1 /*-------------------------------------------------------------------------
2 *
3 * xlog.c
4 * PostgreSQL write-ahead log manager
5 *
6 *
7 * Portions Copyright (c) 1996-2019, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/backend/access/transam/xlog.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <math.h>
19 #include <time.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/time.h>
23 #include <unistd.h>
24
25 #include "access/clog.h"
26 #include "access/commit_ts.h"
27 #include "access/multixact.h"
28 #include "access/rewriteheap.h"
29 #include "access/subtrans.h"
30 #include "access/timeline.h"
31 #include "access/transam.h"
32 #include "access/tuptoaster.h"
33 #include "access/twophase.h"
34 #include "access/xact.h"
35 #include "access/xlog_internal.h"
36 #include "access/xloginsert.h"
37 #include "access/xlogreader.h"
38 #include "access/xlogutils.h"
39 #include "catalog/catversion.h"
40 #include "catalog/pg_control.h"
41 #include "catalog/pg_database.h"
42 #include "commands/tablespace.h"
43 #include "common/controldata_utils.h"
44 #include "miscadmin.h"
45 #include "pgstat.h"
46 #include "port/atomics.h"
47 #include "postmaster/bgwriter.h"
48 #include "postmaster/walwriter.h"
49 #include "postmaster/startup.h"
50 #include "replication/basebackup.h"
51 #include "replication/logical.h"
52 #include "replication/slot.h"
53 #include "replication/origin.h"
54 #include "replication/snapbuild.h"
55 #include "replication/walreceiver.h"
56 #include "replication/walsender.h"
57 #include "storage/bufmgr.h"
58 #include "storage/fd.h"
59 #include "storage/ipc.h"
60 #include "storage/large_object.h"
61 #include "storage/latch.h"
62 #include "storage/pmsignal.h"
63 #include "storage/predicate.h"
64 #include "storage/proc.h"
65 #include "storage/procarray.h"
66 #include "storage/reinit.h"
67 #include "storage/smgr.h"
68 #include "storage/spin.h"
69 #include "storage/sync.h"
70 #include "utils/builtins.h"
71 #include "utils/guc.h"
72 #include "utils/memutils.h"
73 #include "utils/ps_status.h"
74 #include "utils/relmapper.h"
75 #include "utils/snapmgr.h"
76 #include "utils/timestamp.h"
77 #include "pg_trace.h"
78
79 extern uint32 bootstrap_data_checksum_version;
80
81 /* Unsupported old recovery command file names (relative to $PGDATA) */
82 #define RECOVERY_COMMAND_FILE "recovery.conf"
83 #define RECOVERY_COMMAND_DONE "recovery.done"
84
85 /* User-settable parameters */
86 int max_wal_size_mb = 1024; /* 1 GB */
87 int min_wal_size_mb = 80; /* 80 MB */
88 int wal_keep_segments = 0;
89 int XLOGbuffers = -1;
90 int XLogArchiveTimeout = 0;
91 int XLogArchiveMode = ARCHIVE_MODE_OFF;
92 char *XLogArchiveCommand = NULL;
93 bool EnableHotStandby = false;
94 bool fullPageWrites = true;
95 bool wal_log_hints = false;
96 bool wal_compression = false;
97 char *wal_consistency_checking_string = NULL;
98 bool *wal_consistency_checking = NULL;
99 bool wal_init_zero = true;
100 bool wal_recycle = true;
101 bool log_checkpoints = false;
102 int sync_method = DEFAULT_SYNC_METHOD;
103 int wal_level = WAL_LEVEL_MINIMAL;
104 int CommitDelay = 0; /* precommit delay in microseconds */
105 int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
106 int wal_retrieve_retry_interval = 5000;
107
108 #ifdef WAL_DEBUG
109 bool XLOG_DEBUG = false;
110 #endif
111
112 int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
113
114 /*
115 * Number of WAL insertion locks to use. A higher value allows more insertions
116 * to happen concurrently, but adds some CPU overhead to flushing the WAL,
117 * which needs to iterate all the locks.
118 */
119 #define NUM_XLOGINSERT_LOCKS 8
120
121 /*
122 * Max distance from last checkpoint, before triggering a new xlog-based
123 * checkpoint.
124 */
125 int CheckPointSegments;
126
127 /* Estimated distance between checkpoints, in bytes */
128 static double CheckPointDistanceEstimate = 0;
129 static double PrevCheckPointDistance = 0;
130
131 /*
132 * GUC support
133 */
134 const struct config_enum_entry sync_method_options[] = {
135 {"fsync", SYNC_METHOD_FSYNC, false},
136 #ifdef HAVE_FSYNC_WRITETHROUGH
137 {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
138 #endif
139 #ifdef HAVE_FDATASYNC
140 {"fdatasync", SYNC_METHOD_FDATASYNC, false},
141 #endif
142 #ifdef OPEN_SYNC_FLAG
143 {"open_sync", SYNC_METHOD_OPEN, false},
144 #endif
145 #ifdef OPEN_DATASYNC_FLAG
146 {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
147 #endif
148 {NULL, 0, false}
149 };
150
151
152 /*
153 * Although only "on", "off", and "always" are documented,
154 * we accept all the likely variants of "on" and "off".
155 */
156 const struct config_enum_entry archive_mode_options[] = {
157 {"always", ARCHIVE_MODE_ALWAYS, false},
158 {"on", ARCHIVE_MODE_ON, false},
159 {"off", ARCHIVE_MODE_OFF, false},
160 {"true", ARCHIVE_MODE_ON, true},
161 {"false", ARCHIVE_MODE_OFF, true},
162 {"yes", ARCHIVE_MODE_ON, true},
163 {"no", ARCHIVE_MODE_OFF, true},
164 {"1", ARCHIVE_MODE_ON, true},
165 {"0", ARCHIVE_MODE_OFF, true},
166 {NULL, 0, false}
167 };
168
169 const struct config_enum_entry recovery_target_action_options[] = {
170 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
171 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
172 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
173 {NULL, 0, false}
174 };
175
176 /*
177 * Statistics for current checkpoint are collected in this global struct.
178 * Because only the checkpointer or a stand-alone backend can perform
179 * checkpoints, this will be unused in normal backends.
180 */
181 CheckpointStatsData CheckpointStats;
182
183 /*
184 * ThisTimeLineID will be same in all backends --- it identifies current
185 * WAL timeline for the database system.
186 */
187 TimeLineID ThisTimeLineID = 0;
188
189 /*
190 * Are we doing recovery from XLOG?
191 *
192 * This is only ever true in the startup process; it should be read as meaning
193 * "this process is replaying WAL records", rather than "the system is in
194 * recovery mode". It should be examined primarily by functions that need
195 * to act differently when called from a WAL redo function (e.g., to skip WAL
196 * logging). To check whether the system is in recovery regardless of which
197 * process you're running in, use RecoveryInProgress() but only after shared
198 * memory startup and lock initialization.
199 */
200 bool InRecovery = false;
201
202 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
203 HotStandbyState standbyState = STANDBY_DISABLED;
204
205 static XLogRecPtr LastRec;
206
207 /* Local copy of WalRcv->receivedUpto */
208 static XLogRecPtr receivedUpto = 0;
209 static TimeLineID receiveTLI = 0;
210
211 /*
212 * abortedRecPtr is the start pointer of a broken record at end of WAL when
213 * recovery completes; missingContrecPtr is the location of the first
214 * contrecord that went missing. See CreateOverwriteContrecordRecord for
215 * details.
216 */
217 static XLogRecPtr abortedRecPtr;
218 static XLogRecPtr missingContrecPtr;
219
220 /*
221 * During recovery, lastFullPageWrites keeps track of full_page_writes that
222 * the replayed WAL records indicate. It's initialized with full_page_writes
223 * that the recovery starting checkpoint record indicates, and then updated
224 * each time XLOG_FPW_CHANGE record is replayed.
225 */
226 static bool lastFullPageWrites;
227
228 /*
229 * Local copy of the state tracked by SharedRecoveryState in shared memory,
230 * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
231 * means "not known, need to check the shared state".
232 */
233 static bool LocalRecoveryInProgress = true;
234
235 /*
236 * Local copy of SharedHotStandbyActive variable. False actually means "not
237 * known, need to check the shared state".
238 */
239 static bool LocalHotStandbyActive = false;
240
241 /*
242 * Local state for XLogInsertAllowed():
243 * 1: unconditionally allowed to insert XLOG
244 * 0: unconditionally not allowed to insert XLOG
245 * -1: must check RecoveryInProgress(); disallow until it is false
246 * Most processes start with -1 and transition to 1 after seeing that recovery
247 * is not in progress. But we can also force the value for special cases.
248 * The coding in XLogInsertAllowed() depends on the first two of these states
249 * being numerically the same as bool true and false.
250 */
251 static int LocalXLogInsertAllowed = -1;
252
253 /*
254 * When ArchiveRecoveryRequested is set, archive recovery was requested,
255 * ie. signal files were present. When InArchiveRecovery is set, we are
256 * currently recovering using offline XLOG archives. These variables are only
257 * valid in the startup process.
258 *
259 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
260 * currently performing crash recovery using only XLOG files in pg_wal, but
261 * will switch to using offline XLOG archives as soon as we reach the end of
262 * WAL in pg_wal.
263 */
264 bool ArchiveRecoveryRequested = false;
265 bool InArchiveRecovery = false;
266
267 static bool standby_signal_file_found = false;
268 static bool recovery_signal_file_found = false;
269
270 /* Was the last xlog file restored from archive, or local? */
271 static bool restoredFromArchive = false;
272
273 /* Buffers dedicated to consistency checks of size BLCKSZ */
274 static char *replay_image_masked = NULL;
275 static char *master_image_masked = NULL;
276
277 /* options formerly taken from recovery.conf for archive recovery */
278 char *recoveryRestoreCommand = NULL;
279 char *recoveryEndCommand = NULL;
280 char *archiveCleanupCommand = NULL;
281 RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
282 bool recoveryTargetInclusive = true;
283 int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
284 TransactionId recoveryTargetXid;
285 char *recovery_target_time_string;
286 static TimestampTz recoveryTargetTime;
287 const char *recoveryTargetName;
288 XLogRecPtr recoveryTargetLSN;
289 int recovery_min_apply_delay = 0;
290 TimestampTz recoveryDelayUntilTime;
291
292 /* options formerly taken from recovery.conf for XLOG streaming */
293 bool StandbyModeRequested = false;
294 char *PrimaryConnInfo = NULL;
295 char *PrimarySlotName = NULL;
296 char *PromoteTriggerFile = NULL;
297
298 /* are we currently in standby mode? */
299 bool StandbyMode = false;
300
301 /* whether request for fast promotion has been made yet */
302 static bool fast_promote = false;
303
304 /*
305 * if recoveryStopsBefore/After returns true, it saves information of the stop
306 * point here
307 */
308 static TransactionId recoveryStopXid;
309 static TimestampTz recoveryStopTime;
310 static XLogRecPtr recoveryStopLSN;
311 static char recoveryStopName[MAXFNAMELEN];
312 static bool recoveryStopAfter;
313
314 /*
315 * During normal operation, the only timeline we care about is ThisTimeLineID.
316 * During recovery, however, things are more complicated. To simplify life
317 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
318 * scan through the WAL history (that is, it is the line that was active when
319 * the currently-scanned WAL record was generated). We also need these
320 * timeline values:
321 *
322 * recoveryTargetTimeLineGoal: what the user requested, if any
323 *
324 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
325 *
326 * recoveryTargetTLI: the currently understood target timeline; changes
327 *
328 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
329 * its known parents, newest first (so recoveryTargetTLI is always the
330 * first list member). Only these TLIs are expected to be seen in the WAL
331 * segments we read, and indeed only these TLIs will be considered as
332 * candidate WAL files to open at all.
333 *
334 * curFileTLI: the TLI appearing in the name of the current input WAL file.
335 * (This is not necessarily the same as ThisTimeLineID, because we could
336 * be scanning data that was copied from an ancestor timeline when the current
337 * file was created.) During a sequential scan we do not allow this value
338 * to decrease.
339 */
340 RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
341 TimeLineID recoveryTargetTLIRequested = 0;
342 TimeLineID recoveryTargetTLI = 0;
343 static List *expectedTLEs;
344 static TimeLineID curFileTLI;
345
346 /*
347 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
348 * current backend. It is updated for all inserts. XactLastRecEnd points to
349 * end+1 of the last record, and is reset when we end a top-level transaction,
350 * or start a new one; so it can be used to tell if the current transaction has
351 * created any XLOG records.
352 *
353 * While in parallel mode, this may not be fully up to date. When committing,
354 * a transaction can assume this covers all xlog records written either by the
355 * user backend or by any parallel worker which was present at any point during
356 * the transaction. But when aborting, or when still in parallel mode, other
357 * parallel backends may have written WAL records at later LSNs than the value
358 * stored here. The parallel leader advances its own copy, when necessary,
359 * in WaitForParallelWorkersToFinish.
360 */
361 XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
362 XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
363 XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
364
365 /*
366 * RedoRecPtr is this backend's local copy of the REDO record pointer
367 * (which is almost but not quite the same as a pointer to the most recent
368 * CHECKPOINT record). We update this from the shared-memory copy,
369 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
370 * hold an insertion lock). See XLogInsertRecord for details. We are also
371 * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
372 * see GetRedoRecPtr. A freshly spawned backend obtains the value during
373 * InitXLOGAccess.
374 */
375 static XLogRecPtr RedoRecPtr;
376
377 /*
378 * doPageWrites is this backend's local copy of (forcePageWrites ||
379 * fullPageWrites). It is used together with RedoRecPtr to decide whether
380 * a full-page image of a page need to be taken.
381 */
382 static bool doPageWrites;
383
384 /* Has the recovery code requested a walreceiver wakeup? */
385 static bool doRequestWalReceiverReply;
386
387 /*
388 * RedoStartLSN points to the checkpoint's REDO location which is specified
389 * in a backup label file, backup history file or control file. In standby
390 * mode, XLOG streaming usually starts from the position where an invalid
391 * record was found. But if we fail to read even the initial checkpoint
392 * record, we use the REDO location instead of the checkpoint location as
393 * the start position of XLOG streaming. Otherwise we would have to jump
394 * backwards to the REDO location after reading the checkpoint record,
395 * because the REDO record can precede the checkpoint record.
396 */
397 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
398
399 /*----------
400 * Shared-memory data structures for XLOG control
401 *
402 * LogwrtRqst indicates a byte position that we need to write and/or fsync
403 * the log up to (all records before that point must be written or fsynced).
404 * LogwrtResult indicates the byte positions we have already written/fsynced.
405 * These structs are identical but are declared separately to indicate their
406 * slightly different functions.
407 *
408 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
409 * WALWriteLock. To update it, you need to hold both locks. The point of
410 * this arrangement is that the value can be examined by code that already
411 * holds WALWriteLock without needing to grab info_lck as well. In addition
412 * to the shared variable, each backend has a private copy of LogwrtResult,
413 * which is updated when convenient.
414 *
415 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
416 * (protected by info_lck), but we don't need to cache any copies of it.
417 *
418 * info_lck is only held long enough to read/update the protected variables,
419 * so it's a plain spinlock. The other locks are held longer (potentially
420 * over I/O operations), so we use LWLocks for them. These locks are:
421 *
422 * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
423 * It is only held while initializing and changing the mapping. If the
424 * contents of the buffer being replaced haven't been written yet, the mapping
425 * lock is released while the write is done, and reacquired afterwards.
426 *
427 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
428 * XLogFlush).
429 *
430 * ControlFileLock: must be held to read/update control file or create
431 * new log file.
432 *
433 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
434 * only one checkpointer at a time; currently, with all checkpoints done by
435 * the checkpointer, this is just pro forma).
436 *
437 *----------
438 */
439
440 typedef struct XLogwrtRqst
441 {
442 XLogRecPtr Write; /* last byte + 1 to write out */
443 XLogRecPtr Flush; /* last byte + 1 to flush */
444 } XLogwrtRqst;
445
446 typedef struct XLogwrtResult
447 {
448 XLogRecPtr Write; /* last byte + 1 written out */
449 XLogRecPtr Flush; /* last byte + 1 flushed */
450 } XLogwrtResult;
451
452 /*
453 * Inserting to WAL is protected by a small fixed number of WAL insertion
454 * locks. To insert to the WAL, you must hold one of the locks - it doesn't
455 * matter which one. To lock out other concurrent insertions, you must hold
456 * of them. Each WAL insertion lock consists of a lightweight lock, plus an
457 * indicator of how far the insertion has progressed (insertingAt).
458 *
459 * The insertingAt values are read when a process wants to flush WAL from
460 * the in-memory buffers to disk, to check that all the insertions to the
461 * region the process is about to write out have finished. You could simply
462 * wait for all currently in-progress insertions to finish, but the
463 * insertingAt indicator allows you to ignore insertions to later in the WAL,
464 * so that you only wait for the insertions that are modifying the buffers
465 * you're about to write out.
466 *
467 * This isn't just an optimization. If all the WAL buffers are dirty, an
468 * inserter that's holding a WAL insert lock might need to evict an old WAL
469 * buffer, which requires flushing the WAL. If it's possible for an inserter
470 * to block on another inserter unnecessarily, deadlock can arise when two
471 * inserters holding a WAL insert lock wait for each other to finish their
472 * insertion.
473 *
474 * Small WAL records that don't cross a page boundary never update the value,
475 * the WAL record is just copied to the page and the lock is released. But
476 * to avoid the deadlock-scenario explained above, the indicator is always
477 * updated before sleeping while holding an insertion lock.
478 *
479 * lastImportantAt contains the LSN of the last important WAL record inserted
480 * using a given lock. This value is used to detect if there has been
481 * important WAL activity since the last time some action, like a checkpoint,
482 * was performed - allowing to not repeat the action if not. The LSN is
483 * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
484 * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
485 * records. Tracking the WAL activity directly in WALInsertLock has the
486 * advantage of not needing any additional locks to update the value.
487 */
488 typedef struct
489 {
490 LWLock lock;
491 XLogRecPtr insertingAt;
492 XLogRecPtr lastImportantAt;
493 } WALInsertLock;
494
495 /*
496 * All the WAL insertion locks are allocated as an array in shared memory. We
497 * force the array stride to be a power of 2, which saves a few cycles in
498 * indexing, but more importantly also ensures that individual slots don't
499 * cross cache line boundaries. (Of course, we have to also ensure that the
500 * array start address is suitably aligned.)
501 */
502 typedef union WALInsertLockPadded
503 {
504 WALInsertLock l;
505 char pad[PG_CACHE_LINE_SIZE];
506 } WALInsertLockPadded;
507
508 /*
509 * State of an exclusive backup, necessary to control concurrent activities
510 * across sessions when working on exclusive backups.
511 *
512 * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
513 * running, to be more precise pg_start_backup() is not being executed for
514 * an exclusive backup and there is no exclusive backup in progress.
515 * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
516 * exclusive backup.
517 * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
518 * running and an exclusive backup is in progress. pg_stop_backup() is
519 * needed to finish it.
520 * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
521 * exclusive backup.
522 */
523 typedef enum ExclusiveBackupState
524 {
525 EXCLUSIVE_BACKUP_NONE = 0,
526 EXCLUSIVE_BACKUP_STARTING,
527 EXCLUSIVE_BACKUP_IN_PROGRESS,
528 EXCLUSIVE_BACKUP_STOPPING
529 } ExclusiveBackupState;
530
531 /*
532 * Session status of running backup, used for sanity checks in SQL-callable
533 * functions to start and stop backups.
534 */
535 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
536
537 /*
538 * Shared state data for WAL insertion.
539 */
540 typedef struct XLogCtlInsert
541 {
542 slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
543
544 /*
545 * CurrBytePos is the end of reserved WAL. The next record will be
546 * inserted at that position. PrevBytePos is the start position of the
547 * previously inserted (or rather, reserved) record - it is copied to the
548 * prev-link of the next record. These are stored as "usable byte
549 * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
550 */
551 uint64 CurrBytePos;
552 uint64 PrevBytePos;
553
554 /*
555 * Make sure the above heavily-contended spinlock and byte positions are
556 * on their own cache line. In particular, the RedoRecPtr and full page
557 * write variables below should be on a different cache line. They are
558 * read on every WAL insertion, but updated rarely, and we don't want
559 * those reads to steal the cache line containing Curr/PrevBytePos.
560 */
561 char pad[PG_CACHE_LINE_SIZE];
562
563 /*
564 * fullPageWrites is the master copy used by all backends to determine
565 * whether to write full-page to WAL, instead of using process-local one.
566 * This is required because, when full_page_writes is changed by SIGHUP,
567 * we must WAL-log it before it actually affects WAL-logging by backends.
568 * Checkpointer sets at startup or after SIGHUP.
569 *
570 * To read these fields, you must hold an insertion lock. To modify them,
571 * you must hold ALL the locks.
572 */
573 XLogRecPtr RedoRecPtr; /* current redo point for insertions */
574 bool forcePageWrites; /* forcing full-page writes for PITR? */
575 bool fullPageWrites;
576
577 /*
578 * exclusiveBackupState indicates the state of an exclusive backup (see
579 * comments of ExclusiveBackupState for more details). nonExclusiveBackups
580 * is a counter indicating the number of streaming base backups currently
581 * in progress. forcePageWrites is set to true when either of these is
582 * non-zero. lastBackupStart is the latest checkpoint redo location used
583 * as a starting point for an online backup.
584 */
585 ExclusiveBackupState exclusiveBackupState;
586 int nonExclusiveBackups;
587 XLogRecPtr lastBackupStart;
588
589 /*
590 * WAL insertion locks.
591 */
592 WALInsertLockPadded *WALInsertLocks;
593 } XLogCtlInsert;
594
595 /*
596 * Total shared-memory state for XLOG.
597 */
598 typedef struct XLogCtlData
599 {
600 XLogCtlInsert Insert;
601
602 /* Protected by info_lck: */
603 XLogwrtRqst LogwrtRqst;
604 XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
605 FullTransactionId ckptFullXid; /* nextFullXid of latest checkpoint */
606 XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
607 XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
608
609 XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
610
611 /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
612 XLogRecPtr unloggedLSN;
613 slock_t ulsn_lck;
614
615 /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
616 pg_time_t lastSegSwitchTime;
617 XLogRecPtr lastSegSwitchLSN;
618
619 /*
620 * Protected by info_lck and WALWriteLock (you must hold either lock to
621 * read it, but both to update)
622 */
623 XLogwrtResult LogwrtResult;
624
625 /*
626 * Latest initialized page in the cache (last byte position + 1).
627 *
628 * To change the identity of a buffer (and InitializedUpTo), you need to
629 * hold WALBufMappingLock. To change the identity of a buffer that's
630 * still dirty, the old page needs to be written out first, and for that
631 * you need WALWriteLock, and you need to ensure that there are no
632 * in-progress insertions to the page by calling
633 * WaitXLogInsertionsToFinish().
634 */
635 XLogRecPtr InitializedUpTo;
636
637 /*
638 * These values do not change after startup, although the pointed-to pages
639 * and xlblocks values certainly do. xlblock values are protected by
640 * WALBufMappingLock.
641 */
642 char *pages; /* buffers for unwritten XLOG pages */
643 XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
644 int XLogCacheBlck; /* highest allocated xlog buffer index */
645
646 /*
647 * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
648 * If we created a new timeline when the system was started up,
649 * PrevTimeLineID is the old timeline's ID that we forked off from.
650 * Otherwise it's equal to ThisTimeLineID.
651 */
652 TimeLineID ThisTimeLineID;
653 TimeLineID PrevTimeLineID;
654
655 /*
656 * SharedRecoveryState indicates if we're still in crash or archive
657 * recovery. Protected by info_lck.
658 */
659 RecoveryState SharedRecoveryState;
660
661 /*
662 * SharedHotStandbyActive indicates if we're still in crash or archive
663 * recovery. Protected by info_lck.
664 */
665 bool SharedHotStandbyActive;
666
667 /*
668 * WalWriterSleeping indicates whether the WAL writer is currently in
669 * low-power mode (and hence should be nudged if an async commit occurs).
670 * Protected by info_lck.
671 */
672 bool WalWriterSleeping;
673
674 /*
675 * recoveryWakeupLatch is used to wake up the startup process to continue
676 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
677 * to appear.
678 */
679 Latch recoveryWakeupLatch;
680
681 /*
682 * During recovery, we keep a copy of the latest checkpoint record here.
683 * lastCheckPointRecPtr points to start of checkpoint record and
684 * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
685 * checkpointer when it wants to create a restartpoint.
686 *
687 * Protected by info_lck.
688 */
689 XLogRecPtr lastCheckPointRecPtr;
690 XLogRecPtr lastCheckPointEndPtr;
691 CheckPoint lastCheckPoint;
692
693 /*
694 * lastReplayedEndRecPtr points to end+1 of the last record successfully
695 * replayed. When we're currently replaying a record, ie. in a redo
696 * function, replayEndRecPtr points to the end+1 of the record being
697 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
698 */
699 XLogRecPtr lastReplayedEndRecPtr;
700 TimeLineID lastReplayedTLI;
701 XLogRecPtr replayEndRecPtr;
702 TimeLineID replayEndTLI;
703 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
704 TimestampTz recoveryLastXTime;
705
706 /*
707 * timestamp of when we started replaying the current chunk of WAL data,
708 * only relevant for replication or archive recovery
709 */
710 TimestampTz currentChunkStartTime;
711 /* Are we requested to pause recovery? */
712 bool recoveryPause;
713
714 /*
715 * lastFpwDisableRecPtr points to the start of the last replayed
716 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
717 */
718 XLogRecPtr lastFpwDisableRecPtr;
719
720 slock_t info_lck; /* locks shared variables shown above */
721 } XLogCtlData;
722
723 static XLogCtlData *XLogCtl = NULL;
724
725 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
726 static WALInsertLockPadded *WALInsertLocks = NULL;
727
728 /*
729 * We maintain an image of pg_control in shared memory.
730 */
731 static ControlFileData *ControlFile = NULL;
732
733 /*
734 * Calculate the amount of space left on the page after 'endptr'. Beware
735 * multiple evaluation!
736 */
737 #define INSERT_FREESPACE(endptr) \
738 (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
739
740 /* Macro to advance to next buffer index. */
741 #define NextBufIdx(idx) \
742 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
743
744 /*
745 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
746 * would hold if it was in cache, the page containing 'recptr'.
747 */
748 #define XLogRecPtrToBufIdx(recptr) \
749 (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
750
751 /*
752 * These are the number of bytes in a WAL page usable for WAL data.
753 */
754 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
755
756 /*
757 * Convert min_wal_size_mb and max wal_size_mb to equivalent segment count.
758 * Rounds down.
759 */
760 #define ConvertToXSegs(x, segsize) \
761 ((x) / ((segsize) / (1024 * 1024)))
762
763 /* The number of bytes in a WAL segment usable for WAL data. */
764 static int UsableBytesInSegment;
765
766 /*
767 * Private, possibly out-of-date copy of shared LogwrtResult.
768 * See discussion above.
769 */
770 static XLogwrtResult LogwrtResult = {0, 0};
771
772 /*
773 * Codes indicating where we got a WAL file from during recovery, or where
774 * to attempt to get one.
775 */
776 typedef enum
777 {
778 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
779 XLOG_FROM_ARCHIVE, /* restored using restore_command */
780 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
781 XLOG_FROM_STREAM /* streamed from master */
782 } XLogSource;
783
784 /* human-readable names for XLogSources, for debugging output */
785 static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
786
787 /*
788 * openLogFile is -1 or a kernel FD for an open log file segment.
789 * openLogSegNo identifies the segment. These variables are only used to
790 * write the XLOG, and so will normally refer to the active segment.
791 */
792 static int openLogFile = -1;
793 static XLogSegNo openLogSegNo = 0;
794
795 /*
796 * These variables are used similarly to the ones above, but for reading
797 * the XLOG. readOff is the offset of the page just read, readLen
798 * indicates how much of it has been read into readBuf, and readSource
799 * indicates where we got the currently open file from.
800 */
801 static int readFile = -1;
802 static XLogSegNo readSegNo = 0;
803 static uint32 readOff = 0;
804 static uint32 readLen = 0;
805 static XLogSource readSource = 0; /* XLOG_FROM_* code */
806
807 /*
808 * Keeps track of which source we're currently reading from. This is
809 * different from readSource in that this is always set, even when we don't
810 * currently have a WAL file open. If lastSourceFailed is set, our last
811 * attempt to read from currentSource failed, and we should try another source
812 * next.
813 */
814 static XLogSource currentSource = 0; /* XLOG_FROM_* code */
815 static bool lastSourceFailed = false;
816
817 typedef struct XLogPageReadPrivate
818 {
819 int emode;
820 bool fetching_ckpt; /* are we fetching a checkpoint record? */
821 bool randAccess;
822 } XLogPageReadPrivate;
823
824 /*
825 * These variables track when we last obtained some WAL data to process,
826 * and where we got it from. (XLogReceiptSource is initially the same as
827 * readSource, but readSource gets reset to zero when we don't have data
828 * to process right now. It is also different from currentSource, which
829 * also changes when we try to read from a source and fail, while
830 * XLogReceiptSource tracks where we last successfully read some WAL.)
831 */
832 static TimestampTz XLogReceiptTime = 0;
833 static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */
834
835 /* State information for XLOG reading */
836 static XLogRecPtr ReadRecPtr; /* start of last record read */
837 static XLogRecPtr EndRecPtr; /* end+1 of last record read */
838
839 /*
840 * Local copies of equivalent fields in the control file. When running
841 * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
842 * expect to replay all the WAL available, and updateMinRecoveryPoint is
843 * switched to false to prevent any updates while replaying records.
844 * Those values are kept consistent as long as crash recovery runs.
845 */
846 static XLogRecPtr minRecoveryPoint;
847 static TimeLineID minRecoveryPointTLI;
848 static bool updateMinRecoveryPoint = true;
849
850 /*
851 * Have we reached a consistent database state? In crash recovery, we have
852 * to replay all the WAL, so reachedConsistency is never set. During archive
853 * recovery, the database is consistent once minRecoveryPoint is reached.
854 */
855 bool reachedConsistency = false;
856
857 static bool InRedo = false;
858
859 /* Have we launched bgwriter during recovery? */
860 static bool bgwriterLaunched = false;
861
862 /* For WALInsertLockAcquire/Release functions */
863 static int MyLockNo = 0;
864 static bool holdingAllLocks = false;
865
866 #ifdef WAL_DEBUG
867 static MemoryContext walDebugCxt = NULL;
868 #endif
869
870 static void readRecoverySignalFile(void);
871 static void validateRecoveryParameters(void);
872 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
873 static bool recoveryStopsBefore(XLogReaderState *record);
874 static bool recoveryStopsAfter(XLogReaderState *record);
875 static void recoveryPausesHere(void);
876 static bool recoveryApplyDelay(XLogReaderState *record);
877 static void SetLatestXTime(TimestampTz xtime);
878 static void SetCurrentChunkStartTime(TimestampTz xtime);
879 static void CheckRequiredParameterValues(void);
880 static void XLogReportParameters(void);
881 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
882 TimeLineID prevTLI);
883 static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
884 XLogReaderState *state);
885 static void LocalSetXLogInsertAllowed(void);
886 static void CreateEndOfRecoveryRecord(void);
887 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
888 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
889 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
890 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
891
892 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
893 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
894 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
895 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
896 bool find_free, XLogSegNo max_segno,
897 bool use_lock);
898 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
899 int source, bool notfoundOk);
900 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
901 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
902 int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
903 TimeLineID *readTLI);
904 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
905 bool fetching_ckpt, XLogRecPtr tliRecPtr);
906 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
907 static void XLogFileClose(void);
908 static void PreallocXlogFiles(XLogRecPtr endptr);
909 static void RemoveTempXlogFiles(void);
910 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr);
911 static void RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr);
912 static void UpdateLastRemovedPtr(char *filename);
913 static void ValidateXLOGDirectoryStructure(void);
914 static void CleanupBackupHistory(void);
915 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
916 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
917 int emode, bool fetching_ckpt);
918 static void CheckRecoveryConsistency(void);
919 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
920 XLogRecPtr RecPtr, int whichChkpti, bool report);
921 static bool rescanLatestTimeLine(void);
922 static void WriteControlFile(void);
923 static void ReadControlFile(void);
924 static char *str_time(pg_time_t tnow);
925 static bool CheckForStandbyTrigger(void);
926
927 #ifdef WAL_DEBUG
928 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
929 #endif
930 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
931 static void pg_start_backup_callback(int code, Datum arg);
932 static void pg_stop_backup_callback(int code, Datum arg);
933 static bool read_backup_label(XLogRecPtr *checkPointLoc,
934 bool *backupEndRequired, bool *backupFromStandby);
935 static bool read_tablespace_map(List **tablespaces);
936
937 static void rm_redo_error_callback(void *arg);
938 static int get_sync_bit(int method);
939
940 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
941 XLogRecData *rdata,
942 XLogRecPtr StartPos, XLogRecPtr EndPos);
943 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
944 XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
945 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
946 XLogRecPtr *PrevPtr);
947 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
948 static char *GetXLogBuffer(XLogRecPtr ptr);
949 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
950 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
951 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
952 static void checkXLogConsistency(XLogReaderState *record);
953
954 static void WALInsertLockAcquire(void);
955 static void WALInsertLockAcquireExclusive(void);
956 static void WALInsertLockRelease(void);
957 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
958
959 /*
960 * Insert an XLOG record represented by an already-constructed chain of data
961 * chunks. This is a low-level routine; to construct the WAL record header
962 * and data, use the higher-level routines in xloginsert.c.
963 *
964 * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
965 * WAL record applies to, that were not included in the record as full page
966 * images. If fpw_lsn <= RedoRecPtr, the function does not perform the
967 * insertion and returns InvalidXLogRecPtr. The caller can then recalculate
968 * which pages need a full-page image, and retry. If fpw_lsn is invalid, the
969 * record is always inserted.
970 *
971 * 'flags' gives more in-depth control on the record being inserted. See
972 * XLogSetRecordFlags() for details.
973 *
974 * The first XLogRecData in the chain must be for the record header, and its
975 * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
976 * xl_crc fields in the header, the rest of the header must already be filled
977 * by the caller.
978 *
979 * Returns XLOG pointer to end of record (beginning of next record).
980 * This can be used as LSN for data pages affected by the logged action.
981 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
982 * before the data page can be written out. This implements the basic
983 * WAL rule "write the log before the data".)
984 */
985 XLogRecPtr
XLogInsertRecord(XLogRecData * rdata,XLogRecPtr fpw_lsn,uint8 flags)986 XLogInsertRecord(XLogRecData *rdata,
987 XLogRecPtr fpw_lsn,
988 uint8 flags)
989 {
990 XLogCtlInsert *Insert = &XLogCtl->Insert;
991 pg_crc32c rdata_crc;
992 bool inserted;
993 XLogRecord *rechdr = (XLogRecord *) rdata->data;
994 uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
995 bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
996 info == XLOG_SWITCH);
997 XLogRecPtr StartPos;
998 XLogRecPtr EndPos;
999 bool prevDoPageWrites = doPageWrites;
1000
1001 /* we assume that all of the record header is in the first chunk */
1002 Assert(rdata->len >= SizeOfXLogRecord);
1003
1004 /* cross-check on whether we should be here or not */
1005 if (!XLogInsertAllowed())
1006 elog(ERROR, "cannot make new WAL entries during recovery");
1007
1008 /*----------
1009 *
1010 * We have now done all the preparatory work we can without holding a
1011 * lock or modifying shared state. From here on, inserting the new WAL
1012 * record to the shared WAL buffer cache is a two-step process:
1013 *
1014 * 1. Reserve the right amount of space from the WAL. The current head of
1015 * reserved space is kept in Insert->CurrBytePos, and is protected by
1016 * insertpos_lck.
1017 *
1018 * 2. Copy the record to the reserved WAL space. This involves finding the
1019 * correct WAL buffer containing the reserved space, and copying the
1020 * record in place. This can be done concurrently in multiple processes.
1021 *
1022 * To keep track of which insertions are still in-progress, each concurrent
1023 * inserter acquires an insertion lock. In addition to just indicating that
1024 * an insertion is in progress, the lock tells others how far the inserter
1025 * has progressed. There is a small fixed number of insertion locks,
1026 * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1027 * boundary, it updates the value stored in the lock to the how far it has
1028 * inserted, to allow the previous buffer to be flushed.
1029 *
1030 * Holding onto an insertion lock also protects RedoRecPtr and
1031 * fullPageWrites from changing until the insertion is finished.
1032 *
1033 * Step 2 can usually be done completely in parallel. If the required WAL
1034 * page is not initialized yet, you have to grab WALBufMappingLock to
1035 * initialize it, but the WAL writer tries to do that ahead of insertions
1036 * to avoid that from happening in the critical path.
1037 *
1038 *----------
1039 */
1040 START_CRIT_SECTION();
1041 if (isLogSwitch)
1042 WALInsertLockAcquireExclusive();
1043 else
1044 WALInsertLockAcquire();
1045
1046 /*
1047 * Check to see if my copy of RedoRecPtr is out of date. If so, may have
1048 * to go back and have the caller recompute everything. This can only
1049 * happen just after a checkpoint, so it's better to be slow in this case
1050 * and fast otherwise.
1051 *
1052 * Also check to see if fullPageWrites or forcePageWrites was just turned
1053 * on; if we weren't already doing full-page writes then go back and
1054 * recompute.
1055 *
1056 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1057 * affect the contents of the XLOG record, so we'll update our local copy
1058 * but not force a recomputation. (If doPageWrites was just turned off,
1059 * we could recompute the record without full pages, but we choose not to
1060 * bother.)
1061 */
1062 if (RedoRecPtr != Insert->RedoRecPtr)
1063 {
1064 Assert(RedoRecPtr < Insert->RedoRecPtr);
1065 RedoRecPtr = Insert->RedoRecPtr;
1066 }
1067 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1068
1069 if (doPageWrites &&
1070 (!prevDoPageWrites ||
1071 (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
1072 {
1073 /*
1074 * Oops, some buffer now needs to be backed up that the caller didn't
1075 * back up. Start over.
1076 */
1077 WALInsertLockRelease();
1078 END_CRIT_SECTION();
1079 return InvalidXLogRecPtr;
1080 }
1081
1082 /*
1083 * Reserve space for the record in the WAL. This also sets the xl_prev
1084 * pointer.
1085 */
1086 if (isLogSwitch)
1087 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1088 else
1089 {
1090 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1091 &rechdr->xl_prev);
1092 inserted = true;
1093 }
1094
1095 if (inserted)
1096 {
1097 /*
1098 * Now that xl_prev has been filled in, calculate CRC of the record
1099 * header.
1100 */
1101 rdata_crc = rechdr->xl_crc;
1102 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1103 FIN_CRC32C(rdata_crc);
1104 rechdr->xl_crc = rdata_crc;
1105
1106 /*
1107 * All the record data, including the header, is now ready to be
1108 * inserted. Copy the record in the space reserved.
1109 */
1110 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1111 StartPos, EndPos);
1112
1113 /*
1114 * Unless record is flagged as not important, update LSN of last
1115 * important record in the current slot. When holding all locks, just
1116 * update the first one.
1117 */
1118 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1119 {
1120 int lockno = holdingAllLocks ? 0 : MyLockNo;
1121
1122 WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1123 }
1124 }
1125 else
1126 {
1127 /*
1128 * This was an xlog-switch record, but the current insert location was
1129 * already exactly at the beginning of a segment, so there was no need
1130 * to do anything.
1131 */
1132 }
1133
1134 /*
1135 * Done! Let others know that we're finished.
1136 */
1137 WALInsertLockRelease();
1138
1139 MarkCurrentTransactionIdLoggedIfAny();
1140
1141 END_CRIT_SECTION();
1142
1143 /*
1144 * Update shared LogwrtRqst.Write, if we crossed page boundary.
1145 */
1146 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1147 {
1148 SpinLockAcquire(&XLogCtl->info_lck);
1149 /* advance global request to include new block(s) */
1150 if (XLogCtl->LogwrtRqst.Write < EndPos)
1151 XLogCtl->LogwrtRqst.Write = EndPos;
1152 /* update local result copy while I have the chance */
1153 LogwrtResult = XLogCtl->LogwrtResult;
1154 SpinLockRelease(&XLogCtl->info_lck);
1155 }
1156
1157 /*
1158 * If this was an XLOG_SWITCH record, flush the record and the empty
1159 * padding space that fills the rest of the segment, and perform
1160 * end-of-segment actions (eg, notifying archiver).
1161 */
1162 if (isLogSwitch)
1163 {
1164 TRACE_POSTGRESQL_WAL_SWITCH();
1165 XLogFlush(EndPos);
1166
1167 /*
1168 * Even though we reserved the rest of the segment for us, which is
1169 * reflected in EndPos, we return a pointer to just the end of the
1170 * xlog-switch record.
1171 */
1172 if (inserted)
1173 {
1174 EndPos = StartPos + SizeOfXLogRecord;
1175 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1176 {
1177 uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
1178
1179 if (offset == EndPos % XLOG_BLCKSZ)
1180 EndPos += SizeOfXLogLongPHD;
1181 else
1182 EndPos += SizeOfXLogShortPHD;
1183 }
1184 }
1185 }
1186
1187 #ifdef WAL_DEBUG
1188 if (XLOG_DEBUG)
1189 {
1190 static XLogReaderState *debug_reader = NULL;
1191 StringInfoData buf;
1192 StringInfoData recordBuf;
1193 char *errormsg = NULL;
1194 MemoryContext oldCxt;
1195
1196 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1197
1198 initStringInfo(&buf);
1199 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1200 (uint32) (EndPos >> 32), (uint32) EndPos);
1201
1202 /*
1203 * We have to piece together the WAL record data from the XLogRecData
1204 * entries, so that we can pass it to the rm_desc function as one
1205 * contiguous chunk.
1206 */
1207 initStringInfo(&recordBuf);
1208 for (; rdata != NULL; rdata = rdata->next)
1209 appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1210
1211 if (!debug_reader)
1212 debug_reader = XLogReaderAllocate(wal_segment_size, NULL, NULL);
1213
1214 if (!debug_reader)
1215 {
1216 appendStringInfoString(&buf, "error decoding record: out of memory");
1217 }
1218 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1219 &errormsg))
1220 {
1221 appendStringInfo(&buf, "error decoding record: %s",
1222 errormsg ? errormsg : "no error message");
1223 }
1224 else
1225 {
1226 appendStringInfoString(&buf, " - ");
1227 xlog_outdesc(&buf, debug_reader);
1228 }
1229 elog(LOG, "%s", buf.data);
1230
1231 pfree(buf.data);
1232 pfree(recordBuf.data);
1233 MemoryContextSwitchTo(oldCxt);
1234 }
1235 #endif
1236
1237 /*
1238 * Update our global variables
1239 */
1240 ProcLastRecPtr = StartPos;
1241 XactLastRecEnd = EndPos;
1242
1243 return EndPos;
1244 }
1245
1246 /*
1247 * Reserves the right amount of space for a record of given size from the WAL.
1248 * *StartPos is set to the beginning of the reserved section, *EndPos to
1249 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1250 * used to set the xl_prev of this record.
1251 *
1252 * This is the performance critical part of XLogInsert that must be serialized
1253 * across backends. The rest can happen mostly in parallel. Try to keep this
1254 * section as short as possible, insertpos_lck can be heavily contended on a
1255 * busy system.
1256 *
1257 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1258 * where we actually copy the record to the reserved space.
1259 */
1260 static void
ReserveXLogInsertLocation(int size,XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1261 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1262 XLogRecPtr *PrevPtr)
1263 {
1264 XLogCtlInsert *Insert = &XLogCtl->Insert;
1265 uint64 startbytepos;
1266 uint64 endbytepos;
1267 uint64 prevbytepos;
1268
1269 size = MAXALIGN(size);
1270
1271 /* All (non xlog-switch) records should contain data. */
1272 Assert(size > SizeOfXLogRecord);
1273
1274 /*
1275 * The duration the spinlock needs to be held is minimized by minimizing
1276 * the calculations that have to be done while holding the lock. The
1277 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1278 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1279 * page headers. The mapping between "usable" byte positions and physical
1280 * positions (XLogRecPtrs) can be done outside the locked region, and
1281 * because the usable byte position doesn't include any headers, reserving
1282 * X bytes from WAL is almost as simple as "CurrBytePos += X".
1283 */
1284 SpinLockAcquire(&Insert->insertpos_lck);
1285
1286 startbytepos = Insert->CurrBytePos;
1287 endbytepos = startbytepos + size;
1288 prevbytepos = Insert->PrevBytePos;
1289 Insert->CurrBytePos = endbytepos;
1290 Insert->PrevBytePos = startbytepos;
1291
1292 SpinLockRelease(&Insert->insertpos_lck);
1293
1294 *StartPos = XLogBytePosToRecPtr(startbytepos);
1295 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1296 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1297
1298 /*
1299 * Check that the conversions between "usable byte positions" and
1300 * XLogRecPtrs work consistently in both directions.
1301 */
1302 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1303 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1304 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1305 }
1306
1307 /*
1308 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1309 *
1310 * A log-switch record is handled slightly differently. The rest of the
1311 * segment will be reserved for this insertion, as indicated by the returned
1312 * *EndPos value. However, if we are already at the beginning of the current
1313 * segment, *StartPos and *EndPos are set to the current location without
1314 * reserving any space, and the function returns false.
1315 */
1316 static bool
ReserveXLogSwitch(XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1317 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1318 {
1319 XLogCtlInsert *Insert = &XLogCtl->Insert;
1320 uint64 startbytepos;
1321 uint64 endbytepos;
1322 uint64 prevbytepos;
1323 uint32 size = MAXALIGN(SizeOfXLogRecord);
1324 XLogRecPtr ptr;
1325 uint32 segleft;
1326
1327 /*
1328 * These calculations are a bit heavy-weight to be done while holding a
1329 * spinlock, but since we're holding all the WAL insertion locks, there
1330 * are no other inserters competing for it. GetXLogInsertRecPtr() does
1331 * compete for it, but that's not called very frequently.
1332 */
1333 SpinLockAcquire(&Insert->insertpos_lck);
1334
1335 startbytepos = Insert->CurrBytePos;
1336
1337 ptr = XLogBytePosToEndRecPtr(startbytepos);
1338 if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1339 {
1340 SpinLockRelease(&Insert->insertpos_lck);
1341 *EndPos = *StartPos = ptr;
1342 return false;
1343 }
1344
1345 endbytepos = startbytepos + size;
1346 prevbytepos = Insert->PrevBytePos;
1347
1348 *StartPos = XLogBytePosToRecPtr(startbytepos);
1349 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1350
1351 segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1352 if (segleft != wal_segment_size)
1353 {
1354 /* consume the rest of the segment */
1355 *EndPos += segleft;
1356 endbytepos = XLogRecPtrToBytePos(*EndPos);
1357 }
1358 Insert->CurrBytePos = endbytepos;
1359 Insert->PrevBytePos = startbytepos;
1360
1361 SpinLockRelease(&Insert->insertpos_lck);
1362
1363 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1364
1365 Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1366 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1367 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1368 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1369
1370 return true;
1371 }
1372
1373 /*
1374 * Checks whether the current buffer page and backup page stored in the
1375 * WAL record are consistent or not. Before comparing the two pages, a
1376 * masking can be applied to the pages to ignore certain areas like hint bits,
1377 * unused space between pd_lower and pd_upper among other things. This
1378 * function should be called once WAL replay has been completed for a
1379 * given record.
1380 */
1381 static void
checkXLogConsistency(XLogReaderState * record)1382 checkXLogConsistency(XLogReaderState *record)
1383 {
1384 RmgrId rmid = XLogRecGetRmid(record);
1385 RelFileNode rnode;
1386 ForkNumber forknum;
1387 BlockNumber blkno;
1388 int block_id;
1389
1390 /* Records with no backup blocks have no need for consistency checks. */
1391 if (!XLogRecHasAnyBlockRefs(record))
1392 return;
1393
1394 Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1395
1396 for (block_id = 0; block_id <= record->max_block_id; block_id++)
1397 {
1398 Buffer buf;
1399 Page page;
1400
1401 if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1402 {
1403 /*
1404 * WAL record doesn't contain a block reference with the given id.
1405 * Do nothing.
1406 */
1407 continue;
1408 }
1409
1410 Assert(XLogRecHasBlockImage(record, block_id));
1411
1412 if (XLogRecBlockImageApply(record, block_id))
1413 {
1414 /*
1415 * WAL record has already applied the page, so bypass the
1416 * consistency check as that would result in comparing the full
1417 * page stored in the record with itself.
1418 */
1419 continue;
1420 }
1421
1422 /*
1423 * Read the contents from the current buffer and store it in a
1424 * temporary page.
1425 */
1426 buf = XLogReadBufferExtended(rnode, forknum, blkno,
1427 RBM_NORMAL_NO_LOG);
1428 if (!BufferIsValid(buf))
1429 continue;
1430
1431 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1432 page = BufferGetPage(buf);
1433
1434 /*
1435 * Take a copy of the local page where WAL has been applied to have a
1436 * comparison base before masking it...
1437 */
1438 memcpy(replay_image_masked, page, BLCKSZ);
1439
1440 /* No need for this page anymore now that a copy is in. */
1441 UnlockReleaseBuffer(buf);
1442
1443 /*
1444 * If the block LSN is already ahead of this WAL record, we can't
1445 * expect contents to match. This can happen if recovery is
1446 * restarted.
1447 */
1448 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1449 continue;
1450
1451 /*
1452 * Read the contents from the backup copy, stored in WAL record and
1453 * store it in a temporary page. There is no need to allocate a new
1454 * page here, a local buffer is fine to hold its contents and a mask
1455 * can be directly applied on it.
1456 */
1457 if (!RestoreBlockImage(record, block_id, master_image_masked))
1458 elog(ERROR, "failed to restore block image");
1459
1460 /*
1461 * If masking function is defined, mask both the master and replay
1462 * images
1463 */
1464 if (RmgrTable[rmid].rm_mask != NULL)
1465 {
1466 RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1467 RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1468 }
1469
1470 /* Time to compare the master and replay images. */
1471 if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1472 {
1473 elog(FATAL,
1474 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1475 rnode.spcNode, rnode.dbNode, rnode.relNode,
1476 forknum, blkno);
1477 }
1478 }
1479 }
1480
1481 /*
1482 * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
1483 * area in the WAL.
1484 */
1485 static void
CopyXLogRecordToWAL(int write_len,bool isLogSwitch,XLogRecData * rdata,XLogRecPtr StartPos,XLogRecPtr EndPos)1486 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1487 XLogRecPtr StartPos, XLogRecPtr EndPos)
1488 {
1489 char *currpos;
1490 int freespace;
1491 int written;
1492 XLogRecPtr CurrPos;
1493 XLogPageHeader pagehdr;
1494
1495 /*
1496 * Get a pointer to the right place in the right WAL buffer to start
1497 * inserting to.
1498 */
1499 CurrPos = StartPos;
1500 currpos = GetXLogBuffer(CurrPos);
1501 freespace = INSERT_FREESPACE(CurrPos);
1502
1503 /*
1504 * there should be enough space for at least the first field (xl_tot_len)
1505 * on this page.
1506 */
1507 Assert(freespace >= sizeof(uint32));
1508
1509 /* Copy record data */
1510 written = 0;
1511 while (rdata != NULL)
1512 {
1513 char *rdata_data = rdata->data;
1514 int rdata_len = rdata->len;
1515
1516 while (rdata_len > freespace)
1517 {
1518 /*
1519 * Write what fits on this page, and continue on the next page.
1520 */
1521 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1522 memcpy(currpos, rdata_data, freespace);
1523 rdata_data += freespace;
1524 rdata_len -= freespace;
1525 written += freespace;
1526 CurrPos += freespace;
1527
1528 /*
1529 * Get pointer to beginning of next page, and set the xlp_rem_len
1530 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1531 *
1532 * It's safe to set the contrecord flag and xlp_rem_len without a
1533 * lock on the page. All the other flags were already set when the
1534 * page was initialized, in AdvanceXLInsertBuffer, and we're the
1535 * only backend that needs to set the contrecord flag.
1536 */
1537 currpos = GetXLogBuffer(CurrPos);
1538 pagehdr = (XLogPageHeader) currpos;
1539 pagehdr->xlp_rem_len = write_len - written;
1540 pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1541
1542 /* skip over the page header */
1543 if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1544 {
1545 CurrPos += SizeOfXLogLongPHD;
1546 currpos += SizeOfXLogLongPHD;
1547 }
1548 else
1549 {
1550 CurrPos += SizeOfXLogShortPHD;
1551 currpos += SizeOfXLogShortPHD;
1552 }
1553 freespace = INSERT_FREESPACE(CurrPos);
1554 }
1555
1556 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1557 memcpy(currpos, rdata_data, rdata_len);
1558 currpos += rdata_len;
1559 CurrPos += rdata_len;
1560 freespace -= rdata_len;
1561 written += rdata_len;
1562
1563 rdata = rdata->next;
1564 }
1565 Assert(written == write_len);
1566
1567 /*
1568 * If this was an xlog-switch, it's not enough to write the switch record,
1569 * we also have to consume all the remaining space in the WAL segment. We
1570 * have already reserved that space, but we need to actually fill it.
1571 */
1572 if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1573 {
1574 /* An xlog-switch record doesn't contain any data besides the header */
1575 Assert(write_len == SizeOfXLogRecord);
1576
1577 /* Assert that we did reserve the right amount of space */
1578 Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1579
1580 /* Use up all the remaining space on the current page */
1581 CurrPos += freespace;
1582
1583 /*
1584 * Cause all remaining pages in the segment to be flushed, leaving the
1585 * XLog position where it should be, at the start of the next segment.
1586 * We do this one page at a time, to make sure we don't deadlock
1587 * against ourselves if wal_buffers < wal_segment_size.
1588 */
1589 while (CurrPos < EndPos)
1590 {
1591 /*
1592 * The minimal action to flush the page would be to call
1593 * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1594 * AdvanceXLInsertBuffer(...). The page would be left initialized
1595 * mostly to zeros, except for the page header (always the short
1596 * variant, as this is never a segment's first page).
1597 *
1598 * The large vistas of zeros are good for compressibility, but the
1599 * headers interrupting them every XLOG_BLCKSZ (with values that
1600 * differ from page to page) are not. The effect varies with
1601 * compression tool, but bzip2 for instance compresses about an
1602 * order of magnitude worse if those headers are left in place.
1603 *
1604 * Rather than complicating AdvanceXLInsertBuffer itself (which is
1605 * called in heavily-loaded circumstances as well as this lightly-
1606 * loaded one) with variant behavior, we just use GetXLogBuffer
1607 * (which itself calls the two methods we need) to get the pointer
1608 * and zero most of the page. Then we just zero the page header.
1609 */
1610 currpos = GetXLogBuffer(CurrPos);
1611 MemSet(currpos, 0, SizeOfXLogShortPHD);
1612
1613 CurrPos += XLOG_BLCKSZ;
1614 }
1615 }
1616 else
1617 {
1618 /* Align the end position, so that the next record starts aligned */
1619 CurrPos = MAXALIGN64(CurrPos);
1620 }
1621
1622 if (CurrPos != EndPos)
1623 elog(PANIC, "space reserved for WAL record does not match what was written");
1624 }
1625
1626 /*
1627 * Acquire a WAL insertion lock, for inserting to WAL.
1628 */
1629 static void
WALInsertLockAcquire(void)1630 WALInsertLockAcquire(void)
1631 {
1632 bool immed;
1633
1634 /*
1635 * It doesn't matter which of the WAL insertion locks we acquire, so try
1636 * the one we used last time. If the system isn't particularly busy, it's
1637 * a good bet that it's still available, and it's good to have some
1638 * affinity to a particular lock so that you don't unnecessarily bounce
1639 * cache lines between processes when there's no contention.
1640 *
1641 * If this is the first time through in this backend, pick a lock
1642 * (semi-)randomly. This allows the locks to be used evenly if you have a
1643 * lot of very short connections.
1644 */
1645 static int lockToTry = -1;
1646
1647 if (lockToTry == -1)
1648 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1649 MyLockNo = lockToTry;
1650
1651 /*
1652 * The insertingAt value is initially set to 0, as we don't know our
1653 * insert location yet.
1654 */
1655 immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1656 if (!immed)
1657 {
1658 /*
1659 * If we couldn't get the lock immediately, try another lock next
1660 * time. On a system with more insertion locks than concurrent
1661 * inserters, this causes all the inserters to eventually migrate to a
1662 * lock that no-one else is using. On a system with more inserters
1663 * than locks, it still helps to distribute the inserters evenly
1664 * across the locks.
1665 */
1666 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1667 }
1668 }
1669
1670 /*
1671 * Acquire all WAL insertion locks, to prevent other backends from inserting
1672 * to WAL.
1673 */
1674 static void
WALInsertLockAcquireExclusive(void)1675 WALInsertLockAcquireExclusive(void)
1676 {
1677 int i;
1678
1679 /*
1680 * When holding all the locks, all but the last lock's insertingAt
1681 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1682 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1683 */
1684 for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1685 {
1686 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1687 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1688 &WALInsertLocks[i].l.insertingAt,
1689 PG_UINT64_MAX);
1690 }
1691 /* Variable value reset to 0 at release */
1692 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1693
1694 holdingAllLocks = true;
1695 }
1696
1697 /*
1698 * Release our insertion lock (or locks, if we're holding them all).
1699 *
1700 * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1701 * next time the lock is acquired.
1702 */
1703 static void
WALInsertLockRelease(void)1704 WALInsertLockRelease(void)
1705 {
1706 if (holdingAllLocks)
1707 {
1708 int i;
1709
1710 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1711 LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1712 &WALInsertLocks[i].l.insertingAt,
1713 0);
1714
1715 holdingAllLocks = false;
1716 }
1717 else
1718 {
1719 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1720 &WALInsertLocks[MyLockNo].l.insertingAt,
1721 0);
1722 }
1723 }
1724
1725 /*
1726 * Update our insertingAt value, to let others know that we've finished
1727 * inserting up to that point.
1728 */
1729 static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)1730 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1731 {
1732 if (holdingAllLocks)
1733 {
1734 /*
1735 * We use the last lock to mark our actual position, see comments in
1736 * WALInsertLockAcquireExclusive.
1737 */
1738 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1739 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1740 insertingAt);
1741 }
1742 else
1743 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1744 &WALInsertLocks[MyLockNo].l.insertingAt,
1745 insertingAt);
1746 }
1747
1748 /*
1749 * Wait for any WAL insertions < upto to finish.
1750 *
1751 * Returns the location of the oldest insertion that is still in-progress.
1752 * Any WAL prior to that point has been fully copied into WAL buffers, and
1753 * can be flushed out to disk. Because this waits for any insertions older
1754 * than 'upto' to finish, the return value is always >= 'upto'.
1755 *
1756 * Note: When you are about to write out WAL, you must call this function
1757 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1758 * need to wait for an insertion to finish (or at least advance to next
1759 * uninitialized page), and the inserter might need to evict an old WAL buffer
1760 * to make room for a new one, which in turn requires WALWriteLock.
1761 */
1762 static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)1763 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1764 {
1765 uint64 bytepos;
1766 XLogRecPtr reservedUpto;
1767 XLogRecPtr finishedUpto;
1768 XLogCtlInsert *Insert = &XLogCtl->Insert;
1769 int i;
1770
1771 if (MyProc == NULL)
1772 elog(PANIC, "cannot wait without a PGPROC structure");
1773
1774 /* Read the current insert position */
1775 SpinLockAcquire(&Insert->insertpos_lck);
1776 bytepos = Insert->CurrBytePos;
1777 SpinLockRelease(&Insert->insertpos_lck);
1778 reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1779
1780 /*
1781 * No-one should request to flush a piece of WAL that hasn't even been
1782 * reserved yet. However, it can happen if there is a block with a bogus
1783 * LSN on disk, for example. XLogFlush checks for that situation and
1784 * complains, but only after the flush. Here we just assume that to mean
1785 * that all WAL that has been reserved needs to be finished. In this
1786 * corner-case, the return value can be smaller than 'upto' argument.
1787 */
1788 if (upto > reservedUpto)
1789 {
1790 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1791 (uint32) (upto >> 32), (uint32) upto,
1792 (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1793 upto = reservedUpto;
1794 }
1795
1796 /*
1797 * Loop through all the locks, sleeping on any in-progress insert older
1798 * than 'upto'.
1799 *
1800 * finishedUpto is our return value, indicating the point upto which all
1801 * the WAL insertions have been finished. Initialize it to the head of
1802 * reserved WAL, and as we iterate through the insertion locks, back it
1803 * out for any insertion that's still in progress.
1804 */
1805 finishedUpto = reservedUpto;
1806 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1807 {
1808 XLogRecPtr insertingat = InvalidXLogRecPtr;
1809
1810 do
1811 {
1812 /*
1813 * See if this insertion is in progress. LWLockWait will wait for
1814 * the lock to be released, or for the 'value' to be set by a
1815 * LWLockUpdateVar call. When a lock is initially acquired, its
1816 * value is 0 (InvalidXLogRecPtr), which means that we don't know
1817 * where it's inserting yet. We will have to wait for it. If
1818 * it's a small insertion, the record will most likely fit on the
1819 * same page and the inserter will release the lock without ever
1820 * calling LWLockUpdateVar. But if it has to sleep, it will
1821 * advertise the insertion point with LWLockUpdateVar before
1822 * sleeping.
1823 */
1824 if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1825 &WALInsertLocks[i].l.insertingAt,
1826 insertingat, &insertingat))
1827 {
1828 /* the lock was free, so no insertion in progress */
1829 insertingat = InvalidXLogRecPtr;
1830 break;
1831 }
1832
1833 /*
1834 * This insertion is still in progress. Have to wait, unless the
1835 * inserter has proceeded past 'upto'.
1836 */
1837 } while (insertingat < upto);
1838
1839 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1840 finishedUpto = insertingat;
1841 }
1842 return finishedUpto;
1843 }
1844
1845 /*
1846 * Get a pointer to the right location in the WAL buffer containing the
1847 * given XLogRecPtr.
1848 *
1849 * If the page is not initialized yet, it is initialized. That might require
1850 * evicting an old dirty buffer from the buffer cache, which means I/O.
1851 *
1852 * The caller must ensure that the page containing the requested location
1853 * isn't evicted yet, and won't be evicted. The way to ensure that is to
1854 * hold onto a WAL insertion lock with the insertingAt position set to
1855 * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1856 * to evict an old page from the buffer. (This means that once you call
1857 * GetXLogBuffer() with a given 'ptr', you must not access anything before
1858 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1859 * later, because older buffers might be recycled already)
1860 */
1861 static char *
GetXLogBuffer(XLogRecPtr ptr)1862 GetXLogBuffer(XLogRecPtr ptr)
1863 {
1864 int idx;
1865 XLogRecPtr endptr;
1866 static uint64 cachedPage = 0;
1867 static char *cachedPos = NULL;
1868 XLogRecPtr expectedEndPtr;
1869
1870 /*
1871 * Fast path for the common case that we need to access again the same
1872 * page as last time.
1873 */
1874 if (ptr / XLOG_BLCKSZ == cachedPage)
1875 {
1876 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1877 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1878 return cachedPos + ptr % XLOG_BLCKSZ;
1879 }
1880
1881 /*
1882 * The XLog buffer cache is organized so that a page is always loaded to a
1883 * particular buffer. That way we can easily calculate the buffer a given
1884 * page must be loaded into, from the XLogRecPtr alone.
1885 */
1886 idx = XLogRecPtrToBufIdx(ptr);
1887
1888 /*
1889 * See what page is loaded in the buffer at the moment. It could be the
1890 * page we're looking for, or something older. It can't be anything newer
1891 * - that would imply the page we're looking for has already been written
1892 * out to disk and evicted, and the caller is responsible for making sure
1893 * that doesn't happen.
1894 *
1895 * However, we don't hold a lock while we read the value. If someone has
1896 * just initialized the page, it's possible that we get a "torn read" of
1897 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1898 * that case we will see a bogus value. That's ok, we'll grab the mapping
1899 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1900 * the page we're looking for. But it means that when we do this unlocked
1901 * read, we might see a value that appears to be ahead of the page we're
1902 * looking for. Don't PANIC on that, until we've verified the value while
1903 * holding the lock.
1904 */
1905 expectedEndPtr = ptr;
1906 expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1907
1908 endptr = XLogCtl->xlblocks[idx];
1909 if (expectedEndPtr != endptr)
1910 {
1911 XLogRecPtr initializedUpto;
1912
1913 /*
1914 * Before calling AdvanceXLInsertBuffer(), which can block, let others
1915 * know how far we're finished with inserting the record.
1916 *
1917 * NB: If 'ptr' points to just after the page header, advertise a
1918 * position at the beginning of the page rather than 'ptr' itself. If
1919 * there are no other insertions running, someone might try to flush
1920 * up to our advertised location. If we advertised a position after
1921 * the page header, someone might try to flush the page header, even
1922 * though page might actually not be initialized yet. As the first
1923 * inserter on the page, we are effectively responsible for making
1924 * sure that it's initialized, before we let insertingAt to move past
1925 * the page header.
1926 */
1927 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1928 XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1929 initializedUpto = ptr - SizeOfXLogShortPHD;
1930 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1931 XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1932 initializedUpto = ptr - SizeOfXLogLongPHD;
1933 else
1934 initializedUpto = ptr;
1935
1936 WALInsertLockUpdateInsertingAt(initializedUpto);
1937
1938 AdvanceXLInsertBuffer(ptr, false);
1939 endptr = XLogCtl->xlblocks[idx];
1940
1941 if (expectedEndPtr != endptr)
1942 elog(PANIC, "could not find WAL buffer for %X/%X",
1943 (uint32) (ptr >> 32), (uint32) ptr);
1944 }
1945 else
1946 {
1947 /*
1948 * Make sure the initialization of the page is visible to us, and
1949 * won't arrive later to overwrite the WAL data we write on the page.
1950 */
1951 pg_memory_barrier();
1952 }
1953
1954 /*
1955 * Found the buffer holding this page. Return a pointer to the right
1956 * offset within the page.
1957 */
1958 cachedPage = ptr / XLOG_BLCKSZ;
1959 cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1960
1961 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1962 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1963
1964 return cachedPos + ptr % XLOG_BLCKSZ;
1965 }
1966
1967 /*
1968 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1969 * is the position starting from the beginning of WAL, excluding all WAL
1970 * page headers.
1971 */
1972 static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)1973 XLogBytePosToRecPtr(uint64 bytepos)
1974 {
1975 uint64 fullsegs;
1976 uint64 fullpages;
1977 uint64 bytesleft;
1978 uint32 seg_offset;
1979 XLogRecPtr result;
1980
1981 fullsegs = bytepos / UsableBytesInSegment;
1982 bytesleft = bytepos % UsableBytesInSegment;
1983
1984 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1985 {
1986 /* fits on first page of segment */
1987 seg_offset = bytesleft + SizeOfXLogLongPHD;
1988 }
1989 else
1990 {
1991 /* account for the first page on segment with long header */
1992 seg_offset = XLOG_BLCKSZ;
1993 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1994
1995 fullpages = bytesleft / UsableBytesInPage;
1996 bytesleft = bytesleft % UsableBytesInPage;
1997
1998 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1999 }
2000
2001 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2002
2003 return result;
2004 }
2005
2006 /*
2007 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2008 * returns a pointer to the beginning of the page (ie. before page header),
2009 * not to where the first xlog record on that page would go to. This is used
2010 * when converting a pointer to the end of a record.
2011 */
2012 static XLogRecPtr
XLogBytePosToEndRecPtr(uint64 bytepos)2013 XLogBytePosToEndRecPtr(uint64 bytepos)
2014 {
2015 uint64 fullsegs;
2016 uint64 fullpages;
2017 uint64 bytesleft;
2018 uint32 seg_offset;
2019 XLogRecPtr result;
2020
2021 fullsegs = bytepos / UsableBytesInSegment;
2022 bytesleft = bytepos % UsableBytesInSegment;
2023
2024 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2025 {
2026 /* fits on first page of segment */
2027 if (bytesleft == 0)
2028 seg_offset = 0;
2029 else
2030 seg_offset = bytesleft + SizeOfXLogLongPHD;
2031 }
2032 else
2033 {
2034 /* account for the first page on segment with long header */
2035 seg_offset = XLOG_BLCKSZ;
2036 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2037
2038 fullpages = bytesleft / UsableBytesInPage;
2039 bytesleft = bytesleft % UsableBytesInPage;
2040
2041 if (bytesleft == 0)
2042 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2043 else
2044 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2045 }
2046
2047 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2048
2049 return result;
2050 }
2051
2052 /*
2053 * Convert an XLogRecPtr to a "usable byte position".
2054 */
2055 static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)2056 XLogRecPtrToBytePos(XLogRecPtr ptr)
2057 {
2058 uint64 fullsegs;
2059 uint32 fullpages;
2060 uint32 offset;
2061 uint64 result;
2062
2063 XLByteToSeg(ptr, fullsegs, wal_segment_size);
2064
2065 fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2066 offset = ptr % XLOG_BLCKSZ;
2067
2068 if (fullpages == 0)
2069 {
2070 result = fullsegs * UsableBytesInSegment;
2071 if (offset > 0)
2072 {
2073 Assert(offset >= SizeOfXLogLongPHD);
2074 result += offset - SizeOfXLogLongPHD;
2075 }
2076 }
2077 else
2078 {
2079 result = fullsegs * UsableBytesInSegment +
2080 (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2081 (fullpages - 1) * UsableBytesInPage; /* full pages */
2082 if (offset > 0)
2083 {
2084 Assert(offset >= SizeOfXLogShortPHD);
2085 result += offset - SizeOfXLogShortPHD;
2086 }
2087 }
2088
2089 return result;
2090 }
2091
2092 /*
2093 * Initialize XLOG buffers, writing out old buffers if they still contain
2094 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2095 * true, initialize as many pages as we can without having to write out
2096 * unwritten data. Any new pages are initialized to zeros, with pages headers
2097 * initialized properly.
2098 */
2099 static void
AdvanceXLInsertBuffer(XLogRecPtr upto,bool opportunistic)2100 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2101 {
2102 XLogCtlInsert *Insert = &XLogCtl->Insert;
2103 int nextidx;
2104 XLogRecPtr OldPageRqstPtr;
2105 XLogwrtRqst WriteRqst;
2106 XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
2107 XLogRecPtr NewPageBeginPtr;
2108 XLogPageHeader NewPage;
2109 int npages = 0;
2110
2111 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2112
2113 /*
2114 * Now that we have the lock, check if someone initialized the page
2115 * already.
2116 */
2117 while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2118 {
2119 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2120
2121 /*
2122 * Get ending-offset of the buffer page we need to replace (this may
2123 * be zero if the buffer hasn't been used yet). Fall through if it's
2124 * already written out.
2125 */
2126 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2127 if (LogwrtResult.Write < OldPageRqstPtr)
2128 {
2129 /*
2130 * Nope, got work to do. If we just want to pre-initialize as much
2131 * as we can without flushing, give up now.
2132 */
2133 if (opportunistic)
2134 break;
2135
2136 /* Before waiting, get info_lck and update LogwrtResult */
2137 SpinLockAcquire(&XLogCtl->info_lck);
2138 if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2139 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2140 LogwrtResult = XLogCtl->LogwrtResult;
2141 SpinLockRelease(&XLogCtl->info_lck);
2142
2143 /*
2144 * Now that we have an up-to-date LogwrtResult value, see if we
2145 * still need to write it or if someone else already did.
2146 */
2147 if (LogwrtResult.Write < OldPageRqstPtr)
2148 {
2149 /*
2150 * Must acquire write lock. Release WALBufMappingLock first,
2151 * to make sure that all insertions that we need to wait for
2152 * can finish (up to this same position). Otherwise we risk
2153 * deadlock.
2154 */
2155 LWLockRelease(WALBufMappingLock);
2156
2157 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2158
2159 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2160
2161 LogwrtResult = XLogCtl->LogwrtResult;
2162 if (LogwrtResult.Write >= OldPageRqstPtr)
2163 {
2164 /* OK, someone wrote it already */
2165 LWLockRelease(WALWriteLock);
2166 }
2167 else
2168 {
2169 /* Have to write it ourselves */
2170 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2171 WriteRqst.Write = OldPageRqstPtr;
2172 WriteRqst.Flush = 0;
2173 XLogWrite(WriteRqst, false);
2174 LWLockRelease(WALWriteLock);
2175 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2176 }
2177 /* Re-acquire WALBufMappingLock and retry */
2178 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2179 continue;
2180 }
2181 }
2182
2183 /*
2184 * Now the next buffer slot is free and we can set it up to be the
2185 * next output page.
2186 */
2187 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2188 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2189
2190 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2191
2192 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2193
2194 /*
2195 * Be sure to re-zero the buffer so that bytes beyond what we've
2196 * written will look like zeroes and not valid XLOG records...
2197 */
2198 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2199
2200 /*
2201 * Fill the new page's header
2202 */
2203 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2204
2205 /* NewPage->xlp_info = 0; */ /* done by memset */
2206 NewPage->xlp_tli = ThisTimeLineID;
2207 NewPage->xlp_pageaddr = NewPageBeginPtr;
2208
2209 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2210
2211 /*
2212 * If online backup is not in progress, mark the header to indicate
2213 * that WAL records beginning in this page have removable backup
2214 * blocks. This allows the WAL archiver to know whether it is safe to
2215 * compress archived WAL data by transforming full-block records into
2216 * the non-full-block format. It is sufficient to record this at the
2217 * page level because we force a page switch (in fact a segment
2218 * switch) when starting a backup, so the flag will be off before any
2219 * records can be written during the backup. At the end of a backup,
2220 * the last page will be marked as all unsafe when perhaps only part
2221 * is unsafe, but at worst the archiver would miss the opportunity to
2222 * compress a few records.
2223 */
2224 if (!Insert->forcePageWrites)
2225 NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2226
2227 /*
2228 * If a record was found to be broken at the end of recovery, and
2229 * we're going to write on the page where its first contrecord was
2230 * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
2231 * header. See CreateOverwriteContrecordRecord().
2232 */
2233 if (missingContrecPtr == NewPageBeginPtr)
2234 {
2235 NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
2236 missingContrecPtr = InvalidXLogRecPtr;
2237 }
2238
2239 /*
2240 * If first page of an XLOG segment file, make it a long header.
2241 */
2242 if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2243 {
2244 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2245
2246 NewLongPage->xlp_sysid = ControlFile->system_identifier;
2247 NewLongPage->xlp_seg_size = wal_segment_size;
2248 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2249 NewPage->xlp_info |= XLP_LONG_HEADER;
2250 }
2251
2252 /*
2253 * Make sure the initialization of the page becomes visible to others
2254 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2255 * holding a lock.
2256 */
2257 pg_write_barrier();
2258
2259 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2260
2261 XLogCtl->InitializedUpTo = NewPageEndPtr;
2262
2263 npages++;
2264 }
2265 LWLockRelease(WALBufMappingLock);
2266
2267 #ifdef WAL_DEBUG
2268 if (XLOG_DEBUG && npages > 0)
2269 {
2270 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2271 npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2272 }
2273 #endif
2274 }
2275
2276 /*
2277 * Calculate CheckPointSegments based on max_wal_size_mb and
2278 * checkpoint_completion_target.
2279 */
2280 static void
CalculateCheckpointSegments(void)2281 CalculateCheckpointSegments(void)
2282 {
2283 double target;
2284
2285 /*-------
2286 * Calculate the distance at which to trigger a checkpoint, to avoid
2287 * exceeding max_wal_size_mb. This is based on two assumptions:
2288 *
2289 * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2290 * WAL for two checkpoint cycles to allow us to recover from the
2291 * secondary checkpoint if the first checkpoint failed, though we
2292 * only did this on the master anyway, not on standby. Keeping just
2293 * one checkpoint simplifies processing and reduces disk space in
2294 * many smaller databases.)
2295 * b) during checkpoint, we consume checkpoint_completion_target *
2296 * number of segments consumed between checkpoints.
2297 *-------
2298 */
2299 target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2300 (1.0 + CheckPointCompletionTarget);
2301
2302 /* round down */
2303 CheckPointSegments = (int) target;
2304
2305 if (CheckPointSegments < 1)
2306 CheckPointSegments = 1;
2307 }
2308
2309 void
assign_max_wal_size(int newval,void * extra)2310 assign_max_wal_size(int newval, void *extra)
2311 {
2312 max_wal_size_mb = newval;
2313 CalculateCheckpointSegments();
2314 }
2315
2316 void
assign_checkpoint_completion_target(double newval,void * extra)2317 assign_checkpoint_completion_target(double newval, void *extra)
2318 {
2319 CheckPointCompletionTarget = newval;
2320 CalculateCheckpointSegments();
2321 }
2322
2323 /*
2324 * At a checkpoint, how many WAL segments to recycle as preallocated future
2325 * XLOG segments? Returns the highest segment that should be preallocated.
2326 */
2327 static XLogSegNo
XLOGfileslop(XLogRecPtr lastredoptr)2328 XLOGfileslop(XLogRecPtr lastredoptr)
2329 {
2330 XLogSegNo minSegNo;
2331 XLogSegNo maxSegNo;
2332 double distance;
2333 XLogSegNo recycleSegNo;
2334
2335 /*
2336 * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2337 * correspond to. Always recycle enough segments to meet the minimum, and
2338 * remove enough segments to stay below the maximum.
2339 */
2340 minSegNo = lastredoptr / wal_segment_size +
2341 ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2342 maxSegNo = lastredoptr / wal_segment_size +
2343 ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2344
2345 /*
2346 * Between those limits, recycle enough segments to get us through to the
2347 * estimated end of next checkpoint.
2348 *
2349 * To estimate where the next checkpoint will finish, assume that the
2350 * system runs steadily consuming CheckPointDistanceEstimate bytes between
2351 * every checkpoint.
2352 */
2353 distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2354 /* add 10% for good measure. */
2355 distance *= 1.10;
2356
2357 recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2358 wal_segment_size);
2359
2360 if (recycleSegNo < minSegNo)
2361 recycleSegNo = minSegNo;
2362 if (recycleSegNo > maxSegNo)
2363 recycleSegNo = maxSegNo;
2364
2365 return recycleSegNo;
2366 }
2367
2368 /*
2369 * Check whether we've consumed enough xlog space that a checkpoint is needed.
2370 *
2371 * new_segno indicates a log file that has just been filled up (or read
2372 * during recovery). We measure the distance from RedoRecPtr to new_segno
2373 * and see if that exceeds CheckPointSegments.
2374 *
2375 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2376 */
2377 static bool
XLogCheckpointNeeded(XLogSegNo new_segno)2378 XLogCheckpointNeeded(XLogSegNo new_segno)
2379 {
2380 XLogSegNo old_segno;
2381
2382 XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2383
2384 if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2385 return true;
2386 return false;
2387 }
2388
2389 /*
2390 * Write and/or fsync the log at least as far as WriteRqst indicates.
2391 *
2392 * If flexible == true, we don't have to write as far as WriteRqst, but
2393 * may stop at any convenient boundary (such as a cache or logfile boundary).
2394 * This option allows us to avoid uselessly issuing multiple writes when a
2395 * single one would do.
2396 *
2397 * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2398 * must be called before grabbing the lock, to make sure the data is ready to
2399 * write.
2400 */
2401 static void
XLogWrite(XLogwrtRqst WriteRqst,bool flexible)2402 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2403 {
2404 bool ispartialpage;
2405 bool last_iteration;
2406 bool finishing_seg;
2407 bool use_existent;
2408 int curridx;
2409 int npages;
2410 int startidx;
2411 uint32 startoffset;
2412
2413 /* We should always be inside a critical section here */
2414 Assert(CritSectionCount > 0);
2415
2416 /*
2417 * Update local LogwrtResult (caller probably did this already, but...)
2418 */
2419 LogwrtResult = XLogCtl->LogwrtResult;
2420
2421 /*
2422 * Since successive pages in the xlog cache are consecutively allocated,
2423 * we can usually gather multiple pages together and issue just one
2424 * write() call. npages is the number of pages we have determined can be
2425 * written together; startidx is the cache block index of the first one,
2426 * and startoffset is the file offset at which it should go. The latter
2427 * two variables are only valid when npages > 0, but we must initialize
2428 * all of them to keep the compiler quiet.
2429 */
2430 npages = 0;
2431 startidx = 0;
2432 startoffset = 0;
2433
2434 /*
2435 * Within the loop, curridx is the cache block index of the page to
2436 * consider writing. Begin at the buffer containing the next unwritten
2437 * page, or last partially written page.
2438 */
2439 curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2440
2441 while (LogwrtResult.Write < WriteRqst.Write)
2442 {
2443 /*
2444 * Make sure we're not ahead of the insert process. This could happen
2445 * if we're passed a bogus WriteRqst.Write that is past the end of the
2446 * last page that's been initialized by AdvanceXLInsertBuffer.
2447 */
2448 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2449
2450 if (LogwrtResult.Write >= EndPtr)
2451 elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2452 (uint32) (LogwrtResult.Write >> 32),
2453 (uint32) LogwrtResult.Write,
2454 (uint32) (EndPtr >> 32), (uint32) EndPtr);
2455
2456 /* Advance LogwrtResult.Write to end of current buffer page */
2457 LogwrtResult.Write = EndPtr;
2458 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2459
2460 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2461 wal_segment_size))
2462 {
2463 /*
2464 * Switch to new logfile segment. We cannot have any pending
2465 * pages here (since we dump what we have at segment end).
2466 */
2467 Assert(npages == 0);
2468 if (openLogFile >= 0)
2469 XLogFileClose();
2470 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2471 wal_segment_size);
2472
2473 /* create/use new log file */
2474 use_existent = true;
2475 openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2476 }
2477
2478 /* Make sure we have the current logfile open */
2479 if (openLogFile < 0)
2480 {
2481 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2482 wal_segment_size);
2483 openLogFile = XLogFileOpen(openLogSegNo);
2484 }
2485
2486 /* Add current page to the set of pending pages-to-dump */
2487 if (npages == 0)
2488 {
2489 /* first of group */
2490 startidx = curridx;
2491 startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2492 wal_segment_size);
2493 }
2494 npages++;
2495
2496 /*
2497 * Dump the set if this will be the last loop iteration, or if we are
2498 * at the last page of the cache area (since the next page won't be
2499 * contiguous in memory), or if we are at the end of the logfile
2500 * segment.
2501 */
2502 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2503
2504 finishing_seg = !ispartialpage &&
2505 (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2506
2507 if (last_iteration ||
2508 curridx == XLogCtl->XLogCacheBlck ||
2509 finishing_seg)
2510 {
2511 char *from;
2512 Size nbytes;
2513 Size nleft;
2514 int written;
2515
2516 /* OK to write the page(s) */
2517 from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2518 nbytes = npages * (Size) XLOG_BLCKSZ;
2519 nleft = nbytes;
2520 do
2521 {
2522 errno = 0;
2523 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2524 written = pg_pwrite(openLogFile, from, nleft, startoffset);
2525 pgstat_report_wait_end();
2526 if (written <= 0)
2527 {
2528 if (errno == EINTR)
2529 continue;
2530 ereport(PANIC,
2531 (errcode_for_file_access(),
2532 errmsg("could not write to log file %s "
2533 "at offset %u, length %zu: %m",
2534 XLogFileNameP(ThisTimeLineID, openLogSegNo),
2535 startoffset, nleft)));
2536 }
2537 nleft -= written;
2538 from += written;
2539 startoffset += written;
2540 } while (nleft > 0);
2541
2542 npages = 0;
2543
2544 /*
2545 * If we just wrote the whole last page of a logfile segment,
2546 * fsync the segment immediately. This avoids having to go back
2547 * and re-open prior segments when an fsync request comes along
2548 * later. Doing it here ensures that one and only one backend will
2549 * perform this fsync.
2550 *
2551 * This is also the right place to notify the Archiver that the
2552 * segment is ready to copy to archival storage, and to update the
2553 * timer for archive_timeout, and to signal for a checkpoint if
2554 * too many logfile segments have been used since the last
2555 * checkpoint.
2556 */
2557 if (finishing_seg)
2558 {
2559 issue_xlog_fsync(openLogFile, openLogSegNo);
2560
2561 /* signal that we need to wakeup walsenders later */
2562 WalSndWakeupRequest();
2563
2564 LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
2565
2566 if (XLogArchivingActive())
2567 XLogArchiveNotifySeg(openLogSegNo);
2568
2569 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2570 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2571
2572 /*
2573 * Request a checkpoint if we've consumed too much xlog since
2574 * the last one. For speed, we first check using the local
2575 * copy of RedoRecPtr, which might be out of date; if it looks
2576 * like a checkpoint is needed, forcibly update RedoRecPtr and
2577 * recheck.
2578 */
2579 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2580 {
2581 (void) GetRedoRecPtr();
2582 if (XLogCheckpointNeeded(openLogSegNo))
2583 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2584 }
2585 }
2586 }
2587
2588 if (ispartialpage)
2589 {
2590 /* Only asked to write a partial page */
2591 LogwrtResult.Write = WriteRqst.Write;
2592 break;
2593 }
2594 curridx = NextBufIdx(curridx);
2595
2596 /* If flexible, break out of loop as soon as we wrote something */
2597 if (flexible && npages == 0)
2598 break;
2599 }
2600
2601 Assert(npages == 0);
2602
2603 /*
2604 * If asked to flush, do so
2605 */
2606 if (LogwrtResult.Flush < WriteRqst.Flush &&
2607 LogwrtResult.Flush < LogwrtResult.Write)
2608
2609 {
2610 /*
2611 * Could get here without iterating above loop, in which case we might
2612 * have no open file or the wrong one. However, we do not need to
2613 * fsync more than one file.
2614 */
2615 if (sync_method != SYNC_METHOD_OPEN &&
2616 sync_method != SYNC_METHOD_OPEN_DSYNC)
2617 {
2618 if (openLogFile >= 0 &&
2619 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2620 wal_segment_size))
2621 XLogFileClose();
2622 if (openLogFile < 0)
2623 {
2624 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2625 wal_segment_size);
2626 openLogFile = XLogFileOpen(openLogSegNo);
2627 }
2628
2629 issue_xlog_fsync(openLogFile, openLogSegNo);
2630 }
2631
2632 /* signal that we need to wakeup walsenders later */
2633 WalSndWakeupRequest();
2634
2635 LogwrtResult.Flush = LogwrtResult.Write;
2636 }
2637
2638 /*
2639 * Update shared-memory status
2640 *
2641 * We make sure that the shared 'request' values do not fall behind the
2642 * 'result' values. This is not absolutely essential, but it saves some
2643 * code in a couple of places.
2644 */
2645 {
2646 SpinLockAcquire(&XLogCtl->info_lck);
2647 XLogCtl->LogwrtResult = LogwrtResult;
2648 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2649 XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2650 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2651 XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2652 SpinLockRelease(&XLogCtl->info_lck);
2653 }
2654 }
2655
2656 /*
2657 * Record the LSN for an asynchronous transaction commit/abort
2658 * and nudge the WALWriter if there is work for it to do.
2659 * (This should not be called for synchronous commits.)
2660 */
2661 void
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)2662 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2663 {
2664 XLogRecPtr WriteRqstPtr = asyncXactLSN;
2665 bool sleeping;
2666
2667 SpinLockAcquire(&XLogCtl->info_lck);
2668 LogwrtResult = XLogCtl->LogwrtResult;
2669 sleeping = XLogCtl->WalWriterSleeping;
2670 if (XLogCtl->asyncXactLSN < asyncXactLSN)
2671 XLogCtl->asyncXactLSN = asyncXactLSN;
2672 SpinLockRelease(&XLogCtl->info_lck);
2673
2674 /*
2675 * If the WALWriter is sleeping, we should kick it to make it come out of
2676 * low-power mode. Otherwise, determine whether there's a full page of
2677 * WAL available to write.
2678 */
2679 if (!sleeping)
2680 {
2681 /* back off to last completed page boundary */
2682 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2683
2684 /* if we have already flushed that far, we're done */
2685 if (WriteRqstPtr <= LogwrtResult.Flush)
2686 return;
2687 }
2688
2689 /*
2690 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2691 * to come out of low-power mode so that this async commit will reach disk
2692 * within the expected amount of time.
2693 */
2694 if (ProcGlobal->walwriterLatch)
2695 SetLatch(ProcGlobal->walwriterLatch);
2696 }
2697
2698 /*
2699 * Record the LSN up to which we can remove WAL because it's not required by
2700 * any replication slot.
2701 */
2702 void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)2703 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2704 {
2705 SpinLockAcquire(&XLogCtl->info_lck);
2706 XLogCtl->replicationSlotMinLSN = lsn;
2707 SpinLockRelease(&XLogCtl->info_lck);
2708 }
2709
2710
2711 /*
2712 * Return the oldest LSN we must retain to satisfy the needs of some
2713 * replication slot.
2714 */
2715 static XLogRecPtr
XLogGetReplicationSlotMinimumLSN(void)2716 XLogGetReplicationSlotMinimumLSN(void)
2717 {
2718 XLogRecPtr retval;
2719
2720 SpinLockAcquire(&XLogCtl->info_lck);
2721 retval = XLogCtl->replicationSlotMinLSN;
2722 SpinLockRelease(&XLogCtl->info_lck);
2723
2724 return retval;
2725 }
2726
2727 /*
2728 * Advance minRecoveryPoint in control file.
2729 *
2730 * If we crash during recovery, we must reach this point again before the
2731 * database is consistent.
2732 *
2733 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2734 * is only updated if it's not already greater than or equal to 'lsn'.
2735 */
2736 static void
UpdateMinRecoveryPoint(XLogRecPtr lsn,bool force)2737 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2738 {
2739 /* Quick check using our local copy of the variable */
2740 if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2741 return;
2742
2743 /*
2744 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2745 * i.e., we're doing crash recovery. We never modify the control file's
2746 * value in that case, so we can short-circuit future checks here too. The
2747 * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2748 * updated until crash recovery finishes. We only do this for the startup
2749 * process as it should not update its own reference of minRecoveryPoint
2750 * until it has finished crash recovery to make sure that all WAL
2751 * available is replayed in this case. This also saves from extra locks
2752 * taken on the control file from the startup process.
2753 */
2754 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2755 {
2756 updateMinRecoveryPoint = false;
2757 return;
2758 }
2759
2760 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2761
2762 /* update local copy */
2763 minRecoveryPoint = ControlFile->minRecoveryPoint;
2764 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2765
2766 if (XLogRecPtrIsInvalid(minRecoveryPoint))
2767 updateMinRecoveryPoint = false;
2768 else if (force || minRecoveryPoint < lsn)
2769 {
2770 XLogRecPtr newMinRecoveryPoint;
2771 TimeLineID newMinRecoveryPointTLI;
2772
2773 /*
2774 * To avoid having to update the control file too often, we update it
2775 * all the way to the last record being replayed, even though 'lsn'
2776 * would suffice for correctness. This also allows the 'force' case
2777 * to not need a valid 'lsn' value.
2778 *
2779 * Another important reason for doing it this way is that the passed
2780 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2781 * the caller got it from a corrupted heap page. Accepting such a
2782 * value as the min recovery point would prevent us from coming up at
2783 * all. Instead, we just log a warning and continue with recovery.
2784 * (See also the comments about corrupt LSNs in XLogFlush.)
2785 */
2786 SpinLockAcquire(&XLogCtl->info_lck);
2787 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2788 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2789 SpinLockRelease(&XLogCtl->info_lck);
2790
2791 if (!force && newMinRecoveryPoint < lsn)
2792 elog(WARNING,
2793 "xlog min recovery request %X/%X is past current point %X/%X",
2794 (uint32) (lsn >> 32), (uint32) lsn,
2795 (uint32) (newMinRecoveryPoint >> 32),
2796 (uint32) newMinRecoveryPoint);
2797
2798 /* update control file */
2799 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2800 {
2801 ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2802 ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2803 UpdateControlFile();
2804 minRecoveryPoint = newMinRecoveryPoint;
2805 minRecoveryPointTLI = newMinRecoveryPointTLI;
2806
2807 ereport(DEBUG2,
2808 (errmsg("updated min recovery point to %X/%X on timeline %u",
2809 (uint32) (minRecoveryPoint >> 32),
2810 (uint32) minRecoveryPoint,
2811 newMinRecoveryPointTLI)));
2812 }
2813 }
2814 LWLockRelease(ControlFileLock);
2815 }
2816
2817 /*
2818 * Ensure that all XLOG data through the given position is flushed to disk.
2819 *
2820 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2821 * already held, and we try to avoid acquiring it if possible.
2822 */
2823 void
XLogFlush(XLogRecPtr record)2824 XLogFlush(XLogRecPtr record)
2825 {
2826 XLogRecPtr WriteRqstPtr;
2827 XLogwrtRqst WriteRqst;
2828
2829 /*
2830 * During REDO, we are reading not writing WAL. Therefore, instead of
2831 * trying to flush the WAL, we should update minRecoveryPoint instead. We
2832 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2833 * to act this way too, and because when it tries to write the
2834 * end-of-recovery checkpoint, it should indeed flush.
2835 */
2836 if (!XLogInsertAllowed())
2837 {
2838 UpdateMinRecoveryPoint(record, false);
2839 return;
2840 }
2841
2842 /* Quick exit if already known flushed */
2843 if (record <= LogwrtResult.Flush)
2844 return;
2845
2846 #ifdef WAL_DEBUG
2847 if (XLOG_DEBUG)
2848 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2849 (uint32) (record >> 32), (uint32) record,
2850 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2851 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2852 #endif
2853
2854 START_CRIT_SECTION();
2855
2856 /*
2857 * Since fsync is usually a horribly expensive operation, we try to
2858 * piggyback as much data as we can on each fsync: if we see any more data
2859 * entered into the xlog buffer, we'll write and fsync that too, so that
2860 * the final value of LogwrtResult.Flush is as large as possible. This
2861 * gives us some chance of avoiding another fsync immediately after.
2862 */
2863
2864 /* initialize to given target; may increase below */
2865 WriteRqstPtr = record;
2866
2867 /*
2868 * Now wait until we get the write lock, or someone else does the flush
2869 * for us.
2870 */
2871 for (;;)
2872 {
2873 XLogRecPtr insertpos;
2874
2875 /* read LogwrtResult and update local state */
2876 SpinLockAcquire(&XLogCtl->info_lck);
2877 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2878 WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2879 LogwrtResult = XLogCtl->LogwrtResult;
2880 SpinLockRelease(&XLogCtl->info_lck);
2881
2882 /* done already? */
2883 if (record <= LogwrtResult.Flush)
2884 break;
2885
2886 /*
2887 * Before actually performing the write, wait for all in-flight
2888 * insertions to the pages we're about to write to finish.
2889 */
2890 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2891
2892 /*
2893 * Try to get the write lock. If we can't get it immediately, wait
2894 * until it's released, and recheck if we still need to do the flush
2895 * or if the backend that held the lock did it for us already. This
2896 * helps to maintain a good rate of group committing when the system
2897 * is bottlenecked by the speed of fsyncing.
2898 */
2899 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2900 {
2901 /*
2902 * The lock is now free, but we didn't acquire it yet. Before we
2903 * do, loop back to check if someone else flushed the record for
2904 * us already.
2905 */
2906 continue;
2907 }
2908
2909 /* Got the lock; recheck whether request is satisfied */
2910 LogwrtResult = XLogCtl->LogwrtResult;
2911 if (record <= LogwrtResult.Flush)
2912 {
2913 LWLockRelease(WALWriteLock);
2914 break;
2915 }
2916
2917 /*
2918 * Sleep before flush! By adding a delay here, we may give further
2919 * backends the opportunity to join the backlog of group commit
2920 * followers; this can significantly improve transaction throughput,
2921 * at the risk of increasing transaction latency.
2922 *
2923 * We do not sleep if enableFsync is not turned on, nor if there are
2924 * fewer than CommitSiblings other backends with active transactions.
2925 */
2926 if (CommitDelay > 0 && enableFsync &&
2927 MinimumActiveBackends(CommitSiblings))
2928 {
2929 pg_usleep(CommitDelay);
2930
2931 /*
2932 * Re-check how far we can now flush the WAL. It's generally not
2933 * safe to call WaitXLogInsertionsToFinish while holding
2934 * WALWriteLock, because an in-progress insertion might need to
2935 * also grab WALWriteLock to make progress. But we know that all
2936 * the insertions up to insertpos have already finished, because
2937 * that's what the earlier WaitXLogInsertionsToFinish() returned.
2938 * We're only calling it again to allow insertpos to be moved
2939 * further forward, not to actually wait for anyone.
2940 */
2941 insertpos = WaitXLogInsertionsToFinish(insertpos);
2942 }
2943
2944 /* try to write/flush later additions to XLOG as well */
2945 WriteRqst.Write = insertpos;
2946 WriteRqst.Flush = insertpos;
2947
2948 XLogWrite(WriteRqst, false);
2949
2950 LWLockRelease(WALWriteLock);
2951 /* done */
2952 break;
2953 }
2954
2955 END_CRIT_SECTION();
2956
2957 /* wake up walsenders now that we've released heavily contended locks */
2958 WalSndWakeupProcessRequests();
2959
2960 /*
2961 * If we still haven't flushed to the request point then we have a
2962 * problem; most likely, the requested flush point is past end of XLOG.
2963 * This has been seen to occur when a disk page has a corrupted LSN.
2964 *
2965 * Formerly we treated this as a PANIC condition, but that hurts the
2966 * system's robustness rather than helping it: we do not want to take down
2967 * the whole system due to corruption on one data page. In particular, if
2968 * the bad page is encountered again during recovery then we would be
2969 * unable to restart the database at all! (This scenario actually
2970 * happened in the field several times with 7.1 releases.) As of 8.4, bad
2971 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2972 * the only time we can reach here during recovery is while flushing the
2973 * end-of-recovery checkpoint record, and we don't expect that to have a
2974 * bad LSN.
2975 *
2976 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2977 * since xact.c calls this routine inside a critical section. However,
2978 * calls from bufmgr.c are not within critical sections and so we will not
2979 * force a restart for a bad LSN on a data page.
2980 */
2981 if (LogwrtResult.Flush < record)
2982 elog(ERROR,
2983 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2984 (uint32) (record >> 32), (uint32) record,
2985 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2986 }
2987
2988 /*
2989 * Write & flush xlog, but without specifying exactly where to.
2990 *
2991 * We normally write only completed blocks; but if there is nothing to do on
2992 * that basis, we check for unwritten async commits in the current incomplete
2993 * block, and write through the latest one of those. Thus, if async commits
2994 * are not being used, we will write complete blocks only.
2995 *
2996 * If, based on the above, there's anything to write we do so immediately. But
2997 * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2998 * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2999 * more than wal_writer_flush_after unflushed blocks.
3000 *
3001 * We can guarantee that async commits reach disk after at most three
3002 * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
3003 * to write "flexibly", meaning it can stop at the end of the buffer ring;
3004 * this makes a difference only with very high load or long wal_writer_delay,
3005 * but imposes one extra cycle for the worst case for async commits.)
3006 *
3007 * This routine is invoked periodically by the background walwriter process.
3008 *
3009 * Returns true if there was any work to do, even if we skipped flushing due
3010 * to wal_writer_delay/wal_writer_flush_after.
3011 */
3012 bool
XLogBackgroundFlush(void)3013 XLogBackgroundFlush(void)
3014 {
3015 XLogwrtRqst WriteRqst;
3016 bool flexible = true;
3017 static TimestampTz lastflush;
3018 TimestampTz now;
3019 int flushbytes;
3020
3021 /* XLOG doesn't need flushing during recovery */
3022 if (RecoveryInProgress())
3023 return false;
3024
3025 /* read LogwrtResult and update local state */
3026 SpinLockAcquire(&XLogCtl->info_lck);
3027 LogwrtResult = XLogCtl->LogwrtResult;
3028 WriteRqst = XLogCtl->LogwrtRqst;
3029 SpinLockRelease(&XLogCtl->info_lck);
3030
3031 /* back off to last completed page boundary */
3032 WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3033
3034 /* if we have already flushed that far, consider async commit records */
3035 if (WriteRqst.Write <= LogwrtResult.Flush)
3036 {
3037 SpinLockAcquire(&XLogCtl->info_lck);
3038 WriteRqst.Write = XLogCtl->asyncXactLSN;
3039 SpinLockRelease(&XLogCtl->info_lck);
3040 flexible = false; /* ensure it all gets written */
3041 }
3042
3043 /*
3044 * If already known flushed, we're done. Just need to check if we are
3045 * holding an open file handle to a logfile that's no longer in use,
3046 * preventing the file from being deleted.
3047 */
3048 if (WriteRqst.Write <= LogwrtResult.Flush)
3049 {
3050 if (openLogFile >= 0)
3051 {
3052 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3053 wal_segment_size))
3054 {
3055 XLogFileClose();
3056 }
3057 }
3058 return false;
3059 }
3060
3061 /*
3062 * Determine how far to flush WAL, based on the wal_writer_delay and
3063 * wal_writer_flush_after GUCs.
3064 */
3065 now = GetCurrentTimestamp();
3066 flushbytes =
3067 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3068
3069 if (WalWriterFlushAfter == 0 || lastflush == 0)
3070 {
3071 /* first call, or block based limits disabled */
3072 WriteRqst.Flush = WriteRqst.Write;
3073 lastflush = now;
3074 }
3075 else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3076 {
3077 /*
3078 * Flush the writes at least every WalWriteDelay ms. This is important
3079 * to bound the amount of time it takes for an asynchronous commit to
3080 * hit disk.
3081 */
3082 WriteRqst.Flush = WriteRqst.Write;
3083 lastflush = now;
3084 }
3085 else if (flushbytes >= WalWriterFlushAfter)
3086 {
3087 /* exceeded wal_writer_flush_after blocks, flush */
3088 WriteRqst.Flush = WriteRqst.Write;
3089 lastflush = now;
3090 }
3091 else
3092 {
3093 /* no flushing, this time round */
3094 WriteRqst.Flush = 0;
3095 }
3096
3097 #ifdef WAL_DEBUG
3098 if (XLOG_DEBUG)
3099 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3100 (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3101 (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3102 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3103 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3104 #endif
3105
3106 START_CRIT_SECTION();
3107
3108 /* now wait for any in-progress insertions to finish and get write lock */
3109 WaitXLogInsertionsToFinish(WriteRqst.Write);
3110 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3111 LogwrtResult = XLogCtl->LogwrtResult;
3112 if (WriteRqst.Write > LogwrtResult.Write ||
3113 WriteRqst.Flush > LogwrtResult.Flush)
3114 {
3115 XLogWrite(WriteRqst, flexible);
3116 }
3117 LWLockRelease(WALWriteLock);
3118
3119 END_CRIT_SECTION();
3120
3121 /* wake up walsenders now that we've released heavily contended locks */
3122 WalSndWakeupProcessRequests();
3123
3124 /*
3125 * Great, done. To take some work off the critical path, try to initialize
3126 * as many of the no-longer-needed WAL buffers for future use as we can.
3127 */
3128 AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3129
3130 /*
3131 * If we determined that we need to write data, but somebody else
3132 * wrote/flushed already, it should be considered as being active, to
3133 * avoid hibernating too early.
3134 */
3135 return true;
3136 }
3137
3138 /*
3139 * Test whether XLOG data has been flushed up to (at least) the given position.
3140 *
3141 * Returns true if a flush is still needed. (It may be that someone else
3142 * is already in process of flushing that far, however.)
3143 */
3144 bool
XLogNeedsFlush(XLogRecPtr record)3145 XLogNeedsFlush(XLogRecPtr record)
3146 {
3147 /*
3148 * During recovery, we don't flush WAL but update minRecoveryPoint
3149 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3150 * would need to be updated.
3151 */
3152 if (RecoveryInProgress())
3153 {
3154 /*
3155 * An invalid minRecoveryPoint means that we need to recover all the
3156 * WAL, i.e., we're doing crash recovery. We never modify the control
3157 * file's value in that case, so we can short-circuit future checks
3158 * here too. This triggers a quick exit path for the startup process,
3159 * which cannot update its local copy of minRecoveryPoint as long as
3160 * it has not replayed all WAL available when doing crash recovery.
3161 */
3162 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3163 updateMinRecoveryPoint = false;
3164
3165 /* Quick exit if already known to be updated or cannot be updated */
3166 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3167 return false;
3168
3169 /*
3170 * Update local copy of minRecoveryPoint. But if the lock is busy,
3171 * just return a conservative guess.
3172 */
3173 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3174 return true;
3175 minRecoveryPoint = ControlFile->minRecoveryPoint;
3176 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3177 LWLockRelease(ControlFileLock);
3178
3179 /*
3180 * Check minRecoveryPoint for any other process than the startup
3181 * process doing crash recovery, which should not update the control
3182 * file value if crash recovery is still running.
3183 */
3184 if (XLogRecPtrIsInvalid(minRecoveryPoint))
3185 updateMinRecoveryPoint = false;
3186
3187 /* check again */
3188 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3189 return false;
3190 else
3191 return true;
3192 }
3193
3194 /* Quick exit if already known flushed */
3195 if (record <= LogwrtResult.Flush)
3196 return false;
3197
3198 /* read LogwrtResult and update local state */
3199 SpinLockAcquire(&XLogCtl->info_lck);
3200 LogwrtResult = XLogCtl->LogwrtResult;
3201 SpinLockRelease(&XLogCtl->info_lck);
3202
3203 /* check again */
3204 if (record <= LogwrtResult.Flush)
3205 return false;
3206
3207 return true;
3208 }
3209
3210 /*
3211 * Create a new XLOG file segment, or open a pre-existing one.
3212 *
3213 * log, seg: identify segment to be created/opened.
3214 *
3215 * *use_existent: if true, OK to use a pre-existing file (else, any
3216 * pre-existing file will be deleted). On return, true if a pre-existing
3217 * file was used.
3218 *
3219 * use_lock: if true, acquire ControlFileLock while moving file into
3220 * place. This should be true except during bootstrap log creation. The
3221 * caller must *not* hold the lock at call.
3222 *
3223 * Returns FD of opened file.
3224 *
3225 * Note: errors here are ERROR not PANIC because we might or might not be
3226 * inside a critical section (eg, during checkpoint there is no reason to
3227 * take down the system on failure). They will promote to PANIC if we are
3228 * in a critical section.
3229 */
3230 int
XLogFileInit(XLogSegNo logsegno,bool * use_existent,bool use_lock)3231 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3232 {
3233 char path[MAXPGPATH];
3234 char tmppath[MAXPGPATH];
3235 PGAlignedXLogBlock zbuffer;
3236 XLogSegNo installed_segno;
3237 XLogSegNo max_segno;
3238 int fd;
3239 int nbytes;
3240 int save_errno;
3241
3242 XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3243
3244 /*
3245 * Try to use existent file (checkpoint maker may have created it already)
3246 */
3247 if (*use_existent)
3248 {
3249 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3250 if (fd < 0)
3251 {
3252 if (errno != ENOENT)
3253 ereport(ERROR,
3254 (errcode_for_file_access(),
3255 errmsg("could not open file \"%s\": %m", path)));
3256 }
3257 else
3258 return fd;
3259 }
3260
3261 /*
3262 * Initialize an empty (all zeroes) segment. NOTE: it is possible that
3263 * another process is doing the same thing. If so, we will end up
3264 * pre-creating an extra log segment. That seems OK, and better than
3265 * holding the lock throughout this lengthy process.
3266 */
3267 elog(DEBUG2, "creating and filling new WAL file");
3268
3269 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3270
3271 unlink(tmppath);
3272
3273 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3274 fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3275 if (fd < 0)
3276 ereport(ERROR,
3277 (errcode_for_file_access(),
3278 errmsg("could not create file \"%s\": %m", tmppath)));
3279
3280 memset(zbuffer.data, 0, XLOG_BLCKSZ);
3281
3282 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3283 save_errno = 0;
3284 if (wal_init_zero)
3285 {
3286 /*
3287 * Zero-fill the file. With this setting, we do this the hard way to
3288 * ensure that all the file space has really been allocated. On
3289 * platforms that allow "holes" in files, just seeking to the end
3290 * doesn't allocate intermediate space. This way, we know that we
3291 * have all the space and (after the fsync below) that all the
3292 * indirect blocks are down on disk. Therefore, fdatasync(2) or
3293 * O_DSYNC will be sufficient to sync future writes to the log file.
3294 */
3295 for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
3296 {
3297 errno = 0;
3298 if (write(fd, zbuffer.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3299 {
3300 /* if write didn't set errno, assume no disk space */
3301 save_errno = errno ? errno : ENOSPC;
3302 break;
3303 }
3304 }
3305 }
3306 else
3307 {
3308 /*
3309 * Otherwise, seeking to the end and writing a solitary byte is
3310 * enough.
3311 */
3312 errno = 0;
3313 if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
3314 {
3315 /* if write didn't set errno, assume no disk space */
3316 save_errno = errno ? errno : ENOSPC;
3317 }
3318 }
3319 pgstat_report_wait_end();
3320
3321 if (save_errno)
3322 {
3323 /*
3324 * If we fail to make the file, delete it to release disk space
3325 */
3326 unlink(tmppath);
3327
3328 close(fd);
3329
3330 errno = save_errno;
3331
3332 ereport(ERROR,
3333 (errcode_for_file_access(),
3334 errmsg("could not write to file \"%s\": %m", tmppath)));
3335 }
3336
3337 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3338 if (pg_fsync(fd) != 0)
3339 {
3340 int save_errno = errno;
3341
3342 close(fd);
3343 errno = save_errno;
3344 ereport(ERROR,
3345 (errcode_for_file_access(),
3346 errmsg("could not fsync file \"%s\": %m", tmppath)));
3347 }
3348 pgstat_report_wait_end();
3349
3350 if (close(fd))
3351 ereport(ERROR,
3352 (errcode_for_file_access(),
3353 errmsg("could not close file \"%s\": %m", tmppath)));
3354
3355 /*
3356 * Now move the segment into place with its final name.
3357 *
3358 * If caller didn't want to use a pre-existing file, get rid of any
3359 * pre-existing file. Otherwise, cope with possibility that someone else
3360 * has created the file while we were filling ours: if so, use ours to
3361 * pre-create a future log segment.
3362 */
3363 installed_segno = logsegno;
3364
3365 /*
3366 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3367 * that was a constant, but that was always a bit dubious: normally, at a
3368 * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3369 * here, it was the offset from the insert location. We can't do the
3370 * normal XLOGfileslop calculation here because we don't have access to
3371 * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3372 * CheckPointSegments.
3373 */
3374 max_segno = logsegno + CheckPointSegments;
3375 if (!InstallXLogFileSegment(&installed_segno, tmppath,
3376 *use_existent, max_segno,
3377 use_lock))
3378 {
3379 /*
3380 * No need for any more future segments, or InstallXLogFileSegment()
3381 * failed to rename the file into place. If the rename failed, opening
3382 * the file below will fail.
3383 */
3384 unlink(tmppath);
3385 }
3386
3387 /* Set flag to tell caller there was no existent file */
3388 *use_existent = false;
3389
3390 /* Now open original target segment (might not be file I just made) */
3391 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3392 if (fd < 0)
3393 ereport(ERROR,
3394 (errcode_for_file_access(),
3395 errmsg("could not open file \"%s\": %m", path)));
3396
3397 elog(DEBUG2, "done creating and filling new WAL file");
3398
3399 return fd;
3400 }
3401
3402 /*
3403 * Create a new XLOG file segment by copying a pre-existing one.
3404 *
3405 * destsegno: identify segment to be created.
3406 *
3407 * srcTLI, srcsegno: identify segment to be copied (could be from
3408 * a different timeline)
3409 *
3410 * upto: how much of the source file to copy (the rest is filled with
3411 * zeros)
3412 *
3413 * Currently this is only used during recovery, and so there are no locking
3414 * considerations. But we should be just as tense as XLogFileInit to avoid
3415 * emplacing a bogus file.
3416 */
3417 static void
XLogFileCopy(XLogSegNo destsegno,TimeLineID srcTLI,XLogSegNo srcsegno,int upto)3418 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3419 int upto)
3420 {
3421 char path[MAXPGPATH];
3422 char tmppath[MAXPGPATH];
3423 PGAlignedXLogBlock buffer;
3424 int srcfd;
3425 int fd;
3426 int nbytes;
3427
3428 /*
3429 * Open the source file
3430 */
3431 XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3432 srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3433 if (srcfd < 0)
3434 ereport(ERROR,
3435 (errcode_for_file_access(),
3436 errmsg("could not open file \"%s\": %m", path)));
3437
3438 /*
3439 * Copy into a temp file name.
3440 */
3441 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3442
3443 unlink(tmppath);
3444
3445 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3446 fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3447 if (fd < 0)
3448 ereport(ERROR,
3449 (errcode_for_file_access(),
3450 errmsg("could not create file \"%s\": %m", tmppath)));
3451
3452 /*
3453 * Do the data copying.
3454 */
3455 for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3456 {
3457 int nread;
3458
3459 nread = upto - nbytes;
3460
3461 /*
3462 * The part that is not read from the source file is filled with
3463 * zeros.
3464 */
3465 if (nread < sizeof(buffer))
3466 memset(buffer.data, 0, sizeof(buffer));
3467
3468 if (nread > 0)
3469 {
3470 int r;
3471
3472 if (nread > sizeof(buffer))
3473 nread = sizeof(buffer);
3474 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3475 r = read(srcfd, buffer.data, nread);
3476 if (r != nread)
3477 {
3478 if (r < 0)
3479 ereport(ERROR,
3480 (errcode_for_file_access(),
3481 errmsg("could not read file \"%s\": %m",
3482 path)));
3483 else
3484 ereport(ERROR,
3485 (errcode(ERRCODE_DATA_CORRUPTED),
3486 errmsg("could not read file \"%s\": read %d of %zu",
3487 path, r, (Size) nread)));
3488 }
3489 pgstat_report_wait_end();
3490 }
3491 errno = 0;
3492 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3493 if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3494 {
3495 int save_errno = errno;
3496
3497 /*
3498 * If we fail to make the file, delete it to release disk space
3499 */
3500 unlink(tmppath);
3501 /* if write didn't set errno, assume problem is no disk space */
3502 errno = save_errno ? save_errno : ENOSPC;
3503
3504 ereport(ERROR,
3505 (errcode_for_file_access(),
3506 errmsg("could not write to file \"%s\": %m", tmppath)));
3507 }
3508 pgstat_report_wait_end();
3509 }
3510
3511 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3512 if (pg_fsync(fd) != 0)
3513 ereport(data_sync_elevel(ERROR),
3514 (errcode_for_file_access(),
3515 errmsg("could not fsync file \"%s\": %m", tmppath)));
3516 pgstat_report_wait_end();
3517
3518 if (CloseTransientFile(fd))
3519 ereport(ERROR,
3520 (errcode_for_file_access(),
3521 errmsg("could not close file \"%s\": %m", tmppath)));
3522
3523 if (CloseTransientFile(srcfd))
3524 ereport(ERROR,
3525 (errcode_for_file_access(),
3526 errmsg("could not close file \"%s\": %m", path)));
3527
3528 /*
3529 * Now move the segment into place with its final name.
3530 */
3531 if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3532 elog(ERROR, "InstallXLogFileSegment should not have failed");
3533 }
3534
3535 /*
3536 * Install a new XLOG segment file as a current or future log segment.
3537 *
3538 * This is used both to install a newly-created segment (which has a temp
3539 * filename while it's being created) and to recycle an old segment.
3540 *
3541 * *segno: identify segment to install as (or first possible target).
3542 * When find_free is true, this is modified on return to indicate the
3543 * actual installation location or last segment searched.
3544 *
3545 * tmppath: initial name of file to install. It will be renamed into place.
3546 *
3547 * find_free: if true, install the new segment at the first empty segno
3548 * number at or after the passed numbers. If false, install the new segment
3549 * exactly where specified, deleting any existing segment file there.
3550 *
3551 * max_segno: maximum segment number to install the new file as. Fail if no
3552 * free slot is found between *segno and max_segno. (Ignored when find_free
3553 * is false.)
3554 *
3555 * use_lock: if true, acquire ControlFileLock while moving file into
3556 * place. This should be true except during bootstrap log creation. The
3557 * caller must *not* hold the lock at call.
3558 *
3559 * Returns true if the file was installed successfully. false indicates that
3560 * max_segno limit was exceeded, or an error occurred while renaming the
3561 * file into place.
3562 */
3563 static bool
InstallXLogFileSegment(XLogSegNo * segno,char * tmppath,bool find_free,XLogSegNo max_segno,bool use_lock)3564 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3565 bool find_free, XLogSegNo max_segno,
3566 bool use_lock)
3567 {
3568 char path[MAXPGPATH];
3569 struct stat stat_buf;
3570
3571 XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3572
3573 /*
3574 * We want to be sure that only one process does this at a time.
3575 */
3576 if (use_lock)
3577 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3578
3579 if (!find_free)
3580 {
3581 /* Force installation: get rid of any pre-existing segment file */
3582 durable_unlink(path, DEBUG1);
3583 }
3584 else
3585 {
3586 /* Find a free slot to put it in */
3587 while (stat(path, &stat_buf) == 0)
3588 {
3589 if ((*segno) >= max_segno)
3590 {
3591 /* Failed to find a free slot within specified range */
3592 if (use_lock)
3593 LWLockRelease(ControlFileLock);
3594 return false;
3595 }
3596 (*segno)++;
3597 XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3598 }
3599 }
3600
3601 /*
3602 * Perform the rename using link if available, paranoidly trying to avoid
3603 * overwriting an existing file (there shouldn't be one).
3604 */
3605 if (durable_link_or_rename(tmppath, path, LOG) != 0)
3606 {
3607 if (use_lock)
3608 LWLockRelease(ControlFileLock);
3609 /* durable_link_or_rename already emitted log message */
3610 return false;
3611 }
3612
3613 if (use_lock)
3614 LWLockRelease(ControlFileLock);
3615
3616 return true;
3617 }
3618
3619 /*
3620 * Open a pre-existing logfile segment for writing.
3621 */
3622 int
XLogFileOpen(XLogSegNo segno)3623 XLogFileOpen(XLogSegNo segno)
3624 {
3625 char path[MAXPGPATH];
3626 int fd;
3627
3628 XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3629
3630 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3631 if (fd < 0)
3632 ereport(PANIC,
3633 (errcode_for_file_access(),
3634 errmsg("could not open file \"%s\": %m", path)));
3635
3636 return fd;
3637 }
3638
3639 /*
3640 * Open a logfile segment for reading (during recovery).
3641 *
3642 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3643 * Otherwise, it's assumed to be already available in pg_wal.
3644 */
3645 static int
XLogFileRead(XLogSegNo segno,int emode,TimeLineID tli,int source,bool notfoundOk)3646 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3647 int source, bool notfoundOk)
3648 {
3649 char xlogfname[MAXFNAMELEN];
3650 char activitymsg[MAXFNAMELEN + 16];
3651 char path[MAXPGPATH];
3652 int fd;
3653
3654 XLogFileName(xlogfname, tli, segno, wal_segment_size);
3655
3656 switch (source)
3657 {
3658 case XLOG_FROM_ARCHIVE:
3659 /* Report recovery progress in PS display */
3660 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3661 xlogfname);
3662 set_ps_display(activitymsg, false);
3663
3664 restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3665 "RECOVERYXLOG",
3666 wal_segment_size,
3667 InRedo);
3668 if (!restoredFromArchive)
3669 return -1;
3670 break;
3671
3672 case XLOG_FROM_PG_WAL:
3673 case XLOG_FROM_STREAM:
3674 XLogFilePath(path, tli, segno, wal_segment_size);
3675 restoredFromArchive = false;
3676 break;
3677
3678 default:
3679 elog(ERROR, "invalid XLogFileRead source %d", source);
3680 }
3681
3682 /*
3683 * If the segment was fetched from archival storage, replace the existing
3684 * xlog segment (if any) with the archival version.
3685 */
3686 if (source == XLOG_FROM_ARCHIVE)
3687 {
3688 KeepFileRestoredFromArchive(path, xlogfname);
3689
3690 /*
3691 * Set path to point at the new file in pg_wal.
3692 */
3693 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3694 }
3695
3696 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
3697 if (fd >= 0)
3698 {
3699 /* Success! */
3700 curFileTLI = tli;
3701
3702 /* Report recovery progress in PS display */
3703 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3704 xlogfname);
3705 set_ps_display(activitymsg, false);
3706
3707 /* Track source of data in assorted state variables */
3708 readSource = source;
3709 XLogReceiptSource = source;
3710 /* In FROM_STREAM case, caller tracks receipt time, not me */
3711 if (source != XLOG_FROM_STREAM)
3712 XLogReceiptTime = GetCurrentTimestamp();
3713
3714 return fd;
3715 }
3716 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3717 ereport(PANIC,
3718 (errcode_for_file_access(),
3719 errmsg("could not open file \"%s\": %m", path)));
3720 return -1;
3721 }
3722
3723 /*
3724 * Open a logfile segment for reading (during recovery).
3725 *
3726 * This version searches for the segment with any TLI listed in expectedTLEs.
3727 */
3728 static int
XLogFileReadAnyTLI(XLogSegNo segno,int emode,int source)3729 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3730 {
3731 char path[MAXPGPATH];
3732 ListCell *cell;
3733 int fd;
3734 List *tles;
3735
3736 /*
3737 * Loop looking for a suitable timeline ID: we might need to read any of
3738 * the timelines listed in expectedTLEs.
3739 *
3740 * We expect curFileTLI on entry to be the TLI of the preceding file in
3741 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
3742 * to go backwards; this prevents us from picking up the wrong file when a
3743 * parent timeline extends to higher segment numbers than the child we
3744 * want to read.
3745 *
3746 * If we haven't read the timeline history file yet, read it now, so that
3747 * we know which TLIs to scan. We don't save the list in expectedTLEs,
3748 * however, unless we actually find a valid segment. That way if there is
3749 * neither a timeline history file nor a WAL segment in the archive, and
3750 * streaming replication is set up, we'll read the timeline history file
3751 * streamed from the master when we start streaming, instead of recovering
3752 * with a dummy history generated here.
3753 */
3754 if (expectedTLEs)
3755 tles = expectedTLEs;
3756 else
3757 tles = readTimeLineHistory(recoveryTargetTLI);
3758
3759 foreach(cell, tles)
3760 {
3761 TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
3762 TimeLineID tli = hent->tli;
3763
3764 if (tli < curFileTLI)
3765 break; /* don't bother looking at too-old TLIs */
3766
3767 /*
3768 * Skip scanning the timeline ID that the logfile segment to read
3769 * doesn't belong to
3770 */
3771 if (hent->begin != InvalidXLogRecPtr)
3772 {
3773 XLogSegNo beginseg = 0;
3774
3775 XLByteToSeg(hent->begin, beginseg, wal_segment_size);
3776
3777 /*
3778 * The logfile segment that doesn't belong to the timeline is
3779 * older or newer than the segment that the timeline started or
3780 * ended at, respectively. It's sufficient to check only the
3781 * starting segment of the timeline here. Since the timelines are
3782 * scanned in descending order in this loop, any segments newer
3783 * than the ending segment should belong to newer timeline and
3784 * have already been read before. So it's not necessary to check
3785 * the ending segment of the timeline here.
3786 */
3787 if (segno < beginseg)
3788 continue;
3789 }
3790
3791 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3792 {
3793 fd = XLogFileRead(segno, emode, tli,
3794 XLOG_FROM_ARCHIVE, true);
3795 if (fd != -1)
3796 {
3797 elog(DEBUG1, "got WAL segment from archive");
3798 if (!expectedTLEs)
3799 expectedTLEs = tles;
3800 return fd;
3801 }
3802 }
3803
3804 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3805 {
3806 fd = XLogFileRead(segno, emode, tli,
3807 XLOG_FROM_PG_WAL, true);
3808 if (fd != -1)
3809 {
3810 if (!expectedTLEs)
3811 expectedTLEs = tles;
3812 return fd;
3813 }
3814 }
3815 }
3816
3817 /* Couldn't find it. For simplicity, complain about front timeline */
3818 XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3819 errno = ENOENT;
3820 ereport(emode,
3821 (errcode_for_file_access(),
3822 errmsg("could not open file \"%s\": %m", path)));
3823 return -1;
3824 }
3825
3826 /*
3827 * Close the current logfile segment for writing.
3828 */
3829 static void
XLogFileClose(void)3830 XLogFileClose(void)
3831 {
3832 Assert(openLogFile >= 0);
3833
3834 /*
3835 * WAL segment files will not be re-read in normal operation, so we advise
3836 * the OS to release any cached pages. But do not do so if WAL archiving
3837 * or streaming is active, because archiver and walsender process could
3838 * use the cache to read the WAL segment.
3839 */
3840 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3841 if (!XLogIsNeeded())
3842 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3843 #endif
3844
3845 if (close(openLogFile))
3846 ereport(PANIC,
3847 (errcode_for_file_access(),
3848 errmsg("could not close file \"%s\": %m",
3849 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3850 openLogFile = -1;
3851 }
3852
3853 /*
3854 * Preallocate log files beyond the specified log endpoint.
3855 *
3856 * XXX this is currently extremely conservative, since it forces only one
3857 * future log segment to exist, and even that only if we are 75% done with
3858 * the current one. This is only appropriate for very low-WAL-volume systems.
3859 * High-volume systems will be OK once they've built up a sufficient set of
3860 * recycled log segments, but the startup transient is likely to include
3861 * a lot of segment creations by foreground processes, which is not so good.
3862 */
3863 static void
PreallocXlogFiles(XLogRecPtr endptr)3864 PreallocXlogFiles(XLogRecPtr endptr)
3865 {
3866 XLogSegNo _logSegNo;
3867 int lf;
3868 bool use_existent;
3869 uint64 offset;
3870
3871 XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3872 offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3873 if (offset >= (uint32) (0.75 * wal_segment_size))
3874 {
3875 _logSegNo++;
3876 use_existent = true;
3877 lf = XLogFileInit(_logSegNo, &use_existent, true);
3878 close(lf);
3879 if (!use_existent)
3880 CheckpointStats.ckpt_segs_added++;
3881 }
3882 }
3883
3884 /*
3885 * Throws an error if the given log segment has already been removed or
3886 * recycled. The caller should only pass a segment that it knows to have
3887 * existed while the server has been running, as this function always
3888 * succeeds if no WAL segments have been removed since startup.
3889 * 'tli' is only used in the error message.
3890 *
3891 * Note: this function guarantees to keep errno unchanged on return.
3892 * This supports callers that use this to possibly deliver a better
3893 * error message about a missing file, while still being able to throw
3894 * a normal file-access error afterwards, if this does return.
3895 */
3896 void
CheckXLogRemoved(XLogSegNo segno,TimeLineID tli)3897 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3898 {
3899 int save_errno = errno;
3900 XLogSegNo lastRemovedSegNo;
3901
3902 SpinLockAcquire(&XLogCtl->info_lck);
3903 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3904 SpinLockRelease(&XLogCtl->info_lck);
3905
3906 if (segno <= lastRemovedSegNo)
3907 {
3908 char filename[MAXFNAMELEN];
3909
3910 XLogFileName(filename, tli, segno, wal_segment_size);
3911 errno = save_errno;
3912 ereport(ERROR,
3913 (errcode_for_file_access(),
3914 errmsg("requested WAL segment %s has already been removed",
3915 filename)));
3916 }
3917 errno = save_errno;
3918 }
3919
3920 /*
3921 * Return the last WAL segment removed, or 0 if no segment has been removed
3922 * since startup.
3923 *
3924 * NB: the result can be out of date arbitrarily fast, the caller has to deal
3925 * with that.
3926 */
3927 XLogSegNo
XLogGetLastRemovedSegno(void)3928 XLogGetLastRemovedSegno(void)
3929 {
3930 XLogSegNo lastRemovedSegNo;
3931
3932 SpinLockAcquire(&XLogCtl->info_lck);
3933 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3934 SpinLockRelease(&XLogCtl->info_lck);
3935
3936 return lastRemovedSegNo;
3937 }
3938
3939 /*
3940 * Update the last removed segno pointer in shared memory, to reflect
3941 * that the given XLOG file has been removed.
3942 */
3943 static void
UpdateLastRemovedPtr(char * filename)3944 UpdateLastRemovedPtr(char *filename)
3945 {
3946 uint32 tli;
3947 XLogSegNo segno;
3948
3949 XLogFromFileName(filename, &tli, &segno, wal_segment_size);
3950
3951 SpinLockAcquire(&XLogCtl->info_lck);
3952 if (segno > XLogCtl->lastRemovedSegNo)
3953 XLogCtl->lastRemovedSegNo = segno;
3954 SpinLockRelease(&XLogCtl->info_lck);
3955 }
3956
3957 /*
3958 * Remove all temporary log files in pg_wal
3959 *
3960 * This is called at the beginning of recovery after a previous crash,
3961 * at a point where no other processes write fresh WAL data.
3962 */
3963 static void
RemoveTempXlogFiles(void)3964 RemoveTempXlogFiles(void)
3965 {
3966 DIR *xldir;
3967 struct dirent *xlde;
3968
3969 elog(DEBUG2, "removing all temporary WAL segments");
3970
3971 xldir = AllocateDir(XLOGDIR);
3972 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3973 {
3974 char path[MAXPGPATH];
3975
3976 if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
3977 continue;
3978
3979 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
3980 unlink(path);
3981 elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
3982 }
3983 FreeDir(xldir);
3984 }
3985
3986 /*
3987 * Recycle or remove all log files older or equal to passed segno.
3988 *
3989 * endptr is current (or recent) end of xlog, and lastredoptr is the
3990 * redo pointer of the last checkpoint. These are used to determine
3991 * whether we want to recycle rather than delete no-longer-wanted log files.
3992 */
3993 static void
RemoveOldXlogFiles(XLogSegNo segno,XLogRecPtr lastredoptr,XLogRecPtr endptr)3994 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr)
3995 {
3996 DIR *xldir;
3997 struct dirent *xlde;
3998 char lastoff[MAXFNAMELEN];
3999
4000 /*
4001 * Construct a filename of the last segment to be kept. The timeline ID
4002 * doesn't matter, we ignore that in the comparison. (During recovery,
4003 * ThisTimeLineID isn't set, so we can't use that.)
4004 */
4005 XLogFileName(lastoff, 0, segno, wal_segment_size);
4006
4007 elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4008 lastoff);
4009
4010 xldir = AllocateDir(XLOGDIR);
4011
4012 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4013 {
4014 /* Ignore files that are not XLOG segments */
4015 if (!IsXLogFileName(xlde->d_name) &&
4016 !IsPartialXLogFileName(xlde->d_name))
4017 continue;
4018
4019 /*
4020 * We ignore the timeline part of the XLOG segment identifiers in
4021 * deciding whether a segment is still needed. This ensures that we
4022 * won't prematurely remove a segment from a parent timeline. We could
4023 * probably be a little more proactive about removing segments of
4024 * non-parent timelines, but that would be a whole lot more
4025 * complicated.
4026 *
4027 * We use the alphanumeric sorting property of the filenames to decide
4028 * which ones are earlier than the lastoff segment.
4029 */
4030 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4031 {
4032 if (XLogArchiveCheckDone(xlde->d_name))
4033 {
4034 /* Update the last removed location in shared memory first */
4035 UpdateLastRemovedPtr(xlde->d_name);
4036
4037 RemoveXlogFile(xlde->d_name, lastredoptr, endptr);
4038 }
4039 }
4040 }
4041
4042 FreeDir(xldir);
4043 }
4044
4045 /*
4046 * Remove WAL files that are not part of the given timeline's history.
4047 *
4048 * This is called during recovery, whenever we switch to follow a new
4049 * timeline, and at the end of recovery when we create a new timeline. We
4050 * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
4051 * might be leftover pre-allocated or recycled WAL segments on the old timeline
4052 * that we haven't used yet, and contain garbage. If we just leave them in
4053 * pg_wal, they will eventually be archived, and we can't let that happen.
4054 * Files that belong to our timeline history are valid, because we have
4055 * successfully replayed them, but from others we can't be sure.
4056 *
4057 * 'switchpoint' is the current point in WAL where we switch to new timeline,
4058 * and 'newTLI' is the new timeline we switch to.
4059 */
4060 static void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint,TimeLineID newTLI)4061 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
4062 {
4063 DIR *xldir;
4064 struct dirent *xlde;
4065 char switchseg[MAXFNAMELEN];
4066 XLogSegNo endLogSegNo;
4067
4068 XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
4069
4070 /*
4071 * Construct a filename of the last segment to be kept.
4072 */
4073 XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
4074
4075 elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4076 switchseg);
4077
4078 xldir = AllocateDir(XLOGDIR);
4079
4080 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4081 {
4082 /* Ignore files that are not XLOG segments */
4083 if (!IsXLogFileName(xlde->d_name))
4084 continue;
4085
4086 /*
4087 * Remove files that are on a timeline older than the new one we're
4088 * switching to, but with a segment number >= the first segment on the
4089 * new timeline.
4090 */
4091 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4092 strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4093 {
4094 /*
4095 * If the file has already been marked as .ready, however, don't
4096 * remove it yet. It should be OK to remove it - files that are
4097 * not part of our timeline history are not required for recovery
4098 * - but seems safer to let them be archived and removed later.
4099 */
4100 if (!XLogArchiveIsReady(xlde->d_name))
4101 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
4102 }
4103 }
4104
4105 FreeDir(xldir);
4106 }
4107
4108 /*
4109 * Recycle or remove a log file that's no longer needed.
4110 *
4111 * endptr is current (or recent) end of xlog, and lastredoptr is the
4112 * redo pointer of the last checkpoint. These are used to determine
4113 * whether we want to recycle rather than delete no-longer-wanted log files.
4114 * If lastredoptr is not known, pass invalid, and the function will recycle,
4115 * somewhat arbitrarily, 10 future segments.
4116 */
4117 static void
RemoveXlogFile(const char * segname,XLogRecPtr lastredoptr,XLogRecPtr endptr)4118 RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr)
4119 {
4120 char path[MAXPGPATH];
4121 #ifdef WIN32
4122 char newpath[MAXPGPATH];
4123 #endif
4124 struct stat statbuf;
4125 XLogSegNo endlogSegNo;
4126 XLogSegNo recycleSegNo;
4127
4128 if (wal_recycle)
4129 {
4130 /*
4131 * Initialize info about where to try to recycle to.
4132 */
4133 XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4134 if (lastredoptr == InvalidXLogRecPtr)
4135 recycleSegNo = endlogSegNo + 10;
4136 else
4137 recycleSegNo = XLOGfileslop(lastredoptr);
4138 }
4139 else
4140 recycleSegNo = 0; /* keep compiler quiet */
4141
4142 snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4143
4144 /*
4145 * Before deleting the file, see if it can be recycled as a future log
4146 * segment. Only recycle normal files, pg_standby for example can create
4147 * symbolic links pointing to a separate archive directory.
4148 */
4149 if (wal_recycle &&
4150 endlogSegNo <= recycleSegNo &&
4151 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4152 InstallXLogFileSegment(&endlogSegNo, path,
4153 true, recycleSegNo, true))
4154 {
4155 ereport(DEBUG2,
4156 (errmsg("recycled write-ahead log file \"%s\"",
4157 segname)));
4158 CheckpointStats.ckpt_segs_recycled++;
4159 /* Needn't recheck that slot on future iterations */
4160 endlogSegNo++;
4161 }
4162 else
4163 {
4164 /* No need for any more future segments... */
4165 int rc;
4166
4167 ereport(DEBUG2,
4168 (errmsg("removing write-ahead log file \"%s\"",
4169 segname)));
4170
4171 #ifdef WIN32
4172
4173 /*
4174 * On Windows, if another process (e.g another backend) holds the file
4175 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4176 * will still show up in directory listing until the last handle is
4177 * closed. To avoid confusing the lingering deleted file for a live
4178 * WAL file that needs to be archived, rename it before deleting it.
4179 *
4180 * If another process holds the file open without FILE_SHARE_DELETE
4181 * flag, rename will fail. We'll try again at the next checkpoint.
4182 */
4183 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4184 if (rename(path, newpath) != 0)
4185 {
4186 ereport(LOG,
4187 (errcode_for_file_access(),
4188 errmsg("could not rename file \"%s\": %m",
4189 path)));
4190 return;
4191 }
4192 rc = durable_unlink(newpath, LOG);
4193 #else
4194 rc = durable_unlink(path, LOG);
4195 #endif
4196 if (rc != 0)
4197 {
4198 /* Message already logged by durable_unlink() */
4199 return;
4200 }
4201 CheckpointStats.ckpt_segs_removed++;
4202 }
4203
4204 XLogArchiveCleanup(segname);
4205 }
4206
4207 /*
4208 * Verify whether pg_wal and pg_wal/archive_status exist.
4209 * If the latter does not exist, recreate it.
4210 *
4211 * It is not the goal of this function to verify the contents of these
4212 * directories, but to help in cases where someone has performed a cluster
4213 * copy for PITR purposes but omitted pg_wal from the copy.
4214 *
4215 * We could also recreate pg_wal if it doesn't exist, but a deliberate
4216 * policy decision was made not to. It is fairly common for pg_wal to be
4217 * a symlink, and if that was the DBA's intent then automatically making a
4218 * plain directory would result in degraded performance with no notice.
4219 */
4220 static void
ValidateXLOGDirectoryStructure(void)4221 ValidateXLOGDirectoryStructure(void)
4222 {
4223 char path[MAXPGPATH];
4224 struct stat stat_buf;
4225
4226 /* Check for pg_wal; if it doesn't exist, error out */
4227 if (stat(XLOGDIR, &stat_buf) != 0 ||
4228 !S_ISDIR(stat_buf.st_mode))
4229 ereport(FATAL,
4230 (errmsg("required WAL directory \"%s\" does not exist",
4231 XLOGDIR)));
4232
4233 /* Check for archive_status */
4234 snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4235 if (stat(path, &stat_buf) == 0)
4236 {
4237 /* Check for weird cases where it exists but isn't a directory */
4238 if (!S_ISDIR(stat_buf.st_mode))
4239 ereport(FATAL,
4240 (errmsg("required WAL directory \"%s\" does not exist",
4241 path)));
4242 }
4243 else
4244 {
4245 ereport(LOG,
4246 (errmsg("creating missing WAL directory \"%s\"", path)));
4247 if (MakePGDirectory(path) < 0)
4248 ereport(FATAL,
4249 (errmsg("could not create missing directory \"%s\": %m",
4250 path)));
4251 }
4252 }
4253
4254 /*
4255 * Remove previous backup history files. This also retries creation of
4256 * .ready files for any backup history files for which XLogArchiveNotify
4257 * failed earlier.
4258 */
4259 static void
CleanupBackupHistory(void)4260 CleanupBackupHistory(void)
4261 {
4262 DIR *xldir;
4263 struct dirent *xlde;
4264 char path[MAXPGPATH + sizeof(XLOGDIR)];
4265
4266 xldir = AllocateDir(XLOGDIR);
4267
4268 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4269 {
4270 if (IsBackupHistoryFileName(xlde->d_name))
4271 {
4272 if (XLogArchiveCheckDone(xlde->d_name))
4273 {
4274 elog(DEBUG2, "removing WAL backup history file \"%s\"",
4275 xlde->d_name);
4276 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4277 unlink(path);
4278 XLogArchiveCleanup(xlde->d_name);
4279 }
4280 }
4281 }
4282
4283 FreeDir(xldir);
4284 }
4285
4286 /*
4287 * Attempt to read an XLOG record.
4288 *
4289 * If RecPtr is valid, try to read a record at that position. Otherwise
4290 * try to read a record just after the last one previously read.
4291 *
4292 * If no valid record is available, returns NULL, or fails if emode is PANIC.
4293 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4294 * record is available.
4295 */
4296 static XLogRecord *
ReadRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int emode,bool fetching_ckpt)4297 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4298 bool fetching_ckpt)
4299 {
4300 XLogRecord *record;
4301 XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4302
4303 /* Pass through parameters to XLogPageRead */
4304 private->fetching_ckpt = fetching_ckpt;
4305 private->emode = emode;
4306 private->randAccess = (RecPtr != InvalidXLogRecPtr);
4307
4308 /* This is the first attempt to read this page. */
4309 lastSourceFailed = false;
4310
4311 for (;;)
4312 {
4313 char *errormsg;
4314
4315 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4316 ReadRecPtr = xlogreader->ReadRecPtr;
4317 EndRecPtr = xlogreader->EndRecPtr;
4318 if (record == NULL)
4319 {
4320 /*
4321 * When not in standby mode we find that WAL ends in an incomplete
4322 * record, keep track of that record. After recovery is done,
4323 * we'll write a record to indicate downstream WAL readers that
4324 * that portion is to be ignored.
4325 */
4326 if (!StandbyMode &&
4327 !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
4328 {
4329 abortedRecPtr = xlogreader->abortedRecPtr;
4330 missingContrecPtr = xlogreader->missingContrecPtr;
4331 }
4332
4333 if (readFile >= 0)
4334 {
4335 close(readFile);
4336 readFile = -1;
4337 }
4338
4339 /*
4340 * We only end up here without a message when XLogPageRead()
4341 * failed - in that case we already logged something. In
4342 * StandbyMode that only happens if we have been triggered, so we
4343 * shouldn't loop anymore in that case.
4344 */
4345 if (errormsg)
4346 ereport(emode_for_corrupt_record(emode,
4347 RecPtr ? RecPtr : EndRecPtr),
4348 (errmsg_internal("%s", errormsg) /* already translated */ ));
4349 }
4350
4351 /*
4352 * Check page TLI is one of the expected values.
4353 */
4354 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4355 {
4356 char fname[MAXFNAMELEN];
4357 XLogSegNo segno;
4358 int32 offset;
4359
4360 XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4361 offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4362 wal_segment_size);
4363 XLogFileName(fname, xlogreader->readPageTLI, segno,
4364 wal_segment_size);
4365 ereport(emode_for_corrupt_record(emode,
4366 RecPtr ? RecPtr : EndRecPtr),
4367 (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4368 xlogreader->latestPageTLI,
4369 fname,
4370 offset)));
4371 record = NULL;
4372 }
4373
4374 if (record)
4375 {
4376 /* Great, got a record */
4377 return record;
4378 }
4379 else
4380 {
4381 /* No valid record available from this source */
4382 lastSourceFailed = true;
4383
4384 /*
4385 * If archive recovery was requested, but we were still doing
4386 * crash recovery, switch to archive recovery and retry using the
4387 * offline archive. We have now replayed all the valid WAL in
4388 * pg_wal, so we are presumably now consistent.
4389 *
4390 * We require that there's at least some valid WAL present in
4391 * pg_wal, however (!fetching_ckpt). We could recover using the
4392 * WAL from the archive, even if pg_wal is completely empty, but
4393 * we'd have no idea how far we'd have to replay to reach
4394 * consistency. So err on the safe side and give up.
4395 */
4396 if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4397 !fetching_ckpt)
4398 {
4399 ereport(DEBUG1,
4400 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4401 InArchiveRecovery = true;
4402 if (StandbyModeRequested)
4403 StandbyMode = true;
4404
4405 /* initialize minRecoveryPoint to this record */
4406 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4407 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4408 if (ControlFile->minRecoveryPoint < EndRecPtr)
4409 {
4410 ControlFile->minRecoveryPoint = EndRecPtr;
4411 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4412 }
4413 /* update local copy */
4414 minRecoveryPoint = ControlFile->minRecoveryPoint;
4415 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4416
4417 /*
4418 * The startup process can update its local copy of
4419 * minRecoveryPoint from this point.
4420 */
4421 updateMinRecoveryPoint = true;
4422
4423 UpdateControlFile();
4424
4425 /*
4426 * We update SharedRecoveryState while holding the lock on
4427 * ControlFileLock so both states are consistent in shared
4428 * memory.
4429 */
4430 SpinLockAcquire(&XLogCtl->info_lck);
4431 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
4432 SpinLockRelease(&XLogCtl->info_lck);
4433
4434 LWLockRelease(ControlFileLock);
4435
4436 CheckRecoveryConsistency();
4437
4438 /*
4439 * Before we retry, reset lastSourceFailed and currentSource
4440 * so that we will check the archive next.
4441 */
4442 lastSourceFailed = false;
4443 currentSource = 0;
4444
4445 continue;
4446 }
4447
4448 /* In standby mode, loop back to retry. Otherwise, give up. */
4449 if (StandbyMode && !CheckForStandbyTrigger())
4450 continue;
4451 else
4452 return NULL;
4453 }
4454 }
4455 }
4456
4457 /*
4458 * Scan for new timelines that might have appeared in the archive since we
4459 * started recovery.
4460 *
4461 * If there are any, the function changes recovery target TLI to the latest
4462 * one and returns 'true'.
4463 */
4464 static bool
rescanLatestTimeLine(void)4465 rescanLatestTimeLine(void)
4466 {
4467 List *newExpectedTLEs;
4468 bool found;
4469 ListCell *cell;
4470 TimeLineID newtarget;
4471 TimeLineID oldtarget = recoveryTargetTLI;
4472 TimeLineHistoryEntry *currentTle = NULL;
4473
4474 newtarget = findNewestTimeLine(recoveryTargetTLI);
4475 if (newtarget == recoveryTargetTLI)
4476 {
4477 /* No new timelines found */
4478 return false;
4479 }
4480
4481 /*
4482 * Determine the list of expected TLIs for the new TLI
4483 */
4484
4485 newExpectedTLEs = readTimeLineHistory(newtarget);
4486
4487 /*
4488 * If the current timeline is not part of the history of the new timeline,
4489 * we cannot proceed to it.
4490 */
4491 found = false;
4492 foreach(cell, newExpectedTLEs)
4493 {
4494 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4495
4496 if (currentTle->tli == recoveryTargetTLI)
4497 {
4498 found = true;
4499 break;
4500 }
4501 }
4502 if (!found)
4503 {
4504 ereport(LOG,
4505 (errmsg("new timeline %u is not a child of database system timeline %u",
4506 newtarget,
4507 ThisTimeLineID)));
4508 return false;
4509 }
4510
4511 /*
4512 * The current timeline was found in the history file, but check that the
4513 * next timeline was forked off from it *after* the current recovery
4514 * location.
4515 */
4516 if (currentTle->end < EndRecPtr)
4517 {
4518 ereport(LOG,
4519 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4520 newtarget,
4521 ThisTimeLineID,
4522 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4523 return false;
4524 }
4525
4526 /* The new timeline history seems valid. Switch target */
4527 recoveryTargetTLI = newtarget;
4528 list_free_deep(expectedTLEs);
4529 expectedTLEs = newExpectedTLEs;
4530
4531 /*
4532 * As in StartupXLOG(), try to ensure we have all the history files
4533 * between the old target and new target in pg_wal.
4534 */
4535 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4536
4537 ereport(LOG,
4538 (errmsg("new target timeline is %u",
4539 recoveryTargetTLI)));
4540
4541 return true;
4542 }
4543
4544 /*
4545 * I/O routines for pg_control
4546 *
4547 * *ControlFile is a buffer in shared memory that holds an image of the
4548 * contents of pg_control. WriteControlFile() initializes pg_control
4549 * given a preloaded buffer, ReadControlFile() loads the buffer from
4550 * the pg_control file (during postmaster or standalone-backend startup),
4551 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4552 *
4553 * For simplicity, WriteControlFile() initializes the fields of pg_control
4554 * that are related to checking backend/database compatibility, and
4555 * ReadControlFile() verifies they are correct. We could split out the
4556 * I/O and compatibility-check functions, but there seems no need currently.
4557 */
4558 static void
WriteControlFile(void)4559 WriteControlFile(void)
4560 {
4561 int fd;
4562 char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
4563
4564 /*
4565 * Ensure that the size of the pg_control data structure is sane. See the
4566 * comments for these symbols in pg_control.h.
4567 */
4568 StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4569 "pg_control is too large for atomic disk writes");
4570 StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4571 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4572
4573 /*
4574 * Initialize version and compatibility-check fields
4575 */
4576 ControlFile->pg_control_version = PG_CONTROL_VERSION;
4577 ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4578
4579 ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4580 ControlFile->floatFormat = FLOATFORMAT_VALUE;
4581
4582 ControlFile->blcksz = BLCKSZ;
4583 ControlFile->relseg_size = RELSEG_SIZE;
4584 ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4585 ControlFile->xlog_seg_size = wal_segment_size;
4586
4587 ControlFile->nameDataLen = NAMEDATALEN;
4588 ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4589
4590 ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4591 ControlFile->loblksize = LOBLKSIZE;
4592
4593 ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4594 ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4595
4596 /* Contents are protected with a CRC */
4597 INIT_CRC32C(ControlFile->crc);
4598 COMP_CRC32C(ControlFile->crc,
4599 (char *) ControlFile,
4600 offsetof(ControlFileData, crc));
4601 FIN_CRC32C(ControlFile->crc);
4602
4603 /*
4604 * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4605 * the excess over sizeof(ControlFileData). This reduces the odds of
4606 * premature-EOF errors when reading pg_control. We'll still fail when we
4607 * check the contents of the file, but hopefully with a more specific
4608 * error than "couldn't read pg_control".
4609 */
4610 memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4611 memcpy(buffer, ControlFile, sizeof(ControlFileData));
4612
4613 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4614 O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4615 if (fd < 0)
4616 ereport(PANIC,
4617 (errcode_for_file_access(),
4618 errmsg("could not create file \"%s\": %m",
4619 XLOG_CONTROL_FILE)));
4620
4621 errno = 0;
4622 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4623 if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4624 {
4625 /* if write didn't set errno, assume problem is no disk space */
4626 if (errno == 0)
4627 errno = ENOSPC;
4628 ereport(PANIC,
4629 (errcode_for_file_access(),
4630 errmsg("could not write to file \"%s\": %m",
4631 XLOG_CONTROL_FILE)));
4632 }
4633 pgstat_report_wait_end();
4634
4635 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4636 if (pg_fsync(fd) != 0)
4637 ereport(PANIC,
4638 (errcode_for_file_access(),
4639 errmsg("could not fsync file \"%s\": %m",
4640 XLOG_CONTROL_FILE)));
4641 pgstat_report_wait_end();
4642
4643 if (close(fd))
4644 ereport(PANIC,
4645 (errcode_for_file_access(),
4646 errmsg("could not close file \"%s\": %m",
4647 XLOG_CONTROL_FILE)));
4648 }
4649
4650 static void
ReadControlFile(void)4651 ReadControlFile(void)
4652 {
4653 pg_crc32c crc;
4654 int fd;
4655 static char wal_segsz_str[20];
4656 int r;
4657
4658 /*
4659 * Read data...
4660 */
4661 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4662 O_RDWR | PG_BINARY);
4663 if (fd < 0)
4664 ereport(PANIC,
4665 (errcode_for_file_access(),
4666 errmsg("could not open file \"%s\": %m",
4667 XLOG_CONTROL_FILE)));
4668
4669 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4670 r = read(fd, ControlFile, sizeof(ControlFileData));
4671 if (r != sizeof(ControlFileData))
4672 {
4673 if (r < 0)
4674 ereport(PANIC,
4675 (errcode_for_file_access(),
4676 errmsg("could not read file \"%s\": %m",
4677 XLOG_CONTROL_FILE)));
4678 else
4679 ereport(PANIC,
4680 (errcode(ERRCODE_DATA_CORRUPTED),
4681 errmsg("could not read file \"%s\": read %d of %zu",
4682 XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4683 }
4684 pgstat_report_wait_end();
4685
4686 close(fd);
4687
4688 /*
4689 * Check for expected pg_control format version. If this is wrong, the
4690 * CRC check will likely fail because we'll be checking the wrong number
4691 * of bytes. Complaining about wrong version will probably be more
4692 * enlightening than complaining about wrong CRC.
4693 */
4694
4695 if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4696 ereport(FATAL,
4697 (errmsg("database files are incompatible with server"),
4698 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4699 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4700 ControlFile->pg_control_version, ControlFile->pg_control_version,
4701 PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4702 errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
4703
4704 if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4705 ereport(FATAL,
4706 (errmsg("database files are incompatible with server"),
4707 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4708 " but the server was compiled with PG_CONTROL_VERSION %d.",
4709 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4710 errhint("It looks like you need to initdb.")));
4711
4712 /* Now check the CRC. */
4713 INIT_CRC32C(crc);
4714 COMP_CRC32C(crc,
4715 (char *) ControlFile,
4716 offsetof(ControlFileData, crc));
4717 FIN_CRC32C(crc);
4718
4719 if (!EQ_CRC32C(crc, ControlFile->crc))
4720 ereport(FATAL,
4721 (errmsg("incorrect checksum in control file")));
4722
4723 /*
4724 * Do compatibility checking immediately. If the database isn't
4725 * compatible with the backend executable, we want to abort before we can
4726 * possibly do any damage.
4727 */
4728 if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4729 ereport(FATAL,
4730 (errmsg("database files are incompatible with server"),
4731 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4732 " but the server was compiled with CATALOG_VERSION_NO %d.",
4733 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4734 errhint("It looks like you need to initdb.")));
4735 if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4736 ereport(FATAL,
4737 (errmsg("database files are incompatible with server"),
4738 errdetail("The database cluster was initialized with MAXALIGN %d,"
4739 " but the server was compiled with MAXALIGN %d.",
4740 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4741 errhint("It looks like you need to initdb.")));
4742 if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4743 ereport(FATAL,
4744 (errmsg("database files are incompatible with server"),
4745 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4746 errhint("It looks like you need to initdb.")));
4747 if (ControlFile->blcksz != BLCKSZ)
4748 ereport(FATAL,
4749 (errmsg("database files are incompatible with server"),
4750 errdetail("The database cluster was initialized with BLCKSZ %d,"
4751 " but the server was compiled with BLCKSZ %d.",
4752 ControlFile->blcksz, BLCKSZ),
4753 errhint("It looks like you need to recompile or initdb.")));
4754 if (ControlFile->relseg_size != RELSEG_SIZE)
4755 ereport(FATAL,
4756 (errmsg("database files are incompatible with server"),
4757 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4758 " but the server was compiled with RELSEG_SIZE %d.",
4759 ControlFile->relseg_size, RELSEG_SIZE),
4760 errhint("It looks like you need to recompile or initdb.")));
4761 if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4762 ereport(FATAL,
4763 (errmsg("database files are incompatible with server"),
4764 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4765 " but the server was compiled with XLOG_BLCKSZ %d.",
4766 ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4767 errhint("It looks like you need to recompile or initdb.")));
4768 if (ControlFile->nameDataLen != NAMEDATALEN)
4769 ereport(FATAL,
4770 (errmsg("database files are incompatible with server"),
4771 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4772 " but the server was compiled with NAMEDATALEN %d.",
4773 ControlFile->nameDataLen, NAMEDATALEN),
4774 errhint("It looks like you need to recompile or initdb.")));
4775 if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4776 ereport(FATAL,
4777 (errmsg("database files are incompatible with server"),
4778 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4779 " but the server was compiled with INDEX_MAX_KEYS %d.",
4780 ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4781 errhint("It looks like you need to recompile or initdb.")));
4782 if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4783 ereport(FATAL,
4784 (errmsg("database files are incompatible with server"),
4785 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4786 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4787 ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4788 errhint("It looks like you need to recompile or initdb.")));
4789 if (ControlFile->loblksize != LOBLKSIZE)
4790 ereport(FATAL,
4791 (errmsg("database files are incompatible with server"),
4792 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4793 " but the server was compiled with LOBLKSIZE %d.",
4794 ControlFile->loblksize, (int) LOBLKSIZE),
4795 errhint("It looks like you need to recompile or initdb.")));
4796
4797 #ifdef USE_FLOAT4_BYVAL
4798 if (ControlFile->float4ByVal != true)
4799 ereport(FATAL,
4800 (errmsg("database files are incompatible with server"),
4801 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4802 " but the server was compiled with USE_FLOAT4_BYVAL."),
4803 errhint("It looks like you need to recompile or initdb.")));
4804 #else
4805 if (ControlFile->float4ByVal != false)
4806 ereport(FATAL,
4807 (errmsg("database files are incompatible with server"),
4808 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4809 " but the server was compiled without USE_FLOAT4_BYVAL."),
4810 errhint("It looks like you need to recompile or initdb.")));
4811 #endif
4812
4813 #ifdef USE_FLOAT8_BYVAL
4814 if (ControlFile->float8ByVal != true)
4815 ereport(FATAL,
4816 (errmsg("database files are incompatible with server"),
4817 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4818 " but the server was compiled with USE_FLOAT8_BYVAL."),
4819 errhint("It looks like you need to recompile or initdb.")));
4820 #else
4821 if (ControlFile->float8ByVal != false)
4822 ereport(FATAL,
4823 (errmsg("database files are incompatible with server"),
4824 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4825 " but the server was compiled without USE_FLOAT8_BYVAL."),
4826 errhint("It looks like you need to recompile or initdb.")));
4827 #endif
4828
4829 wal_segment_size = ControlFile->xlog_seg_size;
4830
4831 if (!IsValidWalSegSize(wal_segment_size))
4832 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4833 errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4834 "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4835 wal_segment_size,
4836 wal_segment_size)));
4837
4838 snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4839 SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4840 PGC_S_OVERRIDE);
4841
4842 /* check and update variables dependent on wal_segment_size */
4843 if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4844 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4845 errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
4846
4847 if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4848 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4849 errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
4850
4851 UsableBytesInSegment =
4852 (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4853 (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4854
4855 CalculateCheckpointSegments();
4856
4857 /* Make the initdb settings visible as GUC variables, too */
4858 SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4859 PGC_INTERNAL, PGC_S_OVERRIDE);
4860 }
4861
4862 /*
4863 * Utility wrapper to update the control file. Note that the control
4864 * file gets flushed.
4865 */
4866 void
UpdateControlFile(void)4867 UpdateControlFile(void)
4868 {
4869 update_controlfile(DataDir, ControlFile, true);
4870 }
4871
4872 /*
4873 * Returns the unique system identifier from control file.
4874 */
4875 uint64
GetSystemIdentifier(void)4876 GetSystemIdentifier(void)
4877 {
4878 Assert(ControlFile != NULL);
4879 return ControlFile->system_identifier;
4880 }
4881
4882 /*
4883 * Returns the random nonce from control file.
4884 */
4885 char *
GetMockAuthenticationNonce(void)4886 GetMockAuthenticationNonce(void)
4887 {
4888 Assert(ControlFile != NULL);
4889 return ControlFile->mock_authentication_nonce;
4890 }
4891
4892 /*
4893 * Are checksums enabled for data pages?
4894 */
4895 bool
DataChecksumsEnabled(void)4896 DataChecksumsEnabled(void)
4897 {
4898 Assert(ControlFile != NULL);
4899 return (ControlFile->data_checksum_version > 0);
4900 }
4901
4902 /*
4903 * Returns a fake LSN for unlogged relations.
4904 *
4905 * Each call generates an LSN that is greater than any previous value
4906 * returned. The current counter value is saved and restored across clean
4907 * shutdowns, but like unlogged relations, does not survive a crash. This can
4908 * be used in lieu of real LSN values returned by XLogInsert, if you need an
4909 * LSN-like increasing sequence of numbers without writing any WAL.
4910 */
4911 XLogRecPtr
GetFakeLSNForUnloggedRel(void)4912 GetFakeLSNForUnloggedRel(void)
4913 {
4914 XLogRecPtr nextUnloggedLSN;
4915
4916 /* increment the unloggedLSN counter, need SpinLock */
4917 SpinLockAcquire(&XLogCtl->ulsn_lck);
4918 nextUnloggedLSN = XLogCtl->unloggedLSN++;
4919 SpinLockRelease(&XLogCtl->ulsn_lck);
4920
4921 return nextUnloggedLSN;
4922 }
4923
4924 /*
4925 * Auto-tune the number of XLOG buffers.
4926 *
4927 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4928 * a maximum of one XLOG segment (there is little reason to think that more
4929 * is helpful, at least so long as we force an fsync when switching log files)
4930 * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4931 * 9.1, when auto-tuning was added).
4932 *
4933 * This should not be called until NBuffers has received its final value.
4934 */
4935 static int
XLOGChooseNumBuffers(void)4936 XLOGChooseNumBuffers(void)
4937 {
4938 int xbuffers;
4939
4940 xbuffers = NBuffers / 32;
4941 if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
4942 xbuffers = (wal_segment_size / XLOG_BLCKSZ);
4943 if (xbuffers < 8)
4944 xbuffers = 8;
4945 return xbuffers;
4946 }
4947
4948 /*
4949 * GUC check_hook for wal_buffers
4950 */
4951 bool
check_wal_buffers(int * newval,void ** extra,GucSource source)4952 check_wal_buffers(int *newval, void **extra, GucSource source)
4953 {
4954 /*
4955 * -1 indicates a request for auto-tune.
4956 */
4957 if (*newval == -1)
4958 {
4959 /*
4960 * If we haven't yet changed the boot_val default of -1, just let it
4961 * be. We'll fix it when XLOGShmemSize is called.
4962 */
4963 if (XLOGbuffers == -1)
4964 return true;
4965
4966 /* Otherwise, substitute the auto-tune value */
4967 *newval = XLOGChooseNumBuffers();
4968 }
4969
4970 /*
4971 * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
4972 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4973 * the case, we just silently treat such values as a request for the
4974 * minimum. (We could throw an error instead, but that doesn't seem very
4975 * helpful.)
4976 */
4977 if (*newval < 4)
4978 *newval = 4;
4979
4980 return true;
4981 }
4982
4983 /*
4984 * Read the control file, set respective GUCs.
4985 *
4986 * This is to be called during startup, including a crash recovery cycle,
4987 * unless in bootstrap mode, where no control file yet exists. As there's no
4988 * usable shared memory yet (its sizing can depend on the contents of the
4989 * control file!), first store the contents in local memory. XLOGShmemInit()
4990 * will then copy it to shared memory later.
4991 *
4992 * reset just controls whether previous contents are to be expected (in the
4993 * reset case, there's a dangling pointer into old shared memory), or not.
4994 */
4995 void
LocalProcessControlFile(bool reset)4996 LocalProcessControlFile(bool reset)
4997 {
4998 Assert(reset || ControlFile == NULL);
4999 ControlFile = palloc(sizeof(ControlFileData));
5000 ReadControlFile();
5001 }
5002
5003 /*
5004 * Initialization of shared memory for XLOG
5005 */
5006 Size
XLOGShmemSize(void)5007 XLOGShmemSize(void)
5008 {
5009 Size size;
5010
5011 /*
5012 * If the value of wal_buffers is -1, use the preferred auto-tune value.
5013 * This isn't an amazingly clean place to do this, but we must wait till
5014 * NBuffers has received its final value, and must do it before using the
5015 * value of XLOGbuffers to do anything important.
5016 */
5017 if (XLOGbuffers == -1)
5018 {
5019 char buf[32];
5020
5021 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5022 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5023 }
5024 Assert(XLOGbuffers > 0);
5025
5026 /* XLogCtl */
5027 size = sizeof(XLogCtlData);
5028
5029 /* WAL insertion locks, plus alignment */
5030 size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
5031 /* xlblocks array */
5032 size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5033 /* extra alignment padding for XLOG I/O buffers */
5034 size = add_size(size, XLOG_BLCKSZ);
5035 /* and the buffers themselves */
5036 size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5037
5038 /*
5039 * Note: we don't count ControlFileData, it comes out of the "slop factor"
5040 * added by CreateSharedMemoryAndSemaphores. This lets us use this
5041 * routine again below to compute the actual allocation size.
5042 */
5043
5044 return size;
5045 }
5046
5047 void
XLOGShmemInit(void)5048 XLOGShmemInit(void)
5049 {
5050 bool foundCFile,
5051 foundXLog;
5052 char *allocptr;
5053 int i;
5054 ControlFileData *localControlFile;
5055
5056 #ifdef WAL_DEBUG
5057
5058 /*
5059 * Create a memory context for WAL debugging that's exempt from the normal
5060 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
5061 * an allocation fails, but wal_debug is not for production use anyway.
5062 */
5063 if (walDebugCxt == NULL)
5064 {
5065 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
5066 "WAL Debug",
5067 ALLOCSET_DEFAULT_SIZES);
5068 MemoryContextAllowInCriticalSection(walDebugCxt, true);
5069 }
5070 #endif
5071
5072
5073 XLogCtl = (XLogCtlData *)
5074 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5075
5076 localControlFile = ControlFile;
5077 ControlFile = (ControlFileData *)
5078 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5079
5080 if (foundCFile || foundXLog)
5081 {
5082 /* both should be present or neither */
5083 Assert(foundCFile && foundXLog);
5084
5085 /* Initialize local copy of WALInsertLocks and register the tranche */
5086 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5087 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
5088 "wal_insert");
5089
5090 if (localControlFile)
5091 pfree(localControlFile);
5092 return;
5093 }
5094 memset(XLogCtl, 0, sizeof(XLogCtlData));
5095
5096 /*
5097 * Already have read control file locally, unless in bootstrap mode. Move
5098 * contents into shared memory.
5099 */
5100 if (localControlFile)
5101 {
5102 memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5103 pfree(localControlFile);
5104 }
5105
5106 /*
5107 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5108 * multiple of the alignment for same, so no extra alignment padding is
5109 * needed here.
5110 */
5111 allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5112 XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5113 memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5114 allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5115
5116
5117 /* WAL insertion locks. Ensure they're aligned to the full padded size */
5118 allocptr += sizeof(WALInsertLockPadded) -
5119 ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5120 WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5121 (WALInsertLockPadded *) allocptr;
5122 allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5123
5124 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
5125 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5126 {
5127 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5128 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5129 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5130 }
5131
5132 /*
5133 * Align the start of the page buffers to a full xlog block size boundary.
5134 * This simplifies some calculations in XLOG insertion. It is also
5135 * required for O_DIRECT.
5136 */
5137 allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5138 XLogCtl->pages = allocptr;
5139 memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5140
5141 /*
5142 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5143 * in additional info.)
5144 */
5145 XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5146 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5147 XLogCtl->SharedHotStandbyActive = false;
5148 XLogCtl->WalWriterSleeping = false;
5149
5150 SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5151 SpinLockInit(&XLogCtl->info_lck);
5152 SpinLockInit(&XLogCtl->ulsn_lck);
5153 InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5154 }
5155
5156 /*
5157 * This func must be called ONCE on system install. It creates pg_control
5158 * and the initial XLOG segment.
5159 */
5160 void
BootStrapXLOG(void)5161 BootStrapXLOG(void)
5162 {
5163 CheckPoint checkPoint;
5164 char *buffer;
5165 XLogPageHeader page;
5166 XLogLongPageHeader longpage;
5167 XLogRecord *record;
5168 char *recptr;
5169 bool use_existent;
5170 uint64 sysidentifier;
5171 char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
5172 struct timeval tv;
5173 pg_crc32c crc;
5174
5175 /*
5176 * Select a hopefully-unique system identifier code for this installation.
5177 * We use the result of gettimeofday(), including the fractional seconds
5178 * field, as being about as unique as we can easily get. (Think not to
5179 * use random(), since it hasn't been seeded and there's no portable way
5180 * to seed it other than the system clock value...) The upper half of the
5181 * uint64 value is just the tv_sec part, while the lower half contains the
5182 * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5183 * PID for a little extra uniqueness. A person knowing this encoding can
5184 * determine the initialization time of the installation, which could
5185 * perhaps be useful sometimes.
5186 */
5187 gettimeofday(&tv, NULL);
5188 sysidentifier = ((uint64) tv.tv_sec) << 32;
5189 sysidentifier |= ((uint64) tv.tv_usec) << 12;
5190 sysidentifier |= getpid() & 0xFFF;
5191
5192 /*
5193 * Generate a random nonce. This is used for authentication requests that
5194 * will fail because the user does not exist. The nonce is used to create
5195 * a genuine-looking password challenge for the non-existent user, in lieu
5196 * of an actual stored password.
5197 */
5198 if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
5199 ereport(PANIC,
5200 (errcode(ERRCODE_INTERNAL_ERROR),
5201 errmsg("could not generate secret authorization token")));
5202
5203 /* First timeline ID is always 1 */
5204 ThisTimeLineID = 1;
5205
5206 /* page buffer must be aligned suitably for O_DIRECT */
5207 buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5208 page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5209 memset(page, 0, XLOG_BLCKSZ);
5210
5211 /*
5212 * Set up information for the initial checkpoint record
5213 *
5214 * The initial checkpoint record is written to the beginning of the WAL
5215 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5216 * used, so that we can use 0/0 to mean "before any valid WAL segment".
5217 */
5218 checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5219 checkPoint.ThisTimeLineID = ThisTimeLineID;
5220 checkPoint.PrevTimeLineID = ThisTimeLineID;
5221 checkPoint.fullPageWrites = fullPageWrites;
5222 checkPoint.nextFullXid =
5223 FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
5224 checkPoint.nextOid = FirstBootstrapObjectId;
5225 checkPoint.nextMulti = FirstMultiXactId;
5226 checkPoint.nextMultiOffset = 0;
5227 checkPoint.oldestXid = FirstNormalTransactionId;
5228 checkPoint.oldestXidDB = TemplateDbOid;
5229 checkPoint.oldestMulti = FirstMultiXactId;
5230 checkPoint.oldestMultiDB = TemplateDbOid;
5231 checkPoint.oldestCommitTsXid = InvalidTransactionId;
5232 checkPoint.newestCommitTsXid = InvalidTransactionId;
5233 checkPoint.time = (pg_time_t) time(NULL);
5234 checkPoint.oldestActiveXid = InvalidTransactionId;
5235
5236 ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
5237 ShmemVariableCache->nextOid = checkPoint.nextOid;
5238 ShmemVariableCache->oidCount = 0;
5239 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5240 AdvanceOldestClogXid(checkPoint.oldestXid);
5241 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5242 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5243 SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5244
5245 /* Set up the XLOG page header */
5246 page->xlp_magic = XLOG_PAGE_MAGIC;
5247 page->xlp_info = XLP_LONG_HEADER;
5248 page->xlp_tli = ThisTimeLineID;
5249 page->xlp_pageaddr = wal_segment_size;
5250 longpage = (XLogLongPageHeader) page;
5251 longpage->xlp_sysid = sysidentifier;
5252 longpage->xlp_seg_size = wal_segment_size;
5253 longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5254
5255 /* Insert the initial checkpoint record */
5256 recptr = ((char *) page + SizeOfXLogLongPHD);
5257 record = (XLogRecord *) recptr;
5258 record->xl_prev = 0;
5259 record->xl_xid = InvalidTransactionId;
5260 record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5261 record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5262 record->xl_rmid = RM_XLOG_ID;
5263 recptr += SizeOfXLogRecord;
5264 /* fill the XLogRecordDataHeaderShort struct */
5265 *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5266 *(recptr++) = sizeof(checkPoint);
5267 memcpy(recptr, &checkPoint, sizeof(checkPoint));
5268 recptr += sizeof(checkPoint);
5269 Assert(recptr - (char *) record == record->xl_tot_len);
5270
5271 INIT_CRC32C(crc);
5272 COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5273 COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5274 FIN_CRC32C(crc);
5275 record->xl_crc = crc;
5276
5277 /* Create first XLOG segment file */
5278 use_existent = false;
5279 openLogFile = XLogFileInit(1, &use_existent, false);
5280
5281 /* Write the first page with the initial record */
5282 errno = 0;
5283 pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5284 if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5285 {
5286 /* if write didn't set errno, assume problem is no disk space */
5287 if (errno == 0)
5288 errno = ENOSPC;
5289 ereport(PANIC,
5290 (errcode_for_file_access(),
5291 errmsg("could not write bootstrap write-ahead log file: %m")));
5292 }
5293 pgstat_report_wait_end();
5294
5295 pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5296 if (pg_fsync(openLogFile) != 0)
5297 ereport(PANIC,
5298 (errcode_for_file_access(),
5299 errmsg("could not fsync bootstrap write-ahead log file: %m")));
5300 pgstat_report_wait_end();
5301
5302 if (close(openLogFile))
5303 ereport(PANIC,
5304 (errcode_for_file_access(),
5305 errmsg("could not close bootstrap write-ahead log file: %m")));
5306
5307 openLogFile = -1;
5308
5309 /* Now create pg_control */
5310
5311 memset(ControlFile, 0, sizeof(ControlFileData));
5312 /* Initialize pg_control status fields */
5313 ControlFile->system_identifier = sysidentifier;
5314 memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
5315 ControlFile->state = DB_SHUTDOWNED;
5316 ControlFile->time = checkPoint.time;
5317 ControlFile->checkPoint = checkPoint.redo;
5318 ControlFile->checkPointCopy = checkPoint;
5319 ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
5320
5321 /* Set important parameter values for use when replaying WAL */
5322 ControlFile->MaxConnections = MaxConnections;
5323 ControlFile->max_worker_processes = max_worker_processes;
5324 ControlFile->max_wal_senders = max_wal_senders;
5325 ControlFile->max_prepared_xacts = max_prepared_xacts;
5326 ControlFile->max_locks_per_xact = max_locks_per_xact;
5327 ControlFile->wal_level = wal_level;
5328 ControlFile->wal_log_hints = wal_log_hints;
5329 ControlFile->track_commit_timestamp = track_commit_timestamp;
5330 ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5331
5332 /* some additional ControlFile fields are set in WriteControlFile() */
5333
5334 WriteControlFile();
5335
5336 /* Bootstrap the commit log, too */
5337 BootStrapCLOG();
5338 BootStrapCommitTs();
5339 BootStrapSUBTRANS();
5340 BootStrapMultiXact();
5341
5342 pfree(buffer);
5343
5344 /*
5345 * Force control file to be read - in contrast to normal processing we'd
5346 * otherwise never run the checks and GUC related initializations therein.
5347 */
5348 ReadControlFile();
5349 }
5350
5351 static char *
str_time(pg_time_t tnow)5352 str_time(pg_time_t tnow)
5353 {
5354 static char buf[128];
5355
5356 pg_strftime(buf, sizeof(buf),
5357 "%Y-%m-%d %H:%M:%S %Z",
5358 pg_localtime(&tnow, log_timezone));
5359
5360 return buf;
5361 }
5362
5363 /*
5364 * See if there are any recovery signal files and if so, set state for
5365 * recovery.
5366 *
5367 * See if there is a recovery command file (recovery.conf), and if so
5368 * throw an ERROR since as of PG12 we no longer recognize that.
5369 */
5370 static void
readRecoverySignalFile(void)5371 readRecoverySignalFile(void)
5372 {
5373 struct stat stat_buf;
5374
5375 if (IsBootstrapProcessingMode())
5376 return;
5377
5378 /*
5379 * Check for old recovery API file: recovery.conf
5380 */
5381 if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
5382 ereport(FATAL,
5383 (errcode_for_file_access(),
5384 errmsg("using recovery command file \"%s\" is not supported",
5385 RECOVERY_COMMAND_FILE)));
5386
5387 /*
5388 * Remove unused .done file, if present. Ignore if absent.
5389 */
5390 unlink(RECOVERY_COMMAND_DONE);
5391
5392 /*
5393 * Check for recovery signal files and if found, fsync them since they
5394 * represent server state information. We don't sweat too much about the
5395 * possibility of fsync failure, however.
5396 *
5397 * If present, standby signal file takes precedence. If neither is present
5398 * then we won't enter archive recovery.
5399 */
5400 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
5401 {
5402 int fd;
5403
5404 fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5405 S_IRUSR | S_IWUSR);
5406 if (fd >= 0)
5407 {
5408 (void) pg_fsync(fd);
5409 close(fd);
5410 }
5411 standby_signal_file_found = true;
5412 }
5413 else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
5414 {
5415 int fd;
5416
5417 fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5418 S_IRUSR | S_IWUSR);
5419 if (fd >= 0)
5420 {
5421 (void) pg_fsync(fd);
5422 close(fd);
5423 }
5424 recovery_signal_file_found = true;
5425 }
5426
5427 StandbyModeRequested = false;
5428 ArchiveRecoveryRequested = false;
5429 if (standby_signal_file_found)
5430 {
5431 StandbyModeRequested = true;
5432 ArchiveRecoveryRequested = true;
5433 }
5434 else if (recovery_signal_file_found)
5435 {
5436 StandbyModeRequested = false;
5437 ArchiveRecoveryRequested = true;
5438 }
5439 else
5440 return;
5441
5442 /*
5443 * We don't support standby mode in standalone backends; that requires
5444 * other processes such as the WAL receiver to be alive.
5445 */
5446 if (StandbyModeRequested && !IsUnderPostmaster)
5447 ereport(FATAL,
5448 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5449 errmsg("standby mode is not supported by single-user servers")));
5450 }
5451
5452 static void
validateRecoveryParameters(void)5453 validateRecoveryParameters(void)
5454 {
5455 if (!ArchiveRecoveryRequested)
5456 return;
5457
5458 /*
5459 * Check for compulsory parameters
5460 */
5461 if (StandbyModeRequested)
5462 {
5463 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
5464 (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
5465 ereport(WARNING,
5466 (errmsg("specified neither primary_conninfo nor restore_command"),
5467 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5468 }
5469 else
5470 {
5471 if (recoveryRestoreCommand == NULL ||
5472 strcmp(recoveryRestoreCommand, "") == 0)
5473 ereport(FATAL,
5474 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5475 errmsg("must specify restore_command when standby mode is not enabled")));
5476 }
5477
5478 /*
5479 * Override any inconsistent requests. Note that this is a change of
5480 * behaviour in 9.5; prior to this we simply ignored a request to pause if
5481 * hot_standby = off, which was surprising behaviour.
5482 */
5483 if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5484 !EnableHotStandby)
5485 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5486
5487 /*
5488 * Final parsing of recovery_target_time string; see also
5489 * check_recovery_target_time().
5490 */
5491 if (recoveryTarget == RECOVERY_TARGET_TIME)
5492 {
5493 recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5494 CStringGetDatum(recovery_target_time_string),
5495 ObjectIdGetDatum(InvalidOid),
5496 Int32GetDatum(-1)));
5497 }
5498
5499 /*
5500 * If user specified recovery_target_timeline, validate it or compute the
5501 * "latest" value. We can't do this until after we've gotten the restore
5502 * command and set InArchiveRecovery, because we need to fetch timeline
5503 * history files from the archive.
5504 */
5505 if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5506 {
5507 TimeLineID rtli = recoveryTargetTLIRequested;
5508
5509 /* Timeline 1 does not have a history file, all else should */
5510 if (rtli != 1 && !existsTimeLineHistory(rtli))
5511 ereport(FATAL,
5512 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5513 errmsg("recovery target timeline %u does not exist",
5514 rtli)));
5515 recoveryTargetTLI = rtli;
5516 }
5517 else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
5518 {
5519 /* We start the "latest" search from pg_control's timeline */
5520 recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5521 }
5522 else
5523 {
5524 /*
5525 * else we just use the recoveryTargetTLI as already read from
5526 * ControlFile
5527 */
5528 Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
5529 }
5530 }
5531
5532 /*
5533 * Exit archive-recovery state
5534 */
5535 static void
exitArchiveRecovery(TimeLineID endTLI,XLogRecPtr endOfLog)5536 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5537 {
5538 char xlogfname[MAXFNAMELEN];
5539 XLogSegNo endLogSegNo;
5540 XLogSegNo startLogSegNo;
5541
5542 /* we always switch to a new timeline after archive recovery */
5543 Assert(endTLI != ThisTimeLineID);
5544
5545 /*
5546 * We are no longer in archive recovery state.
5547 */
5548 InArchiveRecovery = false;
5549
5550 /*
5551 * Update min recovery point one last time.
5552 */
5553 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5554
5555 /*
5556 * If the ending log segment is still open, close it (to avoid problems on
5557 * Windows with trying to rename or delete an open file).
5558 */
5559 if (readFile >= 0)
5560 {
5561 close(readFile);
5562 readFile = -1;
5563 }
5564
5565 /*
5566 * Calculate the last segment on the old timeline, and the first segment
5567 * on the new timeline. If the switch happens in the middle of a segment,
5568 * they are the same, but if the switch happens exactly at a segment
5569 * boundary, startLogSegNo will be endLogSegNo + 1.
5570 */
5571 XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5572 XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5573
5574 /*
5575 * Initialize the starting WAL segment for the new timeline. If the switch
5576 * happens in the middle of a segment, copy data from the last WAL segment
5577 * of the old timeline up to the switch point, to the starting WAL segment
5578 * on the new timeline.
5579 */
5580 if (endLogSegNo == startLogSegNo)
5581 {
5582 /*
5583 * Make a copy of the file on the new timeline.
5584 *
5585 * Writing WAL isn't allowed yet, so there are no locking
5586 * considerations. But we should be just as tense as XLogFileInit to
5587 * avoid emplacing a bogus file.
5588 */
5589 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5590 XLogSegmentOffset(endOfLog, wal_segment_size));
5591 }
5592 else
5593 {
5594 /*
5595 * The switch happened at a segment boundary, so just create the next
5596 * segment on the new timeline.
5597 */
5598 bool use_existent = true;
5599 int fd;
5600
5601 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5602
5603 if (close(fd))
5604 ereport(ERROR,
5605 (errcode_for_file_access(),
5606 errmsg("could not close file \"%s\": %m",
5607 XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5608 }
5609
5610 /*
5611 * Let's just make real sure there are not .ready or .done flags posted
5612 * for the new segment.
5613 */
5614 XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5615 XLogArchiveCleanup(xlogfname);
5616
5617 /*
5618 * Remove the signal files out of the way, so that we don't accidentally
5619 * re-enter archive recovery mode in a subsequent crash.
5620 */
5621 if (standby_signal_file_found)
5622 durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
5623
5624 if (recovery_signal_file_found)
5625 durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
5626
5627 ereport(LOG,
5628 (errmsg("archive recovery complete")));
5629 }
5630
5631 /*
5632 * Extract timestamp from WAL record.
5633 *
5634 * If the record contains a timestamp, returns true, and saves the timestamp
5635 * in *recordXtime. If the record type has no timestamp, returns false.
5636 * Currently, only transaction commit/abort records and restore points contain
5637 * timestamps.
5638 */
5639 static bool
getRecordTimestamp(XLogReaderState * record,TimestampTz * recordXtime)5640 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5641 {
5642 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5643 uint8 xact_info = info & XLOG_XACT_OPMASK;
5644 uint8 rmid = XLogRecGetRmid(record);
5645
5646 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5647 {
5648 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5649 return true;
5650 }
5651 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5652 xact_info == XLOG_XACT_COMMIT_PREPARED))
5653 {
5654 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5655 return true;
5656 }
5657 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5658 xact_info == XLOG_XACT_ABORT_PREPARED))
5659 {
5660 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5661 return true;
5662 }
5663 return false;
5664 }
5665
5666 /*
5667 * For point-in-time recovery, this function decides whether we want to
5668 * stop applying the XLOG before the current record.
5669 *
5670 * Returns true if we are stopping, false otherwise. If stopping, some
5671 * information is saved in recoveryStopXid et al for use in annotating the
5672 * new timeline's history file.
5673 */
5674 static bool
recoveryStopsBefore(XLogReaderState * record)5675 recoveryStopsBefore(XLogReaderState *record)
5676 {
5677 bool stopsHere = false;
5678 uint8 xact_info;
5679 bool isCommit;
5680 TimestampTz recordXtime = 0;
5681 TransactionId recordXid;
5682
5683 /*
5684 * Ignore recovery target settings when not in archive recovery (meaning
5685 * we are in crash recovery).
5686 */
5687 if (!ArchiveRecoveryRequested)
5688 return false;
5689
5690 /* Check if we should stop as soon as reaching consistency */
5691 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5692 {
5693 ereport(LOG,
5694 (errmsg("recovery stopping after reaching consistency")));
5695
5696 recoveryStopAfter = false;
5697 recoveryStopXid = InvalidTransactionId;
5698 recoveryStopLSN = InvalidXLogRecPtr;
5699 recoveryStopTime = 0;
5700 recoveryStopName[0] = '\0';
5701 return true;
5702 }
5703
5704 /* Check if target LSN has been reached */
5705 if (recoveryTarget == RECOVERY_TARGET_LSN &&
5706 !recoveryTargetInclusive &&
5707 record->ReadRecPtr >= recoveryTargetLSN)
5708 {
5709 recoveryStopAfter = false;
5710 recoveryStopXid = InvalidTransactionId;
5711 recoveryStopLSN = record->ReadRecPtr;
5712 recoveryStopTime = 0;
5713 recoveryStopName[0] = '\0';
5714 ereport(LOG,
5715 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5716 (uint32) (recoveryStopLSN >> 32),
5717 (uint32) recoveryStopLSN)));
5718 return true;
5719 }
5720
5721 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5722 if (XLogRecGetRmid(record) != RM_XACT_ID)
5723 return false;
5724
5725 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5726
5727 if (xact_info == XLOG_XACT_COMMIT)
5728 {
5729 isCommit = true;
5730 recordXid = XLogRecGetXid(record);
5731 }
5732 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5733 {
5734 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5735 xl_xact_parsed_commit parsed;
5736
5737 isCommit = true;
5738 ParseCommitRecord(XLogRecGetInfo(record),
5739 xlrec,
5740 &parsed);
5741 recordXid = parsed.twophase_xid;
5742 }
5743 else if (xact_info == XLOG_XACT_ABORT)
5744 {
5745 isCommit = false;
5746 recordXid = XLogRecGetXid(record);
5747 }
5748 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5749 {
5750 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5751 xl_xact_parsed_abort parsed;
5752
5753 isCommit = false;
5754 ParseAbortRecord(XLogRecGetInfo(record),
5755 xlrec,
5756 &parsed);
5757 recordXid = parsed.twophase_xid;
5758 }
5759 else
5760 return false;
5761
5762 if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5763 {
5764 /*
5765 * There can be only one transaction end record with this exact
5766 * transactionid
5767 *
5768 * when testing for an xid, we MUST test for equality only, since
5769 * transactions are numbered in the order they start, not the order
5770 * they complete. A higher numbered xid will complete before you about
5771 * 50% of the time...
5772 */
5773 stopsHere = (recordXid == recoveryTargetXid);
5774 }
5775
5776 if (recoveryTarget == RECOVERY_TARGET_TIME &&
5777 getRecordTimestamp(record, &recordXtime))
5778 {
5779 /*
5780 * There can be many transactions that share the same commit time, so
5781 * we stop after the last one, if we are inclusive, or stop at the
5782 * first one if we are exclusive
5783 */
5784 if (recoveryTargetInclusive)
5785 stopsHere = (recordXtime > recoveryTargetTime);
5786 else
5787 stopsHere = (recordXtime >= recoveryTargetTime);
5788 }
5789
5790 if (stopsHere)
5791 {
5792 recoveryStopAfter = false;
5793 recoveryStopXid = recordXid;
5794 recoveryStopTime = recordXtime;
5795 recoveryStopLSN = InvalidXLogRecPtr;
5796 recoveryStopName[0] = '\0';
5797
5798 if (isCommit)
5799 {
5800 ereport(LOG,
5801 (errmsg("recovery stopping before commit of transaction %u, time %s",
5802 recoveryStopXid,
5803 timestamptz_to_str(recoveryStopTime))));
5804 }
5805 else
5806 {
5807 ereport(LOG,
5808 (errmsg("recovery stopping before abort of transaction %u, time %s",
5809 recoveryStopXid,
5810 timestamptz_to_str(recoveryStopTime))));
5811 }
5812 }
5813
5814 return stopsHere;
5815 }
5816
5817 /*
5818 * Same as recoveryStopsBefore, but called after applying the record.
5819 *
5820 * We also track the timestamp of the latest applied COMMIT/ABORT
5821 * record in XLogCtl->recoveryLastXTime.
5822 */
5823 static bool
recoveryStopsAfter(XLogReaderState * record)5824 recoveryStopsAfter(XLogReaderState *record)
5825 {
5826 uint8 info;
5827 uint8 xact_info;
5828 uint8 rmid;
5829 TimestampTz recordXtime;
5830
5831 /*
5832 * Ignore recovery target settings when not in archive recovery (meaning
5833 * we are in crash recovery).
5834 */
5835 if (!ArchiveRecoveryRequested)
5836 return false;
5837
5838 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5839 rmid = XLogRecGetRmid(record);
5840
5841 /*
5842 * There can be many restore points that share the same name; we stop at
5843 * the first one.
5844 */
5845 if (recoveryTarget == RECOVERY_TARGET_NAME &&
5846 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5847 {
5848 xl_restore_point *recordRestorePointData;
5849
5850 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5851
5852 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5853 {
5854 recoveryStopAfter = true;
5855 recoveryStopXid = InvalidTransactionId;
5856 recoveryStopLSN = InvalidXLogRecPtr;
5857 (void) getRecordTimestamp(record, &recoveryStopTime);
5858 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5859
5860 ereport(LOG,
5861 (errmsg("recovery stopping at restore point \"%s\", time %s",
5862 recoveryStopName,
5863 timestamptz_to_str(recoveryStopTime))));
5864 return true;
5865 }
5866 }
5867
5868 /* Check if the target LSN has been reached */
5869 if (recoveryTarget == RECOVERY_TARGET_LSN &&
5870 recoveryTargetInclusive &&
5871 record->ReadRecPtr >= recoveryTargetLSN)
5872 {
5873 recoveryStopAfter = true;
5874 recoveryStopXid = InvalidTransactionId;
5875 recoveryStopLSN = record->ReadRecPtr;
5876 recoveryStopTime = 0;
5877 recoveryStopName[0] = '\0';
5878 ereport(LOG,
5879 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5880 (uint32) (recoveryStopLSN >> 32),
5881 (uint32) recoveryStopLSN)));
5882 return true;
5883 }
5884
5885 if (rmid != RM_XACT_ID)
5886 return false;
5887
5888 xact_info = info & XLOG_XACT_OPMASK;
5889
5890 if (xact_info == XLOG_XACT_COMMIT ||
5891 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5892 xact_info == XLOG_XACT_ABORT ||
5893 xact_info == XLOG_XACT_ABORT_PREPARED)
5894 {
5895 TransactionId recordXid;
5896
5897 /* Update the last applied transaction timestamp */
5898 if (getRecordTimestamp(record, &recordXtime))
5899 SetLatestXTime(recordXtime);
5900
5901 /* Extract the XID of the committed/aborted transaction */
5902 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5903 {
5904 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5905 xl_xact_parsed_commit parsed;
5906
5907 ParseCommitRecord(XLogRecGetInfo(record),
5908 xlrec,
5909 &parsed);
5910 recordXid = parsed.twophase_xid;
5911 }
5912 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5913 {
5914 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5915 xl_xact_parsed_abort parsed;
5916
5917 ParseAbortRecord(XLogRecGetInfo(record),
5918 xlrec,
5919 &parsed);
5920 recordXid = parsed.twophase_xid;
5921 }
5922 else
5923 recordXid = XLogRecGetXid(record);
5924
5925 /*
5926 * There can be only one transaction end record with this exact
5927 * transactionid
5928 *
5929 * when testing for an xid, we MUST test for equality only, since
5930 * transactions are numbered in the order they start, not the order
5931 * they complete. A higher numbered xid will complete before you about
5932 * 50% of the time...
5933 */
5934 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5935 recordXid == recoveryTargetXid)
5936 {
5937 recoveryStopAfter = true;
5938 recoveryStopXid = recordXid;
5939 recoveryStopTime = recordXtime;
5940 recoveryStopLSN = InvalidXLogRecPtr;
5941 recoveryStopName[0] = '\0';
5942
5943 if (xact_info == XLOG_XACT_COMMIT ||
5944 xact_info == XLOG_XACT_COMMIT_PREPARED)
5945 {
5946 ereport(LOG,
5947 (errmsg("recovery stopping after commit of transaction %u, time %s",
5948 recoveryStopXid,
5949 timestamptz_to_str(recoveryStopTime))));
5950 }
5951 else if (xact_info == XLOG_XACT_ABORT ||
5952 xact_info == XLOG_XACT_ABORT_PREPARED)
5953 {
5954 ereport(LOG,
5955 (errmsg("recovery stopping after abort of transaction %u, time %s",
5956 recoveryStopXid,
5957 timestamptz_to_str(recoveryStopTime))));
5958 }
5959 return true;
5960 }
5961 }
5962
5963 /* Check if we should stop as soon as reaching consistency */
5964 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5965 {
5966 ereport(LOG,
5967 (errmsg("recovery stopping after reaching consistency")));
5968
5969 recoveryStopAfter = true;
5970 recoveryStopXid = InvalidTransactionId;
5971 recoveryStopTime = 0;
5972 recoveryStopLSN = InvalidXLogRecPtr;
5973 recoveryStopName[0] = '\0';
5974 return true;
5975 }
5976
5977 return false;
5978 }
5979
5980 /*
5981 * Wait until shared recoveryPause flag is cleared.
5982 *
5983 * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5984 * Probably not worth the trouble though. This state shouldn't be one that
5985 * anyone cares about server power consumption in.
5986 */
5987 static void
recoveryPausesHere(void)5988 recoveryPausesHere(void)
5989 {
5990 /* Don't pause unless users can connect! */
5991 if (!LocalHotStandbyActive)
5992 return;
5993
5994 ereport(LOG,
5995 (errmsg("recovery has paused"),
5996 errhint("Execute pg_wal_replay_resume() to continue.")));
5997
5998 while (RecoveryIsPaused())
5999 {
6000 pg_usleep(1000000L); /* 1000 ms */
6001 HandleStartupProcInterrupts();
6002 }
6003 }
6004
6005 bool
RecoveryIsPaused(void)6006 RecoveryIsPaused(void)
6007 {
6008 bool recoveryPause;
6009
6010 SpinLockAcquire(&XLogCtl->info_lck);
6011 recoveryPause = XLogCtl->recoveryPause;
6012 SpinLockRelease(&XLogCtl->info_lck);
6013
6014 return recoveryPause;
6015 }
6016
6017 void
SetRecoveryPause(bool recoveryPause)6018 SetRecoveryPause(bool recoveryPause)
6019 {
6020 SpinLockAcquire(&XLogCtl->info_lck);
6021 XLogCtl->recoveryPause = recoveryPause;
6022 SpinLockRelease(&XLogCtl->info_lck);
6023 }
6024
6025 /*
6026 * When recovery_min_apply_delay is set, we wait long enough to make sure
6027 * certain record types are applied at least that interval behind the master.
6028 *
6029 * Returns true if we waited.
6030 *
6031 * Note that the delay is calculated between the WAL record log time and
6032 * the current time on standby. We would prefer to keep track of when this
6033 * standby received each WAL record, which would allow a more consistent
6034 * approach and one not affected by time synchronisation issues, but that
6035 * is significantly more effort and complexity for little actual gain in
6036 * usability.
6037 */
6038 static bool
recoveryApplyDelay(XLogReaderState * record)6039 recoveryApplyDelay(XLogReaderState *record)
6040 {
6041 uint8 xact_info;
6042 TimestampTz xtime;
6043 long msecs;
6044
6045 /* nothing to do if no delay configured */
6046 if (recovery_min_apply_delay <= 0)
6047 return false;
6048
6049 /* no delay is applied on a database not yet consistent */
6050 if (!reachedConsistency)
6051 return false;
6052
6053 /* nothing to do if crash recovery is requested */
6054 if (!ArchiveRecoveryRequested)
6055 return false;
6056
6057 /*
6058 * Is it a COMMIT record?
6059 *
6060 * We deliberately choose not to delay aborts since they have no effect on
6061 * MVCC. We already allow replay of records that don't have a timestamp,
6062 * so there is already opportunity for issues caused by early conflicts on
6063 * standbys.
6064 */
6065 if (XLogRecGetRmid(record) != RM_XACT_ID)
6066 return false;
6067
6068 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6069
6070 if (xact_info != XLOG_XACT_COMMIT &&
6071 xact_info != XLOG_XACT_COMMIT_PREPARED)
6072 return false;
6073
6074 if (!getRecordTimestamp(record, &xtime))
6075 return false;
6076
6077 recoveryDelayUntilTime =
6078 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6079
6080 /*
6081 * Exit without arming the latch if it's already past time to apply this
6082 * record
6083 */
6084 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
6085 recoveryDelayUntilTime);
6086 if (msecs <= 0)
6087 return false;
6088
6089 while (true)
6090 {
6091 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6092
6093 /*
6094 * This might change recovery_min_apply_delay or the trigger file's
6095 * location.
6096 */
6097 HandleStartupProcInterrupts();
6098
6099 if (CheckForStandbyTrigger())
6100 break;
6101
6102 /*
6103 * Recalculate recoveryDelayUntilTime as recovery_min_apply_delay
6104 * could have changed while waiting in this loop.
6105 */
6106 recoveryDelayUntilTime =
6107 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6108
6109 /*
6110 * Wait for difference between GetCurrentTimestamp() and
6111 * recoveryDelayUntilTime
6112 */
6113 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
6114 recoveryDelayUntilTime);
6115
6116 if (msecs <= 0)
6117 break;
6118
6119 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
6120
6121 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
6122 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
6123 msecs,
6124 WAIT_EVENT_RECOVERY_APPLY_DELAY);
6125 }
6126 return true;
6127 }
6128
6129 /*
6130 * Save timestamp of latest processed commit/abort record.
6131 *
6132 * We keep this in XLogCtl, not a simple static variable, so that it can be
6133 * seen by processes other than the startup process. Note in particular
6134 * that CreateRestartPoint is executed in the checkpointer.
6135 */
6136 static void
SetLatestXTime(TimestampTz xtime)6137 SetLatestXTime(TimestampTz xtime)
6138 {
6139 SpinLockAcquire(&XLogCtl->info_lck);
6140 XLogCtl->recoveryLastXTime = xtime;
6141 SpinLockRelease(&XLogCtl->info_lck);
6142 }
6143
6144 /*
6145 * Fetch timestamp of latest processed commit/abort record.
6146 */
6147 TimestampTz
GetLatestXTime(void)6148 GetLatestXTime(void)
6149 {
6150 TimestampTz xtime;
6151
6152 SpinLockAcquire(&XLogCtl->info_lck);
6153 xtime = XLogCtl->recoveryLastXTime;
6154 SpinLockRelease(&XLogCtl->info_lck);
6155
6156 return xtime;
6157 }
6158
6159 /*
6160 * Save timestamp of the next chunk of WAL records to apply.
6161 *
6162 * We keep this in XLogCtl, not a simple static variable, so that it can be
6163 * seen by all backends.
6164 */
6165 static void
SetCurrentChunkStartTime(TimestampTz xtime)6166 SetCurrentChunkStartTime(TimestampTz xtime)
6167 {
6168 SpinLockAcquire(&XLogCtl->info_lck);
6169 XLogCtl->currentChunkStartTime = xtime;
6170 SpinLockRelease(&XLogCtl->info_lck);
6171 }
6172
6173 /*
6174 * Fetch timestamp of latest processed commit/abort record.
6175 * Startup process maintains an accurate local copy in XLogReceiptTime
6176 */
6177 TimestampTz
GetCurrentChunkReplayStartTime(void)6178 GetCurrentChunkReplayStartTime(void)
6179 {
6180 TimestampTz xtime;
6181
6182 SpinLockAcquire(&XLogCtl->info_lck);
6183 xtime = XLogCtl->currentChunkStartTime;
6184 SpinLockRelease(&XLogCtl->info_lck);
6185
6186 return xtime;
6187 }
6188
6189 /*
6190 * Returns time of receipt of current chunk of XLOG data, as well as
6191 * whether it was received from streaming replication or from archives.
6192 */
6193 void
GetXLogReceiptTime(TimestampTz * rtime,bool * fromStream)6194 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6195 {
6196 /*
6197 * This must be executed in the startup process, since we don't export the
6198 * relevant state to shared memory.
6199 */
6200 Assert(InRecovery);
6201
6202 *rtime = XLogReceiptTime;
6203 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6204 }
6205
6206 /*
6207 * Note that text field supplied is a parameter name and does not require
6208 * translation
6209 */
6210 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6211 do { \
6212 if ((currValue) < (minValue)) \
6213 ereport(ERROR, \
6214 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6215 errmsg("hot standby is not possible because " \
6216 "%s = %d is a lower setting than on the master server " \
6217 "(its value was %d)", \
6218 param_name, \
6219 currValue, \
6220 minValue))); \
6221 } while(0)
6222
6223 /*
6224 * Check to see if required parameters are set high enough on this server
6225 * for various aspects of recovery operation.
6226 *
6227 * Note that all the parameters which this function tests need to be
6228 * listed in Administrator's Overview section in high-availability.sgml.
6229 * If you change them, don't forget to update the list.
6230 */
6231 static void
CheckRequiredParameterValues(void)6232 CheckRequiredParameterValues(void)
6233 {
6234 /*
6235 * For archive recovery, the WAL must be generated with at least 'replica'
6236 * wal_level.
6237 */
6238 if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6239 {
6240 ereport(WARNING,
6241 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6242 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6243 }
6244
6245 /*
6246 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6247 * must have at least as many backend slots as the primary.
6248 */
6249 if (ArchiveRecoveryRequested && EnableHotStandby)
6250 {
6251 if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6252 ereport(ERROR,
6253 (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6254 errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6255
6256 /* We ignore autovacuum_max_workers when we make this test. */
6257 RecoveryRequiresIntParameter("max_connections",
6258 MaxConnections,
6259 ControlFile->MaxConnections);
6260 RecoveryRequiresIntParameter("max_worker_processes",
6261 max_worker_processes,
6262 ControlFile->max_worker_processes);
6263 RecoveryRequiresIntParameter("max_wal_senders",
6264 max_wal_senders,
6265 ControlFile->max_wal_senders);
6266 RecoveryRequiresIntParameter("max_prepared_transactions",
6267 max_prepared_xacts,
6268 ControlFile->max_prepared_xacts);
6269 RecoveryRequiresIntParameter("max_locks_per_transaction",
6270 max_locks_per_xact,
6271 ControlFile->max_locks_per_xact);
6272 }
6273 }
6274
6275 /*
6276 * This must be called ONCE during postmaster or standalone-backend startup
6277 */
6278 void
StartupXLOG(void)6279 StartupXLOG(void)
6280 {
6281 XLogCtlInsert *Insert;
6282 CheckPoint checkPoint;
6283 bool wasShutdown;
6284 bool reachedStopPoint = false;
6285 bool haveBackupLabel = false;
6286 bool haveTblspcMap = false;
6287 XLogRecPtr RecPtr,
6288 checkPointLoc,
6289 EndOfLog;
6290 TimeLineID EndOfLogTLI;
6291 TimeLineID PrevTimeLineID;
6292 XLogRecord *record;
6293 TransactionId oldestActiveXID;
6294 bool backupEndRequired = false;
6295 bool backupFromStandby = false;
6296 DBState dbstate_at_startup;
6297 XLogReaderState *xlogreader;
6298 XLogPageReadPrivate private;
6299 bool fast_promoted = false;
6300 struct stat st;
6301
6302 /*
6303 * We should have an aux process resource owner to use, and we should not
6304 * be in a transaction that's installed some other resowner.
6305 */
6306 Assert(AuxProcessResourceOwner != NULL);
6307 Assert(CurrentResourceOwner == NULL ||
6308 CurrentResourceOwner == AuxProcessResourceOwner);
6309 CurrentResourceOwner = AuxProcessResourceOwner;
6310
6311 /*
6312 * Verify XLOG status looks valid.
6313 */
6314 if (ControlFile->state < DB_SHUTDOWNED ||
6315 ControlFile->state > DB_IN_PRODUCTION ||
6316 !XRecOffIsValid(ControlFile->checkPoint))
6317 ereport(FATAL,
6318 (errmsg("control file contains invalid data")));
6319
6320 if (ControlFile->state == DB_SHUTDOWNED)
6321 {
6322 /* This is the expected case, so don't be chatty in standalone mode */
6323 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6324 (errmsg("database system was shut down at %s",
6325 str_time(ControlFile->time))));
6326 }
6327 else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6328 ereport(LOG,
6329 (errmsg("database system was shut down in recovery at %s",
6330 str_time(ControlFile->time))));
6331 else if (ControlFile->state == DB_SHUTDOWNING)
6332 ereport(LOG,
6333 (errmsg("database system shutdown was interrupted; last known up at %s",
6334 str_time(ControlFile->time))));
6335 else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6336 ereport(LOG,
6337 (errmsg("database system was interrupted while in recovery at %s",
6338 str_time(ControlFile->time)),
6339 errhint("This probably means that some data is corrupted and"
6340 " you will have to use the last backup for recovery.")));
6341 else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6342 ereport(LOG,
6343 (errmsg("database system was interrupted while in recovery at log time %s",
6344 str_time(ControlFile->checkPointCopy.time)),
6345 errhint("If this has occurred more than once some data might be corrupted"
6346 " and you might need to choose an earlier recovery target.")));
6347 else if (ControlFile->state == DB_IN_PRODUCTION)
6348 ereport(LOG,
6349 (errmsg("database system was interrupted; last known up at %s",
6350 str_time(ControlFile->time))));
6351
6352 /* This is just to allow attaching to startup process with a debugger */
6353 #ifdef XLOG_REPLAY_DELAY
6354 if (ControlFile->state != DB_SHUTDOWNED)
6355 pg_usleep(60000000L);
6356 #endif
6357
6358 /*
6359 * Verify that pg_wal and pg_wal/archive_status exist. In cases where
6360 * someone has performed a copy for PITR, these directories may have been
6361 * excluded and need to be re-created.
6362 */
6363 ValidateXLOGDirectoryStructure();
6364
6365 /*----------
6366 * If we previously crashed, perform a couple of actions:
6367 * - The pg_wal directory may still include some temporary WAL segments
6368 * used when creating a new segment, so perform some clean up to not
6369 * bloat this path. This is done first as there is no point to sync this
6370 * temporary data.
6371 * - There might be data which we had written, intending to fsync it,
6372 * but which we had not actually fsync'd yet. Therefore, a power failure
6373 * in the near future might cause earlier unflushed writes to be lost,
6374 * even though more recent data written to disk from here on would be
6375 * persisted. To avoid that, fsync the entire data directory.
6376 *---------
6377 */
6378 if (ControlFile->state != DB_SHUTDOWNED &&
6379 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6380 {
6381 RemoveTempXlogFiles();
6382 SyncDataDirectory();
6383 }
6384
6385 /*
6386 * Initialize on the assumption we want to recover to the latest timeline
6387 * that's active according to pg_control.
6388 */
6389 if (ControlFile->minRecoveryPointTLI >
6390 ControlFile->checkPointCopy.ThisTimeLineID)
6391 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6392 else
6393 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6394
6395 /*
6396 * Check for signal files, and if so set up state for offline recovery
6397 */
6398 readRecoverySignalFile();
6399 validateRecoveryParameters();
6400
6401 if (ArchiveRecoveryRequested)
6402 {
6403 if (StandbyModeRequested)
6404 ereport(LOG,
6405 (errmsg("entering standby mode")));
6406 else if (recoveryTarget == RECOVERY_TARGET_XID)
6407 ereport(LOG,
6408 (errmsg("starting point-in-time recovery to XID %u",
6409 recoveryTargetXid)));
6410 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6411 ereport(LOG,
6412 (errmsg("starting point-in-time recovery to %s",
6413 timestamptz_to_str(recoveryTargetTime))));
6414 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6415 ereport(LOG,
6416 (errmsg("starting point-in-time recovery to \"%s\"",
6417 recoveryTargetName)));
6418 else if (recoveryTarget == RECOVERY_TARGET_LSN)
6419 ereport(LOG,
6420 (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6421 (uint32) (recoveryTargetLSN >> 32),
6422 (uint32) recoveryTargetLSN)));
6423 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6424 ereport(LOG,
6425 (errmsg("starting point-in-time recovery to earliest consistent point")));
6426 else
6427 ereport(LOG,
6428 (errmsg("starting archive recovery")));
6429 }
6430
6431 /*
6432 * Take ownership of the wakeup latch if we're going to sleep during
6433 * recovery.
6434 */
6435 if (ArchiveRecoveryRequested)
6436 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6437
6438 /* Set up XLOG reader facility */
6439 MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6440 xlogreader = XLogReaderAllocate(wal_segment_size, &XLogPageRead, &private);
6441 if (!xlogreader)
6442 ereport(ERROR,
6443 (errcode(ERRCODE_OUT_OF_MEMORY),
6444 errmsg("out of memory"),
6445 errdetail("Failed while allocating a WAL reading processor.")));
6446 xlogreader->system_identifier = ControlFile->system_identifier;
6447
6448 /*
6449 * Allocate two page buffers dedicated to WAL consistency checks. We do
6450 * it this way, rather than just making static arrays, for two reasons:
6451 * (1) no need to waste the storage in most instantiations of the backend;
6452 * (2) a static char array isn't guaranteed to have any particular
6453 * alignment, whereas palloc() will provide MAXALIGN'd storage.
6454 */
6455 replay_image_masked = (char *) palloc(BLCKSZ);
6456 master_image_masked = (char *) palloc(BLCKSZ);
6457
6458 if (read_backup_label(&checkPointLoc, &backupEndRequired,
6459 &backupFromStandby))
6460 {
6461 List *tablespaces = NIL;
6462
6463 /*
6464 * Archive recovery was requested, and thanks to the backup label
6465 * file, we know how far we need to replay to reach consistency. Enter
6466 * archive recovery directly.
6467 */
6468 InArchiveRecovery = true;
6469 if (StandbyModeRequested)
6470 StandbyMode = true;
6471
6472 /*
6473 * When a backup_label file is present, we want to roll forward from
6474 * the checkpoint it identifies, rather than using pg_control.
6475 */
6476 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6477 if (record != NULL)
6478 {
6479 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6480 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6481 ereport(DEBUG1,
6482 (errmsg("checkpoint record is at %X/%X",
6483 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6484 InRecovery = true; /* force recovery even if SHUTDOWNED */
6485
6486 /*
6487 * Make sure that REDO location exists. This may not be the case
6488 * if there was a crash during an online backup, which left a
6489 * backup_label around that references a WAL segment that's
6490 * already been archived.
6491 */
6492 if (checkPoint.redo < checkPointLoc)
6493 {
6494 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6495 ereport(FATAL,
6496 (errmsg("could not find redo location referenced by checkpoint record"),
6497 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6498 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6499 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6500 DataDir, DataDir, DataDir)));
6501 }
6502 }
6503 else
6504 {
6505 ereport(FATAL,
6506 (errmsg("could not locate required checkpoint record"),
6507 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6508 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6509 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6510 DataDir, DataDir, DataDir)));
6511 wasShutdown = false; /* keep compiler quiet */
6512 }
6513
6514 /* read the tablespace_map file if present and create symlinks. */
6515 if (read_tablespace_map(&tablespaces))
6516 {
6517 ListCell *lc;
6518
6519 foreach(lc, tablespaces)
6520 {
6521 tablespaceinfo *ti = lfirst(lc);
6522 char *linkloc;
6523
6524 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6525
6526 /*
6527 * Remove the existing symlink if any and Create the symlink
6528 * under PGDATA.
6529 */
6530 remove_tablespace_symlink(linkloc);
6531
6532 if (symlink(ti->path, linkloc) < 0)
6533 ereport(ERROR,
6534 (errcode_for_file_access(),
6535 errmsg("could not create symbolic link \"%s\": %m",
6536 linkloc)));
6537
6538 pfree(ti->oid);
6539 pfree(ti->path);
6540 pfree(ti);
6541 }
6542
6543 /* set flag to delete it later */
6544 haveTblspcMap = true;
6545 }
6546
6547 /* set flag to delete it later */
6548 haveBackupLabel = true;
6549 }
6550 else
6551 {
6552 /*
6553 * If tablespace_map file is present without backup_label file, there
6554 * is no use of such file. There is no harm in retaining it, but it
6555 * is better to get rid of the map file so that we don't have any
6556 * redundant file in data directory and it will avoid any sort of
6557 * confusion. It seems prudent though to just rename the file out of
6558 * the way rather than delete it completely, also we ignore any error
6559 * that occurs in rename operation as even if map file is present
6560 * without backup_label file, it is harmless.
6561 */
6562 if (stat(TABLESPACE_MAP, &st) == 0)
6563 {
6564 unlink(TABLESPACE_MAP_OLD);
6565 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6566 ereport(LOG,
6567 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6568 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6569 errdetail("File \"%s\" was renamed to \"%s\".",
6570 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6571 else
6572 ereport(LOG,
6573 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6574 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6575 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6576 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6577 }
6578
6579 /*
6580 * It's possible that archive recovery was requested, but we don't
6581 * know how far we need to replay the WAL before we reach consistency.
6582 * This can happen for example if a base backup is taken from a
6583 * running server using an atomic filesystem snapshot, without calling
6584 * pg_start/stop_backup. Or if you just kill a running master server
6585 * and put it into archive recovery by creating a recovery signal
6586 * file.
6587 *
6588 * Our strategy in that case is to perform crash recovery first,
6589 * replaying all the WAL present in pg_wal, and only enter archive
6590 * recovery after that.
6591 *
6592 * But usually we already know how far we need to replay the WAL (up
6593 * to minRecoveryPoint, up to backupEndPoint, or until we see an
6594 * end-of-backup record), and we can enter archive recovery directly.
6595 */
6596 if (ArchiveRecoveryRequested &&
6597 (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6598 ControlFile->backupEndRequired ||
6599 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6600 ControlFile->state == DB_SHUTDOWNED))
6601 {
6602 InArchiveRecovery = true;
6603 if (StandbyModeRequested)
6604 StandbyMode = true;
6605 }
6606
6607 /* Get the last valid checkpoint record. */
6608 checkPointLoc = ControlFile->checkPoint;
6609 RedoStartLSN = ControlFile->checkPointCopy.redo;
6610 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6611 if (record != NULL)
6612 {
6613 ereport(DEBUG1,
6614 (errmsg("checkpoint record is at %X/%X",
6615 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6616 }
6617 else
6618 {
6619 /*
6620 * We used to attempt to go back to a secondary checkpoint record
6621 * here, but only when not in standby mode. We now just fail if we
6622 * can't read the last checkpoint because this allows us to
6623 * simplify processing around checkpoints.
6624 */
6625 ereport(PANIC,
6626 (errmsg("could not locate a valid checkpoint record")));
6627 }
6628 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6629 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6630 }
6631
6632 /*
6633 * Clear out any old relcache cache files. This is *necessary* if we do
6634 * any WAL replay, since that would probably result in the cache files
6635 * being out of sync with database reality. In theory we could leave them
6636 * in place if the database had been cleanly shut down, but it seems
6637 * safest to just remove them always and let them be rebuilt during the
6638 * first backend startup. These files needs to be removed from all
6639 * directories including pg_tblspc, however the symlinks are created only
6640 * after reading tablespace_map file in case of archive recovery from
6641 * backup, so needs to clear old relcache files here after creating
6642 * symlinks.
6643 */
6644 RelationCacheInitFileRemove();
6645
6646 /*
6647 * If the location of the checkpoint record is not on the expected
6648 * timeline in the history of the requested timeline, we cannot proceed:
6649 * the backup is not part of the history of the requested timeline.
6650 */
6651 Assert(expectedTLEs); /* was initialized by reading checkpoint
6652 * record */
6653 if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6654 checkPoint.ThisTimeLineID)
6655 {
6656 XLogRecPtr switchpoint;
6657
6658 /*
6659 * tliSwitchPoint will throw an error if the checkpoint's timeline is
6660 * not in expectedTLEs at all.
6661 */
6662 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6663 ereport(FATAL,
6664 (errmsg("requested timeline %u is not a child of this server's history",
6665 recoveryTargetTLI),
6666 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6667 (uint32) (ControlFile->checkPoint >> 32),
6668 (uint32) ControlFile->checkPoint,
6669 ControlFile->checkPointCopy.ThisTimeLineID,
6670 (uint32) (switchpoint >> 32),
6671 (uint32) switchpoint)));
6672 }
6673
6674 /*
6675 * The min recovery point should be part of the requested timeline's
6676 * history, too.
6677 */
6678 if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6679 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6680 ControlFile->minRecoveryPointTLI)
6681 ereport(FATAL,
6682 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6683 recoveryTargetTLI,
6684 (uint32) (ControlFile->minRecoveryPoint >> 32),
6685 (uint32) ControlFile->minRecoveryPoint,
6686 ControlFile->minRecoveryPointTLI)));
6687
6688 LastRec = RecPtr = checkPointLoc;
6689
6690 ereport(DEBUG1,
6691 (errmsg_internal("redo record is at %X/%X; shutdown %s",
6692 (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6693 wasShutdown ? "true" : "false")));
6694 ereport(DEBUG1,
6695 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
6696 U64FromFullTransactionId(checkPoint.nextFullXid),
6697 checkPoint.nextOid)));
6698 ereport(DEBUG1,
6699 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6700 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6701 ereport(DEBUG1,
6702 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6703 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6704 ereport(DEBUG1,
6705 (errmsg_internal("oldest MultiXactId: %u, in database %u",
6706 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6707 ereport(DEBUG1,
6708 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6709 checkPoint.oldestCommitTsXid,
6710 checkPoint.newestCommitTsXid)));
6711 if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextFullXid)))
6712 ereport(PANIC,
6713 (errmsg("invalid next transaction ID")));
6714
6715 /* initialize shared memory variables from the checkpoint record */
6716 ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
6717 ShmemVariableCache->nextOid = checkPoint.nextOid;
6718 ShmemVariableCache->oidCount = 0;
6719 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6720 AdvanceOldestClogXid(checkPoint.oldestXid);
6721 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6722 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6723 SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6724 checkPoint.newestCommitTsXid);
6725 XLogCtl->ckptFullXid = checkPoint.nextFullXid;
6726
6727 /*
6728 * Initialize replication slots, before there's a chance to remove
6729 * required resources.
6730 */
6731 StartupReplicationSlots();
6732
6733 /*
6734 * Startup logical state, needs to be setup now so we have proper data
6735 * during crash recovery.
6736 */
6737 StartupReorderBuffer();
6738
6739 /*
6740 * Startup MultiXact. We need to do this early to be able to replay
6741 * truncations.
6742 */
6743 StartupMultiXact();
6744
6745 /*
6746 * Ditto for commit timestamps. Activate the facility if the setting is
6747 * enabled in the control file, as there should be no tracking of commit
6748 * timestamps done when the setting was disabled. This facility can be
6749 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6750 */
6751 if (ControlFile->track_commit_timestamp)
6752 StartupCommitTs();
6753
6754 /*
6755 * Recover knowledge about replay progress of known replication partners.
6756 */
6757 StartupReplicationOrigin();
6758
6759 /*
6760 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6761 * control file. On recovery, all unlogged relations are blown away, so
6762 * the unlogged LSN counter can be reset too.
6763 */
6764 if (ControlFile->state == DB_SHUTDOWNED)
6765 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6766 else
6767 XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
6768
6769 /*
6770 * We must replay WAL entries using the same TimeLineID they were created
6771 * under, so temporarily adopt the TLI indicated by the checkpoint (see
6772 * also xlog_redo()).
6773 */
6774 ThisTimeLineID = checkPoint.ThisTimeLineID;
6775
6776 /*
6777 * Copy any missing timeline history files between 'now' and the recovery
6778 * target timeline from archive to pg_wal. While we don't need those files
6779 * ourselves - the history file of the recovery target timeline covers all
6780 * the previous timelines in the history too - a cascading standby server
6781 * might be interested in them. Or, if you archive the WAL from this
6782 * server to a different archive than the master, it'd be good for all the
6783 * history files to get archived there after failover, so that you can use
6784 * one of the old timelines as a PITR target. Timeline history files are
6785 * small, so it's better to copy them unnecessarily than not copy them and
6786 * regret later.
6787 */
6788 restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6789
6790 /*
6791 * Before running in recovery, scan pg_twophase and fill in its status to
6792 * be able to work on entries generated by redo. Doing a scan before
6793 * taking any recovery action has the merit to discard any 2PC files that
6794 * are newer than the first record to replay, saving from any conflicts at
6795 * replay. This avoids as well any subsequent scans when doing recovery
6796 * of the on-disk two-phase data.
6797 */
6798 restoreTwoPhaseData();
6799
6800 lastFullPageWrites = checkPoint.fullPageWrites;
6801
6802 RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6803 doPageWrites = lastFullPageWrites;
6804
6805 if (RecPtr < checkPoint.redo)
6806 ereport(PANIC,
6807 (errmsg("invalid redo in checkpoint record")));
6808
6809 /*
6810 * Check whether we need to force recovery from WAL. If it appears to
6811 * have been a clean shutdown and we did not have a recovery signal file,
6812 * then assume no recovery needed.
6813 */
6814 if (checkPoint.redo < RecPtr)
6815 {
6816 if (wasShutdown)
6817 ereport(PANIC,
6818 (errmsg("invalid redo record in shutdown checkpoint")));
6819 InRecovery = true;
6820 }
6821 else if (ControlFile->state != DB_SHUTDOWNED)
6822 InRecovery = true;
6823 else if (ArchiveRecoveryRequested)
6824 {
6825 /* force recovery due to presence of recovery signal file */
6826 InRecovery = true;
6827 }
6828
6829 /*
6830 * Start recovery assuming that the final record isn't lost.
6831 */
6832 abortedRecPtr = InvalidXLogRecPtr;
6833 missingContrecPtr = InvalidXLogRecPtr;
6834
6835 /* REDO */
6836 if (InRecovery)
6837 {
6838 int rmid;
6839
6840 /*
6841 * Update pg_control to show that we are recovering and to show the
6842 * selected checkpoint as the place we are starting from. We also mark
6843 * pg_control with any minimum recovery stop point obtained from a
6844 * backup history file.
6845 */
6846 dbstate_at_startup = ControlFile->state;
6847 if (InArchiveRecovery)
6848 {
6849 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6850
6851 SpinLockAcquire(&XLogCtl->info_lck);
6852 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
6853 SpinLockRelease(&XLogCtl->info_lck);
6854 }
6855 else
6856 {
6857 ereport(LOG,
6858 (errmsg("database system was not properly shut down; "
6859 "automatic recovery in progress")));
6860 if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6861 ereport(LOG,
6862 (errmsg("crash recovery starts in timeline %u "
6863 "and has target timeline %u",
6864 ControlFile->checkPointCopy.ThisTimeLineID,
6865 recoveryTargetTLI)));
6866 ControlFile->state = DB_IN_CRASH_RECOVERY;
6867
6868 SpinLockAcquire(&XLogCtl->info_lck);
6869 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
6870 SpinLockRelease(&XLogCtl->info_lck);
6871 }
6872 ControlFile->checkPoint = checkPointLoc;
6873 ControlFile->checkPointCopy = checkPoint;
6874 if (InArchiveRecovery)
6875 {
6876 /* initialize minRecoveryPoint if not set yet */
6877 if (ControlFile->minRecoveryPoint < checkPoint.redo)
6878 {
6879 ControlFile->minRecoveryPoint = checkPoint.redo;
6880 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6881 }
6882 }
6883
6884 /*
6885 * Set backupStartPoint if we're starting recovery from a base backup.
6886 *
6887 * Also set backupEndPoint and use minRecoveryPoint as the backup end
6888 * location if we're starting recovery from a base backup which was
6889 * taken from a standby. In this case, the database system status in
6890 * pg_control must indicate that the database was already in recovery.
6891 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6892 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6893 * before reaching this point; e.g. because restore_command or
6894 * primary_conninfo were faulty.
6895 *
6896 * Any other state indicates that the backup somehow became corrupted
6897 * and we can't sensibly continue with recovery.
6898 */
6899 if (haveBackupLabel)
6900 {
6901 ControlFile->backupStartPoint = checkPoint.redo;
6902 ControlFile->backupEndRequired = backupEndRequired;
6903
6904 if (backupFromStandby)
6905 {
6906 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6907 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6908 ereport(FATAL,
6909 (errmsg("backup_label contains data inconsistent with control file"),
6910 errhint("This means that the backup is corrupted and you will "
6911 "have to use another backup for recovery.")));
6912 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6913 }
6914 }
6915 ControlFile->time = (pg_time_t) time(NULL);
6916 /* No need to hold ControlFileLock yet, we aren't up far enough */
6917 UpdateControlFile();
6918
6919 /*
6920 * Initialize our local copy of minRecoveryPoint. When doing crash
6921 * recovery we want to replay up to the end of WAL. Particularly, in
6922 * the case of a promoted standby minRecoveryPoint value in the
6923 * control file is only updated after the first checkpoint. However,
6924 * if the instance crashes before the first post-recovery checkpoint
6925 * is completed then recovery will use a stale location causing the
6926 * startup process to think that there are still invalid page
6927 * references when checking for data consistency.
6928 */
6929 if (InArchiveRecovery)
6930 {
6931 minRecoveryPoint = ControlFile->minRecoveryPoint;
6932 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6933 }
6934 else
6935 {
6936 minRecoveryPoint = InvalidXLogRecPtr;
6937 minRecoveryPointTLI = 0;
6938 }
6939
6940 /*
6941 * Reset pgstat data, because it may be invalid after recovery.
6942 */
6943 pgstat_reset_all();
6944
6945 /*
6946 * If there was a backup label file, it's done its job and the info
6947 * has now been propagated into pg_control. We must get rid of the
6948 * label file so that if we crash during recovery, we'll pick up at
6949 * the latest recovery restartpoint instead of going all the way back
6950 * to the backup start point. It seems prudent though to just rename
6951 * the file out of the way rather than delete it completely.
6952 */
6953 if (haveBackupLabel)
6954 {
6955 unlink(BACKUP_LABEL_OLD);
6956 durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
6957 }
6958
6959 /*
6960 * If there was a tablespace_map file, it's done its job and the
6961 * symlinks have been created. We must get rid of the map file so
6962 * that if we crash during recovery, we don't create symlinks again.
6963 * It seems prudent though to just rename the file out of the way
6964 * rather than delete it completely.
6965 */
6966 if (haveTblspcMap)
6967 {
6968 unlink(TABLESPACE_MAP_OLD);
6969 durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
6970 }
6971
6972 /* Check that the GUCs used to generate the WAL allow recovery */
6973 CheckRequiredParameterValues();
6974
6975 /*
6976 * We're in recovery, so unlogged relations may be trashed and must be
6977 * reset. This should be done BEFORE allowing Hot Standby
6978 * connections, so that read-only backends don't try to read whatever
6979 * garbage is left over from before.
6980 */
6981 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6982
6983 /*
6984 * Likewise, delete any saved transaction snapshot files that got left
6985 * behind by crashed backends.
6986 */
6987 DeleteAllExportedSnapshotFiles();
6988
6989 /*
6990 * Initialize for Hot Standby, if enabled. We won't let backends in
6991 * yet, not until we've reached the min recovery point specified in
6992 * control file and we've established a recovery snapshot from a
6993 * running-xacts WAL record.
6994 */
6995 if (ArchiveRecoveryRequested && EnableHotStandby)
6996 {
6997 TransactionId *xids;
6998 int nxids;
6999
7000 ereport(DEBUG1,
7001 (errmsg("initializing for hot standby")));
7002
7003 InitRecoveryTransactionEnvironment();
7004
7005 if (wasShutdown)
7006 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7007 else
7008 oldestActiveXID = checkPoint.oldestActiveXid;
7009 Assert(TransactionIdIsValid(oldestActiveXID));
7010
7011 /* Tell procarray about the range of xids it has to deal with */
7012 ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextFullXid));
7013
7014 /*
7015 * Startup commit log and subtrans only. MultiXact and commit
7016 * timestamp have already been started up and other SLRUs are not
7017 * maintained during recovery and need not be started yet.
7018 */
7019 StartupCLOG();
7020 StartupSUBTRANS(oldestActiveXID);
7021
7022 /*
7023 * If we're beginning at a shutdown checkpoint, we know that
7024 * nothing was running on the master at this point. So fake-up an
7025 * empty running-xacts record and use that here and now. Recover
7026 * additional standby state for prepared transactions.
7027 */
7028 if (wasShutdown)
7029 {
7030 RunningTransactionsData running;
7031 TransactionId latestCompletedXid;
7032
7033 /*
7034 * Construct a RunningTransactions snapshot representing a
7035 * shut down server, with only prepared transactions still
7036 * alive. We're never overflowed at this point because all
7037 * subxids are listed with their parent prepared transactions.
7038 */
7039 running.xcnt = nxids;
7040 running.subxcnt = 0;
7041 running.subxid_overflow = false;
7042 running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
7043 running.oldestRunningXid = oldestActiveXID;
7044 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
7045 TransactionIdRetreat(latestCompletedXid);
7046 Assert(TransactionIdIsNormal(latestCompletedXid));
7047 running.latestCompletedXid = latestCompletedXid;
7048 running.xids = xids;
7049
7050 ProcArrayApplyRecoveryInfo(&running);
7051
7052 StandbyRecoverPreparedTransactions();
7053 }
7054 }
7055
7056 /* Initialize resource managers */
7057 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7058 {
7059 if (RmgrTable[rmid].rm_startup != NULL)
7060 RmgrTable[rmid].rm_startup();
7061 }
7062
7063 /*
7064 * Initialize shared variables for tracking progress of WAL replay, as
7065 * if we had just replayed the record before the REDO location (or the
7066 * checkpoint record itself, if it's a shutdown checkpoint).
7067 */
7068 SpinLockAcquire(&XLogCtl->info_lck);
7069 if (checkPoint.redo < RecPtr)
7070 XLogCtl->replayEndRecPtr = checkPoint.redo;
7071 else
7072 XLogCtl->replayEndRecPtr = EndRecPtr;
7073 XLogCtl->replayEndTLI = ThisTimeLineID;
7074 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7075 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7076 XLogCtl->recoveryLastXTime = 0;
7077 XLogCtl->currentChunkStartTime = 0;
7078 XLogCtl->recoveryPause = false;
7079 SpinLockRelease(&XLogCtl->info_lck);
7080
7081 /* Also ensure XLogReceiptTime has a sane value */
7082 XLogReceiptTime = GetCurrentTimestamp();
7083
7084 /*
7085 * Let postmaster know we've started redo now, so that it can launch
7086 * checkpointer to perform restartpoints. We don't bother during
7087 * crash recovery as restartpoints can only be performed during
7088 * archive recovery. And we'd like to keep crash recovery simple, to
7089 * avoid introducing bugs that could affect you when recovering after
7090 * crash.
7091 *
7092 * After this point, we can no longer assume that we're the only
7093 * process in addition to postmaster! Also, fsync requests are
7094 * subsequently to be handled by the checkpointer, not locally.
7095 */
7096 if (ArchiveRecoveryRequested && IsUnderPostmaster)
7097 {
7098 PublishStartupProcessInformation();
7099 EnableSyncRequestForwarding();
7100 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7101 bgwriterLaunched = true;
7102 }
7103
7104 /*
7105 * Allow read-only connections immediately if we're consistent
7106 * already.
7107 */
7108 CheckRecoveryConsistency();
7109
7110 /*
7111 * Find the first record that logically follows the checkpoint --- it
7112 * might physically precede it, though.
7113 */
7114 if (checkPoint.redo < RecPtr)
7115 {
7116 /* back up to find the record */
7117 record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
7118 }
7119 else
7120 {
7121 /* just have to read next record after CheckPoint */
7122 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7123 }
7124
7125 if (record != NULL)
7126 {
7127 ErrorContextCallback errcallback;
7128 TimestampTz xtime;
7129
7130 InRedo = true;
7131
7132 ereport(LOG,
7133 (errmsg("redo starts at %X/%X",
7134 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7135
7136 /*
7137 * main redo apply loop
7138 */
7139 do
7140 {
7141 bool switchedTLI = false;
7142
7143 #ifdef WAL_DEBUG
7144 if (XLOG_DEBUG ||
7145 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7146 (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7147 {
7148 StringInfoData buf;
7149
7150 initStringInfo(&buf);
7151 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7152 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
7153 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
7154 xlog_outrec(&buf, xlogreader);
7155 appendStringInfoString(&buf, " - ");
7156 xlog_outdesc(&buf, xlogreader);
7157 elog(LOG, "%s", buf.data);
7158 pfree(buf.data);
7159 }
7160 #endif
7161
7162 /* Handle interrupt signals of startup process */
7163 HandleStartupProcInterrupts();
7164
7165 /*
7166 * Pause WAL replay, if requested by a hot-standby session via
7167 * SetRecoveryPause().
7168 *
7169 * Note that we intentionally don't take the info_lck spinlock
7170 * here. We might therefore read a slightly stale value of
7171 * the recoveryPause flag, but it can't be very stale (no
7172 * worse than the last spinlock we did acquire). Since a
7173 * pause request is a pretty asynchronous thing anyway,
7174 * possibly responding to it one WAL record later than we
7175 * otherwise would is a minor issue, so it doesn't seem worth
7176 * adding another spinlock cycle to prevent that.
7177 */
7178 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7179 recoveryPausesHere();
7180
7181 /*
7182 * Have we reached our recovery target?
7183 */
7184 if (recoveryStopsBefore(xlogreader))
7185 {
7186 reachedStopPoint = true; /* see below */
7187 break;
7188 }
7189
7190 /*
7191 * If we've been asked to lag the master, wait on latch until
7192 * enough time has passed.
7193 */
7194 if (recoveryApplyDelay(xlogreader))
7195 {
7196 /*
7197 * We test for paused recovery again here. If user sets
7198 * delayed apply, it may be because they expect to pause
7199 * recovery in case of problems, so we must test again
7200 * here otherwise pausing during the delay-wait wouldn't
7201 * work.
7202 */
7203 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7204 recoveryPausesHere();
7205 }
7206
7207 /* Setup error traceback support for ereport() */
7208 errcallback.callback = rm_redo_error_callback;
7209 errcallback.arg = (void *) xlogreader;
7210 errcallback.previous = error_context_stack;
7211 error_context_stack = &errcallback;
7212
7213 /*
7214 * ShmemVariableCache->nextFullXid must be beyond record's
7215 * xid.
7216 */
7217 AdvanceNextFullTransactionIdPastXid(record->xl_xid);
7218
7219 /*
7220 * Before replaying this record, check if this record causes
7221 * the current timeline to change. The record is already
7222 * considered to be part of the new timeline, so we update
7223 * ThisTimeLineID before replaying it. That's important so
7224 * that replayEndTLI, which is recorded as the minimum
7225 * recovery point's TLI if recovery stops after this record,
7226 * is set correctly.
7227 */
7228 if (record->xl_rmid == RM_XLOG_ID)
7229 {
7230 TimeLineID newTLI = ThisTimeLineID;
7231 TimeLineID prevTLI = ThisTimeLineID;
7232 uint8 info = record->xl_info & ~XLR_INFO_MASK;
7233
7234 if (info == XLOG_CHECKPOINT_SHUTDOWN)
7235 {
7236 CheckPoint checkPoint;
7237
7238 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7239 newTLI = checkPoint.ThisTimeLineID;
7240 prevTLI = checkPoint.PrevTimeLineID;
7241 }
7242 else if (info == XLOG_END_OF_RECOVERY)
7243 {
7244 xl_end_of_recovery xlrec;
7245
7246 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7247 newTLI = xlrec.ThisTimeLineID;
7248 prevTLI = xlrec.PrevTimeLineID;
7249 }
7250
7251 if (newTLI != ThisTimeLineID)
7252 {
7253 /* Check that it's OK to switch to this TLI */
7254 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7255
7256 /* Following WAL records should be run with new TLI */
7257 ThisTimeLineID = newTLI;
7258 switchedTLI = true;
7259 }
7260 }
7261
7262 /*
7263 * Update shared replayEndRecPtr before replaying this record,
7264 * so that XLogFlush will update minRecoveryPoint correctly.
7265 */
7266 SpinLockAcquire(&XLogCtl->info_lck);
7267 XLogCtl->replayEndRecPtr = EndRecPtr;
7268 XLogCtl->replayEndTLI = ThisTimeLineID;
7269 SpinLockRelease(&XLogCtl->info_lck);
7270
7271 /*
7272 * If we are attempting to enter Hot Standby mode, process
7273 * XIDs we see
7274 */
7275 if (standbyState >= STANDBY_INITIALIZED &&
7276 TransactionIdIsValid(record->xl_xid))
7277 RecordKnownAssignedTransactionIds(record->xl_xid);
7278
7279 /* Now apply the WAL record itself */
7280 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7281
7282 /*
7283 * After redo, check whether the backup pages associated with
7284 * the WAL record are consistent with the existing pages. This
7285 * check is done only if consistency check is enabled for this
7286 * record.
7287 */
7288 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7289 checkXLogConsistency(xlogreader);
7290
7291 /* Pop the error context stack */
7292 error_context_stack = errcallback.previous;
7293
7294 /*
7295 * Update lastReplayedEndRecPtr after this record has been
7296 * successfully replayed.
7297 */
7298 SpinLockAcquire(&XLogCtl->info_lck);
7299 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7300 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7301 SpinLockRelease(&XLogCtl->info_lck);
7302
7303 /*
7304 * If rm_redo called XLogRequestWalReceiverReply, then we wake
7305 * up the receiver so that it notices the updated
7306 * lastReplayedEndRecPtr and sends a reply to the master.
7307 */
7308 if (doRequestWalReceiverReply)
7309 {
7310 doRequestWalReceiverReply = false;
7311 WalRcvForceReply();
7312 }
7313
7314 /* Remember this record as the last-applied one */
7315 LastRec = ReadRecPtr;
7316
7317 /* Allow read-only connections if we're consistent now */
7318 CheckRecoveryConsistency();
7319
7320 /* Is this a timeline switch? */
7321 if (switchedTLI)
7322 {
7323 /*
7324 * Before we continue on the new timeline, clean up any
7325 * (possibly bogus) future WAL segments on the old
7326 * timeline.
7327 */
7328 RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7329
7330 /*
7331 * Wake up any walsenders to notice that we are on a new
7332 * timeline.
7333 */
7334 if (switchedTLI && AllowCascadeReplication())
7335 WalSndWakeup();
7336 }
7337
7338 /* Exit loop if we reached inclusive recovery target */
7339 if (recoveryStopsAfter(xlogreader))
7340 {
7341 reachedStopPoint = true;
7342 break;
7343 }
7344
7345 /* Else, try to fetch the next WAL record */
7346 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7347 } while (record != NULL);
7348
7349 /*
7350 * end of main redo apply loop
7351 */
7352
7353 if (reachedStopPoint)
7354 {
7355 if (!reachedConsistency)
7356 ereport(FATAL,
7357 (errmsg("requested recovery stop point is before consistent recovery point")));
7358
7359 /*
7360 * This is the last point where we can restart recovery with a
7361 * new recovery target, if we shutdown and begin again. After
7362 * this, Resource Managers may choose to do permanent
7363 * corrective actions at end of recovery.
7364 */
7365 switch (recoveryTargetAction)
7366 {
7367 case RECOVERY_TARGET_ACTION_SHUTDOWN:
7368
7369 /*
7370 * exit with special return code to request shutdown
7371 * of postmaster. Log messages issued from
7372 * postmaster.
7373 */
7374 proc_exit(3);
7375
7376 case RECOVERY_TARGET_ACTION_PAUSE:
7377 SetRecoveryPause(true);
7378 recoveryPausesHere();
7379
7380 /* drop into promote */
7381
7382 case RECOVERY_TARGET_ACTION_PROMOTE:
7383 break;
7384 }
7385 }
7386
7387 /* Allow resource managers to do any required cleanup. */
7388 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7389 {
7390 if (RmgrTable[rmid].rm_cleanup != NULL)
7391 RmgrTable[rmid].rm_cleanup();
7392 }
7393
7394 ereport(LOG,
7395 (errmsg("redo done at %X/%X",
7396 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7397 xtime = GetLatestXTime();
7398 if (xtime)
7399 ereport(LOG,
7400 (errmsg("last completed transaction was at log time %s",
7401 timestamptz_to_str(xtime))));
7402
7403 InRedo = false;
7404 }
7405 else
7406 {
7407 /* there are no WAL records following the checkpoint */
7408 ereport(LOG,
7409 (errmsg("redo is not required")));
7410 }
7411 }
7412
7413 /*
7414 * Kill WAL receiver, if it's still running, before we continue to write
7415 * the startup checkpoint and aborted-contrecord records. It will trump
7416 * over these records and subsequent ones if it's still alive when we
7417 * start writing WAL.
7418 */
7419 ShutdownWalRcv();
7420
7421 /*
7422 * Reset unlogged relations to the contents of their INIT fork. This is
7423 * done AFTER recovery is complete so as to include any unlogged relations
7424 * created during recovery, but BEFORE recovery is marked as having
7425 * completed successfully. Otherwise we'd not retry if any of the post
7426 * end-of-recovery steps fail.
7427 */
7428 if (InRecovery)
7429 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7430
7431 /*
7432 * We don't need the latch anymore. It's not strictly necessary to disown
7433 * it, but let's do it for the sake of tidiness.
7434 */
7435 if (ArchiveRecoveryRequested)
7436 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7437
7438 /*
7439 * We are now done reading the xlog from stream. Turn off streaming
7440 * recovery to force fetching the files (which would be required at end of
7441 * recovery, e.g., timeline history file) from archive or pg_wal.
7442 *
7443 * Note that standby mode must be turned off after killing WAL receiver,
7444 * i.e., calling ShutdownWalRcv().
7445 */
7446 Assert(!WalRcvStreaming());
7447 StandbyMode = false;
7448
7449 /*
7450 * Determine where to start writing WAL next.
7451 *
7452 * When recovery ended in an incomplete record, write a WAL record about
7453 * that and continue after it. In all other cases, re-fetch the last
7454 * valid or last applied record, so we can identify the exact endpoint of
7455 * what we consider the valid portion of WAL.
7456 */
7457 record = ReadRecord(xlogreader, LastRec, PANIC, false);
7458 EndOfLog = EndRecPtr;
7459
7460 /*
7461 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7462 * the end-of-log. It could be different from the timeline that EndOfLog
7463 * nominally belongs to, if there was a timeline switch in that segment,
7464 * and we were reading the old WAL from a segment belonging to a higher
7465 * timeline.
7466 */
7467 EndOfLogTLI = xlogreader->readPageTLI;
7468
7469 /*
7470 * Complain if we did not roll forward far enough to render the backup
7471 * dump consistent. Note: it is indeed okay to look at the local variable
7472 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7473 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7474 * advanced beyond the WAL we processed.
7475 */
7476 if (InRecovery &&
7477 (EndOfLog < minRecoveryPoint ||
7478 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7479 {
7480 /*
7481 * Ran off end of WAL before reaching end-of-backup WAL record, or
7482 * minRecoveryPoint. That's usually a bad sign, indicating that you
7483 * tried to recover from an online backup but never called
7484 * pg_stop_backup(), or you didn't archive all the WAL up to that
7485 * point. However, this also happens in crash recovery, if the system
7486 * crashes while an online backup is in progress. We must not treat
7487 * that as an error, or the database will refuse to start up.
7488 */
7489 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7490 {
7491 if (ControlFile->backupEndRequired)
7492 ereport(FATAL,
7493 (errmsg("WAL ends before end of online backup"),
7494 errhint("All WAL generated while online backup was taken must be available at recovery.")));
7495 else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7496 ereport(FATAL,
7497 (errmsg("WAL ends before end of online backup"),
7498 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7499 else
7500 ereport(FATAL,
7501 (errmsg("WAL ends before consistent recovery point")));
7502 }
7503 }
7504
7505 /*
7506 * Pre-scan prepared transactions to find out the range of XIDs present.
7507 * This information is not quite needed yet, but it is positioned here so
7508 * as potential problems are detected before any on-disk change is done.
7509 */
7510 oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7511
7512 /*
7513 * Consider whether we need to assign a new timeline ID.
7514 *
7515 * If we are doing an archive recovery, we always assign a new ID. This
7516 * handles a couple of issues. If we stopped short of the end of WAL
7517 * during recovery, then we are clearly generating a new timeline and must
7518 * assign it a unique new ID. Even if we ran to the end, modifying the
7519 * current last segment is problematic because it may result in trying to
7520 * overwrite an already-archived copy of that segment, and we encourage
7521 * DBAs to make their archive_commands reject that. We can dodge the
7522 * problem by making the new active segment have a new timeline ID.
7523 *
7524 * In a normal crash recovery, we can just extend the timeline we were in.
7525 */
7526 PrevTimeLineID = ThisTimeLineID;
7527 if (ArchiveRecoveryRequested)
7528 {
7529 char reason[200];
7530 char recoveryPath[MAXPGPATH];
7531
7532 Assert(InArchiveRecovery);
7533
7534 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7535 ereport(LOG,
7536 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7537
7538 /*
7539 * Create a comment for the history file to explain why and where
7540 * timeline changed.
7541 */
7542 if (recoveryTarget == RECOVERY_TARGET_XID)
7543 snprintf(reason, sizeof(reason),
7544 "%s transaction %u",
7545 recoveryStopAfter ? "after" : "before",
7546 recoveryStopXid);
7547 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7548 snprintf(reason, sizeof(reason),
7549 "%s %s\n",
7550 recoveryStopAfter ? "after" : "before",
7551 timestamptz_to_str(recoveryStopTime));
7552 else if (recoveryTarget == RECOVERY_TARGET_LSN)
7553 snprintf(reason, sizeof(reason),
7554 "%s LSN %X/%X\n",
7555 recoveryStopAfter ? "after" : "before",
7556 (uint32) (recoveryStopLSN >> 32),
7557 (uint32) recoveryStopLSN);
7558 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7559 snprintf(reason, sizeof(reason),
7560 "at restore point \"%s\"",
7561 recoveryStopName);
7562 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7563 snprintf(reason, sizeof(reason), "reached consistency");
7564 else
7565 snprintf(reason, sizeof(reason), "no recovery target specified");
7566
7567 /*
7568 * We are now done reading the old WAL. Turn off archive fetching if
7569 * it was active, and make a writable copy of the last WAL segment.
7570 * (Note that we also have a copy of the last block of the old WAL in
7571 * readBuf; we will use that below.)
7572 */
7573 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7574
7575 /*
7576 * Write the timeline history file, and have it archived. After this
7577 * point (or rather, as soon as the file is archived), the timeline
7578 * will appear as "taken" in the WAL archive and to any standby
7579 * servers. If we crash before actually switching to the new
7580 * timeline, standby servers will nevertheless think that we switched
7581 * to the new timeline, and will try to connect to the new timeline.
7582 * To minimize the window for that, try to do as little as possible
7583 * between here and writing the end-of-recovery record.
7584 */
7585 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7586 EndRecPtr, reason);
7587
7588 /*
7589 * Since there might be a partial WAL segment named RECOVERYXLOG, get
7590 * rid of it.
7591 */
7592 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
7593 unlink(recoveryPath); /* ignore any error */
7594
7595 /* Get rid of any remaining recovered timeline-history file, too */
7596 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
7597 unlink(recoveryPath); /* ignore any error */
7598 }
7599
7600 /* Save the selected TimeLineID in shared memory, too */
7601 XLogCtl->ThisTimeLineID = ThisTimeLineID;
7602 XLogCtl->PrevTimeLineID = PrevTimeLineID;
7603
7604 /*
7605 * Actually, if WAL ended in an incomplete record, skip the parts that
7606 * made it through and start writing after the portion that persisted.
7607 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
7608 * we'll do as soon as we're open for writing new WAL.)
7609 */
7610 if (!XLogRecPtrIsInvalid(missingContrecPtr))
7611 {
7612 Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
7613 EndOfLog = missingContrecPtr;
7614 }
7615
7616 /*
7617 * Prepare to write WAL starting at EndOfLog location, and init xlog
7618 * buffer cache using the block containing the last record from the
7619 * previous incarnation.
7620 */
7621 Insert = &XLogCtl->Insert;
7622 Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7623 Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7624
7625 /*
7626 * Tricky point here: readBuf contains the *last* block that the LastRec
7627 * record spans, not the one it starts in. The last block is indeed the
7628 * one we want to use.
7629 */
7630 if (EndOfLog % XLOG_BLCKSZ != 0)
7631 {
7632 char *page;
7633 int len;
7634 int firstIdx;
7635 XLogRecPtr pageBeginPtr;
7636
7637 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7638 Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7639
7640 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7641
7642 /* Copy the valid part of the last block, and zero the rest */
7643 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7644 len = EndOfLog % XLOG_BLCKSZ;
7645 memcpy(page, xlogreader->readBuf, len);
7646 memset(page + len, 0, XLOG_BLCKSZ - len);
7647
7648 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7649 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7650 }
7651 else
7652 {
7653 /*
7654 * There is no partial block to copy. Just set InitializedUpTo, and
7655 * let the first attempt to insert a log record to initialize the next
7656 * buffer.
7657 */
7658 XLogCtl->InitializedUpTo = EndOfLog;
7659 }
7660
7661 LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7662
7663 XLogCtl->LogwrtResult = LogwrtResult;
7664
7665 XLogCtl->LogwrtRqst.Write = EndOfLog;
7666 XLogCtl->LogwrtRqst.Flush = EndOfLog;
7667
7668 LocalSetXLogInsertAllowed();
7669
7670 /* If necessary, write overwrite-contrecord before doing anything else */
7671 if (!XLogRecPtrIsInvalid(abortedRecPtr))
7672 {
7673 Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
7674 CreateOverwriteContrecordRecord(abortedRecPtr);
7675 abortedRecPtr = InvalidXLogRecPtr;
7676 missingContrecPtr = InvalidXLogRecPtr;
7677 }
7678
7679 /*
7680 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7681 * record before resource manager writes cleanup WAL records or checkpoint
7682 * record is written.
7683 */
7684 Insert->fullPageWrites = lastFullPageWrites;
7685 UpdateFullPageWrites();
7686 LocalXLogInsertAllowed = -1;
7687
7688 if (InRecovery)
7689 {
7690 /*
7691 * Perform a checkpoint to update all our recovery activity to disk.
7692 *
7693 * Note that we write a shutdown checkpoint rather than an on-line
7694 * one. This is not particularly critical, but since we may be
7695 * assigning a new TLI, using a shutdown checkpoint allows us to have
7696 * the rule that TLI only changes in shutdown checkpoints, which
7697 * allows some extra error checking in xlog_redo.
7698 *
7699 * In fast promotion, only create a lightweight end-of-recovery record
7700 * instead of a full checkpoint. A checkpoint is requested later,
7701 * after we're fully out of recovery mode and already accepting
7702 * queries.
7703 */
7704 if (bgwriterLaunched)
7705 {
7706 if (fast_promote)
7707 {
7708 checkPointLoc = ControlFile->checkPoint;
7709
7710 /*
7711 * Confirm the last checkpoint is available for us to recover
7712 * from if we fail.
7713 */
7714 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7715 if (record != NULL)
7716 {
7717 fast_promoted = true;
7718
7719 /*
7720 * Insert a special WAL record to mark the end of
7721 * recovery, since we aren't doing a checkpoint. That
7722 * means that the checkpointer process may likely be in
7723 * the middle of a time-smoothed restartpoint and could
7724 * continue to be for minutes after this. That sounds
7725 * strange, but the effect is roughly the same and it
7726 * would be stranger to try to come out of the
7727 * restartpoint and then checkpoint. We request a
7728 * checkpoint later anyway, just for safety.
7729 */
7730 CreateEndOfRecoveryRecord();
7731 }
7732 }
7733
7734 if (!fast_promoted)
7735 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7736 CHECKPOINT_IMMEDIATE |
7737 CHECKPOINT_WAIT);
7738 }
7739 else
7740 CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7741 }
7742
7743 if (ArchiveRecoveryRequested)
7744 {
7745 /*
7746 * And finally, execute the recovery_end_command, if any.
7747 */
7748 if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
7749 ExecuteRecoveryCommand(recoveryEndCommand,
7750 "recovery_end_command",
7751 true);
7752
7753 /*
7754 * We switched to a new timeline. Clean up segments on the old
7755 * timeline.
7756 *
7757 * If there are any higher-numbered segments on the old timeline,
7758 * remove them. They might contain valid WAL, but they might also be
7759 * pre-allocated files containing garbage. In any case, they are not
7760 * part of the new timeline's history so we don't need them.
7761 */
7762 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7763
7764 /*
7765 * If the switch happened in the middle of a segment, what to do with
7766 * the last, partial segment on the old timeline? If we don't archive
7767 * it, and the server that created the WAL never archives it either
7768 * (e.g. because it was hit by a meteor), it will never make it to the
7769 * archive. That's OK from our point of view, because the new segment
7770 * that we created with the new TLI contains all the WAL from the old
7771 * timeline up to the switch point. But if you later try to do PITR to
7772 * the "missing" WAL on the old timeline, recovery won't find it in
7773 * the archive. It's physically present in the new file with new TLI,
7774 * but recovery won't look there when it's recovering to the older
7775 * timeline. On the other hand, if we archive the partial segment, and
7776 * the original server on that timeline is still running and archives
7777 * the completed version of the same segment later, it will fail. (We
7778 * used to do that in 9.4 and below, and it caused such problems).
7779 *
7780 * As a compromise, we rename the last segment with the .partial
7781 * suffix, and archive it. Archive recovery will never try to read
7782 * .partial segments, so they will normally go unused. But in the odd
7783 * PITR case, the administrator can copy them manually to the pg_wal
7784 * directory (removing the suffix). They can be useful in debugging,
7785 * too.
7786 *
7787 * If a .done or .ready file already exists for the old timeline,
7788 * however, we had already determined that the segment is complete, so
7789 * we can let it be archived normally. (In particular, if it was
7790 * restored from the archive to begin with, it's expected to have a
7791 * .done file).
7792 */
7793 if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
7794 XLogArchivingActive())
7795 {
7796 char origfname[MAXFNAMELEN];
7797 XLogSegNo endLogSegNo;
7798
7799 XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
7800 XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
7801
7802 if (!XLogArchiveIsReadyOrDone(origfname))
7803 {
7804 char origpath[MAXPGPATH];
7805 char partialfname[MAXFNAMELEN];
7806 char partialpath[MAXPGPATH];
7807
7808 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
7809 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7810 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7811
7812 /*
7813 * Make sure there's no .done or .ready file for the .partial
7814 * file.
7815 */
7816 XLogArchiveCleanup(partialfname);
7817
7818 durable_rename(origpath, partialpath, ERROR);
7819 XLogArchiveNotify(partialfname);
7820 }
7821 }
7822 }
7823
7824 /*
7825 * Preallocate additional log files, if wanted.
7826 */
7827 PreallocXlogFiles(EndOfLog);
7828
7829 /*
7830 * Okay, we're officially UP.
7831 */
7832 InRecovery = false;
7833
7834 /* start the archive_timeout timer and LSN running */
7835 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7836 XLogCtl->lastSegSwitchLSN = EndOfLog;
7837
7838 /* also initialize latestCompletedXid, to nextXid - 1 */
7839 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7840 ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
7841 TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7842 LWLockRelease(ProcArrayLock);
7843
7844 /*
7845 * Start up the commit log and subtrans, if not already done for hot
7846 * standby. (commit timestamps are started below, if necessary.)
7847 */
7848 if (standbyState == STANDBY_DISABLED)
7849 {
7850 StartupCLOG();
7851 StartupSUBTRANS(oldestActiveXID);
7852 }
7853
7854 /*
7855 * Perform end of recovery actions for any SLRUs that need it.
7856 */
7857 TrimCLOG();
7858 TrimMultiXact();
7859
7860 /* Reload shared-memory state for prepared transactions */
7861 RecoverPreparedTransactions();
7862
7863 /* Shut down xlogreader */
7864 if (readFile >= 0)
7865 {
7866 close(readFile);
7867 readFile = -1;
7868 }
7869 XLogReaderFree(xlogreader);
7870
7871 /*
7872 * If any of the critical GUCs have changed, log them before we allow
7873 * backends to write WAL.
7874 */
7875 LocalSetXLogInsertAllowed();
7876 XLogReportParameters();
7877
7878 /*
7879 * Local WAL inserts enabled, so it's time to finish initialization of
7880 * commit timestamp.
7881 */
7882 CompleteCommitTsInitialization();
7883
7884 /*
7885 * All done with end-of-recovery actions.
7886 *
7887 * Now allow backends to write WAL and update the control file status in
7888 * consequence. The boolean flag allowing backends to write WAL is
7889 * updated while holding ControlFileLock to prevent other backends to look
7890 * at an inconsistent state of the control file in shared memory. There
7891 * is still a small window during which backends can write WAL and the
7892 * control file is still referring to a system not in DB_IN_PRODUCTION
7893 * state while looking at the on-disk control file.
7894 *
7895 * Also, although the boolean flag to allow WAL is probably atomic in
7896 * itself, we use the info_lck here to ensure that there are no race
7897 * conditions concerning visibility of other recent updates to shared
7898 * memory.
7899 */
7900 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7901 ControlFile->state = DB_IN_PRODUCTION;
7902 ControlFile->time = (pg_time_t) time(NULL);
7903
7904 SpinLockAcquire(&XLogCtl->info_lck);
7905 XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
7906 SpinLockRelease(&XLogCtl->info_lck);
7907
7908 UpdateControlFile();
7909 LWLockRelease(ControlFileLock);
7910
7911 /*
7912 * Shutdown the recovery environment. This must occur after
7913 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
7914 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
7915 * any session building a snapshot will not rely on KnownAssignedXids as
7916 * RecoveryInProgress() would return false at this stage. This is
7917 * particularly critical for prepared 2PC transactions, that would still
7918 * need to be included in snapshots once recovery has ended.
7919 */
7920 if (standbyState != STANDBY_DISABLED)
7921 ShutdownRecoveryTransactionEnvironment();
7922
7923 /*
7924 * If there were cascading standby servers connected to us, nudge any wal
7925 * sender processes to notice that we've been promoted.
7926 */
7927 WalSndWakeup();
7928
7929 /*
7930 * If this was a fast promotion, request an (online) checkpoint now. This
7931 * isn't required for consistency, but the last restartpoint might be far
7932 * back, and in case of a crash, recovering from it might take a longer
7933 * than is appropriate now that we're not in standby mode anymore.
7934 */
7935 if (fast_promoted)
7936 RequestCheckpoint(CHECKPOINT_FORCE);
7937 }
7938
7939 /*
7940 * Checks if recovery has reached a consistent state. When consistency is
7941 * reached and we have a valid starting standby snapshot, tell postmaster
7942 * that it can start accepting read-only connections.
7943 */
7944 static void
CheckRecoveryConsistency(void)7945 CheckRecoveryConsistency(void)
7946 {
7947 XLogRecPtr lastReplayedEndRecPtr;
7948
7949 /*
7950 * During crash recovery, we don't reach a consistent state until we've
7951 * replayed all the WAL.
7952 */
7953 if (XLogRecPtrIsInvalid(minRecoveryPoint))
7954 return;
7955
7956 Assert(InArchiveRecovery);
7957
7958 /*
7959 * assume that we are called in the startup process, and hence don't need
7960 * a lock to read lastReplayedEndRecPtr
7961 */
7962 lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7963
7964 /*
7965 * Have we reached the point where our base backup was completed?
7966 */
7967 if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7968 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7969 {
7970 /*
7971 * We have reached the end of base backup, as indicated by pg_control.
7972 * The data on disk is now consistent. Reset backupStartPoint and
7973 * backupEndPoint, and update minRecoveryPoint to make sure we don't
7974 * allow starting up at an earlier point even if recovery is stopped
7975 * and restarted soon after this.
7976 */
7977 elog(DEBUG1, "end of backup reached");
7978
7979 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7980
7981 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7982 ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7983
7984 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7985 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7986 ControlFile->backupEndRequired = false;
7987 UpdateControlFile();
7988
7989 LWLockRelease(ControlFileLock);
7990 }
7991
7992 /*
7993 * Have we passed our safe starting point? Note that minRecoveryPoint is
7994 * known to be incorrectly set if ControlFile->backupEndRequired, until
7995 * the XLOG_BACKUP_END arrives to advise us of the correct
7996 * minRecoveryPoint. All we know prior to that is that we're not
7997 * consistent yet.
7998 */
7999 if (!reachedConsistency && !ControlFile->backupEndRequired &&
8000 minRecoveryPoint <= lastReplayedEndRecPtr &&
8001 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8002 {
8003 /*
8004 * Check to see if the XLOG sequence contained any unresolved
8005 * references to uninitialized pages.
8006 */
8007 XLogCheckInvalidPages();
8008
8009 reachedConsistency = true;
8010 ereport(LOG,
8011 (errmsg("consistent recovery state reached at %X/%X",
8012 (uint32) (lastReplayedEndRecPtr >> 32),
8013 (uint32) lastReplayedEndRecPtr)));
8014 }
8015
8016 /*
8017 * Have we got a valid starting snapshot that will allow queries to be
8018 * run? If so, we can tell postmaster that the database is consistent now,
8019 * enabling connections.
8020 */
8021 if (standbyState == STANDBY_SNAPSHOT_READY &&
8022 !LocalHotStandbyActive &&
8023 reachedConsistency &&
8024 IsUnderPostmaster)
8025 {
8026 SpinLockAcquire(&XLogCtl->info_lck);
8027 XLogCtl->SharedHotStandbyActive = true;
8028 SpinLockRelease(&XLogCtl->info_lck);
8029
8030 LocalHotStandbyActive = true;
8031
8032 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8033 }
8034 }
8035
8036 /*
8037 * Is the system still in recovery?
8038 *
8039 * Unlike testing InRecovery, this works in any process that's connected to
8040 * shared memory.
8041 *
8042 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8043 * variables the first time we see that recovery is finished.
8044 */
8045 bool
RecoveryInProgress(void)8046 RecoveryInProgress(void)
8047 {
8048 /*
8049 * We check shared state each time only until we leave recovery mode. We
8050 * can't re-enter recovery, so there's no need to keep checking after the
8051 * shared variable has once been seen false.
8052 */
8053 if (!LocalRecoveryInProgress)
8054 return false;
8055 else
8056 {
8057 /*
8058 * use volatile pointer to make sure we make a fresh read of the
8059 * shared variable.
8060 */
8061 volatile XLogCtlData *xlogctl = XLogCtl;
8062
8063 LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
8064
8065 /*
8066 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8067 * is finished. InitPostgres() relies upon this behaviour to ensure
8068 * that InitXLOGAccess() is called at backend startup. (If you change
8069 * this, see also LocalSetXLogInsertAllowed.)
8070 */
8071 if (!LocalRecoveryInProgress)
8072 {
8073 /*
8074 * If we just exited recovery, make sure we read TimeLineID and
8075 * RedoRecPtr after SharedRecoveryState (for machines with weak
8076 * memory ordering).
8077 */
8078 pg_memory_barrier();
8079 InitXLOGAccess();
8080 }
8081
8082 /*
8083 * Note: We don't need a memory barrier when we're still in recovery.
8084 * We might exit recovery immediately after return, so the caller
8085 * can't rely on 'true' meaning that we're still in recovery anyway.
8086 */
8087
8088 return LocalRecoveryInProgress;
8089 }
8090 }
8091
8092 /*
8093 * Returns current recovery state from shared memory.
8094 *
8095 * This returned state is kept consistent with the contents of the control
8096 * file. See details about the possible values of RecoveryState in xlog.h.
8097 */
8098 RecoveryState
GetRecoveryState(void)8099 GetRecoveryState(void)
8100 {
8101 RecoveryState retval;
8102
8103 SpinLockAcquire(&XLogCtl->info_lck);
8104 retval = XLogCtl->SharedRecoveryState;
8105 SpinLockRelease(&XLogCtl->info_lck);
8106
8107 return retval;
8108 }
8109
8110 /*
8111 * Is HotStandby active yet? This is only important in special backends
8112 * since normal backends won't ever be able to connect until this returns
8113 * true. Postmaster knows this by way of signal, not via shared memory.
8114 *
8115 * Unlike testing standbyState, this works in any process that's connected to
8116 * shared memory. (And note that standbyState alone doesn't tell the truth
8117 * anyway.)
8118 */
8119 bool
HotStandbyActive(void)8120 HotStandbyActive(void)
8121 {
8122 /*
8123 * We check shared state each time only until Hot Standby is active. We
8124 * can't de-activate Hot Standby, so there's no need to keep checking
8125 * after the shared variable has once been seen true.
8126 */
8127 if (LocalHotStandbyActive)
8128 return true;
8129 else
8130 {
8131 /* spinlock is essential on machines with weak memory ordering! */
8132 SpinLockAcquire(&XLogCtl->info_lck);
8133 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8134 SpinLockRelease(&XLogCtl->info_lck);
8135
8136 return LocalHotStandbyActive;
8137 }
8138 }
8139
8140 /*
8141 * Like HotStandbyActive(), but to be used only in WAL replay code,
8142 * where we don't need to ask any other process what the state is.
8143 */
8144 bool
HotStandbyActiveInReplay(void)8145 HotStandbyActiveInReplay(void)
8146 {
8147 Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8148 return LocalHotStandbyActive;
8149 }
8150
8151 /*
8152 * Is this process allowed to insert new WAL records?
8153 *
8154 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8155 * But we also have provisions for forcing the result "true" or "false"
8156 * within specific processes regardless of the global state.
8157 */
8158 bool
XLogInsertAllowed(void)8159 XLogInsertAllowed(void)
8160 {
8161 /*
8162 * If value is "unconditionally true" or "unconditionally false", just
8163 * return it. This provides the normal fast path once recovery is known
8164 * done.
8165 */
8166 if (LocalXLogInsertAllowed >= 0)
8167 return (bool) LocalXLogInsertAllowed;
8168
8169 /*
8170 * Else, must check to see if we're still in recovery.
8171 */
8172 if (RecoveryInProgress())
8173 return false;
8174
8175 /*
8176 * On exit from recovery, reset to "unconditionally true", since there is
8177 * no need to keep checking.
8178 */
8179 LocalXLogInsertAllowed = 1;
8180 return true;
8181 }
8182
8183 /*
8184 * Make XLogInsertAllowed() return true in the current process only.
8185 *
8186 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8187 * and even call LocalSetXLogInsertAllowed() again after that.
8188 */
8189 static void
LocalSetXLogInsertAllowed(void)8190 LocalSetXLogInsertAllowed(void)
8191 {
8192 Assert(LocalXLogInsertAllowed == -1);
8193 LocalXLogInsertAllowed = 1;
8194
8195 /* Initialize as RecoveryInProgress() would do when switching state */
8196 InitXLOGAccess();
8197 }
8198
8199 /*
8200 * Subroutine to try to fetch and validate a prior checkpoint record.
8201 *
8202 * whichChkpt identifies the checkpoint (merely for reporting purposes).
8203 * 1 for "primary", 0 for "other" (backup_label)
8204 */
8205 static XLogRecord *
ReadCheckpointRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int whichChkpt,bool report)8206 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8207 int whichChkpt, bool report)
8208 {
8209 XLogRecord *record;
8210 uint8 info;
8211
8212 if (!XRecOffIsValid(RecPtr))
8213 {
8214 if (!report)
8215 return NULL;
8216
8217 switch (whichChkpt)
8218 {
8219 case 1:
8220 ereport(LOG,
8221 (errmsg("invalid primary checkpoint link in control file")));
8222 break;
8223 default:
8224 ereport(LOG,
8225 (errmsg("invalid checkpoint link in backup_label file")));
8226 break;
8227 }
8228 return NULL;
8229 }
8230
8231 record = ReadRecord(xlogreader, RecPtr, LOG, true);
8232
8233 if (record == NULL)
8234 {
8235 if (!report)
8236 return NULL;
8237
8238 switch (whichChkpt)
8239 {
8240 case 1:
8241 ereport(LOG,
8242 (errmsg("invalid primary checkpoint record")));
8243 break;
8244 default:
8245 ereport(LOG,
8246 (errmsg("invalid checkpoint record")));
8247 break;
8248 }
8249 return NULL;
8250 }
8251 if (record->xl_rmid != RM_XLOG_ID)
8252 {
8253 switch (whichChkpt)
8254 {
8255 case 1:
8256 ereport(LOG,
8257 (errmsg("invalid resource manager ID in primary checkpoint record")));
8258 break;
8259 default:
8260 ereport(LOG,
8261 (errmsg("invalid resource manager ID in checkpoint record")));
8262 break;
8263 }
8264 return NULL;
8265 }
8266 info = record->xl_info & ~XLR_INFO_MASK;
8267 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8268 info != XLOG_CHECKPOINT_ONLINE)
8269 {
8270 switch (whichChkpt)
8271 {
8272 case 1:
8273 ereport(LOG,
8274 (errmsg("invalid xl_info in primary checkpoint record")));
8275 break;
8276 default:
8277 ereport(LOG,
8278 (errmsg("invalid xl_info in checkpoint record")));
8279 break;
8280 }
8281 return NULL;
8282 }
8283 if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8284 {
8285 switch (whichChkpt)
8286 {
8287 case 1:
8288 ereport(LOG,
8289 (errmsg("invalid length of primary checkpoint record")));
8290 break;
8291 default:
8292 ereport(LOG,
8293 (errmsg("invalid length of checkpoint record")));
8294 break;
8295 }
8296 return NULL;
8297 }
8298 return record;
8299 }
8300
8301 /*
8302 * This must be called in a backend process before creating WAL records
8303 * (except in a standalone backend, which does StartupXLOG instead). We need
8304 * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8305 *
8306 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8307 * process's copies of ThisTimeLineID and RedoRecPtr valid too. This was
8308 * unnecessary however, since the postmaster itself never touches XLOG anyway.
8309 */
8310 void
InitXLOGAccess(void)8311 InitXLOGAccess(void)
8312 {
8313 XLogCtlInsert *Insert = &XLogCtl->Insert;
8314
8315 /* ThisTimeLineID doesn't change so we need no lock to copy it */
8316 ThisTimeLineID = XLogCtl->ThisTimeLineID;
8317 Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8318
8319 /* set wal_segment_size */
8320 wal_segment_size = ControlFile->xlog_seg_size;
8321
8322 /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8323 (void) GetRedoRecPtr();
8324 /* Also update our copy of doPageWrites. */
8325 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8326
8327 /* Also initialize the working areas for constructing WAL records */
8328 InitXLogInsert();
8329 }
8330
8331 /*
8332 * Return the current Redo pointer from shared memory.
8333 *
8334 * As a side-effect, the local RedoRecPtr copy is updated.
8335 */
8336 XLogRecPtr
GetRedoRecPtr(void)8337 GetRedoRecPtr(void)
8338 {
8339 XLogRecPtr ptr;
8340
8341 /*
8342 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8343 * grabbed a WAL insertion lock to read the master copy, someone might
8344 * update it just after we've released the lock.
8345 */
8346 SpinLockAcquire(&XLogCtl->info_lck);
8347 ptr = XLogCtl->RedoRecPtr;
8348 SpinLockRelease(&XLogCtl->info_lck);
8349
8350 if (RedoRecPtr < ptr)
8351 RedoRecPtr = ptr;
8352
8353 return RedoRecPtr;
8354 }
8355
8356 /*
8357 * Return information needed to decide whether a modified block needs a
8358 * full-page image to be included in the WAL record.
8359 *
8360 * The returned values are cached copies from backend-private memory, and
8361 * possibly out-of-date. XLogInsertRecord will re-check them against
8362 * up-to-date values, while holding the WAL insert lock.
8363 */
8364 void
GetFullPageWriteInfo(XLogRecPtr * RedoRecPtr_p,bool * doPageWrites_p)8365 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8366 {
8367 *RedoRecPtr_p = RedoRecPtr;
8368 *doPageWrites_p = doPageWrites;
8369 }
8370
8371 /*
8372 * GetInsertRecPtr -- Returns the current insert position.
8373 *
8374 * NOTE: The value *actually* returned is the position of the last full
8375 * xlog page. It lags behind the real insert position by at most 1 page.
8376 * For that, we don't need to scan through WAL insertion locks, and an
8377 * approximation is enough for the current usage of this function.
8378 */
8379 XLogRecPtr
GetInsertRecPtr(void)8380 GetInsertRecPtr(void)
8381 {
8382 XLogRecPtr recptr;
8383
8384 SpinLockAcquire(&XLogCtl->info_lck);
8385 recptr = XLogCtl->LogwrtRqst.Write;
8386 SpinLockRelease(&XLogCtl->info_lck);
8387
8388 return recptr;
8389 }
8390
8391 /*
8392 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8393 * position known to be fsync'd to disk.
8394 */
8395 XLogRecPtr
GetFlushRecPtr(void)8396 GetFlushRecPtr(void)
8397 {
8398 SpinLockAcquire(&XLogCtl->info_lck);
8399 LogwrtResult = XLogCtl->LogwrtResult;
8400 SpinLockRelease(&XLogCtl->info_lck);
8401
8402 return LogwrtResult.Flush;
8403 }
8404
8405 /*
8406 * GetLastImportantRecPtr -- Returns the LSN of the last important record
8407 * inserted. All records not explicitly marked as unimportant are considered
8408 * important.
8409 *
8410 * The LSN is determined by computing the maximum of
8411 * WALInsertLocks[i].lastImportantAt.
8412 */
8413 XLogRecPtr
GetLastImportantRecPtr(void)8414 GetLastImportantRecPtr(void)
8415 {
8416 XLogRecPtr res = InvalidXLogRecPtr;
8417 int i;
8418
8419 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8420 {
8421 XLogRecPtr last_important;
8422
8423 /*
8424 * Need to take a lock to prevent torn reads of the LSN, which are
8425 * possible on some of the supported platforms. WAL insert locks only
8426 * support exclusive mode, so we have to use that.
8427 */
8428 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8429 last_important = WALInsertLocks[i].l.lastImportantAt;
8430 LWLockRelease(&WALInsertLocks[i].l.lock);
8431
8432 if (res < last_important)
8433 res = last_important;
8434 }
8435
8436 return res;
8437 }
8438
8439 /*
8440 * Get the time and LSN of the last xlog segment switch
8441 */
8442 pg_time_t
GetLastSegSwitchData(XLogRecPtr * lastSwitchLSN)8443 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8444 {
8445 pg_time_t result;
8446
8447 /* Need WALWriteLock, but shared lock is sufficient */
8448 LWLockAcquire(WALWriteLock, LW_SHARED);
8449 result = XLogCtl->lastSegSwitchTime;
8450 *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8451 LWLockRelease(WALWriteLock);
8452
8453 return result;
8454 }
8455
8456 /*
8457 * This must be called ONCE during postmaster or standalone-backend shutdown
8458 */
8459 void
ShutdownXLOG(int code,Datum arg)8460 ShutdownXLOG(int code, Datum arg)
8461 {
8462 /*
8463 * We should have an aux process resource owner to use, and we should not
8464 * be in a transaction that's installed some other resowner.
8465 */
8466 Assert(AuxProcessResourceOwner != NULL);
8467 Assert(CurrentResourceOwner == NULL ||
8468 CurrentResourceOwner == AuxProcessResourceOwner);
8469 CurrentResourceOwner = AuxProcessResourceOwner;
8470
8471 /* Don't be chatty in standalone mode */
8472 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8473 (errmsg("shutting down")));
8474
8475 /*
8476 * Signal walsenders to move to stopping state.
8477 */
8478 WalSndInitStopping();
8479
8480 /*
8481 * Wait for WAL senders to be in stopping state. This prevents commands
8482 * from writing new WAL.
8483 */
8484 WalSndWaitStopping();
8485
8486 if (RecoveryInProgress())
8487 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8488 else
8489 {
8490 /*
8491 * If archiving is enabled, rotate the last XLOG file so that all the
8492 * remaining records are archived (postmaster wakes up the archiver
8493 * process one more time at the end of shutdown). The checkpoint
8494 * record will go to the next XLOG file and won't be archived (yet).
8495 */
8496 if (XLogArchivingActive() && XLogArchiveCommandSet())
8497 RequestXLogSwitch(false);
8498
8499 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8500 }
8501 ShutdownCLOG();
8502 ShutdownCommitTs();
8503 ShutdownSUBTRANS();
8504 ShutdownMultiXact();
8505 }
8506
8507 /*
8508 * Log start of a checkpoint.
8509 */
8510 static void
LogCheckpointStart(int flags,bool restartpoint)8511 LogCheckpointStart(int flags, bool restartpoint)
8512 {
8513 elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8514 restartpoint ? "restartpoint" : "checkpoint",
8515 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8516 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8517 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8518 (flags & CHECKPOINT_FORCE) ? " force" : "",
8519 (flags & CHECKPOINT_WAIT) ? " wait" : "",
8520 (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
8521 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8522 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8523 }
8524
8525 /*
8526 * Log end of a checkpoint.
8527 */
8528 static void
LogCheckpointEnd(bool restartpoint)8529 LogCheckpointEnd(bool restartpoint)
8530 {
8531 long write_msecs,
8532 sync_msecs,
8533 total_msecs,
8534 longest_msecs,
8535 average_msecs;
8536 uint64 average_sync_time;
8537
8538 CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8539
8540 write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
8541 CheckpointStats.ckpt_sync_t);
8542
8543 sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
8544 CheckpointStats.ckpt_sync_end_t);
8545
8546 /* Accumulate checkpoint timing summary data, in milliseconds. */
8547 BgWriterStats.m_checkpoint_write_time += write_msecs;
8548 BgWriterStats.m_checkpoint_sync_time += sync_msecs;
8549
8550 /*
8551 * All of the published timing statistics are accounted for. Only
8552 * continue if a log message is to be written.
8553 */
8554 if (!log_checkpoints)
8555 return;
8556
8557 total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
8558 CheckpointStats.ckpt_end_t);
8559
8560 /*
8561 * Timing values returned from CheckpointStats are in microseconds.
8562 * Convert to milliseconds for consistent printing.
8563 */
8564 longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
8565
8566 average_sync_time = 0;
8567 if (CheckpointStats.ckpt_sync_rels > 0)
8568 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8569 CheckpointStats.ckpt_sync_rels;
8570 average_msecs = (long) ((average_sync_time + 999) / 1000);
8571
8572 elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8573 "%d WAL file(s) added, %d removed, %d recycled; "
8574 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8575 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8576 "distance=%d kB, estimate=%d kB",
8577 restartpoint ? "restartpoint" : "checkpoint",
8578 CheckpointStats.ckpt_bufs_written,
8579 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8580 CheckpointStats.ckpt_segs_added,
8581 CheckpointStats.ckpt_segs_removed,
8582 CheckpointStats.ckpt_segs_recycled,
8583 write_msecs / 1000, (int) (write_msecs % 1000),
8584 sync_msecs / 1000, (int) (sync_msecs % 1000),
8585 total_msecs / 1000, (int) (total_msecs % 1000),
8586 CheckpointStats.ckpt_sync_rels,
8587 longest_msecs / 1000, (int) (longest_msecs % 1000),
8588 average_msecs / 1000, (int) (average_msecs % 1000),
8589 (int) (PrevCheckPointDistance / 1024.0),
8590 (int) (CheckPointDistanceEstimate / 1024.0));
8591 }
8592
8593 /*
8594 * Update the estimate of distance between checkpoints.
8595 *
8596 * The estimate is used to calculate the number of WAL segments to keep
8597 * preallocated, see XLOGFileSlop().
8598 */
8599 static void
UpdateCheckPointDistanceEstimate(uint64 nbytes)8600 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8601 {
8602 /*
8603 * To estimate the number of segments consumed between checkpoints, keep a
8604 * moving average of the amount of WAL generated in previous checkpoint
8605 * cycles. However, if the load is bursty, with quiet periods and busy
8606 * periods, we want to cater for the peak load. So instead of a plain
8607 * moving average, let the average decline slowly if the previous cycle
8608 * used less WAL than estimated, but bump it up immediately if it used
8609 * more.
8610 *
8611 * When checkpoints are triggered by max_wal_size, this should converge to
8612 * CheckpointSegments * wal_segment_size,
8613 *
8614 * Note: This doesn't pay any attention to what caused the checkpoint.
8615 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8616 * starting a base backup, are counted the same as those created
8617 * automatically. The slow-decline will largely mask them out, if they are
8618 * not frequent. If they are frequent, it seems reasonable to count them
8619 * in as any others; if you issue a manual checkpoint every 5 minutes and
8620 * never let a timed checkpoint happen, it makes sense to base the
8621 * preallocation on that 5 minute interval rather than whatever
8622 * checkpoint_timeout is set to.
8623 */
8624 PrevCheckPointDistance = nbytes;
8625 if (CheckPointDistanceEstimate < nbytes)
8626 CheckPointDistanceEstimate = nbytes;
8627 else
8628 CheckPointDistanceEstimate =
8629 (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8630 }
8631
8632 /*
8633 * Perform a checkpoint --- either during shutdown, or on-the-fly
8634 *
8635 * flags is a bitwise OR of the following:
8636 * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8637 * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8638 * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8639 * ignoring checkpoint_completion_target parameter.
8640 * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8641 * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8642 * CHECKPOINT_END_OF_RECOVERY).
8643 * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8644 *
8645 * Note: flags contains other bits, of interest here only for logging purposes.
8646 * In particular note that this routine is synchronous and does not pay
8647 * attention to CHECKPOINT_WAIT.
8648 *
8649 * If !shutdown then we are writing an online checkpoint. This is a very special
8650 * kind of operation and WAL record because the checkpoint action occurs over
8651 * a period of time yet logically occurs at just a single LSN. The logical
8652 * position of the WAL record (redo ptr) is the same or earlier than the
8653 * physical position. When we replay WAL we locate the checkpoint via its
8654 * physical position then read the redo ptr and actually start replay at the
8655 * earlier logical position. Note that we don't write *anything* to WAL at
8656 * the logical position, so that location could be any other kind of WAL record.
8657 * All of this mechanism allows us to continue working while we checkpoint.
8658 * As a result, timing of actions is critical here and be careful to note that
8659 * this function will likely take minutes to execute on a busy system.
8660 */
8661 void
CreateCheckPoint(int flags)8662 CreateCheckPoint(int flags)
8663 {
8664 bool shutdown;
8665 CheckPoint checkPoint;
8666 XLogRecPtr recptr;
8667 XLogSegNo _logSegNo;
8668 XLogCtlInsert *Insert = &XLogCtl->Insert;
8669 uint32 freespace;
8670 XLogRecPtr PriorRedoPtr;
8671 XLogRecPtr curInsert;
8672 XLogRecPtr last_important_lsn;
8673 VirtualTransactionId *vxids;
8674 int nvxids;
8675
8676 /*
8677 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8678 * issued at a different time.
8679 */
8680 if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8681 shutdown = true;
8682 else
8683 shutdown = false;
8684
8685 /* sanity check */
8686 if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8687 elog(ERROR, "can't create a checkpoint during recovery");
8688
8689 /*
8690 * Initialize InitXLogInsert working areas before entering the critical
8691 * section. Normally, this is done by the first call to
8692 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8693 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8694 * done below in a critical section, and InitXLogInsert cannot be called
8695 * in a critical section.
8696 */
8697 InitXLogInsert();
8698
8699 /*
8700 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8701 * (This is just pro forma, since in the present system structure there is
8702 * only one process that is allowed to issue checkpoints at any given
8703 * time.)
8704 */
8705 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8706
8707 /*
8708 * Prepare to accumulate statistics.
8709 *
8710 * Note: because it is possible for log_checkpoints to change while a
8711 * checkpoint proceeds, we always accumulate stats, even if
8712 * log_checkpoints is currently off.
8713 */
8714 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8715 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8716
8717 /*
8718 * Use a critical section to force system panic if we have trouble.
8719 */
8720 START_CRIT_SECTION();
8721
8722 if (shutdown)
8723 {
8724 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8725 ControlFile->state = DB_SHUTDOWNING;
8726 ControlFile->time = (pg_time_t) time(NULL);
8727 UpdateControlFile();
8728 LWLockRelease(ControlFileLock);
8729 }
8730
8731 /*
8732 * Let smgr prepare for checkpoint; this has to happen before we determine
8733 * the REDO pointer. Note that smgr must not do anything that'd have to
8734 * be undone if we decide no checkpoint is needed.
8735 */
8736 SyncPreCheckpoint();
8737
8738 /* Begin filling in the checkpoint WAL record */
8739 MemSet(&checkPoint, 0, sizeof(checkPoint));
8740 checkPoint.time = (pg_time_t) time(NULL);
8741
8742 /*
8743 * For Hot Standby, derive the oldestActiveXid before we fix the redo
8744 * pointer. This allows us to begin accumulating changes to assemble our
8745 * starting snapshot of locks and transactions.
8746 */
8747 if (!shutdown && XLogStandbyInfoActive())
8748 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8749 else
8750 checkPoint.oldestActiveXid = InvalidTransactionId;
8751
8752 /*
8753 * Get location of last important record before acquiring insert locks (as
8754 * GetLastImportantRecPtr() also locks WAL locks).
8755 */
8756 last_important_lsn = GetLastImportantRecPtr();
8757
8758 /*
8759 * We must block concurrent insertions while examining insert state to
8760 * determine the checkpoint REDO pointer.
8761 */
8762 WALInsertLockAcquireExclusive();
8763 curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8764
8765 /*
8766 * If this isn't a shutdown or forced checkpoint, and if there has been no
8767 * WAL activity requiring a checkpoint, skip it. The idea here is to
8768 * avoid inserting duplicate checkpoints when the system is idle.
8769 */
8770 if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8771 CHECKPOINT_FORCE)) == 0)
8772 {
8773 if (last_important_lsn == ControlFile->checkPoint)
8774 {
8775 WALInsertLockRelease();
8776 LWLockRelease(CheckpointLock);
8777 END_CRIT_SECTION();
8778 ereport(DEBUG1,
8779 (errmsg("checkpoint skipped because system is idle")));
8780 return;
8781 }
8782 }
8783
8784 /*
8785 * An end-of-recovery checkpoint is created before anyone is allowed to
8786 * write WAL. To allow us to write the checkpoint record, temporarily
8787 * enable XLogInsertAllowed. (This also ensures ThisTimeLineID is
8788 * initialized, which we need here and in AdvanceXLInsertBuffer.)
8789 */
8790 if (flags & CHECKPOINT_END_OF_RECOVERY)
8791 LocalSetXLogInsertAllowed();
8792
8793 checkPoint.ThisTimeLineID = ThisTimeLineID;
8794 if (flags & CHECKPOINT_END_OF_RECOVERY)
8795 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8796 else
8797 checkPoint.PrevTimeLineID = ThisTimeLineID;
8798
8799 checkPoint.fullPageWrites = Insert->fullPageWrites;
8800
8801 /*
8802 * Compute new REDO record ptr = location of next XLOG record.
8803 *
8804 * NB: this is NOT necessarily where the checkpoint record itself will be,
8805 * since other backends may insert more XLOG records while we're off doing
8806 * the buffer flush work. Those XLOG records are logically after the
8807 * checkpoint, even though physically before it. Got that?
8808 */
8809 freespace = INSERT_FREESPACE(curInsert);
8810 if (freespace == 0)
8811 {
8812 if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
8813 curInsert += SizeOfXLogLongPHD;
8814 else
8815 curInsert += SizeOfXLogShortPHD;
8816 }
8817 checkPoint.redo = curInsert;
8818
8819 /*
8820 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8821 * must be done while holding all the insertion locks.
8822 *
8823 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8824 * pointing past where it really needs to point. This is okay; the only
8825 * consequence is that XLogInsert might back up whole buffers that it
8826 * didn't really need to. We can't postpone advancing RedoRecPtr because
8827 * XLogInserts that happen while we are dumping buffers must assume that
8828 * their buffer changes are not included in the checkpoint.
8829 */
8830 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8831
8832 /*
8833 * Now we can release the WAL insertion locks, allowing other xacts to
8834 * proceed while we are flushing disk buffers.
8835 */
8836 WALInsertLockRelease();
8837
8838 /* Update the info_lck-protected copy of RedoRecPtr as well */
8839 SpinLockAcquire(&XLogCtl->info_lck);
8840 XLogCtl->RedoRecPtr = checkPoint.redo;
8841 SpinLockRelease(&XLogCtl->info_lck);
8842
8843 /*
8844 * If enabled, log checkpoint start. We postpone this until now so as not
8845 * to log anything if we decided to skip the checkpoint.
8846 */
8847 if (log_checkpoints)
8848 LogCheckpointStart(flags, false);
8849
8850 TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8851
8852 /*
8853 * Get the other info we need for the checkpoint record.
8854 *
8855 * We don't need to save oldestClogXid in the checkpoint, it only matters
8856 * for the short period in which clog is being truncated, and if we crash
8857 * during that we'll redo the clog truncation and fix up oldestClogXid
8858 * there.
8859 */
8860 LWLockAcquire(XidGenLock, LW_SHARED);
8861 checkPoint.nextFullXid = ShmemVariableCache->nextFullXid;
8862 checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8863 checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8864 LWLockRelease(XidGenLock);
8865
8866 LWLockAcquire(CommitTsLock, LW_SHARED);
8867 checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8868 checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8869 LWLockRelease(CommitTsLock);
8870
8871 LWLockAcquire(OidGenLock, LW_SHARED);
8872 checkPoint.nextOid = ShmemVariableCache->nextOid;
8873 if (!shutdown)
8874 checkPoint.nextOid += ShmemVariableCache->oidCount;
8875 LWLockRelease(OidGenLock);
8876
8877 MultiXactGetCheckptMulti(shutdown,
8878 &checkPoint.nextMulti,
8879 &checkPoint.nextMultiOffset,
8880 &checkPoint.oldestMulti,
8881 &checkPoint.oldestMultiDB);
8882
8883 /*
8884 * Having constructed the checkpoint record, ensure all shmem disk buffers
8885 * and commit-log buffers are flushed to disk.
8886 *
8887 * This I/O could fail for various reasons. If so, we will fail to
8888 * complete the checkpoint, but there is no reason to force a system
8889 * panic. Accordingly, exit critical section while doing it.
8890 */
8891 END_CRIT_SECTION();
8892
8893 /*
8894 * In some cases there are groups of actions that must all occur on one
8895 * side or the other of a checkpoint record. Before flushing the
8896 * checkpoint record we must explicitly wait for any backend currently
8897 * performing those groups of actions.
8898 *
8899 * One example is end of transaction, so we must wait for any transactions
8900 * that are currently in commit critical sections. If an xact inserted
8901 * its commit record into XLOG just before the REDO point, then a crash
8902 * restart from the REDO point would not replay that record, which means
8903 * that our flushing had better include the xact's update of pg_xact. So
8904 * we wait till he's out of his commit critical section before proceeding.
8905 * See notes in RecordTransactionCommit().
8906 *
8907 * Because we've already released the insertion locks, this test is a bit
8908 * fuzzy: it is possible that we will wait for xacts we didn't really need
8909 * to wait for. But the delay should be short and it seems better to make
8910 * checkpoint take a bit longer than to hold off insertions longer than
8911 * necessary. (In fact, the whole reason we have this issue is that xact.c
8912 * does commit record XLOG insertion and clog update as two separate steps
8913 * protected by different locks, but again that seems best on grounds of
8914 * minimizing lock contention.)
8915 *
8916 * A transaction that has not yet set delayChkpt when we look cannot be at
8917 * risk, since he's not inserted his commit record yet; and one that's
8918 * already cleared it is not at risk either, since he's done fixing clog
8919 * and we will correctly flush the update below. So we cannot miss any
8920 * xacts we need to wait for.
8921 */
8922 vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8923 if (nvxids > 0)
8924 {
8925 do
8926 {
8927 pg_usleep(10000L); /* wait for 10 msec */
8928 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8929 }
8930 pfree(vxids);
8931
8932 CheckPointGuts(checkPoint.redo, flags);
8933
8934 /*
8935 * Take a snapshot of running transactions and write this to WAL. This
8936 * allows us to reconstruct the state of running transactions during
8937 * archive recovery, if required. Skip, if this info disabled.
8938 *
8939 * If we are shutting down, or Startup process is completing crash
8940 * recovery we don't need to write running xact data.
8941 */
8942 if (!shutdown && XLogStandbyInfoActive())
8943 LogStandbySnapshot();
8944
8945 START_CRIT_SECTION();
8946
8947 /*
8948 * Now insert the checkpoint record into XLOG.
8949 */
8950 XLogBeginInsert();
8951 XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
8952 recptr = XLogInsert(RM_XLOG_ID,
8953 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8954 XLOG_CHECKPOINT_ONLINE);
8955
8956 XLogFlush(recptr);
8957
8958 /*
8959 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8960 * overwritten at next startup. No-one should even try, this just allows
8961 * sanity-checking. In the case of an end-of-recovery checkpoint, we want
8962 * to just temporarily disable writing until the system has exited
8963 * recovery.
8964 */
8965 if (shutdown)
8966 {
8967 if (flags & CHECKPOINT_END_OF_RECOVERY)
8968 LocalXLogInsertAllowed = -1; /* return to "check" state */
8969 else
8970 LocalXLogInsertAllowed = 0; /* never again write WAL */
8971 }
8972
8973 /*
8974 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8975 * = end of actual checkpoint record.
8976 */
8977 if (shutdown && checkPoint.redo != ProcLastRecPtr)
8978 ereport(PANIC,
8979 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
8980
8981 /*
8982 * Remember the prior checkpoint's redo ptr for
8983 * UpdateCheckPointDistanceEstimate()
8984 */
8985 PriorRedoPtr = ControlFile->checkPointCopy.redo;
8986
8987 /*
8988 * Update the control file.
8989 */
8990 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8991 if (shutdown)
8992 ControlFile->state = DB_SHUTDOWNED;
8993 ControlFile->checkPoint = ProcLastRecPtr;
8994 ControlFile->checkPointCopy = checkPoint;
8995 ControlFile->time = (pg_time_t) time(NULL);
8996 /* crash recovery should always recover to the end of WAL */
8997 ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8998 ControlFile->minRecoveryPointTLI = 0;
8999
9000 /*
9001 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9002 * unused on non-shutdown checkpoints, but seems useful to store it always
9003 * for debugging purposes.
9004 */
9005 SpinLockAcquire(&XLogCtl->ulsn_lck);
9006 ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9007 SpinLockRelease(&XLogCtl->ulsn_lck);
9008
9009 UpdateControlFile();
9010 LWLockRelease(ControlFileLock);
9011
9012 /* Update shared-memory copy of checkpoint XID/epoch */
9013 SpinLockAcquire(&XLogCtl->info_lck);
9014 XLogCtl->ckptFullXid = checkPoint.nextFullXid;
9015 SpinLockRelease(&XLogCtl->info_lck);
9016
9017 /*
9018 * We are now done with critical updates; no need for system panic if we
9019 * have trouble while fooling with old log segments.
9020 */
9021 END_CRIT_SECTION();
9022
9023 /*
9024 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9025 */
9026 SyncPostCheckpoint();
9027
9028 /*
9029 * Update the average distance between checkpoints if the prior checkpoint
9030 * exists.
9031 */
9032 if (PriorRedoPtr != InvalidXLogRecPtr)
9033 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9034
9035 /*
9036 * Delete old log files, those no longer needed for last checkpoint to
9037 * prevent the disk holding the xlog from growing full.
9038 */
9039 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9040 KeepLogSeg(recptr, &_logSegNo);
9041 _logSegNo--;
9042 RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
9043
9044 /*
9045 * Make more log segments if needed. (Do this after recycling old log
9046 * segments, since that may supply some of the needed files.)
9047 */
9048 if (!shutdown)
9049 PreallocXlogFiles(recptr);
9050
9051 /*
9052 * Truncate pg_subtrans if possible. We can throw away all data before
9053 * the oldest XMIN of any running transaction. No future transaction will
9054 * attempt to reference any pg_subtrans entry older than that (see Asserts
9055 * in subtrans.c). During recovery, though, we mustn't do this because
9056 * StartupSUBTRANS hasn't been called yet.
9057 */
9058 if (!RecoveryInProgress())
9059 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9060
9061 /* Real work is done, but log and update stats before releasing lock. */
9062 LogCheckpointEnd(false);
9063
9064 TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9065 NBuffers,
9066 CheckpointStats.ckpt_segs_added,
9067 CheckpointStats.ckpt_segs_removed,
9068 CheckpointStats.ckpt_segs_recycled);
9069
9070 LWLockRelease(CheckpointLock);
9071 }
9072
9073 /*
9074 * Mark the end of recovery in WAL though without running a full checkpoint.
9075 * We can expect that a restartpoint is likely to be in progress as we
9076 * do this, though we are unwilling to wait for it to complete. So be
9077 * careful to avoid taking the CheckpointLock anywhere here.
9078 *
9079 * CreateRestartPoint() allows for the case where recovery may end before
9080 * the restartpoint completes so there is no concern of concurrent behaviour.
9081 */
9082 static void
CreateEndOfRecoveryRecord(void)9083 CreateEndOfRecoveryRecord(void)
9084 {
9085 xl_end_of_recovery xlrec;
9086 XLogRecPtr recptr;
9087
9088 /* sanity check */
9089 if (!RecoveryInProgress())
9090 elog(ERROR, "can only be used to end recovery");
9091
9092 xlrec.end_time = GetCurrentTimestamp();
9093
9094 WALInsertLockAcquireExclusive();
9095 xlrec.ThisTimeLineID = ThisTimeLineID;
9096 xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9097 WALInsertLockRelease();
9098
9099 LocalSetXLogInsertAllowed();
9100
9101 START_CRIT_SECTION();
9102
9103 XLogBeginInsert();
9104 XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9105 recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9106
9107 XLogFlush(recptr);
9108
9109 /*
9110 * Update the control file so that crash recovery can follow the timeline
9111 * changes to this point.
9112 */
9113 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9114 ControlFile->time = (pg_time_t) time(NULL);
9115 ControlFile->minRecoveryPoint = recptr;
9116 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9117 UpdateControlFile();
9118 LWLockRelease(ControlFileLock);
9119
9120 END_CRIT_SECTION();
9121
9122 LocalXLogInsertAllowed = -1; /* return to "check" state */
9123 }
9124
9125 /*
9126 * Write an OVERWRITE_CONTRECORD message.
9127 *
9128 * When on WAL replay we expect a continuation record at the start of a page
9129 * that is not there, recovery ends and WAL writing resumes at that point.
9130 * But it's wrong to resume writing new WAL back at the start of the record
9131 * that was broken, because downstream consumers of that WAL (physical
9132 * replicas) are not prepared to "rewind". So the first action after
9133 * finishing replay of all valid WAL must be to write a record of this type
9134 * at the point where the contrecord was missing; to support xlogreader
9135 * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
9136 * to the page header where the record occurs. xlogreader has an ad-hoc
9137 * mechanism to report metadata about the broken record, which is what we
9138 * use here.
9139 *
9140 * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
9141 * skip the record it was reading, and pass back the LSN of the skipped
9142 * record, so that its caller can verify (on "replay" of that record) that the
9143 * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
9144 */
9145 static XLogRecPtr
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)9146 CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
9147 {
9148 xl_overwrite_contrecord xlrec;
9149 XLogRecPtr recptr;
9150
9151 /* sanity check */
9152 if (!RecoveryInProgress())
9153 elog(ERROR, "can only be used at end of recovery");
9154
9155 xlrec.overwritten_lsn = aborted_lsn;
9156 xlrec.overwrite_time = GetCurrentTimestamp();
9157
9158 START_CRIT_SECTION();
9159
9160 XLogBeginInsert();
9161 XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
9162
9163 recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
9164
9165 XLogFlush(recptr);
9166
9167 END_CRIT_SECTION();
9168
9169 return recptr;
9170 }
9171
9172 /*
9173 * Flush all data in shared memory to disk, and fsync
9174 *
9175 * This is the common code shared between regular checkpoints and
9176 * recovery restartpoints.
9177 */
9178 static void
CheckPointGuts(XLogRecPtr checkPointRedo,int flags)9179 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9180 {
9181 CheckPointCLOG();
9182 CheckPointCommitTs();
9183 CheckPointSUBTRANS();
9184 CheckPointMultiXact();
9185 CheckPointPredicate();
9186 CheckPointRelationMap();
9187 CheckPointReplicationSlots();
9188 CheckPointSnapBuild();
9189 CheckPointLogicalRewriteHeap();
9190 CheckPointBuffers(flags); /* performs all required fsyncs */
9191 CheckPointReplicationOrigin();
9192 /* We deliberately delay 2PC checkpointing as long as possible */
9193 CheckPointTwoPhase(checkPointRedo);
9194 }
9195
9196 /*
9197 * Save a checkpoint for recovery restart if appropriate
9198 *
9199 * This function is called each time a checkpoint record is read from XLOG.
9200 * It must determine whether the checkpoint represents a safe restartpoint or
9201 * not. If so, the checkpoint record is stashed in shared memory so that
9202 * CreateRestartPoint can consult it. (Note that the latter function is
9203 * executed by the checkpointer, while this one will be executed by the
9204 * startup process.)
9205 */
9206 static void
RecoveryRestartPoint(const CheckPoint * checkPoint)9207 RecoveryRestartPoint(const CheckPoint *checkPoint)
9208 {
9209 /*
9210 * Also refrain from creating a restartpoint if we have seen any
9211 * references to non-existent pages. Restarting recovery from the
9212 * restartpoint would not see the references, so we would lose the
9213 * cross-check that the pages belonged to a relation that was dropped
9214 * later.
9215 */
9216 if (XLogHaveInvalidPages())
9217 {
9218 elog(trace_recovery(DEBUG2),
9219 "could not record restart point at %X/%X because there "
9220 "are unresolved references to invalid pages",
9221 (uint32) (checkPoint->redo >> 32),
9222 (uint32) checkPoint->redo);
9223 return;
9224 }
9225
9226 /*
9227 * Copy the checkpoint record to shared memory, so that checkpointer can
9228 * work out the next time it wants to perform a restartpoint.
9229 */
9230 SpinLockAcquire(&XLogCtl->info_lck);
9231 XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9232 XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9233 XLogCtl->lastCheckPoint = *checkPoint;
9234 SpinLockRelease(&XLogCtl->info_lck);
9235 }
9236
9237 /*
9238 * Establish a restartpoint if possible.
9239 *
9240 * This is similar to CreateCheckPoint, but is used during WAL recovery
9241 * to establish a point from which recovery can roll forward without
9242 * replaying the entire recovery log.
9243 *
9244 * Returns true if a new restartpoint was established. We can only establish
9245 * a restartpoint if we have replayed a safe checkpoint record since last
9246 * restartpoint.
9247 */
9248 bool
CreateRestartPoint(int flags)9249 CreateRestartPoint(int flags)
9250 {
9251 XLogRecPtr lastCheckPointRecPtr;
9252 XLogRecPtr lastCheckPointEndPtr;
9253 CheckPoint lastCheckPoint;
9254 XLogRecPtr PriorRedoPtr;
9255 XLogRecPtr receivePtr;
9256 XLogRecPtr replayPtr;
9257 TimeLineID replayTLI;
9258 XLogRecPtr endptr;
9259 XLogSegNo _logSegNo;
9260 TimestampTz xtime;
9261
9262 /*
9263 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9264 * happens at a time.
9265 */
9266 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9267
9268 /* Get a local copy of the last safe checkpoint record. */
9269 SpinLockAcquire(&XLogCtl->info_lck);
9270 lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9271 lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9272 lastCheckPoint = XLogCtl->lastCheckPoint;
9273 SpinLockRelease(&XLogCtl->info_lck);
9274
9275 /*
9276 * Check that we're still in recovery mode. It's ok if we exit recovery
9277 * mode after this check, the restart point is valid anyway.
9278 */
9279 if (!RecoveryInProgress())
9280 {
9281 ereport(DEBUG2,
9282 (errmsg("skipping restartpoint, recovery has already ended")));
9283 LWLockRelease(CheckpointLock);
9284 return false;
9285 }
9286
9287 /*
9288 * If the last checkpoint record we've replayed is already our last
9289 * restartpoint, we can't perform a new restart point. We still update
9290 * minRecoveryPoint in that case, so that if this is a shutdown restart
9291 * point, we won't start up earlier than before. That's not strictly
9292 * necessary, but when hot standby is enabled, it would be rather weird if
9293 * the database opened up for read-only connections at a point-in-time
9294 * before the last shutdown. Such time travel is still possible in case of
9295 * immediate shutdown, though.
9296 *
9297 * We don't explicitly advance minRecoveryPoint when we do create a
9298 * restartpoint. It's assumed that flushing the buffers will do that as a
9299 * side-effect.
9300 */
9301 if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9302 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9303 {
9304 ereport(DEBUG2,
9305 (errmsg("skipping restartpoint, already performed at %X/%X",
9306 (uint32) (lastCheckPoint.redo >> 32),
9307 (uint32) lastCheckPoint.redo)));
9308
9309 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9310 if (flags & CHECKPOINT_IS_SHUTDOWN)
9311 {
9312 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9313 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9314 ControlFile->time = (pg_time_t) time(NULL);
9315 UpdateControlFile();
9316 LWLockRelease(ControlFileLock);
9317 }
9318 LWLockRelease(CheckpointLock);
9319 return false;
9320 }
9321
9322 /*
9323 * Update the shared RedoRecPtr so that the startup process can calculate
9324 * the number of segments replayed since last restartpoint, and request a
9325 * restartpoint if it exceeds CheckPointSegments.
9326 *
9327 * Like in CreateCheckPoint(), hold off insertions to update it, although
9328 * during recovery this is just pro forma, because no WAL insertions are
9329 * happening.
9330 */
9331 WALInsertLockAcquireExclusive();
9332 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9333 WALInsertLockRelease();
9334
9335 /* Also update the info_lck-protected copy */
9336 SpinLockAcquire(&XLogCtl->info_lck);
9337 XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9338 SpinLockRelease(&XLogCtl->info_lck);
9339
9340 /*
9341 * Prepare to accumulate statistics.
9342 *
9343 * Note: because it is possible for log_checkpoints to change while a
9344 * checkpoint proceeds, we always accumulate stats, even if
9345 * log_checkpoints is currently off.
9346 */
9347 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9348 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9349
9350 if (log_checkpoints)
9351 LogCheckpointStart(flags, true);
9352
9353 CheckPointGuts(lastCheckPoint.redo, flags);
9354
9355 /*
9356 * Remember the prior checkpoint's redo ptr for
9357 * UpdateCheckPointDistanceEstimate()
9358 */
9359 PriorRedoPtr = ControlFile->checkPointCopy.redo;
9360
9361 /*
9362 * Update pg_control, using current time. Check that it still shows
9363 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9364 * this is a quick hack to make sure nothing really bad happens if somehow
9365 * we get here after the end-of-recovery checkpoint.
9366 */
9367 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9368 if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9369 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9370 {
9371 ControlFile->checkPoint = lastCheckPointRecPtr;
9372 ControlFile->checkPointCopy = lastCheckPoint;
9373 ControlFile->time = (pg_time_t) time(NULL);
9374
9375 /*
9376 * Ensure minRecoveryPoint is past the checkpoint record. Normally,
9377 * this will have happened already while writing out dirty buffers,
9378 * but not necessarily - e.g. because no buffers were dirtied. We do
9379 * this because a non-exclusive base backup uses minRecoveryPoint to
9380 * determine which WAL files must be included in the backup, and the
9381 * file (or files) containing the checkpoint record must be included,
9382 * at a minimum. Note that for an ordinary restart of recovery there's
9383 * no value in having the minimum recovery point any earlier than this
9384 * anyway, because redo will begin just after the checkpoint record.
9385 */
9386 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9387 {
9388 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9389 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9390
9391 /* update local copy */
9392 minRecoveryPoint = ControlFile->minRecoveryPoint;
9393 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9394 }
9395 if (flags & CHECKPOINT_IS_SHUTDOWN)
9396 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9397 UpdateControlFile();
9398 }
9399 LWLockRelease(ControlFileLock);
9400
9401 /*
9402 * Update the average distance between checkpoints/restartpoints if the
9403 * prior checkpoint exists.
9404 */
9405 if (PriorRedoPtr != InvalidXLogRecPtr)
9406 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9407
9408 /*
9409 * Delete old log files, those no longer needed for last restartpoint to
9410 * prevent the disk holding the xlog from growing full.
9411 */
9412 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9413
9414 /*
9415 * Retreat _logSegNo using the current end of xlog replayed or received,
9416 * whichever is later.
9417 */
9418 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9419 replayPtr = GetXLogReplayRecPtr(&replayTLI);
9420 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9421 KeepLogSeg(endptr, &_logSegNo);
9422 _logSegNo--;
9423
9424 /*
9425 * Try to recycle segments on a useful timeline. If we've been promoted
9426 * since the beginning of this restartpoint, use the new timeline chosen
9427 * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
9428 * case). If we're still in recovery, use the timeline we're currently
9429 * replaying.
9430 *
9431 * There is no guarantee that the WAL segments will be useful on the
9432 * current timeline; if recovery proceeds to a new timeline right after
9433 * this, the pre-allocated WAL segments on this timeline will not be used,
9434 * and will go wasted until recycled on the next restartpoint. We'll live
9435 * with that.
9436 */
9437 if (RecoveryInProgress())
9438 ThisTimeLineID = replayTLI;
9439
9440 RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
9441
9442 /*
9443 * Make more log segments if needed. (Do this after recycling old log
9444 * segments, since that may supply some of the needed files.)
9445 */
9446 PreallocXlogFiles(endptr);
9447
9448 /*
9449 * ThisTimeLineID is normally not set when we're still in recovery.
9450 * However, recycling/preallocating segments above needed ThisTimeLineID
9451 * to determine which timeline to install the segments on. Reset it now,
9452 * to restore the normal state of affairs for debugging purposes.
9453 */
9454 if (RecoveryInProgress())
9455 ThisTimeLineID = 0;
9456
9457 /*
9458 * Truncate pg_subtrans if possible. We can throw away all data before
9459 * the oldest XMIN of any running transaction. No future transaction will
9460 * attempt to reference any pg_subtrans entry older than that (see Asserts
9461 * in subtrans.c). When hot standby is disabled, though, we mustn't do
9462 * this because StartupSUBTRANS hasn't been called yet.
9463 */
9464 if (EnableHotStandby)
9465 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9466
9467 /* Real work is done, but log and update before releasing lock. */
9468 LogCheckpointEnd(true);
9469
9470 xtime = GetLatestXTime();
9471 ereport((log_checkpoints ? LOG : DEBUG2),
9472 (errmsg("recovery restart point at %X/%X",
9473 (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9474 xtime ? errdetail("Last completed transaction was at log time %s.",
9475 timestamptz_to_str(xtime)) : 0));
9476
9477 LWLockRelease(CheckpointLock);
9478
9479 /*
9480 * Finally, execute archive_cleanup_command, if any.
9481 */
9482 if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
9483 ExecuteRecoveryCommand(archiveCleanupCommand,
9484 "archive_cleanup_command",
9485 false);
9486
9487 return true;
9488 }
9489
9490 /*
9491 * Retreat *logSegNo to the last segment that we need to retain because of
9492 * either wal_keep_segments or replication slots.
9493 *
9494 * This is calculated by subtracting wal_keep_segments from the given xlog
9495 * location, recptr and by making sure that that result is below the
9496 * requirement of replication slots.
9497 */
9498 static void
KeepLogSeg(XLogRecPtr recptr,XLogSegNo * logSegNo)9499 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9500 {
9501 XLogSegNo segno;
9502 XLogRecPtr keep;
9503
9504 XLByteToSeg(recptr, segno, wal_segment_size);
9505 keep = XLogGetReplicationSlotMinimumLSN();
9506
9507 /* compute limit for wal_keep_segments first */
9508 if (wal_keep_segments > 0)
9509 {
9510 /* avoid underflow, don't go below 1 */
9511 if (segno <= wal_keep_segments)
9512 segno = 1;
9513 else
9514 segno = segno - wal_keep_segments;
9515 }
9516
9517 /* then check whether slots limit removal further */
9518 if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9519 {
9520 XLogSegNo slotSegNo;
9521
9522 XLByteToSeg(keep, slotSegNo, wal_segment_size);
9523
9524 if (slotSegNo <= 0)
9525 segno = 1;
9526 else if (slotSegNo < segno)
9527 segno = slotSegNo;
9528 }
9529
9530 /* don't delete WAL segments newer than the calculated segment */
9531 if (segno < *logSegNo)
9532 *logSegNo = segno;
9533 }
9534
9535 /*
9536 * Write a NEXTOID log record
9537 */
9538 void
XLogPutNextOid(Oid nextOid)9539 XLogPutNextOid(Oid nextOid)
9540 {
9541 XLogBeginInsert();
9542 XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9543 (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9544
9545 /*
9546 * We need not flush the NEXTOID record immediately, because any of the
9547 * just-allocated OIDs could only reach disk as part of a tuple insert or
9548 * update that would have its own XLOG record that must follow the NEXTOID
9549 * record. Therefore, the standard buffer LSN interlock applied to those
9550 * records will ensure no such OID reaches disk before the NEXTOID record
9551 * does.
9552 *
9553 * Note, however, that the above statement only covers state "within" the
9554 * database. When we use a generated OID as a file or directory name, we
9555 * are in a sense violating the basic WAL rule, because that filesystem
9556 * change may reach disk before the NEXTOID WAL record does. The impact
9557 * of this is that if a database crash occurs immediately afterward, we
9558 * might after restart re-generate the same OID and find that it conflicts
9559 * with the leftover file or directory. But since for safety's sake we
9560 * always loop until finding a nonconflicting filename, this poses no real
9561 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9562 */
9563 }
9564
9565 /*
9566 * Write an XLOG SWITCH record.
9567 *
9568 * Here we just blindly issue an XLogInsert request for the record.
9569 * All the magic happens inside XLogInsert.
9570 *
9571 * The return value is either the end+1 address of the switch record,
9572 * or the end+1 address of the prior segment if we did not need to
9573 * write a switch record because we are already at segment start.
9574 */
9575 XLogRecPtr
RequestXLogSwitch(bool mark_unimportant)9576 RequestXLogSwitch(bool mark_unimportant)
9577 {
9578 XLogRecPtr RecPtr;
9579
9580 /* XLOG SWITCH has no data */
9581 XLogBeginInsert();
9582
9583 if (mark_unimportant)
9584 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9585 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9586
9587 return RecPtr;
9588 }
9589
9590 /*
9591 * Write a RESTORE POINT record
9592 */
9593 XLogRecPtr
XLogRestorePoint(const char * rpName)9594 XLogRestorePoint(const char *rpName)
9595 {
9596 XLogRecPtr RecPtr;
9597 xl_restore_point xlrec;
9598
9599 xlrec.rp_time = GetCurrentTimestamp();
9600 strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9601
9602 XLogBeginInsert();
9603 XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9604
9605 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9606
9607 ereport(LOG,
9608 (errmsg("restore point \"%s\" created at %X/%X",
9609 rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9610
9611 return RecPtr;
9612 }
9613
9614 /*
9615 * Check if any of the GUC parameters that are critical for hot standby
9616 * have changed, and update the value in pg_control file if necessary.
9617 */
9618 static void
XLogReportParameters(void)9619 XLogReportParameters(void)
9620 {
9621 if (wal_level != ControlFile->wal_level ||
9622 wal_log_hints != ControlFile->wal_log_hints ||
9623 MaxConnections != ControlFile->MaxConnections ||
9624 max_worker_processes != ControlFile->max_worker_processes ||
9625 max_wal_senders != ControlFile->max_wal_senders ||
9626 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9627 max_locks_per_xact != ControlFile->max_locks_per_xact ||
9628 track_commit_timestamp != ControlFile->track_commit_timestamp)
9629 {
9630 /*
9631 * The change in number of backend slots doesn't need to be WAL-logged
9632 * if archiving is not enabled, as you can't start archive recovery
9633 * with wal_level=minimal anyway. We don't really care about the
9634 * values in pg_control either if wal_level=minimal, but seems better
9635 * to keep them up-to-date to avoid confusion.
9636 */
9637 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9638 {
9639 xl_parameter_change xlrec;
9640 XLogRecPtr recptr;
9641
9642 xlrec.MaxConnections = MaxConnections;
9643 xlrec.max_worker_processes = max_worker_processes;
9644 xlrec.max_wal_senders = max_wal_senders;
9645 xlrec.max_prepared_xacts = max_prepared_xacts;
9646 xlrec.max_locks_per_xact = max_locks_per_xact;
9647 xlrec.wal_level = wal_level;
9648 xlrec.wal_log_hints = wal_log_hints;
9649 xlrec.track_commit_timestamp = track_commit_timestamp;
9650
9651 XLogBeginInsert();
9652 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9653
9654 recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9655 XLogFlush(recptr);
9656 }
9657
9658 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9659
9660 ControlFile->MaxConnections = MaxConnections;
9661 ControlFile->max_worker_processes = max_worker_processes;
9662 ControlFile->max_wal_senders = max_wal_senders;
9663 ControlFile->max_prepared_xacts = max_prepared_xacts;
9664 ControlFile->max_locks_per_xact = max_locks_per_xact;
9665 ControlFile->wal_level = wal_level;
9666 ControlFile->wal_log_hints = wal_log_hints;
9667 ControlFile->track_commit_timestamp = track_commit_timestamp;
9668 UpdateControlFile();
9669
9670 LWLockRelease(ControlFileLock);
9671 }
9672 }
9673
9674 /*
9675 * Update full_page_writes in shared memory, and write an
9676 * XLOG_FPW_CHANGE record if necessary.
9677 *
9678 * Note: this function assumes there is no other process running
9679 * concurrently that could update it.
9680 */
9681 void
UpdateFullPageWrites(void)9682 UpdateFullPageWrites(void)
9683 {
9684 XLogCtlInsert *Insert = &XLogCtl->Insert;
9685 bool recoveryInProgress;
9686
9687 /*
9688 * Do nothing if full_page_writes has not been changed.
9689 *
9690 * It's safe to check the shared full_page_writes without the lock,
9691 * because we assume that there is no concurrently running process which
9692 * can update it.
9693 */
9694 if (fullPageWrites == Insert->fullPageWrites)
9695 return;
9696
9697 /*
9698 * Perform this outside critical section so that the WAL insert
9699 * initialization done by RecoveryInProgress() doesn't trigger an
9700 * assertion failure.
9701 */
9702 recoveryInProgress = RecoveryInProgress();
9703
9704 START_CRIT_SECTION();
9705
9706 /*
9707 * It's always safe to take full page images, even when not strictly
9708 * required, but not the other round. So if we're setting full_page_writes
9709 * to true, first set it true and then write the WAL record. If we're
9710 * setting it to false, first write the WAL record and then set the global
9711 * flag.
9712 */
9713 if (fullPageWrites)
9714 {
9715 WALInsertLockAcquireExclusive();
9716 Insert->fullPageWrites = true;
9717 WALInsertLockRelease();
9718 }
9719
9720 /*
9721 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9722 * full_page_writes during archive recovery, if required.
9723 */
9724 if (XLogStandbyInfoActive() && !recoveryInProgress)
9725 {
9726 XLogBeginInsert();
9727 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9728
9729 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9730 }
9731
9732 if (!fullPageWrites)
9733 {
9734 WALInsertLockAcquireExclusive();
9735 Insert->fullPageWrites = false;
9736 WALInsertLockRelease();
9737 }
9738 END_CRIT_SECTION();
9739 }
9740
9741 /*
9742 * Check that it's OK to switch to new timeline during recovery.
9743 *
9744 * 'lsn' is the address of the shutdown checkpoint record we're about to
9745 * replay. (Currently, timeline can only change at a shutdown checkpoint).
9746 */
9747 static void
checkTimeLineSwitch(XLogRecPtr lsn,TimeLineID newTLI,TimeLineID prevTLI)9748 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9749 {
9750 /* Check that the record agrees on what the current (old) timeline is */
9751 if (prevTLI != ThisTimeLineID)
9752 ereport(PANIC,
9753 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9754 prevTLI, ThisTimeLineID)));
9755
9756 /*
9757 * The new timeline better be in the list of timelines we expect to see,
9758 * according to the timeline history. It should also not decrease.
9759 */
9760 if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9761 ereport(PANIC,
9762 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9763 newTLI, ThisTimeLineID)));
9764
9765 /*
9766 * If we have not yet reached min recovery point, and we're about to
9767 * switch to a timeline greater than the timeline of the min recovery
9768 * point: trouble. After switching to the new timeline, we could not
9769 * possibly visit the min recovery point on the correct timeline anymore.
9770 * This can happen if there is a newer timeline in the archive that
9771 * branched before the timeline the min recovery point is on, and you
9772 * attempt to do PITR to the new timeline.
9773 */
9774 if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9775 lsn < minRecoveryPoint &&
9776 newTLI > minRecoveryPointTLI)
9777 ereport(PANIC,
9778 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9779 newTLI,
9780 (uint32) (minRecoveryPoint >> 32),
9781 (uint32) minRecoveryPoint,
9782 minRecoveryPointTLI)));
9783
9784 /* Looks good */
9785 }
9786
9787 /*
9788 * XLOG resource manager's routines
9789 *
9790 * Definitions of info values are in include/catalog/pg_control.h, though
9791 * not all record types are related to control file updates.
9792 */
9793 void
xlog_redo(XLogReaderState * record)9794 xlog_redo(XLogReaderState *record)
9795 {
9796 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9797 XLogRecPtr lsn = record->EndRecPtr;
9798
9799 /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
9800 Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9801 !XLogRecHasAnyBlockRefs(record));
9802
9803 if (info == XLOG_NEXTOID)
9804 {
9805 Oid nextOid;
9806
9807 /*
9808 * We used to try to take the maximum of ShmemVariableCache->nextOid
9809 * and the recorded nextOid, but that fails if the OID counter wraps
9810 * around. Since no OID allocation should be happening during replay
9811 * anyway, better to just believe the record exactly. We still take
9812 * OidGenLock while setting the variable, just in case.
9813 */
9814 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9815 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9816 ShmemVariableCache->nextOid = nextOid;
9817 ShmemVariableCache->oidCount = 0;
9818 LWLockRelease(OidGenLock);
9819 }
9820 else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9821 {
9822 CheckPoint checkPoint;
9823
9824 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9825 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9826 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9827 ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
9828 LWLockRelease(XidGenLock);
9829 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9830 ShmemVariableCache->nextOid = checkPoint.nextOid;
9831 ShmemVariableCache->oidCount = 0;
9832 LWLockRelease(OidGenLock);
9833 MultiXactSetNextMXact(checkPoint.nextMulti,
9834 checkPoint.nextMultiOffset);
9835
9836 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9837 checkPoint.oldestMultiDB);
9838
9839 /*
9840 * No need to set oldestClogXid here as well; it'll be set when we
9841 * redo an xl_clog_truncate if it changed since initialization.
9842 */
9843 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9844
9845 /*
9846 * If we see a shutdown checkpoint while waiting for an end-of-backup
9847 * record, the backup was canceled and the end-of-backup record will
9848 * never arrive.
9849 */
9850 if (ArchiveRecoveryRequested &&
9851 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9852 XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9853 ereport(PANIC,
9854 (errmsg("online backup was canceled, recovery cannot continue")));
9855
9856 /*
9857 * If we see a shutdown checkpoint, we know that nothing was running
9858 * on the master at this point. So fake-up an empty running-xacts
9859 * record and use that here and now. Recover additional standby state
9860 * for prepared transactions.
9861 */
9862 if (standbyState >= STANDBY_INITIALIZED)
9863 {
9864 TransactionId *xids;
9865 int nxids;
9866 TransactionId oldestActiveXID;
9867 TransactionId latestCompletedXid;
9868 RunningTransactionsData running;
9869
9870 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9871
9872 /*
9873 * Construct a RunningTransactions snapshot representing a shut
9874 * down server, with only prepared transactions still alive. We're
9875 * never overflowed at this point because all subxids are listed
9876 * with their parent prepared transactions.
9877 */
9878 running.xcnt = nxids;
9879 running.subxcnt = 0;
9880 running.subxid_overflow = false;
9881 running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
9882 running.oldestRunningXid = oldestActiveXID;
9883 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
9884 TransactionIdRetreat(latestCompletedXid);
9885 Assert(TransactionIdIsNormal(latestCompletedXid));
9886 running.latestCompletedXid = latestCompletedXid;
9887 running.xids = xids;
9888
9889 ProcArrayApplyRecoveryInfo(&running);
9890
9891 StandbyRecoverPreparedTransactions();
9892 }
9893
9894 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9895 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9896 ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
9897 LWLockRelease(ControlFileLock);
9898
9899 /* Update shared-memory copy of checkpoint XID/epoch */
9900 SpinLockAcquire(&XLogCtl->info_lck);
9901 XLogCtl->ckptFullXid = checkPoint.nextFullXid;
9902 SpinLockRelease(&XLogCtl->info_lck);
9903
9904 /*
9905 * We should've already switched to the new TLI before replaying this
9906 * record.
9907 */
9908 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9909 ereport(PANIC,
9910 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9911 checkPoint.ThisTimeLineID, ThisTimeLineID)));
9912
9913 RecoveryRestartPoint(&checkPoint);
9914 }
9915 else if (info == XLOG_CHECKPOINT_ONLINE)
9916 {
9917 CheckPoint checkPoint;
9918
9919 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9920 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9921 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9922 if (FullTransactionIdPrecedes(ShmemVariableCache->nextFullXid,
9923 checkPoint.nextFullXid))
9924 ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
9925 LWLockRelease(XidGenLock);
9926
9927 /*
9928 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
9929 * to track OID assignment through XLOG_NEXTOID records. The nextOid
9930 * counter is from the start of the checkpoint and might well be stale
9931 * compared to later XLOG_NEXTOID records. We could try to take the
9932 * maximum of the nextOid counter and our latest value, but since
9933 * there's no particular guarantee about the speed with which the OID
9934 * counter wraps around, that's a risky thing to do. In any case,
9935 * users of the nextOid counter are required to avoid assignment of
9936 * duplicates, so that a somewhat out-of-date value should be safe.
9937 */
9938
9939 /* Handle multixact */
9940 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9941 checkPoint.nextMultiOffset);
9942
9943 /*
9944 * NB: This may perform multixact truncation when replaying WAL
9945 * generated by an older primary.
9946 */
9947 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9948 checkPoint.oldestMultiDB);
9949 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9950 checkPoint.oldestXid))
9951 SetTransactionIdLimit(checkPoint.oldestXid,
9952 checkPoint.oldestXidDB);
9953 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9954 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9955 ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
9956 LWLockRelease(ControlFileLock);
9957
9958 /* Update shared-memory copy of checkpoint XID/epoch */
9959 SpinLockAcquire(&XLogCtl->info_lck);
9960 XLogCtl->ckptFullXid = checkPoint.nextFullXid;
9961 SpinLockRelease(&XLogCtl->info_lck);
9962
9963 /* TLI should not change in an on-line checkpoint */
9964 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9965 ereport(PANIC,
9966 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9967 checkPoint.ThisTimeLineID, ThisTimeLineID)));
9968
9969 RecoveryRestartPoint(&checkPoint);
9970 }
9971 else if (info == XLOG_OVERWRITE_CONTRECORD)
9972 {
9973 xl_overwrite_contrecord xlrec;
9974
9975 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
9976 VerifyOverwriteContrecord(&xlrec, record);
9977 }
9978 else if (info == XLOG_END_OF_RECOVERY)
9979 {
9980 xl_end_of_recovery xlrec;
9981
9982 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9983
9984 /*
9985 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9986 * but this case is rarer and harder to test, so the benefit doesn't
9987 * outweigh the potential extra cost of maintenance.
9988 */
9989
9990 /*
9991 * We should've already switched to the new TLI before replaying this
9992 * record.
9993 */
9994 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9995 ereport(PANIC,
9996 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9997 xlrec.ThisTimeLineID, ThisTimeLineID)));
9998 }
9999 else if (info == XLOG_NOOP)
10000 {
10001 /* nothing to do here */
10002 }
10003 else if (info == XLOG_SWITCH)
10004 {
10005 /* nothing to do here */
10006 }
10007 else if (info == XLOG_RESTORE_POINT)
10008 {
10009 /* nothing to do here */
10010 }
10011 else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
10012 {
10013 /*
10014 * Full-page image (FPI) records contain nothing else but a backup
10015 * block (or multiple backup blocks). Every block reference must
10016 * include a full-page image - otherwise there would be no point in
10017 * this record.
10018 *
10019 * No recovery conflicts are generated by these generic records - if a
10020 * resource manager needs to generate conflicts, it has to define a
10021 * separate WAL record type and redo routine.
10022 *
10023 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
10024 * WAL- logged because of a hint bit update. They are only generated
10025 * when checksums are enabled. There is no difference in handling
10026 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
10027 * code just to distinguish them for statistics purposes.
10028 */
10029 for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
10030 {
10031 Buffer buffer;
10032
10033 if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
10034 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
10035 UnlockReleaseBuffer(buffer);
10036 }
10037 }
10038 else if (info == XLOG_BACKUP_END)
10039 {
10040 XLogRecPtr startpoint;
10041
10042 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
10043
10044 if (ControlFile->backupStartPoint == startpoint)
10045 {
10046 /*
10047 * We have reached the end of base backup, the point where
10048 * pg_stop_backup() was done. The data on disk is now consistent.
10049 * Reset backupStartPoint, and update minRecoveryPoint to make
10050 * sure we don't allow starting up at an earlier point even if
10051 * recovery is stopped and restarted soon after this.
10052 */
10053 elog(DEBUG1, "end of backup reached");
10054
10055 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10056
10057 if (ControlFile->minRecoveryPoint < lsn)
10058 {
10059 ControlFile->minRecoveryPoint = lsn;
10060 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10061 }
10062 ControlFile->backupStartPoint = InvalidXLogRecPtr;
10063 ControlFile->backupEndRequired = false;
10064 UpdateControlFile();
10065
10066 LWLockRelease(ControlFileLock);
10067 }
10068 }
10069 else if (info == XLOG_PARAMETER_CHANGE)
10070 {
10071 xl_parameter_change xlrec;
10072
10073 /* Update our copy of the parameters in pg_control */
10074 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10075
10076 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10077 ControlFile->MaxConnections = xlrec.MaxConnections;
10078 ControlFile->max_worker_processes = xlrec.max_worker_processes;
10079 ControlFile->max_wal_senders = xlrec.max_wal_senders;
10080 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10081 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10082 ControlFile->wal_level = xlrec.wal_level;
10083 ControlFile->wal_log_hints = xlrec.wal_log_hints;
10084
10085 /*
10086 * Update minRecoveryPoint to ensure that if recovery is aborted, we
10087 * recover back up to this point before allowing hot standby again.
10088 * This is important if the max_* settings are decreased, to ensure
10089 * you don't run queries against the WAL preceding the change. The
10090 * local copies cannot be updated as long as crash recovery is
10091 * happening and we expect all the WAL to be replayed.
10092 */
10093 if (InArchiveRecovery)
10094 {
10095 minRecoveryPoint = ControlFile->minRecoveryPoint;
10096 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10097 }
10098 if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10099 {
10100 ControlFile->minRecoveryPoint = lsn;
10101 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10102 }
10103
10104 CommitTsParameterChange(xlrec.track_commit_timestamp,
10105 ControlFile->track_commit_timestamp);
10106 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10107
10108 UpdateControlFile();
10109 LWLockRelease(ControlFileLock);
10110
10111 /* Check to see if any parameter change gives a problem on recovery */
10112 CheckRequiredParameterValues();
10113 }
10114 else if (info == XLOG_FPW_CHANGE)
10115 {
10116 bool fpw;
10117
10118 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10119
10120 /*
10121 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10122 * do_pg_start_backup() and do_pg_stop_backup() can check whether
10123 * full_page_writes has been disabled during online backup.
10124 */
10125 if (!fpw)
10126 {
10127 SpinLockAcquire(&XLogCtl->info_lck);
10128 if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10129 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10130 SpinLockRelease(&XLogCtl->info_lck);
10131 }
10132
10133 /* Keep track of full_page_writes */
10134 lastFullPageWrites = fpw;
10135 }
10136 }
10137
10138 /*
10139 * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
10140 */
10141 static void
VerifyOverwriteContrecord(xl_overwrite_contrecord * xlrec,XLogReaderState * state)10142 VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
10143 {
10144 if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
10145 elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
10146 (uint32) (xlrec->overwritten_lsn >> 32),
10147 (uint32) xlrec->overwritten_lsn,
10148 (uint32) (state->overwrittenRecPtr >> 32),
10149 (uint32) state->overwrittenRecPtr);
10150
10151 ereport(LOG,
10152 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
10153 (uint32) (xlrec->overwritten_lsn >> 32),
10154 (uint32) xlrec->overwritten_lsn,
10155 timestamptz_to_str(xlrec->overwrite_time))));
10156
10157 /* Verifying the record should only happen once */
10158 state->overwrittenRecPtr = InvalidXLogRecPtr;
10159 }
10160
10161 #ifdef WAL_DEBUG
10162
10163 static void
xlog_outrec(StringInfo buf,XLogReaderState * record)10164 xlog_outrec(StringInfo buf, XLogReaderState *record)
10165 {
10166 int block_id;
10167
10168 appendStringInfo(buf, "prev %X/%X; xid %u",
10169 (uint32) (XLogRecGetPrev(record) >> 32),
10170 (uint32) XLogRecGetPrev(record),
10171 XLogRecGetXid(record));
10172
10173 appendStringInfo(buf, "; len %u",
10174 XLogRecGetDataLen(record));
10175
10176 /* decode block references */
10177 for (block_id = 0; block_id <= record->max_block_id; block_id++)
10178 {
10179 RelFileNode rnode;
10180 ForkNumber forknum;
10181 BlockNumber blk;
10182
10183 if (!XLogRecHasBlockRef(record, block_id))
10184 continue;
10185
10186 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10187 if (forknum != MAIN_FORKNUM)
10188 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10189 block_id,
10190 rnode.spcNode, rnode.dbNode, rnode.relNode,
10191 forknum,
10192 blk);
10193 else
10194 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10195 block_id,
10196 rnode.spcNode, rnode.dbNode, rnode.relNode,
10197 blk);
10198 if (XLogRecHasBlockImage(record, block_id))
10199 appendStringInfoString(buf, " FPW");
10200 }
10201 }
10202 #endif /* WAL_DEBUG */
10203
10204 /*
10205 * Returns a string describing an XLogRecord, consisting of its identity
10206 * optionally followed by a colon, a space, and a further description.
10207 */
10208 static void
xlog_outdesc(StringInfo buf,XLogReaderState * record)10209 xlog_outdesc(StringInfo buf, XLogReaderState *record)
10210 {
10211 RmgrId rmid = XLogRecGetRmid(record);
10212 uint8 info = XLogRecGetInfo(record);
10213 const char *id;
10214
10215 appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10216 appendStringInfoChar(buf, '/');
10217
10218 id = RmgrTable[rmid].rm_identify(info);
10219 if (id == NULL)
10220 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10221 else
10222 appendStringInfo(buf, "%s: ", id);
10223
10224 RmgrTable[rmid].rm_desc(buf, record);
10225 }
10226
10227
10228 /*
10229 * Return the (possible) sync flag used for opening a file, depending on the
10230 * value of the GUC wal_sync_method.
10231 */
10232 static int
get_sync_bit(int method)10233 get_sync_bit(int method)
10234 {
10235 int o_direct_flag = 0;
10236
10237 /* If fsync is disabled, never open in sync mode */
10238 if (!enableFsync)
10239 return 0;
10240
10241 /*
10242 * Optimize writes by bypassing kernel cache with O_DIRECT when using
10243 * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
10244 * disabled, otherwise the archive command or walsender process will read
10245 * the WAL soon after writing it, which is guaranteed to cause a physical
10246 * read if we bypassed the kernel cache. We also skip the
10247 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10248 * reason.
10249 *
10250 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10251 * written by walreceiver is normally read by the startup process soon
10252 * after its written. Also, walreceiver performs unaligned writes, which
10253 * don't work with O_DIRECT, so it is required for correctness too.
10254 */
10255 if (!XLogIsNeeded() && !AmWalReceiverProcess())
10256 o_direct_flag = PG_O_DIRECT;
10257
10258 switch (method)
10259 {
10260 /*
10261 * enum values for all sync options are defined even if they are
10262 * not supported on the current platform. But if not, they are
10263 * not included in the enum option array, and therefore will never
10264 * be seen here.
10265 */
10266 case SYNC_METHOD_FSYNC:
10267 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10268 case SYNC_METHOD_FDATASYNC:
10269 return 0;
10270 #ifdef OPEN_SYNC_FLAG
10271 case SYNC_METHOD_OPEN:
10272 return OPEN_SYNC_FLAG | o_direct_flag;
10273 #endif
10274 #ifdef OPEN_DATASYNC_FLAG
10275 case SYNC_METHOD_OPEN_DSYNC:
10276 return OPEN_DATASYNC_FLAG | o_direct_flag;
10277 #endif
10278 default:
10279 /* can't happen (unless we are out of sync with option array) */
10280 elog(ERROR, "unrecognized wal_sync_method: %d", method);
10281 return 0; /* silence warning */
10282 }
10283 }
10284
10285 /*
10286 * GUC support
10287 */
10288 void
assign_xlog_sync_method(int new_sync_method,void * extra)10289 assign_xlog_sync_method(int new_sync_method, void *extra)
10290 {
10291 if (sync_method != new_sync_method)
10292 {
10293 /*
10294 * To ensure that no blocks escape unsynced, force an fsync on the
10295 * currently open log segment (if any). Also, if the open flag is
10296 * changing, close the log file so it will be reopened (with new flag
10297 * bit) at next use.
10298 */
10299 if (openLogFile >= 0)
10300 {
10301 pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10302 if (pg_fsync(openLogFile) != 0)
10303 ereport(PANIC,
10304 (errcode_for_file_access(),
10305 errmsg("could not fsync file \"%s\": %m",
10306 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10307 pgstat_report_wait_end();
10308 if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10309 XLogFileClose();
10310 }
10311 }
10312 }
10313
10314
10315 /*
10316 * Issue appropriate kind of fsync (if any) for an XLOG output file.
10317 *
10318 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10319 * 'segno' is for error reporting purposes.
10320 */
10321 void
issue_xlog_fsync(int fd,XLogSegNo segno)10322 issue_xlog_fsync(int fd, XLogSegNo segno)
10323 {
10324 pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
10325 switch (sync_method)
10326 {
10327 case SYNC_METHOD_FSYNC:
10328 if (pg_fsync_no_writethrough(fd) != 0)
10329 ereport(PANIC,
10330 (errcode_for_file_access(),
10331 errmsg("could not fsync file \"%s\": %m",
10332 XLogFileNameP(ThisTimeLineID, segno))));
10333 break;
10334 #ifdef HAVE_FSYNC_WRITETHROUGH
10335 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10336 if (pg_fsync_writethrough(fd) != 0)
10337 ereport(PANIC,
10338 (errcode_for_file_access(),
10339 errmsg("could not fsync write-through file \"%s\": %m",
10340 XLogFileNameP(ThisTimeLineID, segno))));
10341 break;
10342 #endif
10343 #ifdef HAVE_FDATASYNC
10344 case SYNC_METHOD_FDATASYNC:
10345 if (pg_fdatasync(fd) != 0)
10346 ereport(PANIC,
10347 (errcode_for_file_access(),
10348 errmsg("could not fdatasync file \"%s\": %m",
10349 XLogFileNameP(ThisTimeLineID, segno))));
10350 break;
10351 #endif
10352 case SYNC_METHOD_OPEN:
10353 case SYNC_METHOD_OPEN_DSYNC:
10354 /* write synced it already */
10355 break;
10356 default:
10357 elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10358 break;
10359 }
10360 pgstat_report_wait_end();
10361 }
10362
10363 /*
10364 * Return the filename of given log segment, as a palloc'd string.
10365 */
10366 char *
XLogFileNameP(TimeLineID tli,XLogSegNo segno)10367 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10368 {
10369 char *result = palloc(MAXFNAMELEN);
10370
10371 XLogFileName(result, tli, segno, wal_segment_size);
10372 return result;
10373 }
10374
10375 /*
10376 * do_pg_start_backup
10377 *
10378 * Utility function called at the start of an online backup. It creates the
10379 * necessary starting checkpoint and constructs the backup label file.
10380 *
10381 * There are two kind of backups: exclusive and non-exclusive. An exclusive
10382 * backup is started with pg_start_backup(), and there can be only one active
10383 * at a time. The backup and tablespace map files of an exclusive backup are
10384 * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10385 * removed by pg_stop_backup().
10386 *
10387 * A non-exclusive backup is used for the streaming base backups (see
10388 * src/backend/replication/basebackup.c). The difference to exclusive backups
10389 * is that the backup label and tablespace map files are not written to disk.
10390 * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10391 * and the caller is responsible for including them in the backup archive as
10392 * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10393 * active at the same time, and they don't conflict with an exclusive backup
10394 * either.
10395 *
10396 * tblspcmapfile is required mainly for tar format in windows as native windows
10397 * utilities are not able to create symlinks while extracting files from tar.
10398 * However for consistency, the same is used for all platforms.
10399 *
10400 * needtblspcmapfile is true for the cases (exclusive backup and for
10401 * non-exclusive backup only when tar format is used for taking backup)
10402 * when backup needs to generate tablespace_map file, it is used to
10403 * embed escape character before newline character in tablespace path.
10404 *
10405 * Returns the minimum WAL location that must be present to restore from this
10406 * backup, and the corresponding timeline ID in *starttli_p.
10407 *
10408 * Every successfully started non-exclusive backup must be stopped by calling
10409 * do_pg_stop_backup() or do_pg_abort_backup().
10410 *
10411 * It is the responsibility of the caller of this function to verify the
10412 * permissions of the calling user!
10413 */
10414 XLogRecPtr
do_pg_start_backup(const char * backupidstr,bool fast,TimeLineID * starttli_p,StringInfo labelfile,List ** tablespaces,StringInfo tblspcmapfile,bool infotbssize,bool needtblspcmapfile)10415 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10416 StringInfo labelfile, List **tablespaces,
10417 StringInfo tblspcmapfile, bool infotbssize,
10418 bool needtblspcmapfile)
10419 {
10420 bool exclusive = (labelfile == NULL);
10421 bool backup_started_in_recovery = false;
10422 XLogRecPtr checkpointloc;
10423 XLogRecPtr startpoint;
10424 TimeLineID starttli;
10425 pg_time_t stamp_time;
10426 char strfbuf[128];
10427 char xlogfilename[MAXFNAMELEN];
10428 XLogSegNo _logSegNo;
10429 struct stat stat_buf;
10430 FILE *fp;
10431
10432 backup_started_in_recovery = RecoveryInProgress();
10433
10434 /*
10435 * Currently only non-exclusive backup can be taken during recovery.
10436 */
10437 if (backup_started_in_recovery && exclusive)
10438 ereport(ERROR,
10439 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10440 errmsg("recovery is in progress"),
10441 errhint("WAL control functions cannot be executed during recovery.")));
10442
10443 /*
10444 * During recovery, we don't need to check WAL level. Because, if WAL
10445 * level is not sufficient, it's impossible to get here during recovery.
10446 */
10447 if (!backup_started_in_recovery && !XLogIsNeeded())
10448 ereport(ERROR,
10449 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10450 errmsg("WAL level not sufficient for making an online backup"),
10451 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10452
10453 if (strlen(backupidstr) > MAXPGPATH)
10454 ereport(ERROR,
10455 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10456 errmsg("backup label too long (max %d bytes)",
10457 MAXPGPATH)));
10458
10459 /*
10460 * Mark backup active in shared memory. We must do full-page WAL writes
10461 * during an on-line backup even if not doing so at other times, because
10462 * it's quite possible for the backup dump to obtain a "torn" (partially
10463 * written) copy of a database page if it reads the page concurrently with
10464 * our write to the same page. This can be fixed as long as the first
10465 * write to the page in the WAL sequence is a full-page write. Hence, we
10466 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10467 * are no dirty pages in shared memory that might get dumped while the
10468 * backup is in progress without having a corresponding WAL record. (Once
10469 * the backup is complete, we need not force full-page writes anymore,
10470 * since we expect that any pages not modified during the backup interval
10471 * must have been correctly captured by the backup.)
10472 *
10473 * Note that forcePageWrites has no effect during an online backup from
10474 * the standby.
10475 *
10476 * We must hold all the insertion locks to change the value of
10477 * forcePageWrites, to ensure adequate interlocking against
10478 * XLogInsertRecord().
10479 */
10480 WALInsertLockAcquireExclusive();
10481 if (exclusive)
10482 {
10483 /*
10484 * At first, mark that we're now starting an exclusive backup, to
10485 * ensure that there are no other sessions currently running
10486 * pg_start_backup() or pg_stop_backup().
10487 */
10488 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10489 {
10490 WALInsertLockRelease();
10491 ereport(ERROR,
10492 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10493 errmsg("a backup is already in progress"),
10494 errhint("Run pg_stop_backup() and try again.")));
10495 }
10496 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10497 }
10498 else
10499 XLogCtl->Insert.nonExclusiveBackups++;
10500 XLogCtl->Insert.forcePageWrites = true;
10501 WALInsertLockRelease();
10502
10503 /* Ensure we release forcePageWrites if fail below */
10504 PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10505 {
10506 bool gotUniqueStartpoint = false;
10507 DIR *tblspcdir;
10508 struct dirent *de;
10509 tablespaceinfo *ti;
10510 int datadirpathlen;
10511
10512 /*
10513 * Force an XLOG file switch before the checkpoint, to ensure that the
10514 * WAL segment the checkpoint is written to doesn't contain pages with
10515 * old timeline IDs. That would otherwise happen if you called
10516 * pg_start_backup() right after restoring from a PITR archive: the
10517 * first WAL segment containing the startup checkpoint has pages in
10518 * the beginning with the old timeline ID. That can cause trouble at
10519 * recovery: we won't have a history file covering the old timeline if
10520 * pg_wal directory was not included in the base backup and the WAL
10521 * archive was cleared too before starting the backup.
10522 *
10523 * This also ensures that we have emitted a WAL page header that has
10524 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10525 * Therefore, if a WAL archiver (such as pglesslog) is trying to
10526 * compress out removable backup blocks, it won't remove any that
10527 * occur after this point.
10528 *
10529 * During recovery, we skip forcing XLOG file switch, which means that
10530 * the backup taken during recovery is not available for the special
10531 * recovery case described above.
10532 */
10533 if (!backup_started_in_recovery)
10534 RequestXLogSwitch(false);
10535
10536 do
10537 {
10538 bool checkpointfpw;
10539
10540 /*
10541 * Force a CHECKPOINT. Aside from being necessary to prevent torn
10542 * page problems, this guarantees that two successive backup runs
10543 * will have different checkpoint positions and hence different
10544 * history file names, even if nothing happened in between.
10545 *
10546 * During recovery, establish a restartpoint if possible. We use
10547 * the last restartpoint as the backup starting checkpoint. This
10548 * means that two successive backup runs can have same checkpoint
10549 * positions.
10550 *
10551 * Since the fact that we are executing do_pg_start_backup()
10552 * during recovery means that checkpointer is running, we can use
10553 * RequestCheckpoint() to establish a restartpoint.
10554 *
10555 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10556 * passing fast = true). Otherwise this can take awhile.
10557 */
10558 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10559 (fast ? CHECKPOINT_IMMEDIATE : 0));
10560
10561 /*
10562 * Now we need to fetch the checkpoint record location, and also
10563 * its REDO pointer. The oldest point in WAL that would be needed
10564 * to restore starting from the checkpoint is precisely the REDO
10565 * pointer.
10566 */
10567 LWLockAcquire(ControlFileLock, LW_SHARED);
10568 checkpointloc = ControlFile->checkPoint;
10569 startpoint = ControlFile->checkPointCopy.redo;
10570 starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10571 checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10572 LWLockRelease(ControlFileLock);
10573
10574 if (backup_started_in_recovery)
10575 {
10576 XLogRecPtr recptr;
10577
10578 /*
10579 * Check to see if all WAL replayed during online backup
10580 * (i.e., since last restartpoint used as backup starting
10581 * checkpoint) contain full-page writes.
10582 */
10583 SpinLockAcquire(&XLogCtl->info_lck);
10584 recptr = XLogCtl->lastFpwDisableRecPtr;
10585 SpinLockRelease(&XLogCtl->info_lck);
10586
10587 if (!checkpointfpw || startpoint <= recptr)
10588 ereport(ERROR,
10589 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10590 errmsg("WAL generated with full_page_writes=off was replayed "
10591 "since last restartpoint"),
10592 errhint("This means that the backup being taken on the standby "
10593 "is corrupt and should not be used. "
10594 "Enable full_page_writes and run CHECKPOINT on the master, "
10595 "and then try an online backup again.")));
10596
10597 /*
10598 * During recovery, since we don't use the end-of-backup WAL
10599 * record and don't write the backup history file, the
10600 * starting WAL location doesn't need to be unique. This means
10601 * that two base backups started at the same time might use
10602 * the same checkpoint as starting locations.
10603 */
10604 gotUniqueStartpoint = true;
10605 }
10606
10607 /*
10608 * If two base backups are started at the same time (in WAL sender
10609 * processes), we need to make sure that they use different
10610 * checkpoints as starting locations, because we use the starting
10611 * WAL location as a unique identifier for the base backup in the
10612 * end-of-backup WAL record and when we write the backup history
10613 * file. Perhaps it would be better generate a separate unique ID
10614 * for each backup instead of forcing another checkpoint, but
10615 * taking a checkpoint right after another is not that expensive
10616 * either because only few buffers have been dirtied yet.
10617 */
10618 WALInsertLockAcquireExclusive();
10619 if (XLogCtl->Insert.lastBackupStart < startpoint)
10620 {
10621 XLogCtl->Insert.lastBackupStart = startpoint;
10622 gotUniqueStartpoint = true;
10623 }
10624 WALInsertLockRelease();
10625 } while (!gotUniqueStartpoint);
10626
10627 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
10628 XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
10629
10630 /*
10631 * Construct tablespace_map file
10632 */
10633 if (exclusive)
10634 tblspcmapfile = makeStringInfo();
10635
10636 datadirpathlen = strlen(DataDir);
10637
10638 /* Collect information about all tablespaces */
10639 tblspcdir = AllocateDir("pg_tblspc");
10640 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10641 {
10642 char fullpath[MAXPGPATH + 10];
10643 char linkpath[MAXPGPATH];
10644 char *relpath = NULL;
10645 int rllen;
10646 StringInfoData buflinkpath;
10647 char *s = linkpath;
10648
10649 /* Skip special stuff */
10650 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10651 continue;
10652
10653 snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10654
10655 #if defined(HAVE_READLINK) || defined(WIN32)
10656 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10657 if (rllen < 0)
10658 {
10659 ereport(WARNING,
10660 (errmsg("could not read symbolic link \"%s\": %m",
10661 fullpath)));
10662 continue;
10663 }
10664 else if (rllen >= sizeof(linkpath))
10665 {
10666 ereport(WARNING,
10667 (errmsg("symbolic link \"%s\" target is too long",
10668 fullpath)));
10669 continue;
10670 }
10671 linkpath[rllen] = '\0';
10672
10673 /*
10674 * Add the escape character '\\' before newline in a string to
10675 * ensure that we can distinguish between the newline in the
10676 * tablespace path and end of line while reading tablespace_map
10677 * file during archive recovery.
10678 */
10679 initStringInfo(&buflinkpath);
10680
10681 while (*s)
10682 {
10683 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10684 appendStringInfoChar(&buflinkpath, '\\');
10685 appendStringInfoChar(&buflinkpath, *s++);
10686 }
10687
10688 /*
10689 * Relpath holds the relative path of the tablespace directory
10690 * when it's located within PGDATA, or NULL if it's located
10691 * elsewhere.
10692 */
10693 if (rllen > datadirpathlen &&
10694 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10695 IS_DIR_SEP(linkpath[datadirpathlen]))
10696 relpath = linkpath + datadirpathlen + 1;
10697
10698 ti = palloc(sizeof(tablespaceinfo));
10699 ti->oid = pstrdup(de->d_name);
10700 ti->path = pstrdup(buflinkpath.data);
10701 ti->rpath = relpath ? pstrdup(relpath) : NULL;
10702 ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
10703
10704 if (tablespaces)
10705 *tablespaces = lappend(*tablespaces, ti);
10706
10707 appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10708
10709 pfree(buflinkpath.data);
10710 #else
10711
10712 /*
10713 * If the platform does not have symbolic links, it should not be
10714 * possible to have tablespaces - clearly somebody else created
10715 * them. Warn about it and ignore.
10716 */
10717 ereport(WARNING,
10718 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10719 errmsg("tablespaces are not supported on this platform")));
10720 #endif
10721 }
10722 FreeDir(tblspcdir);
10723
10724 /*
10725 * Construct backup label file
10726 */
10727 if (exclusive)
10728 labelfile = makeStringInfo();
10729
10730 /* Use the log timezone here, not the session timezone */
10731 stamp_time = (pg_time_t) time(NULL);
10732 pg_strftime(strfbuf, sizeof(strfbuf),
10733 "%Y-%m-%d %H:%M:%S %Z",
10734 pg_localtime(&stamp_time, log_timezone));
10735 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10736 (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10737 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10738 (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10739 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10740 exclusive ? "pg_start_backup" : "streamed");
10741 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10742 backup_started_in_recovery ? "standby" : "master");
10743 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10744 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10745 appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
10746
10747 /*
10748 * Okay, write the file, or return its contents to caller.
10749 */
10750 if (exclusive)
10751 {
10752 /*
10753 * Check for existing backup label --- implies a backup is already
10754 * running. (XXX given that we checked exclusiveBackupState
10755 * above, maybe it would be OK to just unlink any such label
10756 * file?)
10757 */
10758 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10759 {
10760 if (errno != ENOENT)
10761 ereport(ERROR,
10762 (errcode_for_file_access(),
10763 errmsg("could not stat file \"%s\": %m",
10764 BACKUP_LABEL_FILE)));
10765 }
10766 else
10767 ereport(ERROR,
10768 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10769 errmsg("a backup is already in progress"),
10770 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10771 BACKUP_LABEL_FILE)));
10772
10773 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10774
10775 if (!fp)
10776 ereport(ERROR,
10777 (errcode_for_file_access(),
10778 errmsg("could not create file \"%s\": %m",
10779 BACKUP_LABEL_FILE)));
10780 if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
10781 fflush(fp) != 0 ||
10782 pg_fsync(fileno(fp)) != 0 ||
10783 ferror(fp) ||
10784 FreeFile(fp))
10785 ereport(ERROR,
10786 (errcode_for_file_access(),
10787 errmsg("could not write file \"%s\": %m",
10788 BACKUP_LABEL_FILE)));
10789 /* Allocated locally for exclusive backups, so free separately */
10790 pfree(labelfile->data);
10791 pfree(labelfile);
10792
10793 /* Write backup tablespace_map file. */
10794 if (tblspcmapfile->len > 0)
10795 {
10796 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
10797 {
10798 if (errno != ENOENT)
10799 ereport(ERROR,
10800 (errcode_for_file_access(),
10801 errmsg("could not stat file \"%s\": %m",
10802 TABLESPACE_MAP)));
10803 }
10804 else
10805 ereport(ERROR,
10806 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10807 errmsg("a backup is already in progress"),
10808 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10809 TABLESPACE_MAP)));
10810
10811 fp = AllocateFile(TABLESPACE_MAP, "w");
10812
10813 if (!fp)
10814 ereport(ERROR,
10815 (errcode_for_file_access(),
10816 errmsg("could not create file \"%s\": %m",
10817 TABLESPACE_MAP)));
10818 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
10819 fflush(fp) != 0 ||
10820 pg_fsync(fileno(fp)) != 0 ||
10821 ferror(fp) ||
10822 FreeFile(fp))
10823 ereport(ERROR,
10824 (errcode_for_file_access(),
10825 errmsg("could not write file \"%s\": %m",
10826 TABLESPACE_MAP)));
10827 }
10828
10829 /* Allocated locally for exclusive backups, so free separately */
10830 pfree(tblspcmapfile->data);
10831 pfree(tblspcmapfile);
10832 }
10833 }
10834 PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10835
10836 /*
10837 * Mark that start phase has correctly finished for an exclusive backup.
10838 * Session-level locks are updated as well to reflect that state.
10839 *
10840 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
10841 * counters and session-level lock. Otherwise they can be updated
10842 * inconsistently, and which might cause do_pg_abort_backup() to fail.
10843 */
10844 if (exclusive)
10845 {
10846 WALInsertLockAcquireExclusive();
10847 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10848
10849 /* Set session-level lock */
10850 sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
10851 WALInsertLockRelease();
10852 }
10853 else
10854 sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
10855
10856 /*
10857 * We're done. As a convenience, return the starting WAL location.
10858 */
10859 if (starttli_p)
10860 *starttli_p = starttli;
10861 return startpoint;
10862 }
10863
10864 /* Error cleanup callback for pg_start_backup */
10865 static void
pg_start_backup_callback(int code,Datum arg)10866 pg_start_backup_callback(int code, Datum arg)
10867 {
10868 bool exclusive = DatumGetBool(arg);
10869
10870 /* Update backup counters and forcePageWrites on failure */
10871 WALInsertLockAcquireExclusive();
10872 if (exclusive)
10873 {
10874 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10875 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10876 }
10877 else
10878 {
10879 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10880 XLogCtl->Insert.nonExclusiveBackups--;
10881 }
10882
10883 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10884 XLogCtl->Insert.nonExclusiveBackups == 0)
10885 {
10886 XLogCtl->Insert.forcePageWrites = false;
10887 }
10888 WALInsertLockRelease();
10889 }
10890
10891 /*
10892 * Error cleanup callback for pg_stop_backup
10893 */
10894 static void
pg_stop_backup_callback(int code,Datum arg)10895 pg_stop_backup_callback(int code, Datum arg)
10896 {
10897 bool exclusive = DatumGetBool(arg);
10898
10899 /* Update backup status on failure */
10900 WALInsertLockAcquireExclusive();
10901 if (exclusive)
10902 {
10903 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10904 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10905 }
10906 WALInsertLockRelease();
10907 }
10908
10909 /*
10910 * Utility routine to fetch the session-level status of a backup running.
10911 */
10912 SessionBackupState
get_backup_status(void)10913 get_backup_status(void)
10914 {
10915 return sessionBackupState;
10916 }
10917
10918 /*
10919 * do_pg_stop_backup
10920 *
10921 * Utility function called at the end of an online backup. It cleans up the
10922 * backup state and can optionally wait for WAL segments to be archived.
10923 *
10924 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10925 * the non-exclusive backup specified by 'labelfile'.
10926 *
10927 * Returns the last WAL location that must be present to restore from this
10928 * backup, and the corresponding timeline ID in *stoptli_p.
10929 *
10930 * It is the responsibility of the caller of this function to verify the
10931 * permissions of the calling user!
10932 */
10933 XLogRecPtr
do_pg_stop_backup(char * labelfile,bool waitforarchive,TimeLineID * stoptli_p)10934 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10935 {
10936 bool exclusive = (labelfile == NULL);
10937 bool backup_started_in_recovery = false;
10938 XLogRecPtr startpoint;
10939 XLogRecPtr stoppoint;
10940 TimeLineID stoptli;
10941 pg_time_t stamp_time;
10942 char strfbuf[128];
10943 char histfilepath[MAXPGPATH];
10944 char startxlogfilename[MAXFNAMELEN];
10945 char stopxlogfilename[MAXFNAMELEN];
10946 char lastxlogfilename[MAXFNAMELEN];
10947 char histfilename[MAXFNAMELEN];
10948 char backupfrom[20];
10949 XLogSegNo _logSegNo;
10950 FILE *lfp;
10951 FILE *fp;
10952 char ch;
10953 int seconds_before_warning;
10954 int waits = 0;
10955 bool reported_waiting = false;
10956 char *remaining;
10957 char *ptr;
10958 uint32 hi,
10959 lo;
10960
10961 backup_started_in_recovery = RecoveryInProgress();
10962
10963 /*
10964 * Currently only non-exclusive backup can be taken during recovery.
10965 */
10966 if (backup_started_in_recovery && exclusive)
10967 ereport(ERROR,
10968 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10969 errmsg("recovery is in progress"),
10970 errhint("WAL control functions cannot be executed during recovery.")));
10971
10972 /*
10973 * During recovery, we don't need to check WAL level. Because, if WAL
10974 * level is not sufficient, it's impossible to get here during recovery.
10975 */
10976 if (!backup_started_in_recovery && !XLogIsNeeded())
10977 ereport(ERROR,
10978 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10979 errmsg("WAL level not sufficient for making an online backup"),
10980 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10981
10982 if (exclusive)
10983 {
10984 /*
10985 * At first, mark that we're now stopping an exclusive backup, to
10986 * ensure that there are no other sessions currently running
10987 * pg_start_backup() or pg_stop_backup().
10988 */
10989 WALInsertLockAcquireExclusive();
10990 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10991 {
10992 WALInsertLockRelease();
10993 ereport(ERROR,
10994 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10995 errmsg("exclusive backup not in progress")));
10996 }
10997 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10998 WALInsertLockRelease();
10999
11000 /*
11001 * Remove backup_label. In case of failure, the state for an exclusive
11002 * backup is switched back to in-progress.
11003 */
11004 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11005 {
11006 /*
11007 * Read the existing label file into memory.
11008 */
11009 struct stat statbuf;
11010 int r;
11011
11012 if (stat(BACKUP_LABEL_FILE, &statbuf))
11013 {
11014 /* should not happen per the upper checks */
11015 if (errno != ENOENT)
11016 ereport(ERROR,
11017 (errcode_for_file_access(),
11018 errmsg("could not stat file \"%s\": %m",
11019 BACKUP_LABEL_FILE)));
11020 ereport(ERROR,
11021 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11022 errmsg("a backup is not in progress")));
11023 }
11024
11025 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11026 if (!lfp)
11027 {
11028 ereport(ERROR,
11029 (errcode_for_file_access(),
11030 errmsg("could not read file \"%s\": %m",
11031 BACKUP_LABEL_FILE)));
11032 }
11033 labelfile = palloc(statbuf.st_size + 1);
11034 r = fread(labelfile, statbuf.st_size, 1, lfp);
11035 labelfile[statbuf.st_size] = '\0';
11036
11037 /*
11038 * Close and remove the backup label file
11039 */
11040 if (r != 1 || ferror(lfp) || FreeFile(lfp))
11041 ereport(ERROR,
11042 (errcode_for_file_access(),
11043 errmsg("could not read file \"%s\": %m",
11044 BACKUP_LABEL_FILE)));
11045 durable_unlink(BACKUP_LABEL_FILE, ERROR);
11046
11047 /*
11048 * Remove tablespace_map file if present, it is created only if
11049 * there are tablespaces.
11050 */
11051 durable_unlink(TABLESPACE_MAP, DEBUG1);
11052 }
11053 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11054 }
11055
11056 /*
11057 * OK to update backup counters, forcePageWrites and session-level lock.
11058 *
11059 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
11060 * Otherwise they can be updated inconsistently, and which might cause
11061 * do_pg_abort_backup() to fail.
11062 */
11063 WALInsertLockAcquireExclusive();
11064 if (exclusive)
11065 {
11066 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11067 }
11068 else
11069 {
11070 /*
11071 * The user-visible pg_start/stop_backup() functions that operate on
11072 * exclusive backups can be called at any time, but for non-exclusive
11073 * backups, it is expected that each do_pg_start_backup() call is
11074 * matched by exactly one do_pg_stop_backup() call.
11075 */
11076 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11077 XLogCtl->Insert.nonExclusiveBackups--;
11078 }
11079
11080 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11081 XLogCtl->Insert.nonExclusiveBackups == 0)
11082 {
11083 XLogCtl->Insert.forcePageWrites = false;
11084 }
11085
11086 /*
11087 * Clean up session-level lock.
11088 *
11089 * You might think that WALInsertLockRelease() can be called before
11090 * cleaning up session-level lock because session-level lock doesn't need
11091 * to be protected with WAL insertion lock. But since
11092 * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
11093 * cleaned up before it.
11094 */
11095 sessionBackupState = SESSION_BACKUP_NONE;
11096
11097 WALInsertLockRelease();
11098
11099 /*
11100 * Read and parse the START WAL LOCATION line (this code is pretty crude,
11101 * but we are not expecting any variability in the file format).
11102 */
11103 if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11104 &hi, &lo, startxlogfilename,
11105 &ch) != 4 || ch != '\n')
11106 ereport(ERROR,
11107 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11108 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11109 startpoint = ((uint64) hi) << 32 | lo;
11110 remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
11111
11112 /*
11113 * Parse the BACKUP FROM line. If we are taking an online backup from the
11114 * standby, we confirm that the standby has not been promoted during the
11115 * backup.
11116 */
11117 ptr = strstr(remaining, "BACKUP FROM:");
11118 if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11119 ereport(ERROR,
11120 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11121 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11122 if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11123 ereport(ERROR,
11124 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11125 errmsg("the standby was promoted during online backup"),
11126 errhint("This means that the backup being taken is corrupt "
11127 "and should not be used. "
11128 "Try taking another online backup.")));
11129
11130 /*
11131 * During recovery, we don't write an end-of-backup record. We assume that
11132 * pg_control was backed up last and its minimum recovery point can be
11133 * available as the backup end location. Since we don't have an
11134 * end-of-backup record, we use the pg_control value to check whether
11135 * we've reached the end of backup when starting recovery from this
11136 * backup. We have no way of checking if pg_control wasn't backed up last
11137 * however.
11138 *
11139 * We don't force a switch to new WAL file but it is still possible to
11140 * wait for all the required files to be archived if waitforarchive is
11141 * true. This is okay if we use the backup to start a standby and fetch
11142 * the missing WAL using streaming replication. But in the case of an
11143 * archive recovery, a user should set waitforarchive to true and wait for
11144 * them to be archived to ensure that all the required files are
11145 * available.
11146 *
11147 * We return the current minimum recovery point as the backup end
11148 * location. Note that it can be greater than the exact backup end
11149 * location if the minimum recovery point is updated after the backup of
11150 * pg_control. This is harmless for current uses.
11151 *
11152 * XXX currently a backup history file is for informational and debug
11153 * purposes only. It's not essential for an online backup. Furthermore,
11154 * even if it's created, it will not be archived during recovery because
11155 * an archiver is not invoked. So it doesn't seem worthwhile to write a
11156 * backup history file during recovery.
11157 */
11158 if (backup_started_in_recovery)
11159 {
11160 XLogRecPtr recptr;
11161
11162 /*
11163 * Check to see if all WAL replayed during online backup contain
11164 * full-page writes.
11165 */
11166 SpinLockAcquire(&XLogCtl->info_lck);
11167 recptr = XLogCtl->lastFpwDisableRecPtr;
11168 SpinLockRelease(&XLogCtl->info_lck);
11169
11170 if (startpoint <= recptr)
11171 ereport(ERROR,
11172 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11173 errmsg("WAL generated with full_page_writes=off was replayed "
11174 "during online backup"),
11175 errhint("This means that the backup being taken on the standby "
11176 "is corrupt and should not be used. "
11177 "Enable full_page_writes and run CHECKPOINT on the master, "
11178 "and then try an online backup again.")));
11179
11180
11181 LWLockAcquire(ControlFileLock, LW_SHARED);
11182 stoppoint = ControlFile->minRecoveryPoint;
11183 stoptli = ControlFile->minRecoveryPointTLI;
11184 LWLockRelease(ControlFileLock);
11185 }
11186 else
11187 {
11188 /*
11189 * Write the backup-end xlog record
11190 */
11191 XLogBeginInsert();
11192 XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11193 stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11194 stoptli = ThisTimeLineID;
11195
11196 /*
11197 * Force a switch to a new xlog segment file, so that the backup is
11198 * valid as soon as archiver moves out the current segment file.
11199 */
11200 RequestXLogSwitch(false);
11201
11202 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11203 XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
11204
11205 /* Use the log timezone here, not the session timezone */
11206 stamp_time = (pg_time_t) time(NULL);
11207 pg_strftime(strfbuf, sizeof(strfbuf),
11208 "%Y-%m-%d %H:%M:%S %Z",
11209 pg_localtime(&stamp_time, log_timezone));
11210
11211 /*
11212 * Write the backup history file
11213 */
11214 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11215 BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11216 startpoint, wal_segment_size);
11217 fp = AllocateFile(histfilepath, "w");
11218 if (!fp)
11219 ereport(ERROR,
11220 (errcode_for_file_access(),
11221 errmsg("could not create file \"%s\": %m",
11222 histfilepath)));
11223 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11224 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
11225 fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11226 (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
11227
11228 /*
11229 * Transfer remaining lines including label and start timeline to
11230 * history file.
11231 */
11232 fprintf(fp, "%s", remaining);
11233 fprintf(fp, "STOP TIME: %s\n", strfbuf);
11234 fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11235 if (fflush(fp) || ferror(fp) || FreeFile(fp))
11236 ereport(ERROR,
11237 (errcode_for_file_access(),
11238 errmsg("could not write file \"%s\": %m",
11239 histfilepath)));
11240
11241 /*
11242 * Clean out any no-longer-needed history files. As a side effect,
11243 * this will post a .ready file for the newly created history file,
11244 * notifying the archiver that history file may be archived
11245 * immediately.
11246 */
11247 CleanupBackupHistory();
11248 }
11249
11250 /*
11251 * If archiving is enabled, wait for all the required WAL files to be
11252 * archived before returning. If archiving isn't enabled, the required WAL
11253 * needs to be transported via streaming replication (hopefully with
11254 * wal_keep_segments set high enough), or some more exotic mechanism like
11255 * polling and copying files from pg_wal with script. We have no knowledge
11256 * of those mechanisms, so it's up to the user to ensure that he gets all
11257 * the required WAL.
11258 *
11259 * We wait until both the last WAL file filled during backup and the
11260 * history file have been archived, and assume that the alphabetic sorting
11261 * property of the WAL files ensures any earlier WAL files are safely
11262 * archived as well.
11263 *
11264 * We wait forever, since archive_command is supposed to work and we
11265 * assume the admin wanted his backup to work completely. If you don't
11266 * wish to wait, then either waitforarchive should be passed in as false,
11267 * or you can set statement_timeout. Also, some notices are issued to
11268 * clue in anyone who might be doing this interactively.
11269 */
11270
11271 if (waitforarchive &&
11272 ((!backup_started_in_recovery && XLogArchivingActive()) ||
11273 (backup_started_in_recovery && XLogArchivingAlways())))
11274 {
11275 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11276 XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11277
11278 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11279 BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11280 startpoint, wal_segment_size);
11281
11282 seconds_before_warning = 60;
11283 waits = 0;
11284
11285 while (XLogArchiveIsBusy(lastxlogfilename) ||
11286 XLogArchiveIsBusy(histfilename))
11287 {
11288 CHECK_FOR_INTERRUPTS();
11289
11290 if (!reported_waiting && waits > 5)
11291 {
11292 ereport(NOTICE,
11293 (errmsg("base backup done, waiting for required WAL segments to be archived")));
11294 reported_waiting = true;
11295 }
11296
11297 pg_usleep(1000000L);
11298
11299 if (++waits >= seconds_before_warning)
11300 {
11301 seconds_before_warning *= 2; /* This wraps in >10 years... */
11302 ereport(WARNING,
11303 (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11304 waits),
11305 errhint("Check that your archive_command is executing properly. "
11306 "You can safely cancel this backup, "
11307 "but the database backup will not be usable without all the WAL segments.")));
11308 }
11309 }
11310
11311 ereport(NOTICE,
11312 (errmsg("all required WAL segments have been archived")));
11313 }
11314 else if (waitforarchive)
11315 ereport(NOTICE,
11316 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11317
11318 /*
11319 * We're done. As a convenience, return the ending WAL location.
11320 */
11321 if (stoptli_p)
11322 *stoptli_p = stoptli;
11323 return stoppoint;
11324 }
11325
11326
11327 /*
11328 * do_pg_abort_backup: abort a running backup
11329 *
11330 * This does just the most basic steps of do_pg_stop_backup(), by taking the
11331 * system out of backup mode, thus making it a lot more safe to call from
11332 * an error handler.
11333 *
11334 * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
11335 * is emitted.
11336 *
11337 * NB: This is only for aborting a non-exclusive backup that doesn't write
11338 * backup_label. A backup started with pg_start_backup() needs to be finished
11339 * with pg_stop_backup().
11340 *
11341 * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
11342 * signature.
11343 */
11344 void
do_pg_abort_backup(int code,Datum arg)11345 do_pg_abort_backup(int code, Datum arg)
11346 {
11347 bool emit_warning = DatumGetBool(arg);
11348
11349 /*
11350 * Quick exit if session is not keeping around a non-exclusive backup
11351 * already started.
11352 */
11353 if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
11354 return;
11355
11356 WALInsertLockAcquireExclusive();
11357 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11358 XLogCtl->Insert.nonExclusiveBackups--;
11359
11360 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11361 XLogCtl->Insert.nonExclusiveBackups == 0)
11362 {
11363 XLogCtl->Insert.forcePageWrites = false;
11364 }
11365 WALInsertLockRelease();
11366
11367 if (emit_warning)
11368 ereport(WARNING,
11369 (errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
11370 }
11371
11372 /*
11373 * Register a handler that will warn about unterminated backups at end of
11374 * session, unless this has already been done.
11375 */
11376 void
register_persistent_abort_backup_handler(void)11377 register_persistent_abort_backup_handler(void)
11378 {
11379 static bool already_done = false;
11380
11381 if (already_done)
11382 return;
11383 before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
11384 already_done = true;
11385 }
11386
11387 /*
11388 * Get latest redo apply position.
11389 *
11390 * Exported to allow WALReceiver to read the pointer directly.
11391 */
11392 XLogRecPtr
GetXLogReplayRecPtr(TimeLineID * replayTLI)11393 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11394 {
11395 XLogRecPtr recptr;
11396 TimeLineID tli;
11397
11398 SpinLockAcquire(&XLogCtl->info_lck);
11399 recptr = XLogCtl->lastReplayedEndRecPtr;
11400 tli = XLogCtl->lastReplayedTLI;
11401 SpinLockRelease(&XLogCtl->info_lck);
11402
11403 if (replayTLI)
11404 *replayTLI = tli;
11405 return recptr;
11406 }
11407
11408 /*
11409 * Get latest WAL insert pointer
11410 */
11411 XLogRecPtr
GetXLogInsertRecPtr(void)11412 GetXLogInsertRecPtr(void)
11413 {
11414 XLogCtlInsert *Insert = &XLogCtl->Insert;
11415 uint64 current_bytepos;
11416
11417 SpinLockAcquire(&Insert->insertpos_lck);
11418 current_bytepos = Insert->CurrBytePos;
11419 SpinLockRelease(&Insert->insertpos_lck);
11420
11421 return XLogBytePosToRecPtr(current_bytepos);
11422 }
11423
11424 /*
11425 * Get latest WAL write pointer
11426 */
11427 XLogRecPtr
GetXLogWriteRecPtr(void)11428 GetXLogWriteRecPtr(void)
11429 {
11430 SpinLockAcquire(&XLogCtl->info_lck);
11431 LogwrtResult = XLogCtl->LogwrtResult;
11432 SpinLockRelease(&XLogCtl->info_lck);
11433
11434 return LogwrtResult.Write;
11435 }
11436
11437 /*
11438 * Returns the redo pointer of the last checkpoint or restartpoint. This is
11439 * the oldest point in WAL that we still need, if we have to restart recovery.
11440 */
11441 void
GetOldestRestartPoint(XLogRecPtr * oldrecptr,TimeLineID * oldtli)11442 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11443 {
11444 LWLockAcquire(ControlFileLock, LW_SHARED);
11445 *oldrecptr = ControlFile->checkPointCopy.redo;
11446 *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11447 LWLockRelease(ControlFileLock);
11448 }
11449
11450 /*
11451 * read_backup_label: check to see if a backup_label file is present
11452 *
11453 * If we see a backup_label during recovery, we assume that we are recovering
11454 * from a backup dump file, and we therefore roll forward from the checkpoint
11455 * identified by the label file, NOT what pg_control says. This avoids the
11456 * problem that pg_control might have been archived one or more checkpoints
11457 * later than the start of the dump, and so if we rely on it as the start
11458 * point, we will fail to restore a consistent database state.
11459 *
11460 * Returns true if a backup_label was found (and fills the checkpoint
11461 * location and its REDO location into *checkPointLoc and RedoStartLSN,
11462 * respectively); returns false if not. If this backup_label came from a
11463 * streamed backup, *backupEndRequired is set to true. If this backup_label
11464 * was created during recovery, *backupFromStandby is set to true.
11465 */
11466 static bool
read_backup_label(XLogRecPtr * checkPointLoc,bool * backupEndRequired,bool * backupFromStandby)11467 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11468 bool *backupFromStandby)
11469 {
11470 char startxlogfilename[MAXFNAMELEN];
11471 TimeLineID tli_from_walseg,
11472 tli_from_file;
11473 FILE *lfp;
11474 char ch;
11475 char backuptype[20];
11476 char backupfrom[20];
11477 char backuplabel[MAXPGPATH];
11478 char backuptime[128];
11479 uint32 hi,
11480 lo;
11481
11482 *backupEndRequired = false;
11483 *backupFromStandby = false;
11484
11485 /*
11486 * See if label file is present
11487 */
11488 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11489 if (!lfp)
11490 {
11491 if (errno != ENOENT)
11492 ereport(FATAL,
11493 (errcode_for_file_access(),
11494 errmsg("could not read file \"%s\": %m",
11495 BACKUP_LABEL_FILE)));
11496 return false; /* it's not there, all is fine */
11497 }
11498
11499 /*
11500 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11501 * is pretty crude, but we are not expecting any variability in the file
11502 * format).
11503 */
11504 if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11505 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
11506 ereport(FATAL,
11507 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11508 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11509 RedoStartLSN = ((uint64) hi) << 32 | lo;
11510 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11511 &hi, &lo, &ch) != 3 || ch != '\n')
11512 ereport(FATAL,
11513 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11514 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11515 *checkPointLoc = ((uint64) hi) << 32 | lo;
11516
11517 /*
11518 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11519 * from an older backup anyway, but since the information on it is not
11520 * strictly required, don't error out if it's missing for some reason.
11521 */
11522 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11523 {
11524 if (strcmp(backuptype, "streamed") == 0)
11525 *backupEndRequired = true;
11526 }
11527
11528 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11529 {
11530 if (strcmp(backupfrom, "standby") == 0)
11531 *backupFromStandby = true;
11532 }
11533
11534 /*
11535 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
11536 * but checking for their presence is useful for debugging and the next
11537 * sanity checks. Cope also with the fact that the result buffers have a
11538 * pre-allocated size, hence if the backup_label file has been generated
11539 * with strings longer than the maximum assumed here an incorrect parsing
11540 * happens. That's fine as only minor consistency checks are done
11541 * afterwards.
11542 */
11543 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
11544 ereport(DEBUG1,
11545 (errmsg("backup time %s in file \"%s\"",
11546 backuptime, BACKUP_LABEL_FILE)));
11547
11548 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
11549 ereport(DEBUG1,
11550 (errmsg("backup label %s in file \"%s\"",
11551 backuplabel, BACKUP_LABEL_FILE)));
11552
11553 /*
11554 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
11555 * it as a sanity check if present.
11556 */
11557 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
11558 {
11559 if (tli_from_walseg != tli_from_file)
11560 ereport(FATAL,
11561 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11562 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
11563 errdetail("Timeline ID parsed is %u, but expected %u.",
11564 tli_from_file, tli_from_walseg)));
11565
11566 ereport(DEBUG1,
11567 (errmsg("backup timeline %u in file \"%s\"",
11568 tli_from_file, BACKUP_LABEL_FILE)));
11569 }
11570
11571 if (ferror(lfp) || FreeFile(lfp))
11572 ereport(FATAL,
11573 (errcode_for_file_access(),
11574 errmsg("could not read file \"%s\": %m",
11575 BACKUP_LABEL_FILE)));
11576
11577 return true;
11578 }
11579
11580 /*
11581 * read_tablespace_map: check to see if a tablespace_map file is present
11582 *
11583 * If we see a tablespace_map file during recovery, we assume that we are
11584 * recovering from a backup dump file, and we therefore need to create symlinks
11585 * as per the information present in tablespace_map file.
11586 *
11587 * Returns true if a tablespace_map file was found (and fills the link
11588 * information for all the tablespace links present in file); returns false
11589 * if not.
11590 */
11591 static bool
read_tablespace_map(List ** tablespaces)11592 read_tablespace_map(List **tablespaces)
11593 {
11594 tablespaceinfo *ti;
11595 FILE *lfp;
11596 char tbsoid[MAXPGPATH];
11597 char *tbslinkpath;
11598 char str[MAXPGPATH];
11599 int ch,
11600 prev_ch = -1,
11601 i = 0,
11602 n;
11603
11604 /*
11605 * See if tablespace_map file is present
11606 */
11607 lfp = AllocateFile(TABLESPACE_MAP, "r");
11608 if (!lfp)
11609 {
11610 if (errno != ENOENT)
11611 ereport(FATAL,
11612 (errcode_for_file_access(),
11613 errmsg("could not read file \"%s\": %m",
11614 TABLESPACE_MAP)));
11615 return false; /* it's not there, all is fine */
11616 }
11617
11618 /*
11619 * Read and parse the link name and path lines from tablespace_map file
11620 * (this code is pretty crude, but we are not expecting any variability in
11621 * the file format). While taking backup we embed escape character '\\'
11622 * before newline in tablespace path, so that during reading of
11623 * tablespace_map file, we could distinguish newline in tablespace path
11624 * and end of line. Now while reading tablespace_map file, remove the
11625 * escape character that has been added in tablespace path during backup.
11626 */
11627 while ((ch = fgetc(lfp)) != EOF)
11628 {
11629 if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11630 {
11631 str[i] = '\0';
11632 if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11633 ereport(FATAL,
11634 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11635 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11636 tbslinkpath = str + n;
11637 i = 0;
11638
11639 ti = palloc(sizeof(tablespaceinfo));
11640 ti->oid = pstrdup(tbsoid);
11641 ti->path = pstrdup(tbslinkpath);
11642
11643 *tablespaces = lappend(*tablespaces, ti);
11644 continue;
11645 }
11646 else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11647 str[i - 1] = ch;
11648 else if (i < sizeof(str) - 1)
11649 str[i++] = ch;
11650 prev_ch = ch;
11651 }
11652
11653 if (ferror(lfp) || FreeFile(lfp))
11654 ereport(FATAL,
11655 (errcode_for_file_access(),
11656 errmsg("could not read file \"%s\": %m",
11657 TABLESPACE_MAP)));
11658
11659 return true;
11660 }
11661
11662 /*
11663 * Error context callback for errors occurring during rm_redo().
11664 */
11665 static void
rm_redo_error_callback(void * arg)11666 rm_redo_error_callback(void *arg)
11667 {
11668 XLogReaderState *record = (XLogReaderState *) arg;
11669 StringInfoData buf;
11670
11671 initStringInfo(&buf);
11672 xlog_outdesc(&buf, record);
11673
11674 /* translator: %s is a WAL record description */
11675 errcontext("WAL redo at %X/%X for %s",
11676 (uint32) (record->ReadRecPtr >> 32),
11677 (uint32) record->ReadRecPtr,
11678 buf.data);
11679
11680 pfree(buf.data);
11681 }
11682
11683 /*
11684 * BackupInProgress: check if online backup mode is active
11685 *
11686 * This is done by checking for existence of the "backup_label" file.
11687 */
11688 bool
BackupInProgress(void)11689 BackupInProgress(void)
11690 {
11691 struct stat stat_buf;
11692
11693 return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11694 }
11695
11696 /*
11697 * CancelBackup: rename the "backup_label" and "tablespace_map"
11698 * files to cancel backup mode
11699 *
11700 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11701 * Similarly, if the "tablespace_map" file exists, it will be renamed to
11702 * "tablespace_map.old".
11703 *
11704 * Note that this will render an online backup in progress
11705 * useless. To correctly finish an online backup, pg_stop_backup must be
11706 * called.
11707 */
11708 void
CancelBackup(void)11709 CancelBackup(void)
11710 {
11711 struct stat stat_buf;
11712
11713 /* if the backup_label file is not there, return */
11714 if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11715 return;
11716
11717 /* remove leftover file from previously canceled backup if it exists */
11718 unlink(BACKUP_LABEL_OLD);
11719
11720 if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11721 {
11722 ereport(WARNING,
11723 (errcode_for_file_access(),
11724 errmsg("online backup mode was not canceled"),
11725 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11726 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11727 return;
11728 }
11729
11730 /* if the tablespace_map file is not there, return */
11731 if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11732 {
11733 ereport(LOG,
11734 (errmsg("online backup mode canceled"),
11735 errdetail("File \"%s\" was renamed to \"%s\".",
11736 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11737 return;
11738 }
11739
11740 /* remove leftover file from previously canceled backup if it exists */
11741 unlink(TABLESPACE_MAP_OLD);
11742
11743 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11744 {
11745 ereport(LOG,
11746 (errmsg("online backup mode canceled"),
11747 errdetail("Files \"%s\" and \"%s\" were renamed to "
11748 "\"%s\" and \"%s\", respectively.",
11749 BACKUP_LABEL_FILE, TABLESPACE_MAP,
11750 BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11751 }
11752 else
11753 {
11754 ereport(WARNING,
11755 (errcode_for_file_access(),
11756 errmsg("online backup mode canceled"),
11757 errdetail("File \"%s\" was renamed to \"%s\", but "
11758 "file \"%s\" could not be renamed to \"%s\": %m.",
11759 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11760 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11761 }
11762 }
11763
11764 /*
11765 * Read the XLOG page containing RecPtr into readBuf (if not read already).
11766 * Returns number of bytes read, if the page is read successfully, or -1
11767 * in case of errors. When errors occur, they are ereport'ed, but only
11768 * if they have not been previously reported.
11769 *
11770 * This is responsible for restoring files from archive as needed, as well
11771 * as for waiting for the requested WAL record to arrive in standby mode.
11772 *
11773 * 'emode' specifies the log level used for reporting "file not found" or
11774 * "end of WAL" situations in archive recovery, or in standby mode when a
11775 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
11776 * false in those situations, on higher log levels the ereport() won't
11777 * return.
11778 *
11779 * In standby mode, if after a successful return of XLogPageRead() the
11780 * caller finds the record it's interested in to be broken, it should
11781 * ereport the error with the level determined by
11782 * emode_for_corrupt_record(), and then set lastSourceFailed
11783 * and call XLogPageRead() again with the same arguments. This lets
11784 * XLogPageRead() to try fetching the record from another source, or to
11785 * sleep and retry.
11786 */
11787 static int
XLogPageRead(XLogReaderState * xlogreader,XLogRecPtr targetPagePtr,int reqLen,XLogRecPtr targetRecPtr,char * readBuf,TimeLineID * readTLI)11788 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
11789 XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
11790 {
11791 XLogPageReadPrivate *private =
11792 (XLogPageReadPrivate *) xlogreader->private_data;
11793 int emode = private->emode;
11794 uint32 targetPageOff;
11795 XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11796 int r;
11797
11798 XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
11799 targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
11800
11801 /*
11802 * See if we need to switch to a new segment because the requested record
11803 * is not in the currently open one.
11804 */
11805 if (readFile >= 0 &&
11806 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
11807 {
11808 /*
11809 * Request a restartpoint if we've replayed too much xlog since the
11810 * last one.
11811 */
11812 if (bgwriterLaunched)
11813 {
11814 if (XLogCheckpointNeeded(readSegNo))
11815 {
11816 (void) GetRedoRecPtr();
11817 if (XLogCheckpointNeeded(readSegNo))
11818 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11819 }
11820 }
11821
11822 close(readFile);
11823 readFile = -1;
11824 readSource = 0;
11825 }
11826
11827 XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
11828
11829 retry:
11830 /* See if we need to retrieve more data */
11831 if (readFile < 0 ||
11832 (readSource == XLOG_FROM_STREAM &&
11833 receivedUpto < targetPagePtr + reqLen))
11834 {
11835 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11836 private->randAccess,
11837 private->fetching_ckpt,
11838 targetRecPtr))
11839 {
11840 if (readFile >= 0)
11841 close(readFile);
11842 readFile = -1;
11843 readLen = 0;
11844 readSource = 0;
11845
11846 return -1;
11847 }
11848 }
11849
11850 /*
11851 * At this point, we have the right segment open and if we're streaming we
11852 * know the requested record is in it.
11853 */
11854 Assert(readFile != -1);
11855
11856 /*
11857 * If the current segment is being streamed from master, calculate how
11858 * much of the current page we have received already. We know the
11859 * requested record has been received, but this is for the benefit of
11860 * future calls, to allow quick exit at the top of this function.
11861 */
11862 if (readSource == XLOG_FROM_STREAM)
11863 {
11864 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11865 readLen = XLOG_BLCKSZ;
11866 else
11867 readLen = XLogSegmentOffset(receivedUpto, wal_segment_size) -
11868 targetPageOff;
11869 }
11870 else
11871 readLen = XLOG_BLCKSZ;
11872
11873 /* Read the requested page */
11874 readOff = targetPageOff;
11875
11876 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
11877 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
11878 if (r != XLOG_BLCKSZ)
11879 {
11880 char fname[MAXFNAMELEN];
11881 int save_errno = errno;
11882
11883 pgstat_report_wait_end();
11884 XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
11885 if (r < 0)
11886 {
11887 errno = save_errno;
11888 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11889 (errcode_for_file_access(),
11890 errmsg("could not read from log segment %s, offset %u: %m",
11891 fname, readOff)));
11892 }
11893 else
11894 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11895 (errcode(ERRCODE_DATA_CORRUPTED),
11896 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
11897 fname, readOff, r, (Size) XLOG_BLCKSZ)));
11898 goto next_record_is_invalid;
11899 }
11900 pgstat_report_wait_end();
11901
11902 Assert(targetSegNo == readSegNo);
11903 Assert(targetPageOff == readOff);
11904 Assert(reqLen <= readLen);
11905
11906 *readTLI = curFileTLI;
11907
11908 /*
11909 * Check the page header immediately, so that we can retry immediately if
11910 * it's not valid. This may seem unnecessary, because XLogReadRecord()
11911 * validates the page header anyway, and would propagate the failure up to
11912 * ReadRecord(), which would retry. However, there's a corner case with
11913 * continuation records, if a record is split across two pages such that
11914 * we would need to read the two pages from different sources. For
11915 * example, imagine a scenario where a streaming replica is started up,
11916 * and replay reaches a record that's split across two WAL segments. The
11917 * first page is only available locally, in pg_wal, because it's already
11918 * been recycled in the master. The second page, however, is not present
11919 * in pg_wal, and we should stream it from the master. There is a recycled
11920 * WAL segment present in pg_wal, with garbage contents, however. We would
11921 * read the first page from the local WAL segment, but when reading the
11922 * second page, we would read the bogus, recycled, WAL segment. If we
11923 * didn't catch that case here, we would never recover, because
11924 * ReadRecord() would retry reading the whole record from the beginning.
11925 *
11926 * Of course, this only catches errors in the page header, which is what
11927 * happens in the case of a recycled WAL segment. Other kinds of errors or
11928 * corruption still has the same problem. But this at least fixes the
11929 * common case, which can happen as part of normal operation.
11930 *
11931 * Validating the page header is cheap enough that doing it twice
11932 * shouldn't be a big deal from a performance point of view.
11933 */
11934 if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11935 {
11936 /* reset any error XLogReaderValidatePageHeader() might have set */
11937 xlogreader->errormsg_buf[0] = '\0';
11938 goto next_record_is_invalid;
11939 }
11940
11941 return readLen;
11942
11943 next_record_is_invalid:
11944 lastSourceFailed = true;
11945
11946 if (readFile >= 0)
11947 close(readFile);
11948 readFile = -1;
11949 readLen = 0;
11950 readSource = 0;
11951
11952 /* In standby-mode, keep trying */
11953 if (StandbyMode)
11954 goto retry;
11955 else
11956 return -1;
11957 }
11958
11959 /*
11960 * Open the WAL segment containing WAL location 'RecPtr'.
11961 *
11962 * The segment can be fetched via restore_command, or via walreceiver having
11963 * streamed the record, or it can already be present in pg_wal. Checking
11964 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
11965 * too, in case someone copies a new segment directly to pg_wal. That is not
11966 * documented or recommended, though.
11967 *
11968 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11969 * prepare to read WAL starting from RedoStartLSN after this.
11970 *
11971 * 'RecPtr' might not point to the beginning of the record we're interested
11972 * in, it might also point to the page or segment header. In that case,
11973 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
11974 * used to decide which timeline to stream the requested WAL from.
11975 *
11976 * If the record is not immediately available, the function returns false
11977 * if we're not in standby mode. In standby mode, waits for it to become
11978 * available.
11979 *
11980 * When the requested record becomes available, the function opens the file
11981 * containing it (if not open already), and returns true. When end of standby
11982 * mode is triggered by the user, and there is no more WAL available, returns
11983 * false.
11984 */
11985 static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,bool randAccess,bool fetching_ckpt,XLogRecPtr tliRecPtr)11986 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11987 bool fetching_ckpt, XLogRecPtr tliRecPtr)
11988 {
11989 static TimestampTz last_fail_time = 0;
11990 TimestampTz now;
11991 bool streaming_reply_sent = false;
11992
11993 /*-------
11994 * Standby mode is implemented by a state machine:
11995 *
11996 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
11997 * pg_wal (XLOG_FROM_PG_WAL)
11998 * 2. Check trigger file
11999 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
12000 * 4. Rescan timelines
12001 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
12002 *
12003 * Failure to read from the current source advances the state machine to
12004 * the next state.
12005 *
12006 * 'currentSource' indicates the current state. There are no currentSource
12007 * values for "check trigger", "rescan timelines", and "sleep" states,
12008 * those actions are taken when reading from the previous source fails, as
12009 * part of advancing to the next state.
12010 *
12011 * If standby mode is turned off while reading WAL from stream, we move
12012 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
12013 * the files (which would be required at end of recovery, e.g., timeline
12014 * history file) from archive or pg_wal. We don't need to kill WAL receiver
12015 * here because it's already stopped when standby mode is turned off at
12016 * the end of recovery.
12017 *-------
12018 */
12019 if (!InArchiveRecovery)
12020 currentSource = XLOG_FROM_PG_WAL;
12021 else if (currentSource == 0 ||
12022 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
12023 {
12024 lastSourceFailed = false;
12025 currentSource = XLOG_FROM_ARCHIVE;
12026 }
12027
12028 for (;;)
12029 {
12030 int oldSource = currentSource;
12031
12032 /*
12033 * First check if we failed to read from the current source, and
12034 * advance the state machine if so. The failure to read might've
12035 * happened outside this function, e.g when a CRC check fails on a
12036 * record, or within this loop.
12037 */
12038 if (lastSourceFailed)
12039 {
12040 switch (currentSource)
12041 {
12042 case XLOG_FROM_ARCHIVE:
12043 case XLOG_FROM_PG_WAL:
12044
12045 /*
12046 * Check to see if the trigger file exists. Note that we
12047 * do this only after failure, so when you create the
12048 * trigger file, we still finish replaying as much as we
12049 * can from archive and pg_wal before failover.
12050 */
12051 if (StandbyMode && CheckForStandbyTrigger())
12052 {
12053 ShutdownWalRcv();
12054 return false;
12055 }
12056
12057 /*
12058 * Not in standby mode, and we've now tried the archive
12059 * and pg_wal.
12060 */
12061 if (!StandbyMode)
12062 return false;
12063
12064 /*
12065 * If primary_conninfo is set, launch walreceiver to try
12066 * to stream the missing WAL.
12067 *
12068 * If fetching_ckpt is true, RecPtr points to the initial
12069 * checkpoint location. In that case, we use RedoStartLSN
12070 * as the streaming start position instead of RecPtr, so
12071 * that when we later jump backwards to start redo at
12072 * RedoStartLSN, we will have the logs streamed already.
12073 */
12074 if (PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
12075 {
12076 XLogRecPtr ptr;
12077 TimeLineID tli;
12078
12079 if (fetching_ckpt)
12080 {
12081 ptr = RedoStartLSN;
12082 tli = ControlFile->checkPointCopy.ThisTimeLineID;
12083 }
12084 else
12085 {
12086 ptr = RecPtr;
12087
12088 /*
12089 * Use the record begin position to determine the
12090 * TLI, rather than the position we're reading.
12091 */
12092 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
12093
12094 if (curFileTLI > 0 && tli < curFileTLI)
12095 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
12096 (uint32) (tliRecPtr >> 32),
12097 (uint32) tliRecPtr,
12098 tli, curFileTLI);
12099 }
12100 curFileTLI = tli;
12101 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
12102 PrimarySlotName);
12103 receivedUpto = 0;
12104 }
12105
12106 /*
12107 * Move to XLOG_FROM_STREAM state in either case. We'll
12108 * get immediate failure if we didn't launch walreceiver,
12109 * and move on to the next state.
12110 */
12111 currentSource = XLOG_FROM_STREAM;
12112 break;
12113
12114 case XLOG_FROM_STREAM:
12115
12116 /*
12117 * Failure while streaming. Most likely, we got here
12118 * because streaming replication was terminated, or
12119 * promotion was triggered. But we also get here if we
12120 * find an invalid record in the WAL streamed from master,
12121 * in which case something is seriously wrong. There's
12122 * little chance that the problem will just go away, but
12123 * PANIC is not good for availability either, especially
12124 * in hot standby mode. So, we treat that the same as
12125 * disconnection, and retry from archive/pg_wal again. The
12126 * WAL in the archive should be identical to what was
12127 * streamed, so it's unlikely that it helps, but one can
12128 * hope...
12129 */
12130
12131 /*
12132 * We should be able to move to XLOG_FROM_STREAM
12133 * only in standby mode.
12134 */
12135 Assert(StandbyMode);
12136
12137 /*
12138 * Before we leave XLOG_FROM_STREAM state, make sure that
12139 * walreceiver is not active, so that it won't overwrite
12140 * WAL that we restore from archive.
12141 */
12142 if (WalRcvStreaming())
12143 ShutdownWalRcv();
12144
12145 /*
12146 * Before we sleep, re-scan for possible new timelines if
12147 * we were requested to recover to the latest timeline.
12148 */
12149 if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
12150 {
12151 if (rescanLatestTimeLine())
12152 {
12153 currentSource = XLOG_FROM_ARCHIVE;
12154 break;
12155 }
12156 }
12157
12158 /*
12159 * XLOG_FROM_STREAM is the last state in our state
12160 * machine, so we've exhausted all the options for
12161 * obtaining the requested WAL. We're going to loop back
12162 * and retry from the archive, but if it hasn't been long
12163 * since last attempt, sleep wal_retrieve_retry_interval
12164 * milliseconds to avoid busy-waiting.
12165 */
12166 now = GetCurrentTimestamp();
12167 if (!TimestampDifferenceExceeds(last_fail_time, now,
12168 wal_retrieve_retry_interval))
12169 {
12170 long wait_time;
12171
12172 wait_time = wal_retrieve_retry_interval -
12173 TimestampDifferenceMilliseconds(last_fail_time, now);
12174
12175 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12176 WL_LATCH_SET | WL_TIMEOUT |
12177 WL_EXIT_ON_PM_DEATH,
12178 wait_time,
12179 WAIT_EVENT_RECOVERY_WAL_STREAM);
12180 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12181 now = GetCurrentTimestamp();
12182
12183 /* Handle interrupt signals of startup process */
12184 HandleStartupProcInterrupts();
12185 }
12186 last_fail_time = now;
12187 currentSource = XLOG_FROM_ARCHIVE;
12188 break;
12189
12190 default:
12191 elog(ERROR, "unexpected WAL source %d", currentSource);
12192 }
12193 }
12194 else if (currentSource == XLOG_FROM_PG_WAL)
12195 {
12196 /*
12197 * We just successfully read a file in pg_wal. We prefer files in
12198 * the archive over ones in pg_wal, so try the next file again
12199 * from the archive first.
12200 */
12201 if (InArchiveRecovery)
12202 currentSource = XLOG_FROM_ARCHIVE;
12203 }
12204
12205 if (currentSource != oldSource)
12206 elog(DEBUG2, "switched WAL source from %s to %s after %s",
12207 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12208 lastSourceFailed ? "failure" : "success");
12209
12210 /*
12211 * We've now handled possible failure. Try to read from the chosen
12212 * source.
12213 */
12214 lastSourceFailed = false;
12215
12216 switch (currentSource)
12217 {
12218 case XLOG_FROM_ARCHIVE:
12219 case XLOG_FROM_PG_WAL:
12220 /*
12221 * WAL receiver must not be running when reading WAL from
12222 * archive or pg_wal.
12223 */
12224 Assert(!WalRcvStreaming());
12225
12226 /* Close any old file we might have open. */
12227 if (readFile >= 0)
12228 {
12229 close(readFile);
12230 readFile = -1;
12231 }
12232 /* Reset curFileTLI if random fetch. */
12233 if (randAccess)
12234 curFileTLI = 0;
12235
12236 /*
12237 * Try to restore the file from archive, or read an existing
12238 * file from pg_wal.
12239 */
12240 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12241 currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
12242 currentSource);
12243 if (readFile >= 0)
12244 return true; /* success! */
12245
12246 /*
12247 * Nope, not found in archive or pg_wal.
12248 */
12249 lastSourceFailed = true;
12250 break;
12251
12252 case XLOG_FROM_STREAM:
12253 {
12254 bool havedata;
12255
12256 /*
12257 * We should be able to move to XLOG_FROM_STREAM
12258 * only in standby mode.
12259 */
12260 Assert(StandbyMode);
12261
12262 /*
12263 * Check if WAL receiver is still active.
12264 */
12265 if (!WalRcvStreaming())
12266 {
12267 lastSourceFailed = true;
12268 break;
12269 }
12270
12271 /*
12272 * Walreceiver is active, so see if new data has arrived.
12273 *
12274 * We only advance XLogReceiptTime when we obtain fresh
12275 * WAL from walreceiver and observe that we had already
12276 * processed everything before the most recent "chunk"
12277 * that it flushed to disk. In steady state where we are
12278 * keeping up with the incoming data, XLogReceiptTime will
12279 * be updated on each cycle. When we are behind,
12280 * XLogReceiptTime will not advance, so the grace time
12281 * allotted to conflicting queries will decrease.
12282 */
12283 if (RecPtr < receivedUpto)
12284 havedata = true;
12285 else
12286 {
12287 XLogRecPtr latestChunkStart;
12288
12289 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
12290 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
12291 {
12292 havedata = true;
12293 if (latestChunkStart <= RecPtr)
12294 {
12295 XLogReceiptTime = GetCurrentTimestamp();
12296 SetCurrentChunkStartTime(XLogReceiptTime);
12297 }
12298 }
12299 else
12300 havedata = false;
12301 }
12302 if (havedata)
12303 {
12304 /*
12305 * Great, streamed far enough. Open the file if it's
12306 * not open already. Also read the timeline history
12307 * file if we haven't initialized timeline history
12308 * yet; it should be streamed over and present in
12309 * pg_wal by now. Use XLOG_FROM_STREAM so that source
12310 * info is set correctly and XLogReceiptTime isn't
12311 * changed.
12312 *
12313 * NB: We must set readTimeLineHistory based on
12314 * recoveryTargetTLI, not receiveTLI. Normally they'll
12315 * be the same, but if recovery_target_timeline is
12316 * 'latest' and archiving is configured, then it's
12317 * possible that we managed to retrieve one or more
12318 * new timeline history files from the archive,
12319 * updating recoveryTargetTLI.
12320 */
12321 if (readFile < 0)
12322 {
12323 if (!expectedTLEs)
12324 expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
12325 readFile = XLogFileRead(readSegNo, PANIC,
12326 receiveTLI,
12327 XLOG_FROM_STREAM, false);
12328 Assert(readFile >= 0);
12329 }
12330 else
12331 {
12332 /* just make sure source info is correct... */
12333 readSource = XLOG_FROM_STREAM;
12334 XLogReceiptSource = XLOG_FROM_STREAM;
12335 return true;
12336 }
12337 break;
12338 }
12339
12340 /*
12341 * Data not here yet. Check for trigger, then wait for
12342 * walreceiver to wake us up when new WAL arrives.
12343 */
12344 if (CheckForStandbyTrigger())
12345 {
12346 /*
12347 * Note that we don't "return false" immediately here.
12348 * After being triggered, we still want to replay all
12349 * the WAL that was already streamed. It's in pg_wal
12350 * now, so we just treat this as a failure, and the
12351 * state machine will move on to replay the streamed
12352 * WAL from pg_wal, and then recheck the trigger and
12353 * exit replay.
12354 */
12355 lastSourceFailed = true;
12356 break;
12357 }
12358
12359 /*
12360 * Since we have replayed everything we have received so
12361 * far and are about to start waiting for more WAL, let's
12362 * tell the upstream server our replay location now so
12363 * that pg_stat_replication doesn't show stale
12364 * information.
12365 */
12366 if (!streaming_reply_sent)
12367 {
12368 WalRcvForceReply();
12369 streaming_reply_sent = true;
12370 }
12371
12372 /*
12373 * Wait for more WAL to arrive. Time out after 5 seconds
12374 * to react to a trigger file promptly and to check if the
12375 * WAL receiver is still active.
12376 */
12377 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12378 WL_LATCH_SET | WL_TIMEOUT |
12379 WL_EXIT_ON_PM_DEATH,
12380 5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
12381 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12382 break;
12383 }
12384
12385 default:
12386 elog(ERROR, "unexpected WAL source %d", currentSource);
12387 }
12388
12389 /*
12390 * This possibly-long loop needs to handle interrupts of startup
12391 * process.
12392 */
12393 HandleStartupProcInterrupts();
12394 }
12395
12396 return false; /* not reached */
12397 }
12398
12399 /*
12400 * Determine what log level should be used to report a corrupt WAL record
12401 * in the current WAL page, previously read by XLogPageRead().
12402 *
12403 * 'emode' is the error mode that would be used to report a file-not-found
12404 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
12405 * we're retrying the exact same record that we've tried previously, only
12406 * complain the first time to keep the noise down. However, we only do when
12407 * reading from pg_wal, because we don't expect any invalid records in archive
12408 * or in records streamed from master. Files in the archive should be complete,
12409 * and we should never hit the end of WAL because we stop and wait for more WAL
12410 * to arrive before replaying it.
12411 *
12412 * NOTE: This function remembers the RecPtr value it was last called with,
12413 * to suppress repeated messages about the same record. Only call this when
12414 * you are about to ereport(), or you might cause a later message to be
12415 * erroneously suppressed.
12416 */
12417 static int
emode_for_corrupt_record(int emode,XLogRecPtr RecPtr)12418 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12419 {
12420 static XLogRecPtr lastComplaint = 0;
12421
12422 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12423 {
12424 if (RecPtr == lastComplaint)
12425 emode = DEBUG1;
12426 else
12427 lastComplaint = RecPtr;
12428 }
12429 return emode;
12430 }
12431
12432 /*
12433 * Check to see whether the user-specified trigger file exists and whether a
12434 * promote request has arrived. If either condition holds, return true.
12435 */
12436 static bool
CheckForStandbyTrigger(void)12437 CheckForStandbyTrigger(void)
12438 {
12439 struct stat stat_buf;
12440 static bool triggered = false;
12441
12442 if (triggered)
12443 return true;
12444
12445 if (IsPromoteTriggered())
12446 {
12447 /*
12448 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12449 * signal handler. It now leaves the file in place and lets the
12450 * Startup process do the unlink. This allows Startup to know whether
12451 * it should create a full checkpoint before starting up (fallback
12452 * mode). Fast promotion takes precedence.
12453 */
12454 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12455 {
12456 unlink(PROMOTE_SIGNAL_FILE);
12457 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12458 fast_promote = true;
12459 }
12460 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12461 {
12462 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12463 fast_promote = false;
12464 }
12465
12466 ereport(LOG, (errmsg("received promote request")));
12467
12468 ResetPromoteTriggered();
12469 triggered = true;
12470 return true;
12471 }
12472
12473 if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
12474 return false;
12475
12476 if (stat(PromoteTriggerFile, &stat_buf) == 0)
12477 {
12478 ereport(LOG,
12479 (errmsg("promote trigger file found: %s", PromoteTriggerFile)));
12480 unlink(PromoteTriggerFile);
12481 triggered = true;
12482 fast_promote = true;
12483 return true;
12484 }
12485 else if (errno != ENOENT)
12486 ereport(ERROR,
12487 (errcode_for_file_access(),
12488 errmsg("could not stat promote trigger file \"%s\": %m",
12489 PromoteTriggerFile)));
12490
12491 return false;
12492 }
12493
12494 /*
12495 * Remove the files signaling a standby promotion request.
12496 */
12497 void
RemovePromoteSignalFiles(void)12498 RemovePromoteSignalFiles(void)
12499 {
12500 unlink(PROMOTE_SIGNAL_FILE);
12501 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12502 }
12503
12504 /*
12505 * Check to see if a promote request has arrived. Should be
12506 * called by postmaster after receiving SIGUSR1.
12507 */
12508 bool
CheckPromoteSignal(void)12509 CheckPromoteSignal(void)
12510 {
12511 struct stat stat_buf;
12512
12513 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12514 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12515 return true;
12516
12517 return false;
12518 }
12519
12520 /*
12521 * Wake up startup process to replay newly arrived WAL, or to notice that
12522 * failover has been requested.
12523 */
12524 void
WakeupRecovery(void)12525 WakeupRecovery(void)
12526 {
12527 SetLatch(&XLogCtl->recoveryWakeupLatch);
12528 }
12529
12530 /*
12531 * Update the WalWriterSleeping flag.
12532 */
12533 void
SetWalWriterSleeping(bool sleeping)12534 SetWalWriterSleeping(bool sleeping)
12535 {
12536 SpinLockAcquire(&XLogCtl->info_lck);
12537 XLogCtl->WalWriterSleeping = sleeping;
12538 SpinLockRelease(&XLogCtl->info_lck);
12539 }
12540
12541 /*
12542 * Schedule a walreceiver wakeup in the main recovery loop.
12543 */
12544 void
XLogRequestWalReceiverReply(void)12545 XLogRequestWalReceiverReply(void)
12546 {
12547 doRequestWalReceiverReply = true;
12548 }
12549