1 /*-------------------------------------------------------------------------
2 *
3 * xlog.c
4 * PostgreSQL write-ahead log manager
5 *
6 *
7 * Portions Copyright (c) 1996-2021, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/backend/access/transam/xlog.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <math.h>
19 #include <time.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/time.h>
23 #include <unistd.h>
24
25 #include "access/clog.h"
26 #include "access/commit_ts.h"
27 #include "access/heaptoast.h"
28 #include "access/multixact.h"
29 #include "access/rewriteheap.h"
30 #include "access/subtrans.h"
31 #include "access/timeline.h"
32 #include "access/transam.h"
33 #include "access/twophase.h"
34 #include "access/xact.h"
35 #include "access/xlog_internal.h"
36 #include "access/xlogarchive.h"
37 #include "access/xloginsert.h"
38 #include "access/xlogreader.h"
39 #include "access/xlogutils.h"
40 #include "catalog/catversion.h"
41 #include "catalog/pg_control.h"
42 #include "catalog/pg_database.h"
43 #include "commands/progress.h"
44 #include "commands/tablespace.h"
45 #include "common/controldata_utils.h"
46 #include "executor/instrument.h"
47 #include "miscadmin.h"
48 #include "pg_trace.h"
49 #include "pgstat.h"
50 #include "port/atomics.h"
51 #include "port/pg_iovec.h"
52 #include "postmaster/bgwriter.h"
53 #include "postmaster/startup.h"
54 #include "postmaster/walwriter.h"
55 #include "replication/basebackup.h"
56 #include "replication/logical.h"
57 #include "replication/origin.h"
58 #include "replication/slot.h"
59 #include "replication/snapbuild.h"
60 #include "replication/walreceiver.h"
61 #include "replication/walsender.h"
62 #include "storage/bufmgr.h"
63 #include "storage/fd.h"
64 #include "storage/ipc.h"
65 #include "storage/large_object.h"
66 #include "storage/latch.h"
67 #include "storage/pmsignal.h"
68 #include "storage/predicate.h"
69 #include "storage/proc.h"
70 #include "storage/procarray.h"
71 #include "storage/reinit.h"
72 #include "storage/smgr.h"
73 #include "storage/spin.h"
74 #include "storage/sync.h"
75 #include "utils/builtins.h"
76 #include "utils/guc.h"
77 #include "utils/memutils.h"
78 #include "utils/ps_status.h"
79 #include "utils/relmapper.h"
80 #include "utils/pg_rusage.h"
81 #include "utils/snapmgr.h"
82 #include "utils/timestamp.h"
83
84 extern uint32 bootstrap_data_checksum_version;
85
86 /* Unsupported old recovery command file names (relative to $PGDATA) */
87 #define RECOVERY_COMMAND_FILE "recovery.conf"
88 #define RECOVERY_COMMAND_DONE "recovery.done"
89
90 /* User-settable parameters */
91 int max_wal_size_mb = 1024; /* 1 GB */
92 int min_wal_size_mb = 80; /* 80 MB */
93 int wal_keep_size_mb = 0;
94 int XLOGbuffers = -1;
95 int XLogArchiveTimeout = 0;
96 int XLogArchiveMode = ARCHIVE_MODE_OFF;
97 char *XLogArchiveCommand = NULL;
98 bool EnableHotStandby = false;
99 bool fullPageWrites = true;
100 bool wal_log_hints = false;
101 bool wal_compression = false;
102 char *wal_consistency_checking_string = NULL;
103 bool *wal_consistency_checking = NULL;
104 bool wal_init_zero = true;
105 bool wal_recycle = true;
106 bool log_checkpoints = false;
107 int sync_method = DEFAULT_SYNC_METHOD;
108 int wal_level = WAL_LEVEL_MINIMAL;
109 int CommitDelay = 0; /* precommit delay in microseconds */
110 int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
111 int wal_retrieve_retry_interval = 5000;
112 int max_slot_wal_keep_size_mb = -1;
113 bool track_wal_io_timing = false;
114
115 #ifdef WAL_DEBUG
116 bool XLOG_DEBUG = false;
117 #endif
118
119 int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
120
121 /*
122 * Number of WAL insertion locks to use. A higher value allows more insertions
123 * to happen concurrently, but adds some CPU overhead to flushing the WAL,
124 * which needs to iterate all the locks.
125 */
126 #define NUM_XLOGINSERT_LOCKS 8
127
128 /*
129 * Max distance from last checkpoint, before triggering a new xlog-based
130 * checkpoint.
131 */
132 int CheckPointSegments;
133
134 /* Estimated distance between checkpoints, in bytes */
135 static double CheckPointDistanceEstimate = 0;
136 static double PrevCheckPointDistance = 0;
137
138 /*
139 * GUC support
140 */
141 const struct config_enum_entry sync_method_options[] = {
142 {"fsync", SYNC_METHOD_FSYNC, false},
143 #ifdef HAVE_FSYNC_WRITETHROUGH
144 {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
145 #endif
146 #ifdef HAVE_FDATASYNC
147 {"fdatasync", SYNC_METHOD_FDATASYNC, false},
148 #endif
149 #ifdef OPEN_SYNC_FLAG
150 {"open_sync", SYNC_METHOD_OPEN, false},
151 #endif
152 #ifdef OPEN_DATASYNC_FLAG
153 {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
154 #endif
155 {NULL, 0, false}
156 };
157
158
159 /*
160 * Although only "on", "off", and "always" are documented,
161 * we accept all the likely variants of "on" and "off".
162 */
163 const struct config_enum_entry archive_mode_options[] = {
164 {"always", ARCHIVE_MODE_ALWAYS, false},
165 {"on", ARCHIVE_MODE_ON, false},
166 {"off", ARCHIVE_MODE_OFF, false},
167 {"true", ARCHIVE_MODE_ON, true},
168 {"false", ARCHIVE_MODE_OFF, true},
169 {"yes", ARCHIVE_MODE_ON, true},
170 {"no", ARCHIVE_MODE_OFF, true},
171 {"1", ARCHIVE_MODE_ON, true},
172 {"0", ARCHIVE_MODE_OFF, true},
173 {NULL, 0, false}
174 };
175
176 const struct config_enum_entry recovery_target_action_options[] = {
177 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
178 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
179 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
180 {NULL, 0, false}
181 };
182
183 /*
184 * Statistics for current checkpoint are collected in this global struct.
185 * Because only the checkpointer or a stand-alone backend can perform
186 * checkpoints, this will be unused in normal backends.
187 */
188 CheckpointStatsData CheckpointStats;
189
190 /*
191 * ThisTimeLineID will be same in all backends --- it identifies current
192 * WAL timeline for the database system.
193 */
194 TimeLineID ThisTimeLineID = 0;
195
196 /*
197 * Are we doing recovery from XLOG?
198 *
199 * This is only ever true in the startup process; it should be read as meaning
200 * "this process is replaying WAL records", rather than "the system is in
201 * recovery mode". It should be examined primarily by functions that need
202 * to act differently when called from a WAL redo function (e.g., to skip WAL
203 * logging). To check whether the system is in recovery regardless of which
204 * process you're running in, use RecoveryInProgress() but only after shared
205 * memory startup and lock initialization.
206 */
207 bool InRecovery = false;
208
209 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
210 HotStandbyState standbyState = STANDBY_DISABLED;
211
212 static XLogRecPtr LastRec;
213
214 /* Local copy of WalRcv->flushedUpto */
215 static XLogRecPtr flushedUpto = 0;
216 static TimeLineID receiveTLI = 0;
217
218 /*
219 * abortedRecPtr is the start pointer of a broken record at end of WAL when
220 * recovery completes; missingContrecPtr is the location of the first
221 * contrecord that went missing. See CreateOverwriteContrecordRecord for
222 * details.
223 */
224 static XLogRecPtr abortedRecPtr;
225 static XLogRecPtr missingContrecPtr;
226
227 /*
228 * During recovery, lastFullPageWrites keeps track of full_page_writes that
229 * the replayed WAL records indicate. It's initialized with full_page_writes
230 * that the recovery starting checkpoint record indicates, and then updated
231 * each time XLOG_FPW_CHANGE record is replayed.
232 */
233 static bool lastFullPageWrites;
234
235 /*
236 * Local copy of the state tracked by SharedRecoveryState in shared memory,
237 * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
238 * means "not known, need to check the shared state".
239 */
240 static bool LocalRecoveryInProgress = true;
241
242 /*
243 * Local copy of SharedHotStandbyActive variable. False actually means "not
244 * known, need to check the shared state".
245 */
246 static bool LocalHotStandbyActive = false;
247
248 /*
249 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
250 * known, need to check the shared state".
251 */
252 static bool LocalPromoteIsTriggered = false;
253
254 /*
255 * Local state for XLogInsertAllowed():
256 * 1: unconditionally allowed to insert XLOG
257 * 0: unconditionally not allowed to insert XLOG
258 * -1: must check RecoveryInProgress(); disallow until it is false
259 * Most processes start with -1 and transition to 1 after seeing that recovery
260 * is not in progress. But we can also force the value for special cases.
261 * The coding in XLogInsertAllowed() depends on the first two of these states
262 * being numerically the same as bool true and false.
263 */
264 static int LocalXLogInsertAllowed = -1;
265
266 /*
267 * When ArchiveRecoveryRequested is set, archive recovery was requested,
268 * ie. signal files were present. When InArchiveRecovery is set, we are
269 * currently recovering using offline XLOG archives. These variables are only
270 * valid in the startup process.
271 *
272 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
273 * currently performing crash recovery using only XLOG files in pg_wal, but
274 * will switch to using offline XLOG archives as soon as we reach the end of
275 * WAL in pg_wal.
276 */
277 bool ArchiveRecoveryRequested = false;
278 bool InArchiveRecovery = false;
279
280 static bool standby_signal_file_found = false;
281 static bool recovery_signal_file_found = false;
282
283 /* Was the last xlog file restored from archive, or local? */
284 static bool restoredFromArchive = false;
285
286 /* Buffers dedicated to consistency checks of size BLCKSZ */
287 static char *replay_image_masked = NULL;
288 static char *primary_image_masked = NULL;
289
290 /* options formerly taken from recovery.conf for archive recovery */
291 char *recoveryRestoreCommand = NULL;
292 char *recoveryEndCommand = NULL;
293 char *archiveCleanupCommand = NULL;
294 RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
295 bool recoveryTargetInclusive = true;
296 int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
297 TransactionId recoveryTargetXid;
298 char *recovery_target_time_string;
299 static TimestampTz recoveryTargetTime;
300 const char *recoveryTargetName;
301 XLogRecPtr recoveryTargetLSN;
302 int recovery_min_apply_delay = 0;
303
304 /* options formerly taken from recovery.conf for XLOG streaming */
305 bool StandbyModeRequested = false;
306 char *PrimaryConnInfo = NULL;
307 char *PrimarySlotName = NULL;
308 char *PromoteTriggerFile = NULL;
309 bool wal_receiver_create_temp_slot = false;
310
311 /* are we currently in standby mode? */
312 bool StandbyMode = false;
313
314 /*
315 * if recoveryStopsBefore/After returns true, it saves information of the stop
316 * point here
317 */
318 static TransactionId recoveryStopXid;
319 static TimestampTz recoveryStopTime;
320 static XLogRecPtr recoveryStopLSN;
321 static char recoveryStopName[MAXFNAMELEN];
322 static bool recoveryStopAfter;
323
324 /*
325 * During normal operation, the only timeline we care about is ThisTimeLineID.
326 * During recovery, however, things are more complicated. To simplify life
327 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
328 * scan through the WAL history (that is, it is the line that was active when
329 * the currently-scanned WAL record was generated). We also need these
330 * timeline values:
331 *
332 * recoveryTargetTimeLineGoal: what the user requested, if any
333 *
334 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
335 *
336 * recoveryTargetTLI: the currently understood target timeline; changes
337 *
338 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
339 * its known parents, newest first (so recoveryTargetTLI is always the
340 * first list member). Only these TLIs are expected to be seen in the WAL
341 * segments we read, and indeed only these TLIs will be considered as
342 * candidate WAL files to open at all.
343 *
344 * curFileTLI: the TLI appearing in the name of the current input WAL file.
345 * (This is not necessarily the same as ThisTimeLineID, because we could
346 * be scanning data that was copied from an ancestor timeline when the current
347 * file was created.) During a sequential scan we do not allow this value
348 * to decrease.
349 */
350 RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
351 TimeLineID recoveryTargetTLIRequested = 0;
352 TimeLineID recoveryTargetTLI = 0;
353 static List *expectedTLEs;
354 static TimeLineID curFileTLI;
355
356 /*
357 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
358 * current backend. It is updated for all inserts. XactLastRecEnd points to
359 * end+1 of the last record, and is reset when we end a top-level transaction,
360 * or start a new one; so it can be used to tell if the current transaction has
361 * created any XLOG records.
362 *
363 * While in parallel mode, this may not be fully up to date. When committing,
364 * a transaction can assume this covers all xlog records written either by the
365 * user backend or by any parallel worker which was present at any point during
366 * the transaction. But when aborting, or when still in parallel mode, other
367 * parallel backends may have written WAL records at later LSNs than the value
368 * stored here. The parallel leader advances its own copy, when necessary,
369 * in WaitForParallelWorkersToFinish.
370 */
371 XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
372 XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
373 XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
374
375 /*
376 * RedoRecPtr is this backend's local copy of the REDO record pointer
377 * (which is almost but not quite the same as a pointer to the most recent
378 * CHECKPOINT record). We update this from the shared-memory copy,
379 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
380 * hold an insertion lock). See XLogInsertRecord for details. We are also
381 * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
382 * see GetRedoRecPtr. A freshly spawned backend obtains the value during
383 * InitXLOGAccess.
384 */
385 static XLogRecPtr RedoRecPtr;
386
387 /*
388 * doPageWrites is this backend's local copy of (forcePageWrites ||
389 * fullPageWrites). It is used together with RedoRecPtr to decide whether
390 * a full-page image of a page need to be taken.
391 */
392 static bool doPageWrites;
393
394 /* Has the recovery code requested a walreceiver wakeup? */
395 static bool doRequestWalReceiverReply;
396
397 /*
398 * RedoStartLSN points to the checkpoint's REDO location which is specified
399 * in a backup label file, backup history file or control file. In standby
400 * mode, XLOG streaming usually starts from the position where an invalid
401 * record was found. But if we fail to read even the initial checkpoint
402 * record, we use the REDO location instead of the checkpoint location as
403 * the start position of XLOG streaming. Otherwise we would have to jump
404 * backwards to the REDO location after reading the checkpoint record,
405 * because the REDO record can precede the checkpoint record.
406 */
407 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
408
409 /*----------
410 * Shared-memory data structures for XLOG control
411 *
412 * LogwrtRqst indicates a byte position that we need to write and/or fsync
413 * the log up to (all records before that point must be written or fsynced).
414 * LogwrtResult indicates the byte positions we have already written/fsynced.
415 * These structs are identical but are declared separately to indicate their
416 * slightly different functions.
417 *
418 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
419 * WALWriteLock. To update it, you need to hold both locks. The point of
420 * this arrangement is that the value can be examined by code that already
421 * holds WALWriteLock without needing to grab info_lck as well. In addition
422 * to the shared variable, each backend has a private copy of LogwrtResult,
423 * which is updated when convenient.
424 *
425 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
426 * (protected by info_lck), but we don't need to cache any copies of it.
427 *
428 * info_lck is only held long enough to read/update the protected variables,
429 * so it's a plain spinlock. The other locks are held longer (potentially
430 * over I/O operations), so we use LWLocks for them. These locks are:
431 *
432 * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
433 * It is only held while initializing and changing the mapping. If the
434 * contents of the buffer being replaced haven't been written yet, the mapping
435 * lock is released while the write is done, and reacquired afterwards.
436 *
437 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
438 * XLogFlush).
439 *
440 * ControlFileLock: must be held to read/update control file or create
441 * new log file.
442 *
443 *----------
444 */
445
446 typedef struct XLogwrtRqst
447 {
448 XLogRecPtr Write; /* last byte + 1 to write out */
449 XLogRecPtr Flush; /* last byte + 1 to flush */
450 } XLogwrtRqst;
451
452 typedef struct XLogwrtResult
453 {
454 XLogRecPtr Write; /* last byte + 1 written out */
455 XLogRecPtr Flush; /* last byte + 1 flushed */
456 } XLogwrtResult;
457
458 /*
459 * Inserting to WAL is protected by a small fixed number of WAL insertion
460 * locks. To insert to the WAL, you must hold one of the locks - it doesn't
461 * matter which one. To lock out other concurrent insertions, you must hold
462 * of them. Each WAL insertion lock consists of a lightweight lock, plus an
463 * indicator of how far the insertion has progressed (insertingAt).
464 *
465 * The insertingAt values are read when a process wants to flush WAL from
466 * the in-memory buffers to disk, to check that all the insertions to the
467 * region the process is about to write out have finished. You could simply
468 * wait for all currently in-progress insertions to finish, but the
469 * insertingAt indicator allows you to ignore insertions to later in the WAL,
470 * so that you only wait for the insertions that are modifying the buffers
471 * you're about to write out.
472 *
473 * This isn't just an optimization. If all the WAL buffers are dirty, an
474 * inserter that's holding a WAL insert lock might need to evict an old WAL
475 * buffer, which requires flushing the WAL. If it's possible for an inserter
476 * to block on another inserter unnecessarily, deadlock can arise when two
477 * inserters holding a WAL insert lock wait for each other to finish their
478 * insertion.
479 *
480 * Small WAL records that don't cross a page boundary never update the value,
481 * the WAL record is just copied to the page and the lock is released. But
482 * to avoid the deadlock-scenario explained above, the indicator is always
483 * updated before sleeping while holding an insertion lock.
484 *
485 * lastImportantAt contains the LSN of the last important WAL record inserted
486 * using a given lock. This value is used to detect if there has been
487 * important WAL activity since the last time some action, like a checkpoint,
488 * was performed - allowing to not repeat the action if not. The LSN is
489 * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
490 * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
491 * records. Tracking the WAL activity directly in WALInsertLock has the
492 * advantage of not needing any additional locks to update the value.
493 */
494 typedef struct
495 {
496 LWLock lock;
497 XLogRecPtr insertingAt;
498 XLogRecPtr lastImportantAt;
499 } WALInsertLock;
500
501 /*
502 * All the WAL insertion locks are allocated as an array in shared memory. We
503 * force the array stride to be a power of 2, which saves a few cycles in
504 * indexing, but more importantly also ensures that individual slots don't
505 * cross cache line boundaries. (Of course, we have to also ensure that the
506 * array start address is suitably aligned.)
507 */
508 typedef union WALInsertLockPadded
509 {
510 WALInsertLock l;
511 char pad[PG_CACHE_LINE_SIZE];
512 } WALInsertLockPadded;
513
514 /*
515 * State of an exclusive backup, necessary to control concurrent activities
516 * across sessions when working on exclusive backups.
517 *
518 * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
519 * running, to be more precise pg_start_backup() is not being executed for
520 * an exclusive backup and there is no exclusive backup in progress.
521 * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
522 * exclusive backup.
523 * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
524 * running and an exclusive backup is in progress. pg_stop_backup() is
525 * needed to finish it.
526 * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
527 * exclusive backup.
528 */
529 typedef enum ExclusiveBackupState
530 {
531 EXCLUSIVE_BACKUP_NONE = 0,
532 EXCLUSIVE_BACKUP_STARTING,
533 EXCLUSIVE_BACKUP_IN_PROGRESS,
534 EXCLUSIVE_BACKUP_STOPPING
535 } ExclusiveBackupState;
536
537 /*
538 * Session status of running backup, used for sanity checks in SQL-callable
539 * functions to start and stop backups.
540 */
541 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
542
543 /*
544 * Shared state data for WAL insertion.
545 */
546 typedef struct XLogCtlInsert
547 {
548 slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
549
550 /*
551 * CurrBytePos is the end of reserved WAL. The next record will be
552 * inserted at that position. PrevBytePos is the start position of the
553 * previously inserted (or rather, reserved) record - it is copied to the
554 * prev-link of the next record. These are stored as "usable byte
555 * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
556 */
557 uint64 CurrBytePos;
558 uint64 PrevBytePos;
559
560 /*
561 * Make sure the above heavily-contended spinlock and byte positions are
562 * on their own cache line. In particular, the RedoRecPtr and full page
563 * write variables below should be on a different cache line. They are
564 * read on every WAL insertion, but updated rarely, and we don't want
565 * those reads to steal the cache line containing Curr/PrevBytePos.
566 */
567 char pad[PG_CACHE_LINE_SIZE];
568
569 /*
570 * fullPageWrites is the authoritative value used by all backends to
571 * determine whether to write full-page image to WAL. This shared value,
572 * instead of the process-local fullPageWrites, is required because, when
573 * full_page_writes is changed by SIGHUP, we must WAL-log it before it
574 * actually affects WAL-logging by backends. Checkpointer sets at startup
575 * or after SIGHUP.
576 *
577 * To read these fields, you must hold an insertion lock. To modify them,
578 * you must hold ALL the locks.
579 */
580 XLogRecPtr RedoRecPtr; /* current redo point for insertions */
581 bool forcePageWrites; /* forcing full-page writes for PITR? */
582 bool fullPageWrites;
583
584 /*
585 * exclusiveBackupState indicates the state of an exclusive backup (see
586 * comments of ExclusiveBackupState for more details). nonExclusiveBackups
587 * is a counter indicating the number of streaming base backups currently
588 * in progress. forcePageWrites is set to true when either of these is
589 * non-zero. lastBackupStart is the latest checkpoint redo location used
590 * as a starting point for an online backup.
591 */
592 ExclusiveBackupState exclusiveBackupState;
593 int nonExclusiveBackups;
594 XLogRecPtr lastBackupStart;
595
596 /*
597 * WAL insertion locks.
598 */
599 WALInsertLockPadded *WALInsertLocks;
600 } XLogCtlInsert;
601
602 /*
603 * Total shared-memory state for XLOG.
604 */
605 typedef struct XLogCtlData
606 {
607 XLogCtlInsert Insert;
608
609 /* Protected by info_lck: */
610 XLogwrtRqst LogwrtRqst;
611 XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
612 FullTransactionId ckptFullXid; /* nextXid of latest checkpoint */
613 XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
614 XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
615
616 XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
617
618 /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
619 XLogRecPtr unloggedLSN;
620 slock_t ulsn_lck;
621
622 /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
623 pg_time_t lastSegSwitchTime;
624 XLogRecPtr lastSegSwitchLSN;
625
626 /*
627 * Protected by info_lck and WALWriteLock (you must hold either lock to
628 * read it, but both to update)
629 */
630 XLogwrtResult LogwrtResult;
631
632 /*
633 * Latest initialized page in the cache (last byte position + 1).
634 *
635 * To change the identity of a buffer (and InitializedUpTo), you need to
636 * hold WALBufMappingLock. To change the identity of a buffer that's
637 * still dirty, the old page needs to be written out first, and for that
638 * you need WALWriteLock, and you need to ensure that there are no
639 * in-progress insertions to the page by calling
640 * WaitXLogInsertionsToFinish().
641 */
642 XLogRecPtr InitializedUpTo;
643
644 /*
645 * These values do not change after startup, although the pointed-to pages
646 * and xlblocks values certainly do. xlblocks values are protected by
647 * WALBufMappingLock.
648 */
649 char *pages; /* buffers for unwritten XLOG pages */
650 XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
651 int XLogCacheBlck; /* highest allocated xlog buffer index */
652
653 /*
654 * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
655 * If we created a new timeline when the system was started up,
656 * PrevTimeLineID is the old timeline's ID that we forked off from.
657 * Otherwise it's equal to ThisTimeLineID.
658 */
659 TimeLineID ThisTimeLineID;
660 TimeLineID PrevTimeLineID;
661
662 /*
663 * SharedRecoveryState indicates if we're still in crash or archive
664 * recovery. Protected by info_lck.
665 */
666 RecoveryState SharedRecoveryState;
667
668 /*
669 * SharedHotStandbyActive indicates if we allow hot standby queries to be
670 * run. Protected by info_lck.
671 */
672 bool SharedHotStandbyActive;
673
674 /*
675 * SharedPromoteIsTriggered indicates if a standby promotion has been
676 * triggered. Protected by info_lck.
677 */
678 bool SharedPromoteIsTriggered;
679
680 /*
681 * WalWriterSleeping indicates whether the WAL writer is currently in
682 * low-power mode (and hence should be nudged if an async commit occurs).
683 * Protected by info_lck.
684 */
685 bool WalWriterSleeping;
686
687 /*
688 * recoveryWakeupLatch is used to wake up the startup process to continue
689 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
690 * to appear.
691 *
692 * Note that the startup process also uses another latch, its procLatch,
693 * to wait for recovery conflict. If we get rid of recoveryWakeupLatch for
694 * signaling the startup process in favor of using its procLatch, which
695 * comports better with possible generic signal handlers using that latch.
696 * But we should not do that because the startup process doesn't assume
697 * that it's waken up by walreceiver process or SIGHUP signal handler
698 * while it's waiting for recovery conflict. The separate latches,
699 * recoveryWakeupLatch and procLatch, should be used for inter-process
700 * communication for WAL replay and recovery conflict, respectively.
701 */
702 Latch recoveryWakeupLatch;
703
704 /*
705 * During recovery, we keep a copy of the latest checkpoint record here.
706 * lastCheckPointRecPtr points to start of checkpoint record and
707 * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
708 * checkpointer when it wants to create a restartpoint.
709 *
710 * Protected by info_lck.
711 */
712 XLogRecPtr lastCheckPointRecPtr;
713 XLogRecPtr lastCheckPointEndPtr;
714 CheckPoint lastCheckPoint;
715
716 /*
717 * lastReplayedEndRecPtr points to end+1 of the last record successfully
718 * replayed. When we're currently replaying a record, ie. in a redo
719 * function, replayEndRecPtr points to the end+1 of the record being
720 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
721 */
722 XLogRecPtr lastReplayedEndRecPtr;
723 TimeLineID lastReplayedTLI;
724 XLogRecPtr replayEndRecPtr;
725 TimeLineID replayEndTLI;
726 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
727 TimestampTz recoveryLastXTime;
728
729 /*
730 * timestamp of when we started replaying the current chunk of WAL data,
731 * only relevant for replication or archive recovery
732 */
733 TimestampTz currentChunkStartTime;
734 /* Recovery pause state */
735 RecoveryPauseState recoveryPauseState;
736 ConditionVariable recoveryNotPausedCV;
737
738 /*
739 * lastFpwDisableRecPtr points to the start of the last replayed
740 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
741 */
742 XLogRecPtr lastFpwDisableRecPtr;
743
744 slock_t info_lck; /* locks shared variables shown above */
745 } XLogCtlData;
746
747 static XLogCtlData *XLogCtl = NULL;
748
749 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
750 static WALInsertLockPadded *WALInsertLocks = NULL;
751
752 /*
753 * We maintain an image of pg_control in shared memory.
754 */
755 static ControlFileData *ControlFile = NULL;
756
757 /*
758 * Calculate the amount of space left on the page after 'endptr'. Beware
759 * multiple evaluation!
760 */
761 #define INSERT_FREESPACE(endptr) \
762 (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
763
764 /* Macro to advance to next buffer index. */
765 #define NextBufIdx(idx) \
766 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
767
768 /*
769 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
770 * would hold if it was in cache, the page containing 'recptr'.
771 */
772 #define XLogRecPtrToBufIdx(recptr) \
773 (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
774
775 /*
776 * These are the number of bytes in a WAL page usable for WAL data.
777 */
778 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
779
780 /*
781 * Convert values of GUCs measured in megabytes to equiv. segment count.
782 * Rounds down.
783 */
784 #define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize))
785
786 /* The number of bytes in a WAL segment usable for WAL data. */
787 static int UsableBytesInSegment;
788
789 /*
790 * Private, possibly out-of-date copy of shared LogwrtResult.
791 * See discussion above.
792 */
793 static XLogwrtResult LogwrtResult = {0, 0};
794
795 /*
796 * Codes indicating where we got a WAL file from during recovery, or where
797 * to attempt to get one.
798 */
799 typedef enum
800 {
801 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
802 XLOG_FROM_ARCHIVE, /* restored using restore_command */
803 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
804 XLOG_FROM_STREAM /* streamed from primary */
805 } XLogSource;
806
807 /* human-readable names for XLogSources, for debugging output */
808 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
809
810 /*
811 * openLogFile is -1 or a kernel FD for an open log file segment.
812 * openLogSegNo identifies the segment. These variables are only used to
813 * write the XLOG, and so will normally refer to the active segment.
814 * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
815 */
816 static int openLogFile = -1;
817 static XLogSegNo openLogSegNo = 0;
818
819 /*
820 * These variables are used similarly to the ones above, but for reading
821 * the XLOG. readOff is the offset of the page just read, readLen
822 * indicates how much of it has been read into readBuf, and readSource
823 * indicates where we got the currently open file from.
824 * Note: we could use Reserve/ReleaseExternalFD to track consumption of
825 * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
826 * not read by general-purpose sessions.
827 */
828 static int readFile = -1;
829 static XLogSegNo readSegNo = 0;
830 static uint32 readOff = 0;
831 static uint32 readLen = 0;
832 static XLogSource readSource = XLOG_FROM_ANY;
833
834 /*
835 * Keeps track of which source we're currently reading from. This is
836 * different from readSource in that this is always set, even when we don't
837 * currently have a WAL file open. If lastSourceFailed is set, our last
838 * attempt to read from currentSource failed, and we should try another source
839 * next.
840 *
841 * pendingWalRcvRestart is set when a config change occurs that requires a
842 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
843 */
844 static XLogSource currentSource = XLOG_FROM_ANY;
845 static bool lastSourceFailed = false;
846 static bool pendingWalRcvRestart = false;
847
848 typedef struct XLogPageReadPrivate
849 {
850 int emode;
851 bool fetching_ckpt; /* are we fetching a checkpoint record? */
852 bool randAccess;
853 } XLogPageReadPrivate;
854
855 /*
856 * These variables track when we last obtained some WAL data to process,
857 * and where we got it from. (XLogReceiptSource is initially the same as
858 * readSource, but readSource gets reset to zero when we don't have data
859 * to process right now. It is also different from currentSource, which
860 * also changes when we try to read from a source and fail, while
861 * XLogReceiptSource tracks where we last successfully read some WAL.)
862 */
863 static TimestampTz XLogReceiptTime = 0;
864 static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
865
866 /* State information for XLOG reading */
867 static XLogRecPtr ReadRecPtr; /* start of last record read */
868 static XLogRecPtr EndRecPtr; /* end+1 of last record read */
869
870 /*
871 * Local copies of equivalent fields in the control file. When running
872 * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
873 * expect to replay all the WAL available, and updateMinRecoveryPoint is
874 * switched to false to prevent any updates while replaying records.
875 * Those values are kept consistent as long as crash recovery runs.
876 */
877 static XLogRecPtr minRecoveryPoint;
878 static TimeLineID minRecoveryPointTLI;
879 static bool updateMinRecoveryPoint = true;
880
881 /*
882 * Have we reached a consistent database state? In crash recovery, we have
883 * to replay all the WAL, so reachedConsistency is never set. During archive
884 * recovery, the database is consistent once minRecoveryPoint is reached.
885 */
886 bool reachedConsistency = false;
887
888 static bool InRedo = false;
889
890 /* Have we launched bgwriter during recovery? */
891 static bool bgwriterLaunched = false;
892
893 /* For WALInsertLockAcquire/Release functions */
894 static int MyLockNo = 0;
895 static bool holdingAllLocks = false;
896
897 #ifdef WAL_DEBUG
898 static MemoryContext walDebugCxt = NULL;
899 #endif
900
901 static void readRecoverySignalFile(void);
902 static void validateRecoveryParameters(void);
903 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
904 static bool recoveryStopsBefore(XLogReaderState *record);
905 static bool recoveryStopsAfter(XLogReaderState *record);
906 static void ConfirmRecoveryPaused(void);
907 static void recoveryPausesHere(bool endOfRecovery);
908 static bool recoveryApplyDelay(XLogReaderState *record);
909 static void SetLatestXTime(TimestampTz xtime);
910 static void SetCurrentChunkStartTime(TimestampTz xtime);
911 static void CheckRequiredParameterValues(void);
912 static void XLogReportParameters(void);
913 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
914 TimeLineID prevTLI);
915 static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
916 XLogReaderState *state);
917 static void LocalSetXLogInsertAllowed(void);
918 static void CreateEndOfRecoveryRecord(void);
919 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
920 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
921 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
922 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
923
924 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
925 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
926 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
927 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
928 bool find_free, XLogSegNo max_segno,
929 bool use_lock);
930 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
931 XLogSource source, bool notfoundOk);
932 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
933 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
934 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
935 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
936 bool fetching_ckpt, XLogRecPtr tliRecPtr);
937 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
938 static void XLogFileClose(void);
939 static void PreallocXlogFiles(XLogRecPtr endptr);
940 static void RemoveTempXlogFiles(void);
941 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr);
942 static void RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
943 XLogSegNo *endlogSegNo);
944 static void UpdateLastRemovedPtr(char *filename);
945 static void ValidateXLOGDirectoryStructure(void);
946 static void CleanupBackupHistory(void);
947 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
948 static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
949 int emode, bool fetching_ckpt);
950 static void CheckRecoveryConsistency(void);
951 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
952 XLogRecPtr RecPtr, int whichChkpt, bool report);
953 static bool rescanLatestTimeLine(void);
954 static void InitControlFile(uint64 sysidentifier);
955 static void WriteControlFile(void);
956 static void ReadControlFile(void);
957 static char *str_time(pg_time_t tnow);
958 static void SetPromoteIsTriggered(void);
959 static bool CheckForStandbyTrigger(void);
960
961 #ifdef WAL_DEBUG
962 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
963 #endif
964 static void xlog_block_info(StringInfo buf, XLogReaderState *record);
965 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
966 static void pg_start_backup_callback(int code, Datum arg);
967 static void pg_stop_backup_callback(int code, Datum arg);
968 static bool read_backup_label(XLogRecPtr *checkPointLoc,
969 bool *backupEndRequired, bool *backupFromStandby);
970 static bool read_tablespace_map(List **tablespaces);
971
972 static void rm_redo_error_callback(void *arg);
973 static int get_sync_bit(int method);
974
975 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
976 XLogRecData *rdata,
977 XLogRecPtr StartPos, XLogRecPtr EndPos);
978 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
979 XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
980 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
981 XLogRecPtr *PrevPtr);
982 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
983 static char *GetXLogBuffer(XLogRecPtr ptr);
984 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
985 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
986 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
987 static void checkXLogConsistency(XLogReaderState *record);
988
989 static void WALInsertLockAcquire(void);
990 static void WALInsertLockAcquireExclusive(void);
991 static void WALInsertLockRelease(void);
992 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
993
994 /*
995 * Insert an XLOG record represented by an already-constructed chain of data
996 * chunks. This is a low-level routine; to construct the WAL record header
997 * and data, use the higher-level routines in xloginsert.c.
998 *
999 * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
1000 * WAL record applies to, that were not included in the record as full page
1001 * images. If fpw_lsn <= RedoRecPtr, the function does not perform the
1002 * insertion and returns InvalidXLogRecPtr. The caller can then recalculate
1003 * which pages need a full-page image, and retry. If fpw_lsn is invalid, the
1004 * record is always inserted.
1005 *
1006 * 'flags' gives more in-depth control on the record being inserted. See
1007 * XLogSetRecordFlags() for details.
1008 *
1009 * The first XLogRecData in the chain must be for the record header, and its
1010 * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
1011 * xl_crc fields in the header, the rest of the header must already be filled
1012 * by the caller.
1013 *
1014 * Returns XLOG pointer to end of record (beginning of next record).
1015 * This can be used as LSN for data pages affected by the logged action.
1016 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
1017 * before the data page can be written out. This implements the basic
1018 * WAL rule "write the log before the data".)
1019 */
1020 XLogRecPtr
XLogInsertRecord(XLogRecData * rdata,XLogRecPtr fpw_lsn,uint8 flags,int num_fpi)1021 XLogInsertRecord(XLogRecData *rdata,
1022 XLogRecPtr fpw_lsn,
1023 uint8 flags,
1024 int num_fpi)
1025 {
1026 XLogCtlInsert *Insert = &XLogCtl->Insert;
1027 pg_crc32c rdata_crc;
1028 bool inserted;
1029 XLogRecord *rechdr = (XLogRecord *) rdata->data;
1030 uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
1031 bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
1032 info == XLOG_SWITCH);
1033 XLogRecPtr StartPos;
1034 XLogRecPtr EndPos;
1035 bool prevDoPageWrites = doPageWrites;
1036
1037 /* we assume that all of the record header is in the first chunk */
1038 Assert(rdata->len >= SizeOfXLogRecord);
1039
1040 /* cross-check on whether we should be here or not */
1041 if (!XLogInsertAllowed())
1042 elog(ERROR, "cannot make new WAL entries during recovery");
1043
1044 /*----------
1045 *
1046 * We have now done all the preparatory work we can without holding a
1047 * lock or modifying shared state. From here on, inserting the new WAL
1048 * record to the shared WAL buffer cache is a two-step process:
1049 *
1050 * 1. Reserve the right amount of space from the WAL. The current head of
1051 * reserved space is kept in Insert->CurrBytePos, and is protected by
1052 * insertpos_lck.
1053 *
1054 * 2. Copy the record to the reserved WAL space. This involves finding the
1055 * correct WAL buffer containing the reserved space, and copying the
1056 * record in place. This can be done concurrently in multiple processes.
1057 *
1058 * To keep track of which insertions are still in-progress, each concurrent
1059 * inserter acquires an insertion lock. In addition to just indicating that
1060 * an insertion is in progress, the lock tells others how far the inserter
1061 * has progressed. There is a small fixed number of insertion locks,
1062 * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1063 * boundary, it updates the value stored in the lock to the how far it has
1064 * inserted, to allow the previous buffer to be flushed.
1065 *
1066 * Holding onto an insertion lock also protects RedoRecPtr and
1067 * fullPageWrites from changing until the insertion is finished.
1068 *
1069 * Step 2 can usually be done completely in parallel. If the required WAL
1070 * page is not initialized yet, you have to grab WALBufMappingLock to
1071 * initialize it, but the WAL writer tries to do that ahead of insertions
1072 * to avoid that from happening in the critical path.
1073 *
1074 *----------
1075 */
1076 START_CRIT_SECTION();
1077 if (isLogSwitch)
1078 WALInsertLockAcquireExclusive();
1079 else
1080 WALInsertLockAcquire();
1081
1082 /*
1083 * Check to see if my copy of RedoRecPtr is out of date. If so, may have
1084 * to go back and have the caller recompute everything. This can only
1085 * happen just after a checkpoint, so it's better to be slow in this case
1086 * and fast otherwise.
1087 *
1088 * Also check to see if fullPageWrites or forcePageWrites was just turned
1089 * on; if we weren't already doing full-page writes then go back and
1090 * recompute.
1091 *
1092 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1093 * affect the contents of the XLOG record, so we'll update our local copy
1094 * but not force a recomputation. (If doPageWrites was just turned off,
1095 * we could recompute the record without full pages, but we choose not to
1096 * bother.)
1097 */
1098 if (RedoRecPtr != Insert->RedoRecPtr)
1099 {
1100 Assert(RedoRecPtr < Insert->RedoRecPtr);
1101 RedoRecPtr = Insert->RedoRecPtr;
1102 }
1103 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1104
1105 if (doPageWrites &&
1106 (!prevDoPageWrites ||
1107 (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
1108 {
1109 /*
1110 * Oops, some buffer now needs to be backed up that the caller didn't
1111 * back up. Start over.
1112 */
1113 WALInsertLockRelease();
1114 END_CRIT_SECTION();
1115 return InvalidXLogRecPtr;
1116 }
1117
1118 /*
1119 * Reserve space for the record in the WAL. This also sets the xl_prev
1120 * pointer.
1121 */
1122 if (isLogSwitch)
1123 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1124 else
1125 {
1126 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1127 &rechdr->xl_prev);
1128 inserted = true;
1129 }
1130
1131 if (inserted)
1132 {
1133 /*
1134 * Now that xl_prev has been filled in, calculate CRC of the record
1135 * header.
1136 */
1137 rdata_crc = rechdr->xl_crc;
1138 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1139 FIN_CRC32C(rdata_crc);
1140 rechdr->xl_crc = rdata_crc;
1141
1142 /*
1143 * All the record data, including the header, is now ready to be
1144 * inserted. Copy the record in the space reserved.
1145 */
1146 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1147 StartPos, EndPos);
1148
1149 /*
1150 * Unless record is flagged as not important, update LSN of last
1151 * important record in the current slot. When holding all locks, just
1152 * update the first one.
1153 */
1154 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1155 {
1156 int lockno = holdingAllLocks ? 0 : MyLockNo;
1157
1158 WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1159 }
1160 }
1161 else
1162 {
1163 /*
1164 * This was an xlog-switch record, but the current insert location was
1165 * already exactly at the beginning of a segment, so there was no need
1166 * to do anything.
1167 */
1168 }
1169
1170 /*
1171 * Done! Let others know that we're finished.
1172 */
1173 WALInsertLockRelease();
1174
1175 MarkCurrentTransactionIdLoggedIfAny();
1176
1177 END_CRIT_SECTION();
1178
1179 /*
1180 * Update shared LogwrtRqst.Write, if we crossed page boundary.
1181 */
1182 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1183 {
1184 SpinLockAcquire(&XLogCtl->info_lck);
1185 /* advance global request to include new block(s) */
1186 if (XLogCtl->LogwrtRqst.Write < EndPos)
1187 XLogCtl->LogwrtRqst.Write = EndPos;
1188 /* update local result copy while I have the chance */
1189 LogwrtResult = XLogCtl->LogwrtResult;
1190 SpinLockRelease(&XLogCtl->info_lck);
1191 }
1192
1193 /*
1194 * If this was an XLOG_SWITCH record, flush the record and the empty
1195 * padding space that fills the rest of the segment, and perform
1196 * end-of-segment actions (eg, notifying archiver).
1197 */
1198 if (isLogSwitch)
1199 {
1200 TRACE_POSTGRESQL_WAL_SWITCH();
1201 XLogFlush(EndPos);
1202
1203 /*
1204 * Even though we reserved the rest of the segment for us, which is
1205 * reflected in EndPos, we return a pointer to just the end of the
1206 * xlog-switch record.
1207 */
1208 if (inserted)
1209 {
1210 EndPos = StartPos + SizeOfXLogRecord;
1211 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1212 {
1213 uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
1214
1215 if (offset == EndPos % XLOG_BLCKSZ)
1216 EndPos += SizeOfXLogLongPHD;
1217 else
1218 EndPos += SizeOfXLogShortPHD;
1219 }
1220 }
1221 }
1222
1223 #ifdef WAL_DEBUG
1224 if (XLOG_DEBUG)
1225 {
1226 static XLogReaderState *debug_reader = NULL;
1227 StringInfoData buf;
1228 StringInfoData recordBuf;
1229 char *errormsg = NULL;
1230 MemoryContext oldCxt;
1231
1232 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1233
1234 initStringInfo(&buf);
1235 appendStringInfo(&buf, "INSERT @ %X/%X: ", LSN_FORMAT_ARGS(EndPos));
1236
1237 /*
1238 * We have to piece together the WAL record data from the XLogRecData
1239 * entries, so that we can pass it to the rm_desc function as one
1240 * contiguous chunk.
1241 */
1242 initStringInfo(&recordBuf);
1243 for (; rdata != NULL; rdata = rdata->next)
1244 appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1245
1246 if (!debug_reader)
1247 debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
1248 XL_ROUTINE(), NULL);
1249
1250 if (!debug_reader)
1251 {
1252 appendStringInfoString(&buf, "error decoding record: out of memory");
1253 }
1254 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1255 &errormsg))
1256 {
1257 appendStringInfo(&buf, "error decoding record: %s",
1258 errormsg ? errormsg : "no error message");
1259 }
1260 else
1261 {
1262 appendStringInfoString(&buf, " - ");
1263 xlog_outdesc(&buf, debug_reader);
1264 }
1265 elog(LOG, "%s", buf.data);
1266
1267 pfree(buf.data);
1268 pfree(recordBuf.data);
1269 MemoryContextSwitchTo(oldCxt);
1270 }
1271 #endif
1272
1273 /*
1274 * Update our global variables
1275 */
1276 ProcLastRecPtr = StartPos;
1277 XactLastRecEnd = EndPos;
1278
1279 /* Report WAL traffic to the instrumentation. */
1280 if (inserted)
1281 {
1282 pgWalUsage.wal_bytes += rechdr->xl_tot_len;
1283 pgWalUsage.wal_records++;
1284 pgWalUsage.wal_fpi += num_fpi;
1285 }
1286
1287 return EndPos;
1288 }
1289
1290 /*
1291 * Reserves the right amount of space for a record of given size from the WAL.
1292 * *StartPos is set to the beginning of the reserved section, *EndPos to
1293 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1294 * used to set the xl_prev of this record.
1295 *
1296 * This is the performance critical part of XLogInsert that must be serialized
1297 * across backends. The rest can happen mostly in parallel. Try to keep this
1298 * section as short as possible, insertpos_lck can be heavily contended on a
1299 * busy system.
1300 *
1301 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1302 * where we actually copy the record to the reserved space.
1303 */
1304 static void
ReserveXLogInsertLocation(int size,XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1305 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1306 XLogRecPtr *PrevPtr)
1307 {
1308 XLogCtlInsert *Insert = &XLogCtl->Insert;
1309 uint64 startbytepos;
1310 uint64 endbytepos;
1311 uint64 prevbytepos;
1312
1313 size = MAXALIGN(size);
1314
1315 /* All (non xlog-switch) records should contain data. */
1316 Assert(size > SizeOfXLogRecord);
1317
1318 /*
1319 * The duration the spinlock needs to be held is minimized by minimizing
1320 * the calculations that have to be done while holding the lock. The
1321 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1322 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1323 * page headers. The mapping between "usable" byte positions and physical
1324 * positions (XLogRecPtrs) can be done outside the locked region, and
1325 * because the usable byte position doesn't include any headers, reserving
1326 * X bytes from WAL is almost as simple as "CurrBytePos += X".
1327 */
1328 SpinLockAcquire(&Insert->insertpos_lck);
1329
1330 startbytepos = Insert->CurrBytePos;
1331 endbytepos = startbytepos + size;
1332 prevbytepos = Insert->PrevBytePos;
1333 Insert->CurrBytePos = endbytepos;
1334 Insert->PrevBytePos = startbytepos;
1335
1336 SpinLockRelease(&Insert->insertpos_lck);
1337
1338 *StartPos = XLogBytePosToRecPtr(startbytepos);
1339 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1340 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1341
1342 /*
1343 * Check that the conversions between "usable byte positions" and
1344 * XLogRecPtrs work consistently in both directions.
1345 */
1346 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1347 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1348 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1349 }
1350
1351 /*
1352 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1353 *
1354 * A log-switch record is handled slightly differently. The rest of the
1355 * segment will be reserved for this insertion, as indicated by the returned
1356 * *EndPos value. However, if we are already at the beginning of the current
1357 * segment, *StartPos and *EndPos are set to the current location without
1358 * reserving any space, and the function returns false.
1359 */
1360 static bool
ReserveXLogSwitch(XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1361 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1362 {
1363 XLogCtlInsert *Insert = &XLogCtl->Insert;
1364 uint64 startbytepos;
1365 uint64 endbytepos;
1366 uint64 prevbytepos;
1367 uint32 size = MAXALIGN(SizeOfXLogRecord);
1368 XLogRecPtr ptr;
1369 uint32 segleft;
1370
1371 /*
1372 * These calculations are a bit heavy-weight to be done while holding a
1373 * spinlock, but since we're holding all the WAL insertion locks, there
1374 * are no other inserters competing for it. GetXLogInsertRecPtr() does
1375 * compete for it, but that's not called very frequently.
1376 */
1377 SpinLockAcquire(&Insert->insertpos_lck);
1378
1379 startbytepos = Insert->CurrBytePos;
1380
1381 ptr = XLogBytePosToEndRecPtr(startbytepos);
1382 if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1383 {
1384 SpinLockRelease(&Insert->insertpos_lck);
1385 *EndPos = *StartPos = ptr;
1386 return false;
1387 }
1388
1389 endbytepos = startbytepos + size;
1390 prevbytepos = Insert->PrevBytePos;
1391
1392 *StartPos = XLogBytePosToRecPtr(startbytepos);
1393 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1394
1395 segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1396 if (segleft != wal_segment_size)
1397 {
1398 /* consume the rest of the segment */
1399 *EndPos += segleft;
1400 endbytepos = XLogRecPtrToBytePos(*EndPos);
1401 }
1402 Insert->CurrBytePos = endbytepos;
1403 Insert->PrevBytePos = startbytepos;
1404
1405 SpinLockRelease(&Insert->insertpos_lck);
1406
1407 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1408
1409 Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1410 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1411 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1412 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1413
1414 return true;
1415 }
1416
1417 /*
1418 * Checks whether the current buffer page and backup page stored in the
1419 * WAL record are consistent or not. Before comparing the two pages, a
1420 * masking can be applied to the pages to ignore certain areas like hint bits,
1421 * unused space between pd_lower and pd_upper among other things. This
1422 * function should be called once WAL replay has been completed for a
1423 * given record.
1424 */
1425 static void
checkXLogConsistency(XLogReaderState * record)1426 checkXLogConsistency(XLogReaderState *record)
1427 {
1428 RmgrId rmid = XLogRecGetRmid(record);
1429 RelFileNode rnode;
1430 ForkNumber forknum;
1431 BlockNumber blkno;
1432 int block_id;
1433
1434 /* Records with no backup blocks have no need for consistency checks. */
1435 if (!XLogRecHasAnyBlockRefs(record))
1436 return;
1437
1438 Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1439
1440 for (block_id = 0; block_id <= record->max_block_id; block_id++)
1441 {
1442 Buffer buf;
1443 Page page;
1444
1445 if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1446 {
1447 /*
1448 * WAL record doesn't contain a block reference with the given id.
1449 * Do nothing.
1450 */
1451 continue;
1452 }
1453
1454 Assert(XLogRecHasBlockImage(record, block_id));
1455
1456 if (XLogRecBlockImageApply(record, block_id))
1457 {
1458 /*
1459 * WAL record has already applied the page, so bypass the
1460 * consistency check as that would result in comparing the full
1461 * page stored in the record with itself.
1462 */
1463 continue;
1464 }
1465
1466 /*
1467 * Read the contents from the current buffer and store it in a
1468 * temporary page.
1469 */
1470 buf = XLogReadBufferExtended(rnode, forknum, blkno,
1471 RBM_NORMAL_NO_LOG);
1472 if (!BufferIsValid(buf))
1473 continue;
1474
1475 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1476 page = BufferGetPage(buf);
1477
1478 /*
1479 * Take a copy of the local page where WAL has been applied to have a
1480 * comparison base before masking it...
1481 */
1482 memcpy(replay_image_masked, page, BLCKSZ);
1483
1484 /* No need for this page anymore now that a copy is in. */
1485 UnlockReleaseBuffer(buf);
1486
1487 /*
1488 * If the block LSN is already ahead of this WAL record, we can't
1489 * expect contents to match. This can happen if recovery is
1490 * restarted.
1491 */
1492 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1493 continue;
1494
1495 /*
1496 * Read the contents from the backup copy, stored in WAL record and
1497 * store it in a temporary page. There is no need to allocate a new
1498 * page here, a local buffer is fine to hold its contents and a mask
1499 * can be directly applied on it.
1500 */
1501 if (!RestoreBlockImage(record, block_id, primary_image_masked))
1502 elog(ERROR, "failed to restore block image");
1503
1504 /*
1505 * If masking function is defined, mask both the primary and replay
1506 * images
1507 */
1508 if (RmgrTable[rmid].rm_mask != NULL)
1509 {
1510 RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1511 RmgrTable[rmid].rm_mask(primary_image_masked, blkno);
1512 }
1513
1514 /* Time to compare the primary and replay images. */
1515 if (memcmp(replay_image_masked, primary_image_masked, BLCKSZ) != 0)
1516 {
1517 elog(FATAL,
1518 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1519 rnode.spcNode, rnode.dbNode, rnode.relNode,
1520 forknum, blkno);
1521 }
1522 }
1523 }
1524
1525 /*
1526 * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
1527 * area in the WAL.
1528 */
1529 static void
CopyXLogRecordToWAL(int write_len,bool isLogSwitch,XLogRecData * rdata,XLogRecPtr StartPos,XLogRecPtr EndPos)1530 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1531 XLogRecPtr StartPos, XLogRecPtr EndPos)
1532 {
1533 char *currpos;
1534 int freespace;
1535 int written;
1536 XLogRecPtr CurrPos;
1537 XLogPageHeader pagehdr;
1538
1539 /*
1540 * Get a pointer to the right place in the right WAL buffer to start
1541 * inserting to.
1542 */
1543 CurrPos = StartPos;
1544 currpos = GetXLogBuffer(CurrPos);
1545 freespace = INSERT_FREESPACE(CurrPos);
1546
1547 /*
1548 * there should be enough space for at least the first field (xl_tot_len)
1549 * on this page.
1550 */
1551 Assert(freespace >= sizeof(uint32));
1552
1553 /* Copy record data */
1554 written = 0;
1555 while (rdata != NULL)
1556 {
1557 char *rdata_data = rdata->data;
1558 int rdata_len = rdata->len;
1559
1560 while (rdata_len > freespace)
1561 {
1562 /*
1563 * Write what fits on this page, and continue on the next page.
1564 */
1565 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1566 memcpy(currpos, rdata_data, freespace);
1567 rdata_data += freespace;
1568 rdata_len -= freespace;
1569 written += freespace;
1570 CurrPos += freespace;
1571
1572 /*
1573 * Get pointer to beginning of next page, and set the xlp_rem_len
1574 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1575 *
1576 * It's safe to set the contrecord flag and xlp_rem_len without a
1577 * lock on the page. All the other flags were already set when the
1578 * page was initialized, in AdvanceXLInsertBuffer, and we're the
1579 * only backend that needs to set the contrecord flag.
1580 */
1581 currpos = GetXLogBuffer(CurrPos);
1582 pagehdr = (XLogPageHeader) currpos;
1583 pagehdr->xlp_rem_len = write_len - written;
1584 pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1585
1586 /* skip over the page header */
1587 if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1588 {
1589 CurrPos += SizeOfXLogLongPHD;
1590 currpos += SizeOfXLogLongPHD;
1591 }
1592 else
1593 {
1594 CurrPos += SizeOfXLogShortPHD;
1595 currpos += SizeOfXLogShortPHD;
1596 }
1597 freespace = INSERT_FREESPACE(CurrPos);
1598 }
1599
1600 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1601 memcpy(currpos, rdata_data, rdata_len);
1602 currpos += rdata_len;
1603 CurrPos += rdata_len;
1604 freespace -= rdata_len;
1605 written += rdata_len;
1606
1607 rdata = rdata->next;
1608 }
1609 Assert(written == write_len);
1610
1611 /*
1612 * If this was an xlog-switch, it's not enough to write the switch record,
1613 * we also have to consume all the remaining space in the WAL segment. We
1614 * have already reserved that space, but we need to actually fill it.
1615 */
1616 if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1617 {
1618 /* An xlog-switch record doesn't contain any data besides the header */
1619 Assert(write_len == SizeOfXLogRecord);
1620
1621 /* Assert that we did reserve the right amount of space */
1622 Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1623
1624 /* Use up all the remaining space on the current page */
1625 CurrPos += freespace;
1626
1627 /*
1628 * Cause all remaining pages in the segment to be flushed, leaving the
1629 * XLog position where it should be, at the start of the next segment.
1630 * We do this one page at a time, to make sure we don't deadlock
1631 * against ourselves if wal_buffers < wal_segment_size.
1632 */
1633 while (CurrPos < EndPos)
1634 {
1635 /*
1636 * The minimal action to flush the page would be to call
1637 * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1638 * AdvanceXLInsertBuffer(...). The page would be left initialized
1639 * mostly to zeros, except for the page header (always the short
1640 * variant, as this is never a segment's first page).
1641 *
1642 * The large vistas of zeros are good for compressibility, but the
1643 * headers interrupting them every XLOG_BLCKSZ (with values that
1644 * differ from page to page) are not. The effect varies with
1645 * compression tool, but bzip2 for instance compresses about an
1646 * order of magnitude worse if those headers are left in place.
1647 *
1648 * Rather than complicating AdvanceXLInsertBuffer itself (which is
1649 * called in heavily-loaded circumstances as well as this lightly-
1650 * loaded one) with variant behavior, we just use GetXLogBuffer
1651 * (which itself calls the two methods we need) to get the pointer
1652 * and zero most of the page. Then we just zero the page header.
1653 */
1654 currpos = GetXLogBuffer(CurrPos);
1655 MemSet(currpos, 0, SizeOfXLogShortPHD);
1656
1657 CurrPos += XLOG_BLCKSZ;
1658 }
1659 }
1660 else
1661 {
1662 /* Align the end position, so that the next record starts aligned */
1663 CurrPos = MAXALIGN64(CurrPos);
1664 }
1665
1666 if (CurrPos != EndPos)
1667 elog(PANIC, "space reserved for WAL record does not match what was written");
1668 }
1669
1670 /*
1671 * Acquire a WAL insertion lock, for inserting to WAL.
1672 */
1673 static void
WALInsertLockAcquire(void)1674 WALInsertLockAcquire(void)
1675 {
1676 bool immed;
1677
1678 /*
1679 * It doesn't matter which of the WAL insertion locks we acquire, so try
1680 * the one we used last time. If the system isn't particularly busy, it's
1681 * a good bet that it's still available, and it's good to have some
1682 * affinity to a particular lock so that you don't unnecessarily bounce
1683 * cache lines between processes when there's no contention.
1684 *
1685 * If this is the first time through in this backend, pick a lock
1686 * (semi-)randomly. This allows the locks to be used evenly if you have a
1687 * lot of very short connections.
1688 */
1689 static int lockToTry = -1;
1690
1691 if (lockToTry == -1)
1692 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1693 MyLockNo = lockToTry;
1694
1695 /*
1696 * The insertingAt value is initially set to 0, as we don't know our
1697 * insert location yet.
1698 */
1699 immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1700 if (!immed)
1701 {
1702 /*
1703 * If we couldn't get the lock immediately, try another lock next
1704 * time. On a system with more insertion locks than concurrent
1705 * inserters, this causes all the inserters to eventually migrate to a
1706 * lock that no-one else is using. On a system with more inserters
1707 * than locks, it still helps to distribute the inserters evenly
1708 * across the locks.
1709 */
1710 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1711 }
1712 }
1713
1714 /*
1715 * Acquire all WAL insertion locks, to prevent other backends from inserting
1716 * to WAL.
1717 */
1718 static void
WALInsertLockAcquireExclusive(void)1719 WALInsertLockAcquireExclusive(void)
1720 {
1721 int i;
1722
1723 /*
1724 * When holding all the locks, all but the last lock's insertingAt
1725 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1726 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1727 */
1728 for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1729 {
1730 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1731 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1732 &WALInsertLocks[i].l.insertingAt,
1733 PG_UINT64_MAX);
1734 }
1735 /* Variable value reset to 0 at release */
1736 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1737
1738 holdingAllLocks = true;
1739 }
1740
1741 /*
1742 * Release our insertion lock (or locks, if we're holding them all).
1743 *
1744 * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1745 * next time the lock is acquired.
1746 */
1747 static void
WALInsertLockRelease(void)1748 WALInsertLockRelease(void)
1749 {
1750 if (holdingAllLocks)
1751 {
1752 int i;
1753
1754 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1755 LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1756 &WALInsertLocks[i].l.insertingAt,
1757 0);
1758
1759 holdingAllLocks = false;
1760 }
1761 else
1762 {
1763 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1764 &WALInsertLocks[MyLockNo].l.insertingAt,
1765 0);
1766 }
1767 }
1768
1769 /*
1770 * Update our insertingAt value, to let others know that we've finished
1771 * inserting up to that point.
1772 */
1773 static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)1774 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1775 {
1776 if (holdingAllLocks)
1777 {
1778 /*
1779 * We use the last lock to mark our actual position, see comments in
1780 * WALInsertLockAcquireExclusive.
1781 */
1782 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1783 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1784 insertingAt);
1785 }
1786 else
1787 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1788 &WALInsertLocks[MyLockNo].l.insertingAt,
1789 insertingAt);
1790 }
1791
1792 /*
1793 * Wait for any WAL insertions < upto to finish.
1794 *
1795 * Returns the location of the oldest insertion that is still in-progress.
1796 * Any WAL prior to that point has been fully copied into WAL buffers, and
1797 * can be flushed out to disk. Because this waits for any insertions older
1798 * than 'upto' to finish, the return value is always >= 'upto'.
1799 *
1800 * Note: When you are about to write out WAL, you must call this function
1801 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1802 * need to wait for an insertion to finish (or at least advance to next
1803 * uninitialized page), and the inserter might need to evict an old WAL buffer
1804 * to make room for a new one, which in turn requires WALWriteLock.
1805 */
1806 static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)1807 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1808 {
1809 uint64 bytepos;
1810 XLogRecPtr reservedUpto;
1811 XLogRecPtr finishedUpto;
1812 XLogCtlInsert *Insert = &XLogCtl->Insert;
1813 int i;
1814
1815 if (MyProc == NULL)
1816 elog(PANIC, "cannot wait without a PGPROC structure");
1817
1818 /* Read the current insert position */
1819 SpinLockAcquire(&Insert->insertpos_lck);
1820 bytepos = Insert->CurrBytePos;
1821 SpinLockRelease(&Insert->insertpos_lck);
1822 reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1823
1824 /*
1825 * No-one should request to flush a piece of WAL that hasn't even been
1826 * reserved yet. However, it can happen if there is a block with a bogus
1827 * LSN on disk, for example. XLogFlush checks for that situation and
1828 * complains, but only after the flush. Here we just assume that to mean
1829 * that all WAL that has been reserved needs to be finished. In this
1830 * corner-case, the return value can be smaller than 'upto' argument.
1831 */
1832 if (upto > reservedUpto)
1833 {
1834 ereport(LOG,
1835 (errmsg("request to flush past end of generated WAL; request %X/%X, current position %X/%X",
1836 LSN_FORMAT_ARGS(upto), LSN_FORMAT_ARGS(reservedUpto))));
1837 upto = reservedUpto;
1838 }
1839
1840 /*
1841 * Loop through all the locks, sleeping on any in-progress insert older
1842 * than 'upto'.
1843 *
1844 * finishedUpto is our return value, indicating the point upto which all
1845 * the WAL insertions have been finished. Initialize it to the head of
1846 * reserved WAL, and as we iterate through the insertion locks, back it
1847 * out for any insertion that's still in progress.
1848 */
1849 finishedUpto = reservedUpto;
1850 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1851 {
1852 XLogRecPtr insertingat = InvalidXLogRecPtr;
1853
1854 do
1855 {
1856 /*
1857 * See if this insertion is in progress. LWLockWaitForVar will
1858 * wait for the lock to be released, or for the 'value' to be set
1859 * by a LWLockUpdateVar call. When a lock is initially acquired,
1860 * its value is 0 (InvalidXLogRecPtr), which means that we don't
1861 * know where it's inserting yet. We will have to wait for it. If
1862 * it's a small insertion, the record will most likely fit on the
1863 * same page and the inserter will release the lock without ever
1864 * calling LWLockUpdateVar. But if it has to sleep, it will
1865 * advertise the insertion point with LWLockUpdateVar before
1866 * sleeping.
1867 */
1868 if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1869 &WALInsertLocks[i].l.insertingAt,
1870 insertingat, &insertingat))
1871 {
1872 /* the lock was free, so no insertion in progress */
1873 insertingat = InvalidXLogRecPtr;
1874 break;
1875 }
1876
1877 /*
1878 * This insertion is still in progress. Have to wait, unless the
1879 * inserter has proceeded past 'upto'.
1880 */
1881 } while (insertingat < upto);
1882
1883 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1884 finishedUpto = insertingat;
1885 }
1886 return finishedUpto;
1887 }
1888
1889 /*
1890 * Get a pointer to the right location in the WAL buffer containing the
1891 * given XLogRecPtr.
1892 *
1893 * If the page is not initialized yet, it is initialized. That might require
1894 * evicting an old dirty buffer from the buffer cache, which means I/O.
1895 *
1896 * The caller must ensure that the page containing the requested location
1897 * isn't evicted yet, and won't be evicted. The way to ensure that is to
1898 * hold onto a WAL insertion lock with the insertingAt position set to
1899 * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1900 * to evict an old page from the buffer. (This means that once you call
1901 * GetXLogBuffer() with a given 'ptr', you must not access anything before
1902 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1903 * later, because older buffers might be recycled already)
1904 */
1905 static char *
GetXLogBuffer(XLogRecPtr ptr)1906 GetXLogBuffer(XLogRecPtr ptr)
1907 {
1908 int idx;
1909 XLogRecPtr endptr;
1910 static uint64 cachedPage = 0;
1911 static char *cachedPos = NULL;
1912 XLogRecPtr expectedEndPtr;
1913
1914 /*
1915 * Fast path for the common case that we need to access again the same
1916 * page as last time.
1917 */
1918 if (ptr / XLOG_BLCKSZ == cachedPage)
1919 {
1920 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1921 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1922 return cachedPos + ptr % XLOG_BLCKSZ;
1923 }
1924
1925 /*
1926 * The XLog buffer cache is organized so that a page is always loaded to a
1927 * particular buffer. That way we can easily calculate the buffer a given
1928 * page must be loaded into, from the XLogRecPtr alone.
1929 */
1930 idx = XLogRecPtrToBufIdx(ptr);
1931
1932 /*
1933 * See what page is loaded in the buffer at the moment. It could be the
1934 * page we're looking for, or something older. It can't be anything newer
1935 * - that would imply the page we're looking for has already been written
1936 * out to disk and evicted, and the caller is responsible for making sure
1937 * that doesn't happen.
1938 *
1939 * However, we don't hold a lock while we read the value. If someone has
1940 * just initialized the page, it's possible that we get a "torn read" of
1941 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1942 * that case we will see a bogus value. That's ok, we'll grab the mapping
1943 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1944 * the page we're looking for. But it means that when we do this unlocked
1945 * read, we might see a value that appears to be ahead of the page we're
1946 * looking for. Don't PANIC on that, until we've verified the value while
1947 * holding the lock.
1948 */
1949 expectedEndPtr = ptr;
1950 expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1951
1952 endptr = XLogCtl->xlblocks[idx];
1953 if (expectedEndPtr != endptr)
1954 {
1955 XLogRecPtr initializedUpto;
1956
1957 /*
1958 * Before calling AdvanceXLInsertBuffer(), which can block, let others
1959 * know how far we're finished with inserting the record.
1960 *
1961 * NB: If 'ptr' points to just after the page header, advertise a
1962 * position at the beginning of the page rather than 'ptr' itself. If
1963 * there are no other insertions running, someone might try to flush
1964 * up to our advertised location. If we advertised a position after
1965 * the page header, someone might try to flush the page header, even
1966 * though page might actually not be initialized yet. As the first
1967 * inserter on the page, we are effectively responsible for making
1968 * sure that it's initialized, before we let insertingAt to move past
1969 * the page header.
1970 */
1971 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1972 XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1973 initializedUpto = ptr - SizeOfXLogShortPHD;
1974 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1975 XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1976 initializedUpto = ptr - SizeOfXLogLongPHD;
1977 else
1978 initializedUpto = ptr;
1979
1980 WALInsertLockUpdateInsertingAt(initializedUpto);
1981
1982 AdvanceXLInsertBuffer(ptr, false);
1983 endptr = XLogCtl->xlblocks[idx];
1984
1985 if (expectedEndPtr != endptr)
1986 elog(PANIC, "could not find WAL buffer for %X/%X",
1987 LSN_FORMAT_ARGS(ptr));
1988 }
1989 else
1990 {
1991 /*
1992 * Make sure the initialization of the page is visible to us, and
1993 * won't arrive later to overwrite the WAL data we write on the page.
1994 */
1995 pg_memory_barrier();
1996 }
1997
1998 /*
1999 * Found the buffer holding this page. Return a pointer to the right
2000 * offset within the page.
2001 */
2002 cachedPage = ptr / XLOG_BLCKSZ;
2003 cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
2004
2005 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
2006 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
2007
2008 return cachedPos + ptr % XLOG_BLCKSZ;
2009 }
2010
2011 /*
2012 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2013 * is the position starting from the beginning of WAL, excluding all WAL
2014 * page headers.
2015 */
2016 static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)2017 XLogBytePosToRecPtr(uint64 bytepos)
2018 {
2019 uint64 fullsegs;
2020 uint64 fullpages;
2021 uint64 bytesleft;
2022 uint32 seg_offset;
2023 XLogRecPtr result;
2024
2025 fullsegs = bytepos / UsableBytesInSegment;
2026 bytesleft = bytepos % UsableBytesInSegment;
2027
2028 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2029 {
2030 /* fits on first page of segment */
2031 seg_offset = bytesleft + SizeOfXLogLongPHD;
2032 }
2033 else
2034 {
2035 /* account for the first page on segment with long header */
2036 seg_offset = XLOG_BLCKSZ;
2037 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2038
2039 fullpages = bytesleft / UsableBytesInPage;
2040 bytesleft = bytesleft % UsableBytesInPage;
2041
2042 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2043 }
2044
2045 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2046
2047 return result;
2048 }
2049
2050 /*
2051 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2052 * returns a pointer to the beginning of the page (ie. before page header),
2053 * not to where the first xlog record on that page would go to. This is used
2054 * when converting a pointer to the end of a record.
2055 */
2056 static XLogRecPtr
XLogBytePosToEndRecPtr(uint64 bytepos)2057 XLogBytePosToEndRecPtr(uint64 bytepos)
2058 {
2059 uint64 fullsegs;
2060 uint64 fullpages;
2061 uint64 bytesleft;
2062 uint32 seg_offset;
2063 XLogRecPtr result;
2064
2065 fullsegs = bytepos / UsableBytesInSegment;
2066 bytesleft = bytepos % UsableBytesInSegment;
2067
2068 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2069 {
2070 /* fits on first page of segment */
2071 if (bytesleft == 0)
2072 seg_offset = 0;
2073 else
2074 seg_offset = bytesleft + SizeOfXLogLongPHD;
2075 }
2076 else
2077 {
2078 /* account for the first page on segment with long header */
2079 seg_offset = XLOG_BLCKSZ;
2080 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2081
2082 fullpages = bytesleft / UsableBytesInPage;
2083 bytesleft = bytesleft % UsableBytesInPage;
2084
2085 if (bytesleft == 0)
2086 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2087 else
2088 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2089 }
2090
2091 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2092
2093 return result;
2094 }
2095
2096 /*
2097 * Convert an XLogRecPtr to a "usable byte position".
2098 */
2099 static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)2100 XLogRecPtrToBytePos(XLogRecPtr ptr)
2101 {
2102 uint64 fullsegs;
2103 uint32 fullpages;
2104 uint32 offset;
2105 uint64 result;
2106
2107 XLByteToSeg(ptr, fullsegs, wal_segment_size);
2108
2109 fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2110 offset = ptr % XLOG_BLCKSZ;
2111
2112 if (fullpages == 0)
2113 {
2114 result = fullsegs * UsableBytesInSegment;
2115 if (offset > 0)
2116 {
2117 Assert(offset >= SizeOfXLogLongPHD);
2118 result += offset - SizeOfXLogLongPHD;
2119 }
2120 }
2121 else
2122 {
2123 result = fullsegs * UsableBytesInSegment +
2124 (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2125 (fullpages - 1) * UsableBytesInPage; /* full pages */
2126 if (offset > 0)
2127 {
2128 Assert(offset >= SizeOfXLogShortPHD);
2129 result += offset - SizeOfXLogShortPHD;
2130 }
2131 }
2132
2133 return result;
2134 }
2135
2136 /*
2137 * Initialize XLOG buffers, writing out old buffers if they still contain
2138 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2139 * true, initialize as many pages as we can without having to write out
2140 * unwritten data. Any new pages are initialized to zeros, with pages headers
2141 * initialized properly.
2142 */
2143 static void
AdvanceXLInsertBuffer(XLogRecPtr upto,bool opportunistic)2144 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2145 {
2146 XLogCtlInsert *Insert = &XLogCtl->Insert;
2147 int nextidx;
2148 XLogRecPtr OldPageRqstPtr;
2149 XLogwrtRqst WriteRqst;
2150 XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
2151 XLogRecPtr NewPageBeginPtr;
2152 XLogPageHeader NewPage;
2153 int npages = 0;
2154
2155 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2156
2157 /*
2158 * Now that we have the lock, check if someone initialized the page
2159 * already.
2160 */
2161 while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2162 {
2163 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2164
2165 /*
2166 * Get ending-offset of the buffer page we need to replace (this may
2167 * be zero if the buffer hasn't been used yet). Fall through if it's
2168 * already written out.
2169 */
2170 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2171 if (LogwrtResult.Write < OldPageRqstPtr)
2172 {
2173 /*
2174 * Nope, got work to do. If we just want to pre-initialize as much
2175 * as we can without flushing, give up now.
2176 */
2177 if (opportunistic)
2178 break;
2179
2180 /* Before waiting, get info_lck and update LogwrtResult */
2181 SpinLockAcquire(&XLogCtl->info_lck);
2182 if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2183 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2184 LogwrtResult = XLogCtl->LogwrtResult;
2185 SpinLockRelease(&XLogCtl->info_lck);
2186
2187 /*
2188 * Now that we have an up-to-date LogwrtResult value, see if we
2189 * still need to write it or if someone else already did.
2190 */
2191 if (LogwrtResult.Write < OldPageRqstPtr)
2192 {
2193 /*
2194 * Must acquire write lock. Release WALBufMappingLock first,
2195 * to make sure that all insertions that we need to wait for
2196 * can finish (up to this same position). Otherwise we risk
2197 * deadlock.
2198 */
2199 LWLockRelease(WALBufMappingLock);
2200
2201 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2202
2203 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2204
2205 LogwrtResult = XLogCtl->LogwrtResult;
2206 if (LogwrtResult.Write >= OldPageRqstPtr)
2207 {
2208 /* OK, someone wrote it already */
2209 LWLockRelease(WALWriteLock);
2210 }
2211 else
2212 {
2213 /* Have to write it ourselves */
2214 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2215 WriteRqst.Write = OldPageRqstPtr;
2216 WriteRqst.Flush = 0;
2217 XLogWrite(WriteRqst, false);
2218 LWLockRelease(WALWriteLock);
2219 WalStats.m_wal_buffers_full++;
2220 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2221 }
2222 /* Re-acquire WALBufMappingLock and retry */
2223 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2224 continue;
2225 }
2226 }
2227
2228 /*
2229 * Now the next buffer slot is free and we can set it up to be the
2230 * next output page.
2231 */
2232 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2233 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2234
2235 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2236
2237 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2238
2239 /*
2240 * Be sure to re-zero the buffer so that bytes beyond what we've
2241 * written will look like zeroes and not valid XLOG records...
2242 */
2243 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2244
2245 /*
2246 * Fill the new page's header
2247 */
2248 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2249
2250 /* NewPage->xlp_info = 0; */ /* done by memset */
2251 NewPage->xlp_tli = ThisTimeLineID;
2252 NewPage->xlp_pageaddr = NewPageBeginPtr;
2253
2254 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2255
2256 /*
2257 * If online backup is not in progress, mark the header to indicate
2258 * that WAL records beginning in this page have removable backup
2259 * blocks. This allows the WAL archiver to know whether it is safe to
2260 * compress archived WAL data by transforming full-block records into
2261 * the non-full-block format. It is sufficient to record this at the
2262 * page level because we force a page switch (in fact a segment
2263 * switch) when starting a backup, so the flag will be off before any
2264 * records can be written during the backup. At the end of a backup,
2265 * the last page will be marked as all unsafe when perhaps only part
2266 * is unsafe, but at worst the archiver would miss the opportunity to
2267 * compress a few records.
2268 */
2269 if (!Insert->forcePageWrites)
2270 NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2271
2272 /*
2273 * If a record was found to be broken at the end of recovery, and
2274 * we're going to write on the page where its first contrecord was
2275 * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
2276 * header. See CreateOverwriteContrecordRecord().
2277 */
2278 if (missingContrecPtr == NewPageBeginPtr)
2279 {
2280 NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
2281 missingContrecPtr = InvalidXLogRecPtr;
2282 }
2283
2284 /*
2285 * If first page of an XLOG segment file, make it a long header.
2286 */
2287 if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2288 {
2289 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2290
2291 NewLongPage->xlp_sysid = ControlFile->system_identifier;
2292 NewLongPage->xlp_seg_size = wal_segment_size;
2293 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2294 NewPage->xlp_info |= XLP_LONG_HEADER;
2295 }
2296
2297 /*
2298 * Make sure the initialization of the page becomes visible to others
2299 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2300 * holding a lock.
2301 */
2302 pg_write_barrier();
2303
2304 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2305
2306 XLogCtl->InitializedUpTo = NewPageEndPtr;
2307
2308 npages++;
2309 }
2310 LWLockRelease(WALBufMappingLock);
2311
2312 #ifdef WAL_DEBUG
2313 if (XLOG_DEBUG && npages > 0)
2314 {
2315 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2316 npages, LSN_FORMAT_ARGS(NewPageEndPtr));
2317 }
2318 #endif
2319 }
2320
2321 /*
2322 * Calculate CheckPointSegments based on max_wal_size_mb and
2323 * checkpoint_completion_target.
2324 */
2325 static void
CalculateCheckpointSegments(void)2326 CalculateCheckpointSegments(void)
2327 {
2328 double target;
2329
2330 /*-------
2331 * Calculate the distance at which to trigger a checkpoint, to avoid
2332 * exceeding max_wal_size_mb. This is based on two assumptions:
2333 *
2334 * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2335 * WAL for two checkpoint cycles to allow us to recover from the
2336 * secondary checkpoint if the first checkpoint failed, though we
2337 * only did this on the primary anyway, not on standby. Keeping just
2338 * one checkpoint simplifies processing and reduces disk space in
2339 * many smaller databases.)
2340 * b) during checkpoint, we consume checkpoint_completion_target *
2341 * number of segments consumed between checkpoints.
2342 *-------
2343 */
2344 target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2345 (1.0 + CheckPointCompletionTarget);
2346
2347 /* round down */
2348 CheckPointSegments = (int) target;
2349
2350 if (CheckPointSegments < 1)
2351 CheckPointSegments = 1;
2352 }
2353
2354 void
assign_max_wal_size(int newval,void * extra)2355 assign_max_wal_size(int newval, void *extra)
2356 {
2357 max_wal_size_mb = newval;
2358 CalculateCheckpointSegments();
2359 }
2360
2361 void
assign_checkpoint_completion_target(double newval,void * extra)2362 assign_checkpoint_completion_target(double newval, void *extra)
2363 {
2364 CheckPointCompletionTarget = newval;
2365 CalculateCheckpointSegments();
2366 }
2367
2368 /*
2369 * At a checkpoint, how many WAL segments to recycle as preallocated future
2370 * XLOG segments? Returns the highest segment that should be preallocated.
2371 */
2372 static XLogSegNo
XLOGfileslop(XLogRecPtr lastredoptr)2373 XLOGfileslop(XLogRecPtr lastredoptr)
2374 {
2375 XLogSegNo minSegNo;
2376 XLogSegNo maxSegNo;
2377 double distance;
2378 XLogSegNo recycleSegNo;
2379
2380 /*
2381 * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2382 * correspond to. Always recycle enough segments to meet the minimum, and
2383 * remove enough segments to stay below the maximum.
2384 */
2385 minSegNo = lastredoptr / wal_segment_size +
2386 ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2387 maxSegNo = lastredoptr / wal_segment_size +
2388 ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2389
2390 /*
2391 * Between those limits, recycle enough segments to get us through to the
2392 * estimated end of next checkpoint.
2393 *
2394 * To estimate where the next checkpoint will finish, assume that the
2395 * system runs steadily consuming CheckPointDistanceEstimate bytes between
2396 * every checkpoint.
2397 */
2398 distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2399 /* add 10% for good measure. */
2400 distance *= 1.10;
2401
2402 recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2403 wal_segment_size);
2404
2405 if (recycleSegNo < minSegNo)
2406 recycleSegNo = minSegNo;
2407 if (recycleSegNo > maxSegNo)
2408 recycleSegNo = maxSegNo;
2409
2410 return recycleSegNo;
2411 }
2412
2413 /*
2414 * Check whether we've consumed enough xlog space that a checkpoint is needed.
2415 *
2416 * new_segno indicates a log file that has just been filled up (or read
2417 * during recovery). We measure the distance from RedoRecPtr to new_segno
2418 * and see if that exceeds CheckPointSegments.
2419 *
2420 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2421 */
2422 static bool
XLogCheckpointNeeded(XLogSegNo new_segno)2423 XLogCheckpointNeeded(XLogSegNo new_segno)
2424 {
2425 XLogSegNo old_segno;
2426
2427 XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2428
2429 if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2430 return true;
2431 return false;
2432 }
2433
2434 /*
2435 * Write and/or fsync the log at least as far as WriteRqst indicates.
2436 *
2437 * If flexible == true, we don't have to write as far as WriteRqst, but
2438 * may stop at any convenient boundary (such as a cache or logfile boundary).
2439 * This option allows us to avoid uselessly issuing multiple writes when a
2440 * single one would do.
2441 *
2442 * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2443 * must be called before grabbing the lock, to make sure the data is ready to
2444 * write.
2445 */
2446 static void
XLogWrite(XLogwrtRqst WriteRqst,bool flexible)2447 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2448 {
2449 bool ispartialpage;
2450 bool last_iteration;
2451 bool finishing_seg;
2452 bool use_existent;
2453 int curridx;
2454 int npages;
2455 int startidx;
2456 uint32 startoffset;
2457
2458 /* We should always be inside a critical section here */
2459 Assert(CritSectionCount > 0);
2460
2461 /*
2462 * Update local LogwrtResult (caller probably did this already, but...)
2463 */
2464 LogwrtResult = XLogCtl->LogwrtResult;
2465
2466 /*
2467 * Since successive pages in the xlog cache are consecutively allocated,
2468 * we can usually gather multiple pages together and issue just one
2469 * write() call. npages is the number of pages we have determined can be
2470 * written together; startidx is the cache block index of the first one,
2471 * and startoffset is the file offset at which it should go. The latter
2472 * two variables are only valid when npages > 0, but we must initialize
2473 * all of them to keep the compiler quiet.
2474 */
2475 npages = 0;
2476 startidx = 0;
2477 startoffset = 0;
2478
2479 /*
2480 * Within the loop, curridx is the cache block index of the page to
2481 * consider writing. Begin at the buffer containing the next unwritten
2482 * page, or last partially written page.
2483 */
2484 curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2485
2486 while (LogwrtResult.Write < WriteRqst.Write)
2487 {
2488 /*
2489 * Make sure we're not ahead of the insert process. This could happen
2490 * if we're passed a bogus WriteRqst.Write that is past the end of the
2491 * last page that's been initialized by AdvanceXLInsertBuffer.
2492 */
2493 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2494
2495 if (LogwrtResult.Write >= EndPtr)
2496 elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2497 LSN_FORMAT_ARGS(LogwrtResult.Write),
2498 LSN_FORMAT_ARGS(EndPtr));
2499
2500 /* Advance LogwrtResult.Write to end of current buffer page */
2501 LogwrtResult.Write = EndPtr;
2502 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2503
2504 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2505 wal_segment_size))
2506 {
2507 /*
2508 * Switch to new logfile segment. We cannot have any pending
2509 * pages here (since we dump what we have at segment end).
2510 */
2511 Assert(npages == 0);
2512 if (openLogFile >= 0)
2513 XLogFileClose();
2514 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2515 wal_segment_size);
2516
2517 /* create/use new log file */
2518 use_existent = true;
2519 openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2520 ReserveExternalFD();
2521 }
2522
2523 /* Make sure we have the current logfile open */
2524 if (openLogFile < 0)
2525 {
2526 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2527 wal_segment_size);
2528 openLogFile = XLogFileOpen(openLogSegNo);
2529 ReserveExternalFD();
2530 }
2531
2532 /* Add current page to the set of pending pages-to-dump */
2533 if (npages == 0)
2534 {
2535 /* first of group */
2536 startidx = curridx;
2537 startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2538 wal_segment_size);
2539 }
2540 npages++;
2541
2542 /*
2543 * Dump the set if this will be the last loop iteration, or if we are
2544 * at the last page of the cache area (since the next page won't be
2545 * contiguous in memory), or if we are at the end of the logfile
2546 * segment.
2547 */
2548 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2549
2550 finishing_seg = !ispartialpage &&
2551 (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2552
2553 if (last_iteration ||
2554 curridx == XLogCtl->XLogCacheBlck ||
2555 finishing_seg)
2556 {
2557 char *from;
2558 Size nbytes;
2559 Size nleft;
2560 int written;
2561 instr_time start;
2562
2563 /* OK to write the page(s) */
2564 from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2565 nbytes = npages * (Size) XLOG_BLCKSZ;
2566 nleft = nbytes;
2567 do
2568 {
2569 errno = 0;
2570
2571 /* Measure I/O timing to write WAL data */
2572 if (track_wal_io_timing)
2573 INSTR_TIME_SET_CURRENT(start);
2574
2575 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2576 written = pg_pwrite(openLogFile, from, nleft, startoffset);
2577 pgstat_report_wait_end();
2578
2579 /*
2580 * Increment the I/O timing and the number of times WAL data
2581 * were written out to disk.
2582 */
2583 if (track_wal_io_timing)
2584 {
2585 instr_time duration;
2586
2587 INSTR_TIME_SET_CURRENT(duration);
2588 INSTR_TIME_SUBTRACT(duration, start);
2589 WalStats.m_wal_write_time += INSTR_TIME_GET_MICROSEC(duration);
2590 }
2591
2592 WalStats.m_wal_write++;
2593
2594 if (written <= 0)
2595 {
2596 char xlogfname[MAXFNAMELEN];
2597 int save_errno;
2598
2599 if (errno == EINTR)
2600 continue;
2601
2602 save_errno = errno;
2603 XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
2604 wal_segment_size);
2605 errno = save_errno;
2606 ereport(PANIC,
2607 (errcode_for_file_access(),
2608 errmsg("could not write to log file %s "
2609 "at offset %u, length %zu: %m",
2610 xlogfname, startoffset, nleft)));
2611 }
2612 nleft -= written;
2613 from += written;
2614 startoffset += written;
2615 } while (nleft > 0);
2616
2617 npages = 0;
2618
2619 /*
2620 * If we just wrote the whole last page of a logfile segment,
2621 * fsync the segment immediately. This avoids having to go back
2622 * and re-open prior segments when an fsync request comes along
2623 * later. Doing it here ensures that one and only one backend will
2624 * perform this fsync.
2625 *
2626 * This is also the right place to notify the Archiver that the
2627 * segment is ready to copy to archival storage, and to update the
2628 * timer for archive_timeout, and to signal for a checkpoint if
2629 * too many logfile segments have been used since the last
2630 * checkpoint.
2631 */
2632 if (finishing_seg)
2633 {
2634 issue_xlog_fsync(openLogFile, openLogSegNo);
2635
2636 /* signal that we need to wakeup walsenders later */
2637 WalSndWakeupRequest();
2638
2639 LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
2640
2641 if (XLogArchivingActive())
2642 XLogArchiveNotifySeg(openLogSegNo);
2643
2644 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2645 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2646
2647 /*
2648 * Request a checkpoint if we've consumed too much xlog since
2649 * the last one. For speed, we first check using the local
2650 * copy of RedoRecPtr, which might be out of date; if it looks
2651 * like a checkpoint is needed, forcibly update RedoRecPtr and
2652 * recheck.
2653 */
2654 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2655 {
2656 (void) GetRedoRecPtr();
2657 if (XLogCheckpointNeeded(openLogSegNo))
2658 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2659 }
2660 }
2661 }
2662
2663 if (ispartialpage)
2664 {
2665 /* Only asked to write a partial page */
2666 LogwrtResult.Write = WriteRqst.Write;
2667 break;
2668 }
2669 curridx = NextBufIdx(curridx);
2670
2671 /* If flexible, break out of loop as soon as we wrote something */
2672 if (flexible && npages == 0)
2673 break;
2674 }
2675
2676 Assert(npages == 0);
2677
2678 /*
2679 * If asked to flush, do so
2680 */
2681 if (LogwrtResult.Flush < WriteRqst.Flush &&
2682 LogwrtResult.Flush < LogwrtResult.Write)
2683
2684 {
2685 /*
2686 * Could get here without iterating above loop, in which case we might
2687 * have no open file or the wrong one. However, we do not need to
2688 * fsync more than one file.
2689 */
2690 if (sync_method != SYNC_METHOD_OPEN &&
2691 sync_method != SYNC_METHOD_OPEN_DSYNC)
2692 {
2693 if (openLogFile >= 0 &&
2694 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2695 wal_segment_size))
2696 XLogFileClose();
2697 if (openLogFile < 0)
2698 {
2699 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2700 wal_segment_size);
2701 openLogFile = XLogFileOpen(openLogSegNo);
2702 ReserveExternalFD();
2703 }
2704
2705 issue_xlog_fsync(openLogFile, openLogSegNo);
2706 }
2707
2708 /* signal that we need to wakeup walsenders later */
2709 WalSndWakeupRequest();
2710
2711 LogwrtResult.Flush = LogwrtResult.Write;
2712 }
2713
2714 /*
2715 * Update shared-memory status
2716 *
2717 * We make sure that the shared 'request' values do not fall behind the
2718 * 'result' values. This is not absolutely essential, but it saves some
2719 * code in a couple of places.
2720 */
2721 {
2722 SpinLockAcquire(&XLogCtl->info_lck);
2723 XLogCtl->LogwrtResult = LogwrtResult;
2724 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2725 XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2726 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2727 XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2728 SpinLockRelease(&XLogCtl->info_lck);
2729 }
2730 }
2731
2732 /*
2733 * Record the LSN for an asynchronous transaction commit/abort
2734 * and nudge the WALWriter if there is work for it to do.
2735 * (This should not be called for synchronous commits.)
2736 */
2737 void
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)2738 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2739 {
2740 XLogRecPtr WriteRqstPtr = asyncXactLSN;
2741 bool sleeping;
2742
2743 SpinLockAcquire(&XLogCtl->info_lck);
2744 LogwrtResult = XLogCtl->LogwrtResult;
2745 sleeping = XLogCtl->WalWriterSleeping;
2746 if (XLogCtl->asyncXactLSN < asyncXactLSN)
2747 XLogCtl->asyncXactLSN = asyncXactLSN;
2748 SpinLockRelease(&XLogCtl->info_lck);
2749
2750 /*
2751 * If the WALWriter is sleeping, we should kick it to make it come out of
2752 * low-power mode. Otherwise, determine whether there's a full page of
2753 * WAL available to write.
2754 */
2755 if (!sleeping)
2756 {
2757 /* back off to last completed page boundary */
2758 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2759
2760 /* if we have already flushed that far, we're done */
2761 if (WriteRqstPtr <= LogwrtResult.Flush)
2762 return;
2763 }
2764
2765 /*
2766 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2767 * to come out of low-power mode so that this async commit will reach disk
2768 * within the expected amount of time.
2769 */
2770 if (ProcGlobal->walwriterLatch)
2771 SetLatch(ProcGlobal->walwriterLatch);
2772 }
2773
2774 /*
2775 * Record the LSN up to which we can remove WAL because it's not required by
2776 * any replication slot.
2777 */
2778 void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)2779 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2780 {
2781 SpinLockAcquire(&XLogCtl->info_lck);
2782 XLogCtl->replicationSlotMinLSN = lsn;
2783 SpinLockRelease(&XLogCtl->info_lck);
2784 }
2785
2786
2787 /*
2788 * Return the oldest LSN we must retain to satisfy the needs of some
2789 * replication slot.
2790 */
2791 static XLogRecPtr
XLogGetReplicationSlotMinimumLSN(void)2792 XLogGetReplicationSlotMinimumLSN(void)
2793 {
2794 XLogRecPtr retval;
2795
2796 SpinLockAcquire(&XLogCtl->info_lck);
2797 retval = XLogCtl->replicationSlotMinLSN;
2798 SpinLockRelease(&XLogCtl->info_lck);
2799
2800 return retval;
2801 }
2802
2803 /*
2804 * Advance minRecoveryPoint in control file.
2805 *
2806 * If we crash during recovery, we must reach this point again before the
2807 * database is consistent.
2808 *
2809 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2810 * is only updated if it's not already greater than or equal to 'lsn'.
2811 */
2812 static void
UpdateMinRecoveryPoint(XLogRecPtr lsn,bool force)2813 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2814 {
2815 /* Quick check using our local copy of the variable */
2816 if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2817 return;
2818
2819 /*
2820 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2821 * i.e., we're doing crash recovery. We never modify the control file's
2822 * value in that case, so we can short-circuit future checks here too. The
2823 * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2824 * updated until crash recovery finishes. We only do this for the startup
2825 * process as it should not update its own reference of minRecoveryPoint
2826 * until it has finished crash recovery to make sure that all WAL
2827 * available is replayed in this case. This also saves from extra locks
2828 * taken on the control file from the startup process.
2829 */
2830 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2831 {
2832 updateMinRecoveryPoint = false;
2833 return;
2834 }
2835
2836 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2837
2838 /* update local copy */
2839 minRecoveryPoint = ControlFile->minRecoveryPoint;
2840 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2841
2842 if (XLogRecPtrIsInvalid(minRecoveryPoint))
2843 updateMinRecoveryPoint = false;
2844 else if (force || minRecoveryPoint < lsn)
2845 {
2846 XLogRecPtr newMinRecoveryPoint;
2847 TimeLineID newMinRecoveryPointTLI;
2848
2849 /*
2850 * To avoid having to update the control file too often, we update it
2851 * all the way to the last record being replayed, even though 'lsn'
2852 * would suffice for correctness. This also allows the 'force' case
2853 * to not need a valid 'lsn' value.
2854 *
2855 * Another important reason for doing it this way is that the passed
2856 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2857 * the caller got it from a corrupted heap page. Accepting such a
2858 * value as the min recovery point would prevent us from coming up at
2859 * all. Instead, we just log a warning and continue with recovery.
2860 * (See also the comments about corrupt LSNs in XLogFlush.)
2861 */
2862 SpinLockAcquire(&XLogCtl->info_lck);
2863 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2864 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2865 SpinLockRelease(&XLogCtl->info_lck);
2866
2867 if (!force && newMinRecoveryPoint < lsn)
2868 elog(WARNING,
2869 "xlog min recovery request %X/%X is past current point %X/%X",
2870 LSN_FORMAT_ARGS(lsn), LSN_FORMAT_ARGS(newMinRecoveryPoint));
2871
2872 /* update control file */
2873 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2874 {
2875 ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2876 ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2877 UpdateControlFile();
2878 minRecoveryPoint = newMinRecoveryPoint;
2879 minRecoveryPointTLI = newMinRecoveryPointTLI;
2880
2881 ereport(DEBUG2,
2882 (errmsg_internal("updated min recovery point to %X/%X on timeline %u",
2883 LSN_FORMAT_ARGS(minRecoveryPoint),
2884 newMinRecoveryPointTLI)));
2885 }
2886 }
2887 LWLockRelease(ControlFileLock);
2888 }
2889
2890 /*
2891 * Ensure that all XLOG data through the given position is flushed to disk.
2892 *
2893 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2894 * already held, and we try to avoid acquiring it if possible.
2895 */
2896 void
XLogFlush(XLogRecPtr record)2897 XLogFlush(XLogRecPtr record)
2898 {
2899 XLogRecPtr WriteRqstPtr;
2900 XLogwrtRqst WriteRqst;
2901
2902 /*
2903 * During REDO, we are reading not writing WAL. Therefore, instead of
2904 * trying to flush the WAL, we should update minRecoveryPoint instead. We
2905 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2906 * to act this way too, and because when it tries to write the
2907 * end-of-recovery checkpoint, it should indeed flush.
2908 */
2909 if (!XLogInsertAllowed())
2910 {
2911 UpdateMinRecoveryPoint(record, false);
2912 return;
2913 }
2914
2915 /* Quick exit if already known flushed */
2916 if (record <= LogwrtResult.Flush)
2917 return;
2918
2919 #ifdef WAL_DEBUG
2920 if (XLOG_DEBUG)
2921 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2922 LSN_FORMAT_ARGS(record),
2923 LSN_FORMAT_ARGS(LogwrtResult.Write),
2924 LSN_FORMAT_ARGS(LogwrtResult.Flush));
2925 #endif
2926
2927 START_CRIT_SECTION();
2928
2929 /*
2930 * Since fsync is usually a horribly expensive operation, we try to
2931 * piggyback as much data as we can on each fsync: if we see any more data
2932 * entered into the xlog buffer, we'll write and fsync that too, so that
2933 * the final value of LogwrtResult.Flush is as large as possible. This
2934 * gives us some chance of avoiding another fsync immediately after.
2935 */
2936
2937 /* initialize to given target; may increase below */
2938 WriteRqstPtr = record;
2939
2940 /*
2941 * Now wait until we get the write lock, or someone else does the flush
2942 * for us.
2943 */
2944 for (;;)
2945 {
2946 XLogRecPtr insertpos;
2947
2948 /* read LogwrtResult and update local state */
2949 SpinLockAcquire(&XLogCtl->info_lck);
2950 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2951 WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2952 LogwrtResult = XLogCtl->LogwrtResult;
2953 SpinLockRelease(&XLogCtl->info_lck);
2954
2955 /* done already? */
2956 if (record <= LogwrtResult.Flush)
2957 break;
2958
2959 /*
2960 * Before actually performing the write, wait for all in-flight
2961 * insertions to the pages we're about to write to finish.
2962 */
2963 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2964
2965 /*
2966 * Try to get the write lock. If we can't get it immediately, wait
2967 * until it's released, and recheck if we still need to do the flush
2968 * or if the backend that held the lock did it for us already. This
2969 * helps to maintain a good rate of group committing when the system
2970 * is bottlenecked by the speed of fsyncing.
2971 */
2972 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2973 {
2974 /*
2975 * The lock is now free, but we didn't acquire it yet. Before we
2976 * do, loop back to check if someone else flushed the record for
2977 * us already.
2978 */
2979 continue;
2980 }
2981
2982 /* Got the lock; recheck whether request is satisfied */
2983 LogwrtResult = XLogCtl->LogwrtResult;
2984 if (record <= LogwrtResult.Flush)
2985 {
2986 LWLockRelease(WALWriteLock);
2987 break;
2988 }
2989
2990 /*
2991 * Sleep before flush! By adding a delay here, we may give further
2992 * backends the opportunity to join the backlog of group commit
2993 * followers; this can significantly improve transaction throughput,
2994 * at the risk of increasing transaction latency.
2995 *
2996 * We do not sleep if enableFsync is not turned on, nor if there are
2997 * fewer than CommitSiblings other backends with active transactions.
2998 */
2999 if (CommitDelay > 0 && enableFsync &&
3000 MinimumActiveBackends(CommitSiblings))
3001 {
3002 pg_usleep(CommitDelay);
3003
3004 /*
3005 * Re-check how far we can now flush the WAL. It's generally not
3006 * safe to call WaitXLogInsertionsToFinish while holding
3007 * WALWriteLock, because an in-progress insertion might need to
3008 * also grab WALWriteLock to make progress. But we know that all
3009 * the insertions up to insertpos have already finished, because
3010 * that's what the earlier WaitXLogInsertionsToFinish() returned.
3011 * We're only calling it again to allow insertpos to be moved
3012 * further forward, not to actually wait for anyone.
3013 */
3014 insertpos = WaitXLogInsertionsToFinish(insertpos);
3015 }
3016
3017 /* try to write/flush later additions to XLOG as well */
3018 WriteRqst.Write = insertpos;
3019 WriteRqst.Flush = insertpos;
3020
3021 XLogWrite(WriteRqst, false);
3022
3023 LWLockRelease(WALWriteLock);
3024 /* done */
3025 break;
3026 }
3027
3028 END_CRIT_SECTION();
3029
3030 /* wake up walsenders now that we've released heavily contended locks */
3031 WalSndWakeupProcessRequests();
3032
3033 /*
3034 * If we still haven't flushed to the request point then we have a
3035 * problem; most likely, the requested flush point is past end of XLOG.
3036 * This has been seen to occur when a disk page has a corrupted LSN.
3037 *
3038 * Formerly we treated this as a PANIC condition, but that hurts the
3039 * system's robustness rather than helping it: we do not want to take down
3040 * the whole system due to corruption on one data page. In particular, if
3041 * the bad page is encountered again during recovery then we would be
3042 * unable to restart the database at all! (This scenario actually
3043 * happened in the field several times with 7.1 releases.) As of 8.4, bad
3044 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3045 * the only time we can reach here during recovery is while flushing the
3046 * end-of-recovery checkpoint record, and we don't expect that to have a
3047 * bad LSN.
3048 *
3049 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3050 * since xact.c calls this routine inside a critical section. However,
3051 * calls from bufmgr.c are not within critical sections and so we will not
3052 * force a restart for a bad LSN on a data page.
3053 */
3054 if (LogwrtResult.Flush < record)
3055 elog(ERROR,
3056 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3057 LSN_FORMAT_ARGS(record),
3058 LSN_FORMAT_ARGS(LogwrtResult.Flush));
3059 }
3060
3061 /*
3062 * Write & flush xlog, but without specifying exactly where to.
3063 *
3064 * We normally write only completed blocks; but if there is nothing to do on
3065 * that basis, we check for unwritten async commits in the current incomplete
3066 * block, and write through the latest one of those. Thus, if async commits
3067 * are not being used, we will write complete blocks only.
3068 *
3069 * If, based on the above, there's anything to write we do so immediately. But
3070 * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
3071 * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
3072 * more than wal_writer_flush_after unflushed blocks.
3073 *
3074 * We can guarantee that async commits reach disk after at most three
3075 * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
3076 * to write "flexibly", meaning it can stop at the end of the buffer ring;
3077 * this makes a difference only with very high load or long wal_writer_delay,
3078 * but imposes one extra cycle for the worst case for async commits.)
3079 *
3080 * This routine is invoked periodically by the background walwriter process.
3081 *
3082 * Returns true if there was any work to do, even if we skipped flushing due
3083 * to wal_writer_delay/wal_writer_flush_after.
3084 */
3085 bool
XLogBackgroundFlush(void)3086 XLogBackgroundFlush(void)
3087 {
3088 XLogwrtRqst WriteRqst;
3089 bool flexible = true;
3090 static TimestampTz lastflush;
3091 TimestampTz now;
3092 int flushbytes;
3093
3094 /* XLOG doesn't need flushing during recovery */
3095 if (RecoveryInProgress())
3096 return false;
3097
3098 /* read LogwrtResult and update local state */
3099 SpinLockAcquire(&XLogCtl->info_lck);
3100 LogwrtResult = XLogCtl->LogwrtResult;
3101 WriteRqst = XLogCtl->LogwrtRqst;
3102 SpinLockRelease(&XLogCtl->info_lck);
3103
3104 /* back off to last completed page boundary */
3105 WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3106
3107 /* if we have already flushed that far, consider async commit records */
3108 if (WriteRqst.Write <= LogwrtResult.Flush)
3109 {
3110 SpinLockAcquire(&XLogCtl->info_lck);
3111 WriteRqst.Write = XLogCtl->asyncXactLSN;
3112 SpinLockRelease(&XLogCtl->info_lck);
3113 flexible = false; /* ensure it all gets written */
3114 }
3115
3116 /*
3117 * If already known flushed, we're done. Just need to check if we are
3118 * holding an open file handle to a logfile that's no longer in use,
3119 * preventing the file from being deleted.
3120 */
3121 if (WriteRqst.Write <= LogwrtResult.Flush)
3122 {
3123 if (openLogFile >= 0)
3124 {
3125 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3126 wal_segment_size))
3127 {
3128 XLogFileClose();
3129 }
3130 }
3131 return false;
3132 }
3133
3134 /*
3135 * Determine how far to flush WAL, based on the wal_writer_delay and
3136 * wal_writer_flush_after GUCs.
3137 */
3138 now = GetCurrentTimestamp();
3139 flushbytes =
3140 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3141
3142 if (WalWriterFlushAfter == 0 || lastflush == 0)
3143 {
3144 /* first call, or block based limits disabled */
3145 WriteRqst.Flush = WriteRqst.Write;
3146 lastflush = now;
3147 }
3148 else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3149 {
3150 /*
3151 * Flush the writes at least every WalWriterDelay ms. This is
3152 * important to bound the amount of time it takes for an asynchronous
3153 * commit to hit disk.
3154 */
3155 WriteRqst.Flush = WriteRqst.Write;
3156 lastflush = now;
3157 }
3158 else if (flushbytes >= WalWriterFlushAfter)
3159 {
3160 /* exceeded wal_writer_flush_after blocks, flush */
3161 WriteRqst.Flush = WriteRqst.Write;
3162 lastflush = now;
3163 }
3164 else
3165 {
3166 /* no flushing, this time round */
3167 WriteRqst.Flush = 0;
3168 }
3169
3170 #ifdef WAL_DEBUG
3171 if (XLOG_DEBUG)
3172 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3173 LSN_FORMAT_ARGS(WriteRqst.Write),
3174 LSN_FORMAT_ARGS(WriteRqst.Flush),
3175 LSN_FORMAT_ARGS(LogwrtResult.Write),
3176 LSN_FORMAT_ARGS(LogwrtResult.Flush));
3177 #endif
3178
3179 START_CRIT_SECTION();
3180
3181 /* now wait for any in-progress insertions to finish and get write lock */
3182 WaitXLogInsertionsToFinish(WriteRqst.Write);
3183 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3184 LogwrtResult = XLogCtl->LogwrtResult;
3185 if (WriteRqst.Write > LogwrtResult.Write ||
3186 WriteRqst.Flush > LogwrtResult.Flush)
3187 {
3188 XLogWrite(WriteRqst, flexible);
3189 }
3190 LWLockRelease(WALWriteLock);
3191
3192 END_CRIT_SECTION();
3193
3194 /* wake up walsenders now that we've released heavily contended locks */
3195 WalSndWakeupProcessRequests();
3196
3197 /*
3198 * Great, done. To take some work off the critical path, try to initialize
3199 * as many of the no-longer-needed WAL buffers for future use as we can.
3200 */
3201 AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3202
3203 /*
3204 * If we determined that we need to write data, but somebody else
3205 * wrote/flushed already, it should be considered as being active, to
3206 * avoid hibernating too early.
3207 */
3208 return true;
3209 }
3210
3211 /*
3212 * Test whether XLOG data has been flushed up to (at least) the given position.
3213 *
3214 * Returns true if a flush is still needed. (It may be that someone else
3215 * is already in process of flushing that far, however.)
3216 */
3217 bool
XLogNeedsFlush(XLogRecPtr record)3218 XLogNeedsFlush(XLogRecPtr record)
3219 {
3220 /*
3221 * During recovery, we don't flush WAL but update minRecoveryPoint
3222 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3223 * would need to be updated.
3224 */
3225 if (RecoveryInProgress())
3226 {
3227 /*
3228 * An invalid minRecoveryPoint means that we need to recover all the
3229 * WAL, i.e., we're doing crash recovery. We never modify the control
3230 * file's value in that case, so we can short-circuit future checks
3231 * here too. This triggers a quick exit path for the startup process,
3232 * which cannot update its local copy of minRecoveryPoint as long as
3233 * it has not replayed all WAL available when doing crash recovery.
3234 */
3235 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3236 updateMinRecoveryPoint = false;
3237
3238 /* Quick exit if already known to be updated or cannot be updated */
3239 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3240 return false;
3241
3242 /*
3243 * Update local copy of minRecoveryPoint. But if the lock is busy,
3244 * just return a conservative guess.
3245 */
3246 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3247 return true;
3248 minRecoveryPoint = ControlFile->minRecoveryPoint;
3249 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3250 LWLockRelease(ControlFileLock);
3251
3252 /*
3253 * Check minRecoveryPoint for any other process than the startup
3254 * process doing crash recovery, which should not update the control
3255 * file value if crash recovery is still running.
3256 */
3257 if (XLogRecPtrIsInvalid(minRecoveryPoint))
3258 updateMinRecoveryPoint = false;
3259
3260 /* check again */
3261 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3262 return false;
3263 else
3264 return true;
3265 }
3266
3267 /* Quick exit if already known flushed */
3268 if (record <= LogwrtResult.Flush)
3269 return false;
3270
3271 /* read LogwrtResult and update local state */
3272 SpinLockAcquire(&XLogCtl->info_lck);
3273 LogwrtResult = XLogCtl->LogwrtResult;
3274 SpinLockRelease(&XLogCtl->info_lck);
3275
3276 /* check again */
3277 if (record <= LogwrtResult.Flush)
3278 return false;
3279
3280 return true;
3281 }
3282
3283 /*
3284 * Create a new XLOG file segment, or open a pre-existing one.
3285 *
3286 * logsegno: identify segment to be created/opened.
3287 *
3288 * *use_existent: if true, OK to use a pre-existing file (else, any
3289 * pre-existing file will be deleted). On return, true if a pre-existing
3290 * file was used.
3291 *
3292 * use_lock: if true, acquire ControlFileLock while moving file into
3293 * place. This should be true except during bootstrap log creation. The
3294 * caller must *not* hold the lock at call.
3295 *
3296 * Returns FD of opened file.
3297 *
3298 * Note: errors here are ERROR not PANIC because we might or might not be
3299 * inside a critical section (eg, during checkpoint there is no reason to
3300 * take down the system on failure). They will promote to PANIC if we are
3301 * in a critical section.
3302 */
3303 int
XLogFileInit(XLogSegNo logsegno,bool * use_existent,bool use_lock)3304 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3305 {
3306 char path[MAXPGPATH];
3307 char tmppath[MAXPGPATH];
3308 PGAlignedXLogBlock zbuffer;
3309 XLogSegNo installed_segno;
3310 XLogSegNo max_segno;
3311 int fd;
3312 int save_errno;
3313
3314 XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3315
3316 /*
3317 * Try to use existent file (checkpoint maker may have created it already)
3318 */
3319 if (*use_existent)
3320 {
3321 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3322 if (fd < 0)
3323 {
3324 if (errno != ENOENT)
3325 ereport(ERROR,
3326 (errcode_for_file_access(),
3327 errmsg("could not open file \"%s\": %m", path)));
3328 }
3329 else
3330 return fd;
3331 }
3332
3333 /*
3334 * Initialize an empty (all zeroes) segment. NOTE: it is possible that
3335 * another process is doing the same thing. If so, we will end up
3336 * pre-creating an extra log segment. That seems OK, and better than
3337 * holding the lock throughout this lengthy process.
3338 */
3339 elog(DEBUG2, "creating and filling new WAL file");
3340
3341 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3342
3343 unlink(tmppath);
3344
3345 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3346 fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3347 if (fd < 0)
3348 ereport(ERROR,
3349 (errcode_for_file_access(),
3350 errmsg("could not create file \"%s\": %m", tmppath)));
3351
3352 memset(zbuffer.data, 0, XLOG_BLCKSZ);
3353
3354 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3355 save_errno = 0;
3356 if (wal_init_zero)
3357 {
3358 struct iovec iov[PG_IOV_MAX];
3359 int blocks;
3360
3361 /*
3362 * Zero-fill the file. With this setting, we do this the hard way to
3363 * ensure that all the file space has really been allocated. On
3364 * platforms that allow "holes" in files, just seeking to the end
3365 * doesn't allocate intermediate space. This way, we know that we
3366 * have all the space and (after the fsync below) that all the
3367 * indirect blocks are down on disk. Therefore, fdatasync(2) or
3368 * O_DSYNC will be sufficient to sync future writes to the log file.
3369 */
3370
3371 /* Prepare to write out a lot of copies of our zero buffer at once. */
3372 for (int i = 0; i < lengthof(iov); ++i)
3373 {
3374 iov[i].iov_base = zbuffer.data;
3375 iov[i].iov_len = XLOG_BLCKSZ;
3376 }
3377
3378 /* Loop, writing as many blocks as we can for each system call. */
3379 blocks = wal_segment_size / XLOG_BLCKSZ;
3380 for (int i = 0; i < blocks;)
3381 {
3382 int iovcnt = Min(blocks - i, lengthof(iov));
3383 off_t offset = i * XLOG_BLCKSZ;
3384
3385 if (pg_pwritev_with_retry(fd, iov, iovcnt, offset) < 0)
3386 {
3387 save_errno = errno;
3388 break;
3389 }
3390
3391 i += iovcnt;
3392 }
3393 }
3394 else
3395 {
3396 /*
3397 * Otherwise, seeking to the end and writing a solitary byte is
3398 * enough.
3399 */
3400 errno = 0;
3401 if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
3402 {
3403 /* if write didn't set errno, assume no disk space */
3404 save_errno = errno ? errno : ENOSPC;
3405 }
3406 }
3407 pgstat_report_wait_end();
3408
3409 if (save_errno)
3410 {
3411 /*
3412 * If we fail to make the file, delete it to release disk space
3413 */
3414 unlink(tmppath);
3415
3416 close(fd);
3417
3418 errno = save_errno;
3419
3420 ereport(ERROR,
3421 (errcode_for_file_access(),
3422 errmsg("could not write to file \"%s\": %m", tmppath)));
3423 }
3424
3425 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3426 if (pg_fsync(fd) != 0)
3427 {
3428 int save_errno = errno;
3429
3430 close(fd);
3431 errno = save_errno;
3432 ereport(ERROR,
3433 (errcode_for_file_access(),
3434 errmsg("could not fsync file \"%s\": %m", tmppath)));
3435 }
3436 pgstat_report_wait_end();
3437
3438 if (close(fd) != 0)
3439 ereport(ERROR,
3440 (errcode_for_file_access(),
3441 errmsg("could not close file \"%s\": %m", tmppath)));
3442
3443 /*
3444 * Now move the segment into place with its final name.
3445 *
3446 * If caller didn't want to use a pre-existing file, get rid of any
3447 * pre-existing file. Otherwise, cope with possibility that someone else
3448 * has created the file while we were filling ours: if so, use ours to
3449 * pre-create a future log segment.
3450 */
3451 installed_segno = logsegno;
3452
3453 /*
3454 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3455 * that was a constant, but that was always a bit dubious: normally, at a
3456 * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3457 * here, it was the offset from the insert location. We can't do the
3458 * normal XLOGfileslop calculation here because we don't have access to
3459 * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3460 * CheckPointSegments.
3461 */
3462 max_segno = logsegno + CheckPointSegments;
3463 if (!InstallXLogFileSegment(&installed_segno, tmppath,
3464 *use_existent, max_segno,
3465 use_lock))
3466 {
3467 /*
3468 * No need for any more future segments, or InstallXLogFileSegment()
3469 * failed to rename the file into place. If the rename failed, opening
3470 * the file below will fail.
3471 */
3472 unlink(tmppath);
3473 }
3474
3475 /* Set flag to tell caller there was no existent file */
3476 *use_existent = false;
3477
3478 /* Now open original target segment (might not be file I just made) */
3479 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3480 if (fd < 0)
3481 ereport(ERROR,
3482 (errcode_for_file_access(),
3483 errmsg("could not open file \"%s\": %m", path)));
3484
3485 elog(DEBUG2, "done creating and filling new WAL file");
3486
3487 return fd;
3488 }
3489
3490 /*
3491 * Create a new XLOG file segment by copying a pre-existing one.
3492 *
3493 * destsegno: identify segment to be created.
3494 *
3495 * srcTLI, srcsegno: identify segment to be copied (could be from
3496 * a different timeline)
3497 *
3498 * upto: how much of the source file to copy (the rest is filled with
3499 * zeros)
3500 *
3501 * Currently this is only used during recovery, and so there are no locking
3502 * considerations. But we should be just as tense as XLogFileInit to avoid
3503 * emplacing a bogus file.
3504 */
3505 static void
XLogFileCopy(XLogSegNo destsegno,TimeLineID srcTLI,XLogSegNo srcsegno,int upto)3506 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3507 int upto)
3508 {
3509 char path[MAXPGPATH];
3510 char tmppath[MAXPGPATH];
3511 PGAlignedXLogBlock buffer;
3512 int srcfd;
3513 int fd;
3514 int nbytes;
3515
3516 /*
3517 * Open the source file
3518 */
3519 XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3520 srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3521 if (srcfd < 0)
3522 ereport(ERROR,
3523 (errcode_for_file_access(),
3524 errmsg("could not open file \"%s\": %m", path)));
3525
3526 /*
3527 * Copy into a temp file name.
3528 */
3529 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3530
3531 unlink(tmppath);
3532
3533 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3534 fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3535 if (fd < 0)
3536 ereport(ERROR,
3537 (errcode_for_file_access(),
3538 errmsg("could not create file \"%s\": %m", tmppath)));
3539
3540 /*
3541 * Do the data copying.
3542 */
3543 for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3544 {
3545 int nread;
3546
3547 nread = upto - nbytes;
3548
3549 /*
3550 * The part that is not read from the source file is filled with
3551 * zeros.
3552 */
3553 if (nread < sizeof(buffer))
3554 memset(buffer.data, 0, sizeof(buffer));
3555
3556 if (nread > 0)
3557 {
3558 int r;
3559
3560 if (nread > sizeof(buffer))
3561 nread = sizeof(buffer);
3562 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3563 r = read(srcfd, buffer.data, nread);
3564 if (r != nread)
3565 {
3566 if (r < 0)
3567 ereport(ERROR,
3568 (errcode_for_file_access(),
3569 errmsg("could not read file \"%s\": %m",
3570 path)));
3571 else
3572 ereport(ERROR,
3573 (errcode(ERRCODE_DATA_CORRUPTED),
3574 errmsg("could not read file \"%s\": read %d of %zu",
3575 path, r, (Size) nread)));
3576 }
3577 pgstat_report_wait_end();
3578 }
3579 errno = 0;
3580 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3581 if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3582 {
3583 int save_errno = errno;
3584
3585 /*
3586 * If we fail to make the file, delete it to release disk space
3587 */
3588 unlink(tmppath);
3589 /* if write didn't set errno, assume problem is no disk space */
3590 errno = save_errno ? save_errno : ENOSPC;
3591
3592 ereport(ERROR,
3593 (errcode_for_file_access(),
3594 errmsg("could not write to file \"%s\": %m", tmppath)));
3595 }
3596 pgstat_report_wait_end();
3597 }
3598
3599 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3600 if (pg_fsync(fd) != 0)
3601 ereport(data_sync_elevel(ERROR),
3602 (errcode_for_file_access(),
3603 errmsg("could not fsync file \"%s\": %m", tmppath)));
3604 pgstat_report_wait_end();
3605
3606 if (CloseTransientFile(fd) != 0)
3607 ereport(ERROR,
3608 (errcode_for_file_access(),
3609 errmsg("could not close file \"%s\": %m", tmppath)));
3610
3611 if (CloseTransientFile(srcfd) != 0)
3612 ereport(ERROR,
3613 (errcode_for_file_access(),
3614 errmsg("could not close file \"%s\": %m", path)));
3615
3616 /*
3617 * Now move the segment into place with its final name.
3618 */
3619 if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3620 elog(ERROR, "InstallXLogFileSegment should not have failed");
3621 }
3622
3623 /*
3624 * Install a new XLOG segment file as a current or future log segment.
3625 *
3626 * This is used both to install a newly-created segment (which has a temp
3627 * filename while it's being created) and to recycle an old segment.
3628 *
3629 * *segno: identify segment to install as (or first possible target).
3630 * When find_free is true, this is modified on return to indicate the
3631 * actual installation location or last segment searched.
3632 *
3633 * tmppath: initial name of file to install. It will be renamed into place.
3634 *
3635 * find_free: if true, install the new segment at the first empty segno
3636 * number at or after the passed numbers. If false, install the new segment
3637 * exactly where specified, deleting any existing segment file there.
3638 *
3639 * max_segno: maximum segment number to install the new file as. Fail if no
3640 * free slot is found between *segno and max_segno. (Ignored when find_free
3641 * is false.)
3642 *
3643 * use_lock: if true, acquire ControlFileLock while moving file into
3644 * place. This should be true except during bootstrap log creation. The
3645 * caller must *not* hold the lock at call.
3646 *
3647 * Returns true if the file was installed successfully. false indicates that
3648 * max_segno limit was exceeded, or an error occurred while renaming the
3649 * file into place.
3650 */
3651 static bool
InstallXLogFileSegment(XLogSegNo * segno,char * tmppath,bool find_free,XLogSegNo max_segno,bool use_lock)3652 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3653 bool find_free, XLogSegNo max_segno,
3654 bool use_lock)
3655 {
3656 char path[MAXPGPATH];
3657 struct stat stat_buf;
3658
3659 XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3660
3661 /*
3662 * We want to be sure that only one process does this at a time.
3663 */
3664 if (use_lock)
3665 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3666
3667 if (!find_free)
3668 {
3669 /* Force installation: get rid of any pre-existing segment file */
3670 durable_unlink(path, DEBUG1);
3671 }
3672 else
3673 {
3674 /* Find a free slot to put it in */
3675 while (stat(path, &stat_buf) == 0)
3676 {
3677 if ((*segno) >= max_segno)
3678 {
3679 /* Failed to find a free slot within specified range */
3680 if (use_lock)
3681 LWLockRelease(ControlFileLock);
3682 return false;
3683 }
3684 (*segno)++;
3685 XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3686 }
3687 }
3688
3689 /*
3690 * Perform the rename using link if available, paranoidly trying to avoid
3691 * overwriting an existing file (there shouldn't be one).
3692 */
3693 if (durable_rename_excl(tmppath, path, LOG) != 0)
3694 {
3695 if (use_lock)
3696 LWLockRelease(ControlFileLock);
3697 /* durable_rename_excl already emitted log message */
3698 return false;
3699 }
3700
3701 if (use_lock)
3702 LWLockRelease(ControlFileLock);
3703
3704 return true;
3705 }
3706
3707 /*
3708 * Open a pre-existing logfile segment for writing.
3709 */
3710 int
XLogFileOpen(XLogSegNo segno)3711 XLogFileOpen(XLogSegNo segno)
3712 {
3713 char path[MAXPGPATH];
3714 int fd;
3715
3716 XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3717
3718 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3719 if (fd < 0)
3720 ereport(PANIC,
3721 (errcode_for_file_access(),
3722 errmsg("could not open file \"%s\": %m", path)));
3723
3724 return fd;
3725 }
3726
3727 /*
3728 * Open a logfile segment for reading (during recovery).
3729 *
3730 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3731 * Otherwise, it's assumed to be already available in pg_wal.
3732 */
3733 static int
XLogFileRead(XLogSegNo segno,int emode,TimeLineID tli,XLogSource source,bool notfoundOk)3734 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3735 XLogSource source, bool notfoundOk)
3736 {
3737 char xlogfname[MAXFNAMELEN];
3738 char activitymsg[MAXFNAMELEN + 16];
3739 char path[MAXPGPATH];
3740 int fd;
3741
3742 XLogFileName(xlogfname, tli, segno, wal_segment_size);
3743
3744 switch (source)
3745 {
3746 case XLOG_FROM_ARCHIVE:
3747 /* Report recovery progress in PS display */
3748 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3749 xlogfname);
3750 set_ps_display(activitymsg);
3751
3752 restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3753 "RECOVERYXLOG",
3754 wal_segment_size,
3755 InRedo);
3756 if (!restoredFromArchive)
3757 return -1;
3758 break;
3759
3760 case XLOG_FROM_PG_WAL:
3761 case XLOG_FROM_STREAM:
3762 XLogFilePath(path, tli, segno, wal_segment_size);
3763 restoredFromArchive = false;
3764 break;
3765
3766 default:
3767 elog(ERROR, "invalid XLogFileRead source %d", source);
3768 }
3769
3770 /*
3771 * If the segment was fetched from archival storage, replace the existing
3772 * xlog segment (if any) with the archival version.
3773 */
3774 if (source == XLOG_FROM_ARCHIVE)
3775 {
3776 KeepFileRestoredFromArchive(path, xlogfname);
3777
3778 /*
3779 * Set path to point at the new file in pg_wal.
3780 */
3781 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3782 }
3783
3784 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
3785 if (fd >= 0)
3786 {
3787 /* Success! */
3788 curFileTLI = tli;
3789
3790 /* Report recovery progress in PS display */
3791 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3792 xlogfname);
3793 set_ps_display(activitymsg);
3794
3795 /* Track source of data in assorted state variables */
3796 readSource = source;
3797 XLogReceiptSource = source;
3798 /* In FROM_STREAM case, caller tracks receipt time, not me */
3799 if (source != XLOG_FROM_STREAM)
3800 XLogReceiptTime = GetCurrentTimestamp();
3801
3802 return fd;
3803 }
3804 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3805 ereport(PANIC,
3806 (errcode_for_file_access(),
3807 errmsg("could not open file \"%s\": %m", path)));
3808 return -1;
3809 }
3810
3811 /*
3812 * Open a logfile segment for reading (during recovery).
3813 *
3814 * This version searches for the segment with any TLI listed in expectedTLEs.
3815 */
3816 static int
XLogFileReadAnyTLI(XLogSegNo segno,int emode,XLogSource source)3817 XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
3818 {
3819 char path[MAXPGPATH];
3820 ListCell *cell;
3821 int fd;
3822 List *tles;
3823
3824 /*
3825 * Loop looking for a suitable timeline ID: we might need to read any of
3826 * the timelines listed in expectedTLEs.
3827 *
3828 * We expect curFileTLI on entry to be the TLI of the preceding file in
3829 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
3830 * to go backwards; this prevents us from picking up the wrong file when a
3831 * parent timeline extends to higher segment numbers than the child we
3832 * want to read.
3833 *
3834 * If we haven't read the timeline history file yet, read it now, so that
3835 * we know which TLIs to scan. We don't save the list in expectedTLEs,
3836 * however, unless we actually find a valid segment. That way if there is
3837 * neither a timeline history file nor a WAL segment in the archive, and
3838 * streaming replication is set up, we'll read the timeline history file
3839 * streamed from the primary when we start streaming, instead of
3840 * recovering with a dummy history generated here.
3841 */
3842 if (expectedTLEs)
3843 tles = expectedTLEs;
3844 else
3845 tles = readTimeLineHistory(recoveryTargetTLI);
3846
3847 foreach(cell, tles)
3848 {
3849 TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
3850 TimeLineID tli = hent->tli;
3851
3852 if (tli < curFileTLI)
3853 break; /* don't bother looking at too-old TLIs */
3854
3855 /*
3856 * Skip scanning the timeline ID that the logfile segment to read
3857 * doesn't belong to
3858 */
3859 if (hent->begin != InvalidXLogRecPtr)
3860 {
3861 XLogSegNo beginseg = 0;
3862
3863 XLByteToSeg(hent->begin, beginseg, wal_segment_size);
3864
3865 /*
3866 * The logfile segment that doesn't belong to the timeline is
3867 * older or newer than the segment that the timeline started or
3868 * ended at, respectively. It's sufficient to check only the
3869 * starting segment of the timeline here. Since the timelines are
3870 * scanned in descending order in this loop, any segments newer
3871 * than the ending segment should belong to newer timeline and
3872 * have already been read before. So it's not necessary to check
3873 * the ending segment of the timeline here.
3874 */
3875 if (segno < beginseg)
3876 continue;
3877 }
3878
3879 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3880 {
3881 fd = XLogFileRead(segno, emode, tli,
3882 XLOG_FROM_ARCHIVE, true);
3883 if (fd != -1)
3884 {
3885 elog(DEBUG1, "got WAL segment from archive");
3886 if (!expectedTLEs)
3887 expectedTLEs = tles;
3888 return fd;
3889 }
3890 }
3891
3892 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3893 {
3894 fd = XLogFileRead(segno, emode, tli,
3895 XLOG_FROM_PG_WAL, true);
3896 if (fd != -1)
3897 {
3898 if (!expectedTLEs)
3899 expectedTLEs = tles;
3900 return fd;
3901 }
3902 }
3903 }
3904
3905 /* Couldn't find it. For simplicity, complain about front timeline */
3906 XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3907 errno = ENOENT;
3908 ereport(emode,
3909 (errcode_for_file_access(),
3910 errmsg("could not open file \"%s\": %m", path)));
3911 return -1;
3912 }
3913
3914 /*
3915 * Close the current logfile segment for writing.
3916 */
3917 static void
XLogFileClose(void)3918 XLogFileClose(void)
3919 {
3920 Assert(openLogFile >= 0);
3921
3922 /*
3923 * WAL segment files will not be re-read in normal operation, so we advise
3924 * the OS to release any cached pages. But do not do so if WAL archiving
3925 * or streaming is active, because archiver and walsender process could
3926 * use the cache to read the WAL segment.
3927 */
3928 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3929 if (!XLogIsNeeded())
3930 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3931 #endif
3932
3933 if (close(openLogFile) != 0)
3934 {
3935 char xlogfname[MAXFNAMELEN];
3936 int save_errno = errno;
3937
3938 XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size);
3939 errno = save_errno;
3940 ereport(PANIC,
3941 (errcode_for_file_access(),
3942 errmsg("could not close file \"%s\": %m", xlogfname)));
3943 }
3944
3945 openLogFile = -1;
3946 ReleaseExternalFD();
3947 }
3948
3949 /*
3950 * Preallocate log files beyond the specified log endpoint.
3951 *
3952 * XXX this is currently extremely conservative, since it forces only one
3953 * future log segment to exist, and even that only if we are 75% done with
3954 * the current one. This is only appropriate for very low-WAL-volume systems.
3955 * High-volume systems will be OK once they've built up a sufficient set of
3956 * recycled log segments, but the startup transient is likely to include
3957 * a lot of segment creations by foreground processes, which is not so good.
3958 */
3959 static void
PreallocXlogFiles(XLogRecPtr endptr)3960 PreallocXlogFiles(XLogRecPtr endptr)
3961 {
3962 XLogSegNo _logSegNo;
3963 int lf;
3964 bool use_existent;
3965 uint64 offset;
3966
3967 XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3968 offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3969 if (offset >= (uint32) (0.75 * wal_segment_size))
3970 {
3971 _logSegNo++;
3972 use_existent = true;
3973 lf = XLogFileInit(_logSegNo, &use_existent, true);
3974 close(lf);
3975 if (!use_existent)
3976 CheckpointStats.ckpt_segs_added++;
3977 }
3978 }
3979
3980 /*
3981 * Throws an error if the given log segment has already been removed or
3982 * recycled. The caller should only pass a segment that it knows to have
3983 * existed while the server has been running, as this function always
3984 * succeeds if no WAL segments have been removed since startup.
3985 * 'tli' is only used in the error message.
3986 *
3987 * Note: this function guarantees to keep errno unchanged on return.
3988 * This supports callers that use this to possibly deliver a better
3989 * error message about a missing file, while still being able to throw
3990 * a normal file-access error afterwards, if this does return.
3991 */
3992 void
CheckXLogRemoved(XLogSegNo segno,TimeLineID tli)3993 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3994 {
3995 int save_errno = errno;
3996 XLogSegNo lastRemovedSegNo;
3997
3998 SpinLockAcquire(&XLogCtl->info_lck);
3999 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
4000 SpinLockRelease(&XLogCtl->info_lck);
4001
4002 if (segno <= lastRemovedSegNo)
4003 {
4004 char filename[MAXFNAMELEN];
4005
4006 XLogFileName(filename, tli, segno, wal_segment_size);
4007 errno = save_errno;
4008 ereport(ERROR,
4009 (errcode_for_file_access(),
4010 errmsg("requested WAL segment %s has already been removed",
4011 filename)));
4012 }
4013 errno = save_errno;
4014 }
4015
4016 /*
4017 * Return the last WAL segment removed, or 0 if no segment has been removed
4018 * since startup.
4019 *
4020 * NB: the result can be out of date arbitrarily fast, the caller has to deal
4021 * with that.
4022 */
4023 XLogSegNo
XLogGetLastRemovedSegno(void)4024 XLogGetLastRemovedSegno(void)
4025 {
4026 XLogSegNo lastRemovedSegNo;
4027
4028 SpinLockAcquire(&XLogCtl->info_lck);
4029 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
4030 SpinLockRelease(&XLogCtl->info_lck);
4031
4032 return lastRemovedSegNo;
4033 }
4034
4035
4036 /*
4037 * Update the last removed segno pointer in shared memory, to reflect that the
4038 * given XLOG file has been removed.
4039 */
4040 static void
UpdateLastRemovedPtr(char * filename)4041 UpdateLastRemovedPtr(char *filename)
4042 {
4043 uint32 tli;
4044 XLogSegNo segno;
4045
4046 XLogFromFileName(filename, &tli, &segno, wal_segment_size);
4047
4048 SpinLockAcquire(&XLogCtl->info_lck);
4049 if (segno > XLogCtl->lastRemovedSegNo)
4050 XLogCtl->lastRemovedSegNo = segno;
4051 SpinLockRelease(&XLogCtl->info_lck);
4052 }
4053
4054 /*
4055 * Remove all temporary log files in pg_wal
4056 *
4057 * This is called at the beginning of recovery after a previous crash,
4058 * at a point where no other processes write fresh WAL data.
4059 */
4060 static void
RemoveTempXlogFiles(void)4061 RemoveTempXlogFiles(void)
4062 {
4063 DIR *xldir;
4064 struct dirent *xlde;
4065
4066 elog(DEBUG2, "removing all temporary WAL segments");
4067
4068 xldir = AllocateDir(XLOGDIR);
4069 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4070 {
4071 char path[MAXPGPATH];
4072
4073 if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
4074 continue;
4075
4076 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4077 unlink(path);
4078 elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
4079 }
4080 FreeDir(xldir);
4081 }
4082
4083 /*
4084 * Recycle or remove all log files older or equal to passed segno.
4085 *
4086 * endptr is current (or recent) end of xlog, and lastredoptr is the
4087 * redo pointer of the last checkpoint. These are used to determine
4088 * whether we want to recycle rather than delete no-longer-wanted log files.
4089 */
4090 static void
RemoveOldXlogFiles(XLogSegNo segno,XLogRecPtr lastredoptr,XLogRecPtr endptr)4091 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr)
4092 {
4093 DIR *xldir;
4094 struct dirent *xlde;
4095 char lastoff[MAXFNAMELEN];
4096 XLogSegNo endlogSegNo;
4097 XLogSegNo recycleSegNo;
4098
4099 /* Initialize info about where to try to recycle to */
4100 XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4101 recycleSegNo = XLOGfileslop(lastredoptr);
4102
4103 /*
4104 * Construct a filename of the last segment to be kept. The timeline ID
4105 * doesn't matter, we ignore that in the comparison. (During recovery,
4106 * ThisTimeLineID isn't set, so we can't use that.)
4107 */
4108 XLogFileName(lastoff, 0, segno, wal_segment_size);
4109
4110 elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4111 lastoff);
4112
4113 xldir = AllocateDir(XLOGDIR);
4114
4115 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4116 {
4117 /* Ignore files that are not XLOG segments */
4118 if (!IsXLogFileName(xlde->d_name) &&
4119 !IsPartialXLogFileName(xlde->d_name))
4120 continue;
4121
4122 /*
4123 * We ignore the timeline part of the XLOG segment identifiers in
4124 * deciding whether a segment is still needed. This ensures that we
4125 * won't prematurely remove a segment from a parent timeline. We could
4126 * probably be a little more proactive about removing segments of
4127 * non-parent timelines, but that would be a whole lot more
4128 * complicated.
4129 *
4130 * We use the alphanumeric sorting property of the filenames to decide
4131 * which ones are earlier than the lastoff segment.
4132 */
4133 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4134 {
4135 if (XLogArchiveCheckDone(xlde->d_name))
4136 {
4137 /* Update the last removed location in shared memory first */
4138 UpdateLastRemovedPtr(xlde->d_name);
4139
4140 RemoveXlogFile(xlde->d_name, recycleSegNo, &endlogSegNo);
4141 }
4142 }
4143 }
4144
4145 FreeDir(xldir);
4146 }
4147
4148 /*
4149 * Remove WAL files that are not part of the given timeline's history.
4150 *
4151 * This is called during recovery, whenever we switch to follow a new
4152 * timeline, and at the end of recovery when we create a new timeline. We
4153 * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
4154 * might be leftover pre-allocated or recycled WAL segments on the old timeline
4155 * that we haven't used yet, and contain garbage. If we just leave them in
4156 * pg_wal, they will eventually be archived, and we can't let that happen.
4157 * Files that belong to our timeline history are valid, because we have
4158 * successfully replayed them, but from others we can't be sure.
4159 *
4160 * 'switchpoint' is the current point in WAL where we switch to new timeline,
4161 * and 'newTLI' is the new timeline we switch to.
4162 */
4163 static void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint,TimeLineID newTLI)4164 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
4165 {
4166 DIR *xldir;
4167 struct dirent *xlde;
4168 char switchseg[MAXFNAMELEN];
4169 XLogSegNo endLogSegNo;
4170 XLogSegNo switchLogSegNo;
4171 XLogSegNo recycleSegNo;
4172
4173 /*
4174 * Initialize info about where to begin the work. This will recycle,
4175 * somewhat arbitrarily, 10 future segments.
4176 */
4177 XLByteToPrevSeg(switchpoint, switchLogSegNo, wal_segment_size);
4178 XLByteToSeg(switchpoint, endLogSegNo, wal_segment_size);
4179 recycleSegNo = endLogSegNo + 10;
4180
4181 /*
4182 * Construct a filename of the last segment to be kept.
4183 */
4184 XLogFileName(switchseg, newTLI, switchLogSegNo, wal_segment_size);
4185
4186 elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4187 switchseg);
4188
4189 xldir = AllocateDir(XLOGDIR);
4190
4191 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4192 {
4193 /* Ignore files that are not XLOG segments */
4194 if (!IsXLogFileName(xlde->d_name))
4195 continue;
4196
4197 /*
4198 * Remove files that are on a timeline older than the new one we're
4199 * switching to, but with a segment number >= the first segment on the
4200 * new timeline.
4201 */
4202 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4203 strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4204 {
4205 /*
4206 * If the file has already been marked as .ready, however, don't
4207 * remove it yet. It should be OK to remove it - files that are
4208 * not part of our timeline history are not required for recovery
4209 * - but seems safer to let them be archived and removed later.
4210 */
4211 if (!XLogArchiveIsReady(xlde->d_name))
4212 RemoveXlogFile(xlde->d_name, recycleSegNo, &endLogSegNo);
4213 }
4214 }
4215
4216 FreeDir(xldir);
4217 }
4218
4219 /*
4220 * Recycle or remove a log file that's no longer needed.
4221 *
4222 * segname is the name of the segment to recycle or remove. recycleSegNo
4223 * is the segment number to recycle up to. endlogSegNo is the segment
4224 * number of the current (or recent) end of WAL.
4225 *
4226 * endlogSegNo gets incremented if the segment is recycled so as it is not
4227 * checked again with future callers of this function.
4228 */
4229 static void
RemoveXlogFile(const char * segname,XLogSegNo recycleSegNo,XLogSegNo * endlogSegNo)4230 RemoveXlogFile(const char *segname, XLogSegNo recycleSegNo,
4231 XLogSegNo *endlogSegNo)
4232 {
4233 char path[MAXPGPATH];
4234 #ifdef WIN32
4235 char newpath[MAXPGPATH];
4236 #endif
4237 struct stat statbuf;
4238
4239 snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4240
4241 /*
4242 * Before deleting the file, see if it can be recycled as a future log
4243 * segment. Only recycle normal files, because we don't want to recycle
4244 * symbolic links pointing to a separate archive directory.
4245 */
4246 if (wal_recycle &&
4247 *endlogSegNo <= recycleSegNo &&
4248 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4249 InstallXLogFileSegment(endlogSegNo, path,
4250 true, recycleSegNo, true))
4251 {
4252 ereport(DEBUG2,
4253 (errmsg_internal("recycled write-ahead log file \"%s\"",
4254 segname)));
4255 CheckpointStats.ckpt_segs_recycled++;
4256 /* Needn't recheck that slot on future iterations */
4257 (*endlogSegNo)++;
4258 }
4259 else
4260 {
4261 /* No need for any more future segments... */
4262 int rc;
4263
4264 ereport(DEBUG2,
4265 (errmsg_internal("removing write-ahead log file \"%s\"",
4266 segname)));
4267
4268 #ifdef WIN32
4269
4270 /*
4271 * On Windows, if another process (e.g another backend) holds the file
4272 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4273 * will still show up in directory listing until the last handle is
4274 * closed. To avoid confusing the lingering deleted file for a live
4275 * WAL file that needs to be archived, rename it before deleting it.
4276 *
4277 * If another process holds the file open without FILE_SHARE_DELETE
4278 * flag, rename will fail. We'll try again at the next checkpoint.
4279 */
4280 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4281 if (rename(path, newpath) != 0)
4282 {
4283 ereport(LOG,
4284 (errcode_for_file_access(),
4285 errmsg("could not rename file \"%s\": %m",
4286 path)));
4287 return;
4288 }
4289 rc = durable_unlink(newpath, LOG);
4290 #else
4291 rc = durable_unlink(path, LOG);
4292 #endif
4293 if (rc != 0)
4294 {
4295 /* Message already logged by durable_unlink() */
4296 return;
4297 }
4298 CheckpointStats.ckpt_segs_removed++;
4299 }
4300
4301 XLogArchiveCleanup(segname);
4302 }
4303
4304 /*
4305 * Verify whether pg_wal and pg_wal/archive_status exist.
4306 * If the latter does not exist, recreate it.
4307 *
4308 * It is not the goal of this function to verify the contents of these
4309 * directories, but to help in cases where someone has performed a cluster
4310 * copy for PITR purposes but omitted pg_wal from the copy.
4311 *
4312 * We could also recreate pg_wal if it doesn't exist, but a deliberate
4313 * policy decision was made not to. It is fairly common for pg_wal to be
4314 * a symlink, and if that was the DBA's intent then automatically making a
4315 * plain directory would result in degraded performance with no notice.
4316 */
4317 static void
ValidateXLOGDirectoryStructure(void)4318 ValidateXLOGDirectoryStructure(void)
4319 {
4320 char path[MAXPGPATH];
4321 struct stat stat_buf;
4322
4323 /* Check for pg_wal; if it doesn't exist, error out */
4324 if (stat(XLOGDIR, &stat_buf) != 0 ||
4325 !S_ISDIR(stat_buf.st_mode))
4326 ereport(FATAL,
4327 (errmsg("required WAL directory \"%s\" does not exist",
4328 XLOGDIR)));
4329
4330 /* Check for archive_status */
4331 snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4332 if (stat(path, &stat_buf) == 0)
4333 {
4334 /* Check for weird cases where it exists but isn't a directory */
4335 if (!S_ISDIR(stat_buf.st_mode))
4336 ereport(FATAL,
4337 (errmsg("required WAL directory \"%s\" does not exist",
4338 path)));
4339 }
4340 else
4341 {
4342 ereport(LOG,
4343 (errmsg("creating missing WAL directory \"%s\"", path)));
4344 if (MakePGDirectory(path) < 0)
4345 ereport(FATAL,
4346 (errmsg("could not create missing directory \"%s\": %m",
4347 path)));
4348 }
4349 }
4350
4351 /*
4352 * Remove previous backup history files. This also retries creation of
4353 * .ready files for any backup history files for which XLogArchiveNotify
4354 * failed earlier.
4355 */
4356 static void
CleanupBackupHistory(void)4357 CleanupBackupHistory(void)
4358 {
4359 DIR *xldir;
4360 struct dirent *xlde;
4361 char path[MAXPGPATH + sizeof(XLOGDIR)];
4362
4363 xldir = AllocateDir(XLOGDIR);
4364
4365 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4366 {
4367 if (IsBackupHistoryFileName(xlde->d_name))
4368 {
4369 if (XLogArchiveCheckDone(xlde->d_name))
4370 {
4371 elog(DEBUG2, "removing WAL backup history file \"%s\"",
4372 xlde->d_name);
4373 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4374 unlink(path);
4375 XLogArchiveCleanup(xlde->d_name);
4376 }
4377 }
4378 }
4379
4380 FreeDir(xldir);
4381 }
4382
4383 /*
4384 * Attempt to read the next XLOG record.
4385 *
4386 * Before first call, the reader needs to be positioned to the first record
4387 * by calling XLogBeginRead().
4388 *
4389 * If no valid record is available, returns NULL, or fails if emode is PANIC.
4390 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4391 * record is available.
4392 */
4393 static XLogRecord *
ReadRecord(XLogReaderState * xlogreader,int emode,bool fetching_ckpt)4394 ReadRecord(XLogReaderState *xlogreader, int emode,
4395 bool fetching_ckpt)
4396 {
4397 XLogRecord *record;
4398 XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4399
4400 /* Pass through parameters to XLogPageRead */
4401 private->fetching_ckpt = fetching_ckpt;
4402 private->emode = emode;
4403 private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
4404
4405 /* This is the first attempt to read this page. */
4406 lastSourceFailed = false;
4407
4408 for (;;)
4409 {
4410 char *errormsg;
4411
4412 record = XLogReadRecord(xlogreader, &errormsg);
4413 ReadRecPtr = xlogreader->ReadRecPtr;
4414 EndRecPtr = xlogreader->EndRecPtr;
4415 if (record == NULL)
4416 {
4417 /*
4418 * When not in standby mode we find that WAL ends in an incomplete
4419 * record, keep track of that record. After recovery is done,
4420 * we'll write a record to indicate downstream WAL readers that
4421 * that portion is to be ignored.
4422 */
4423 if (!StandbyMode &&
4424 !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
4425 {
4426 abortedRecPtr = xlogreader->abortedRecPtr;
4427 missingContrecPtr = xlogreader->missingContrecPtr;
4428 }
4429
4430 if (readFile >= 0)
4431 {
4432 close(readFile);
4433 readFile = -1;
4434 }
4435
4436 /*
4437 * We only end up here without a message when XLogPageRead()
4438 * failed - in that case we already logged something. In
4439 * StandbyMode that only happens if we have been triggered, so we
4440 * shouldn't loop anymore in that case.
4441 */
4442 if (errormsg)
4443 ereport(emode_for_corrupt_record(emode, EndRecPtr),
4444 (errmsg_internal("%s", errormsg) /* already translated */ ));
4445 }
4446
4447 /*
4448 * Check page TLI is one of the expected values.
4449 */
4450 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4451 {
4452 char fname[MAXFNAMELEN];
4453 XLogSegNo segno;
4454 int32 offset;
4455
4456 XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4457 offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4458 wal_segment_size);
4459 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
4460 wal_segment_size);
4461 ereport(emode_for_corrupt_record(emode, EndRecPtr),
4462 (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4463 xlogreader->latestPageTLI,
4464 fname,
4465 offset)));
4466 record = NULL;
4467 }
4468
4469 if (record)
4470 {
4471 /* Great, got a record */
4472 return record;
4473 }
4474 else
4475 {
4476 /* No valid record available from this source */
4477 lastSourceFailed = true;
4478
4479 /*
4480 * If archive recovery was requested, but we were still doing
4481 * crash recovery, switch to archive recovery and retry using the
4482 * offline archive. We have now replayed all the valid WAL in
4483 * pg_wal, so we are presumably now consistent.
4484 *
4485 * We require that there's at least some valid WAL present in
4486 * pg_wal, however (!fetching_ckpt). We could recover using the
4487 * WAL from the archive, even if pg_wal is completely empty, but
4488 * we'd have no idea how far we'd have to replay to reach
4489 * consistency. So err on the safe side and give up.
4490 */
4491 if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4492 !fetching_ckpt)
4493 {
4494 ereport(DEBUG1,
4495 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4496 InArchiveRecovery = true;
4497 if (StandbyModeRequested)
4498 StandbyMode = true;
4499
4500 /* initialize minRecoveryPoint to this record */
4501 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4502 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4503 if (ControlFile->minRecoveryPoint < EndRecPtr)
4504 {
4505 ControlFile->minRecoveryPoint = EndRecPtr;
4506 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4507 }
4508 /* update local copy */
4509 minRecoveryPoint = ControlFile->minRecoveryPoint;
4510 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4511
4512 /*
4513 * The startup process can update its local copy of
4514 * minRecoveryPoint from this point.
4515 */
4516 updateMinRecoveryPoint = true;
4517
4518 UpdateControlFile();
4519
4520 /*
4521 * We update SharedRecoveryState while holding the lock on
4522 * ControlFileLock so both states are consistent in shared
4523 * memory.
4524 */
4525 SpinLockAcquire(&XLogCtl->info_lck);
4526 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
4527 SpinLockRelease(&XLogCtl->info_lck);
4528
4529 LWLockRelease(ControlFileLock);
4530
4531 CheckRecoveryConsistency();
4532
4533 /*
4534 * Before we retry, reset lastSourceFailed and currentSource
4535 * so that we will check the archive next.
4536 */
4537 lastSourceFailed = false;
4538 currentSource = XLOG_FROM_ANY;
4539
4540 continue;
4541 }
4542
4543 /* In standby mode, loop back to retry. Otherwise, give up. */
4544 if (StandbyMode && !CheckForStandbyTrigger())
4545 continue;
4546 else
4547 return NULL;
4548 }
4549 }
4550 }
4551
4552 /*
4553 * Scan for new timelines that might have appeared in the archive since we
4554 * started recovery.
4555 *
4556 * If there are any, the function changes recovery target TLI to the latest
4557 * one and returns 'true'.
4558 */
4559 static bool
rescanLatestTimeLine(void)4560 rescanLatestTimeLine(void)
4561 {
4562 List *newExpectedTLEs;
4563 bool found;
4564 ListCell *cell;
4565 TimeLineID newtarget;
4566 TimeLineID oldtarget = recoveryTargetTLI;
4567 TimeLineHistoryEntry *currentTle = NULL;
4568
4569 newtarget = findNewestTimeLine(recoveryTargetTLI);
4570 if (newtarget == recoveryTargetTLI)
4571 {
4572 /* No new timelines found */
4573 return false;
4574 }
4575
4576 /*
4577 * Determine the list of expected TLIs for the new TLI
4578 */
4579
4580 newExpectedTLEs = readTimeLineHistory(newtarget);
4581
4582 /*
4583 * If the current timeline is not part of the history of the new timeline,
4584 * we cannot proceed to it.
4585 */
4586 found = false;
4587 foreach(cell, newExpectedTLEs)
4588 {
4589 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4590
4591 if (currentTle->tli == recoveryTargetTLI)
4592 {
4593 found = true;
4594 break;
4595 }
4596 }
4597 if (!found)
4598 {
4599 ereport(LOG,
4600 (errmsg("new timeline %u is not a child of database system timeline %u",
4601 newtarget,
4602 ThisTimeLineID)));
4603 return false;
4604 }
4605
4606 /*
4607 * The current timeline was found in the history file, but check that the
4608 * next timeline was forked off from it *after* the current recovery
4609 * location.
4610 */
4611 if (currentTle->end < EndRecPtr)
4612 {
4613 ereport(LOG,
4614 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4615 newtarget,
4616 ThisTimeLineID,
4617 LSN_FORMAT_ARGS(EndRecPtr))));
4618 return false;
4619 }
4620
4621 /* The new timeline history seems valid. Switch target */
4622 recoveryTargetTLI = newtarget;
4623 list_free_deep(expectedTLEs);
4624 expectedTLEs = newExpectedTLEs;
4625
4626 /*
4627 * As in StartupXLOG(), try to ensure we have all the history files
4628 * between the old target and new target in pg_wal.
4629 */
4630 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4631
4632 ereport(LOG,
4633 (errmsg("new target timeline is %u",
4634 recoveryTargetTLI)));
4635
4636 return true;
4637 }
4638
4639 /*
4640 * I/O routines for pg_control
4641 *
4642 * *ControlFile is a buffer in shared memory that holds an image of the
4643 * contents of pg_control. WriteControlFile() initializes pg_control
4644 * given a preloaded buffer, ReadControlFile() loads the buffer from
4645 * the pg_control file (during postmaster or standalone-backend startup),
4646 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4647 * InitControlFile() fills the buffer with initial values.
4648 *
4649 * For simplicity, WriteControlFile() initializes the fields of pg_control
4650 * that are related to checking backend/database compatibility, and
4651 * ReadControlFile() verifies they are correct. We could split out the
4652 * I/O and compatibility-check functions, but there seems no need currently.
4653 */
4654
4655 static void
InitControlFile(uint64 sysidentifier)4656 InitControlFile(uint64 sysidentifier)
4657 {
4658 char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4659
4660 /*
4661 * Generate a random nonce. This is used for authentication requests that
4662 * will fail because the user does not exist. The nonce is used to create
4663 * a genuine-looking password challenge for the non-existent user, in lieu
4664 * of an actual stored password.
4665 */
4666 if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4667 ereport(PANIC,
4668 (errcode(ERRCODE_INTERNAL_ERROR),
4669 errmsg("could not generate secret authorization token")));
4670
4671 memset(ControlFile, 0, sizeof(ControlFileData));
4672 /* Initialize pg_control status fields */
4673 ControlFile->system_identifier = sysidentifier;
4674 memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
4675 ControlFile->state = DB_SHUTDOWNED;
4676 ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
4677
4678 /* Set important parameter values for use when replaying WAL */
4679 ControlFile->MaxConnections = MaxConnections;
4680 ControlFile->max_worker_processes = max_worker_processes;
4681 ControlFile->max_wal_senders = max_wal_senders;
4682 ControlFile->max_prepared_xacts = max_prepared_xacts;
4683 ControlFile->max_locks_per_xact = max_locks_per_xact;
4684 ControlFile->wal_level = wal_level;
4685 ControlFile->wal_log_hints = wal_log_hints;
4686 ControlFile->track_commit_timestamp = track_commit_timestamp;
4687 ControlFile->data_checksum_version = bootstrap_data_checksum_version;
4688 }
4689
4690 static void
WriteControlFile(void)4691 WriteControlFile(void)
4692 {
4693 int fd;
4694 char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
4695
4696 /*
4697 * Ensure that the size of the pg_control data structure is sane. See the
4698 * comments for these symbols in pg_control.h.
4699 */
4700 StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4701 "pg_control is too large for atomic disk writes");
4702 StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4703 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4704
4705 /*
4706 * Initialize version and compatibility-check fields
4707 */
4708 ControlFile->pg_control_version = PG_CONTROL_VERSION;
4709 ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4710
4711 ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4712 ControlFile->floatFormat = FLOATFORMAT_VALUE;
4713
4714 ControlFile->blcksz = BLCKSZ;
4715 ControlFile->relseg_size = RELSEG_SIZE;
4716 ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4717 ControlFile->xlog_seg_size = wal_segment_size;
4718
4719 ControlFile->nameDataLen = NAMEDATALEN;
4720 ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4721
4722 ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4723 ControlFile->loblksize = LOBLKSIZE;
4724
4725 ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4726
4727 /* Contents are protected with a CRC */
4728 INIT_CRC32C(ControlFile->crc);
4729 COMP_CRC32C(ControlFile->crc,
4730 (char *) ControlFile,
4731 offsetof(ControlFileData, crc));
4732 FIN_CRC32C(ControlFile->crc);
4733
4734 /*
4735 * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4736 * the excess over sizeof(ControlFileData). This reduces the odds of
4737 * premature-EOF errors when reading pg_control. We'll still fail when we
4738 * check the contents of the file, but hopefully with a more specific
4739 * error than "couldn't read pg_control".
4740 */
4741 memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4742 memcpy(buffer, ControlFile, sizeof(ControlFileData));
4743
4744 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4745 O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4746 if (fd < 0)
4747 ereport(PANIC,
4748 (errcode_for_file_access(),
4749 errmsg("could not create file \"%s\": %m",
4750 XLOG_CONTROL_FILE)));
4751
4752 errno = 0;
4753 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4754 if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4755 {
4756 /* if write didn't set errno, assume problem is no disk space */
4757 if (errno == 0)
4758 errno = ENOSPC;
4759 ereport(PANIC,
4760 (errcode_for_file_access(),
4761 errmsg("could not write to file \"%s\": %m",
4762 XLOG_CONTROL_FILE)));
4763 }
4764 pgstat_report_wait_end();
4765
4766 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4767 if (pg_fsync(fd) != 0)
4768 ereport(PANIC,
4769 (errcode_for_file_access(),
4770 errmsg("could not fsync file \"%s\": %m",
4771 XLOG_CONTROL_FILE)));
4772 pgstat_report_wait_end();
4773
4774 if (close(fd) != 0)
4775 ereport(PANIC,
4776 (errcode_for_file_access(),
4777 errmsg("could not close file \"%s\": %m",
4778 XLOG_CONTROL_FILE)));
4779 }
4780
4781 static void
ReadControlFile(void)4782 ReadControlFile(void)
4783 {
4784 pg_crc32c crc;
4785 int fd;
4786 static char wal_segsz_str[20];
4787 int r;
4788
4789 /*
4790 * Read data...
4791 */
4792 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4793 O_RDWR | PG_BINARY);
4794 if (fd < 0)
4795 ereport(PANIC,
4796 (errcode_for_file_access(),
4797 errmsg("could not open file \"%s\": %m",
4798 XLOG_CONTROL_FILE)));
4799
4800 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4801 r = read(fd, ControlFile, sizeof(ControlFileData));
4802 if (r != sizeof(ControlFileData))
4803 {
4804 if (r < 0)
4805 ereport(PANIC,
4806 (errcode_for_file_access(),
4807 errmsg("could not read file \"%s\": %m",
4808 XLOG_CONTROL_FILE)));
4809 else
4810 ereport(PANIC,
4811 (errcode(ERRCODE_DATA_CORRUPTED),
4812 errmsg("could not read file \"%s\": read %d of %zu",
4813 XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4814 }
4815 pgstat_report_wait_end();
4816
4817 close(fd);
4818
4819 /*
4820 * Check for expected pg_control format version. If this is wrong, the
4821 * CRC check will likely fail because we'll be checking the wrong number
4822 * of bytes. Complaining about wrong version will probably be more
4823 * enlightening than complaining about wrong CRC.
4824 */
4825
4826 if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4827 ereport(FATAL,
4828 (errmsg("database files are incompatible with server"),
4829 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4830 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4831 ControlFile->pg_control_version, ControlFile->pg_control_version,
4832 PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4833 errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
4834
4835 if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4836 ereport(FATAL,
4837 (errmsg("database files are incompatible with server"),
4838 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4839 " but the server was compiled with PG_CONTROL_VERSION %d.",
4840 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4841 errhint("It looks like you need to initdb.")));
4842
4843 /* Now check the CRC. */
4844 INIT_CRC32C(crc);
4845 COMP_CRC32C(crc,
4846 (char *) ControlFile,
4847 offsetof(ControlFileData, crc));
4848 FIN_CRC32C(crc);
4849
4850 if (!EQ_CRC32C(crc, ControlFile->crc))
4851 ereport(FATAL,
4852 (errmsg("incorrect checksum in control file")));
4853
4854 /*
4855 * Do compatibility checking immediately. If the database isn't
4856 * compatible with the backend executable, we want to abort before we can
4857 * possibly do any damage.
4858 */
4859 if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4860 ereport(FATAL,
4861 (errmsg("database files are incompatible with server"),
4862 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4863 " but the server was compiled with CATALOG_VERSION_NO %d.",
4864 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4865 errhint("It looks like you need to initdb.")));
4866 if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4867 ereport(FATAL,
4868 (errmsg("database files are incompatible with server"),
4869 errdetail("The database cluster was initialized with MAXALIGN %d,"
4870 " but the server was compiled with MAXALIGN %d.",
4871 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4872 errhint("It looks like you need to initdb.")));
4873 if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4874 ereport(FATAL,
4875 (errmsg("database files are incompatible with server"),
4876 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4877 errhint("It looks like you need to initdb.")));
4878 if (ControlFile->blcksz != BLCKSZ)
4879 ereport(FATAL,
4880 (errmsg("database files are incompatible with server"),
4881 errdetail("The database cluster was initialized with BLCKSZ %d,"
4882 " but the server was compiled with BLCKSZ %d.",
4883 ControlFile->blcksz, BLCKSZ),
4884 errhint("It looks like you need to recompile or initdb.")));
4885 if (ControlFile->relseg_size != RELSEG_SIZE)
4886 ereport(FATAL,
4887 (errmsg("database files are incompatible with server"),
4888 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4889 " but the server was compiled with RELSEG_SIZE %d.",
4890 ControlFile->relseg_size, RELSEG_SIZE),
4891 errhint("It looks like you need to recompile or initdb.")));
4892 if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4893 ereport(FATAL,
4894 (errmsg("database files are incompatible with server"),
4895 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4896 " but the server was compiled with XLOG_BLCKSZ %d.",
4897 ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4898 errhint("It looks like you need to recompile or initdb.")));
4899 if (ControlFile->nameDataLen != NAMEDATALEN)
4900 ereport(FATAL,
4901 (errmsg("database files are incompatible with server"),
4902 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4903 " but the server was compiled with NAMEDATALEN %d.",
4904 ControlFile->nameDataLen, NAMEDATALEN),
4905 errhint("It looks like you need to recompile or initdb.")));
4906 if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4907 ereport(FATAL,
4908 (errmsg("database files are incompatible with server"),
4909 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4910 " but the server was compiled with INDEX_MAX_KEYS %d.",
4911 ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4912 errhint("It looks like you need to recompile or initdb.")));
4913 if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4914 ereport(FATAL,
4915 (errmsg("database files are incompatible with server"),
4916 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4917 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4918 ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4919 errhint("It looks like you need to recompile or initdb.")));
4920 if (ControlFile->loblksize != LOBLKSIZE)
4921 ereport(FATAL,
4922 (errmsg("database files are incompatible with server"),
4923 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4924 " but the server was compiled with LOBLKSIZE %d.",
4925 ControlFile->loblksize, (int) LOBLKSIZE),
4926 errhint("It looks like you need to recompile or initdb.")));
4927
4928 #ifdef USE_FLOAT8_BYVAL
4929 if (ControlFile->float8ByVal != true)
4930 ereport(FATAL,
4931 (errmsg("database files are incompatible with server"),
4932 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4933 " but the server was compiled with USE_FLOAT8_BYVAL."),
4934 errhint("It looks like you need to recompile or initdb.")));
4935 #else
4936 if (ControlFile->float8ByVal != false)
4937 ereport(FATAL,
4938 (errmsg("database files are incompatible with server"),
4939 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4940 " but the server was compiled without USE_FLOAT8_BYVAL."),
4941 errhint("It looks like you need to recompile or initdb.")));
4942 #endif
4943
4944 wal_segment_size = ControlFile->xlog_seg_size;
4945
4946 if (!IsValidWalSegSize(wal_segment_size))
4947 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4948 errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4949 "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4950 wal_segment_size,
4951 wal_segment_size)));
4952
4953 snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4954 SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4955 PGC_S_OVERRIDE);
4956
4957 /* check and update variables dependent on wal_segment_size */
4958 if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4959 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4960 errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
4961
4962 if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4963 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4964 errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
4965
4966 UsableBytesInSegment =
4967 (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4968 (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4969
4970 CalculateCheckpointSegments();
4971
4972 /* Make the initdb settings visible as GUC variables, too */
4973 SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4974 PGC_INTERNAL, PGC_S_OVERRIDE);
4975 }
4976
4977 /*
4978 * Utility wrapper to update the control file. Note that the control
4979 * file gets flushed.
4980 */
4981 void
UpdateControlFile(void)4982 UpdateControlFile(void)
4983 {
4984 update_controlfile(DataDir, ControlFile, true);
4985 }
4986
4987 /*
4988 * Returns the unique system identifier from control file.
4989 */
4990 uint64
GetSystemIdentifier(void)4991 GetSystemIdentifier(void)
4992 {
4993 Assert(ControlFile != NULL);
4994 return ControlFile->system_identifier;
4995 }
4996
4997 /*
4998 * Returns the random nonce from control file.
4999 */
5000 char *
GetMockAuthenticationNonce(void)5001 GetMockAuthenticationNonce(void)
5002 {
5003 Assert(ControlFile != NULL);
5004 return ControlFile->mock_authentication_nonce;
5005 }
5006
5007 /*
5008 * Are checksums enabled for data pages?
5009 */
5010 bool
DataChecksumsEnabled(void)5011 DataChecksumsEnabled(void)
5012 {
5013 Assert(ControlFile != NULL);
5014 return (ControlFile->data_checksum_version > 0);
5015 }
5016
5017 /*
5018 * Returns a fake LSN for unlogged relations.
5019 *
5020 * Each call generates an LSN that is greater than any previous value
5021 * returned. The current counter value is saved and restored across clean
5022 * shutdowns, but like unlogged relations, does not survive a crash. This can
5023 * be used in lieu of real LSN values returned by XLogInsert, if you need an
5024 * LSN-like increasing sequence of numbers without writing any WAL.
5025 */
5026 XLogRecPtr
GetFakeLSNForUnloggedRel(void)5027 GetFakeLSNForUnloggedRel(void)
5028 {
5029 XLogRecPtr nextUnloggedLSN;
5030
5031 /* increment the unloggedLSN counter, need SpinLock */
5032 SpinLockAcquire(&XLogCtl->ulsn_lck);
5033 nextUnloggedLSN = XLogCtl->unloggedLSN++;
5034 SpinLockRelease(&XLogCtl->ulsn_lck);
5035
5036 return nextUnloggedLSN;
5037 }
5038
5039 /*
5040 * Auto-tune the number of XLOG buffers.
5041 *
5042 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
5043 * a maximum of one XLOG segment (there is little reason to think that more
5044 * is helpful, at least so long as we force an fsync when switching log files)
5045 * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
5046 * 9.1, when auto-tuning was added).
5047 *
5048 * This should not be called until NBuffers has received its final value.
5049 */
5050 static int
XLOGChooseNumBuffers(void)5051 XLOGChooseNumBuffers(void)
5052 {
5053 int xbuffers;
5054
5055 xbuffers = NBuffers / 32;
5056 if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
5057 xbuffers = (wal_segment_size / XLOG_BLCKSZ);
5058 if (xbuffers < 8)
5059 xbuffers = 8;
5060 return xbuffers;
5061 }
5062
5063 /*
5064 * GUC check_hook for wal_buffers
5065 */
5066 bool
check_wal_buffers(int * newval,void ** extra,GucSource source)5067 check_wal_buffers(int *newval, void **extra, GucSource source)
5068 {
5069 /*
5070 * -1 indicates a request for auto-tune.
5071 */
5072 if (*newval == -1)
5073 {
5074 /*
5075 * If we haven't yet changed the boot_val default of -1, just let it
5076 * be. We'll fix it when XLOGShmemSize is called.
5077 */
5078 if (XLOGbuffers == -1)
5079 return true;
5080
5081 /* Otherwise, substitute the auto-tune value */
5082 *newval = XLOGChooseNumBuffers();
5083 }
5084
5085 /*
5086 * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
5087 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5088 * the case, we just silently treat such values as a request for the
5089 * minimum. (We could throw an error instead, but that doesn't seem very
5090 * helpful.)
5091 */
5092 if (*newval < 4)
5093 *newval = 4;
5094
5095 return true;
5096 }
5097
5098 /*
5099 * Read the control file, set respective GUCs.
5100 *
5101 * This is to be called during startup, including a crash recovery cycle,
5102 * unless in bootstrap mode, where no control file yet exists. As there's no
5103 * usable shared memory yet (its sizing can depend on the contents of the
5104 * control file!), first store the contents in local memory. XLOGShmemInit()
5105 * will then copy it to shared memory later.
5106 *
5107 * reset just controls whether previous contents are to be expected (in the
5108 * reset case, there's a dangling pointer into old shared memory), or not.
5109 */
5110 void
LocalProcessControlFile(bool reset)5111 LocalProcessControlFile(bool reset)
5112 {
5113 Assert(reset || ControlFile == NULL);
5114 ControlFile = palloc(sizeof(ControlFileData));
5115 ReadControlFile();
5116 }
5117
5118 /*
5119 * Initialization of shared memory for XLOG
5120 */
5121 Size
XLOGShmemSize(void)5122 XLOGShmemSize(void)
5123 {
5124 Size size;
5125
5126 /*
5127 * If the value of wal_buffers is -1, use the preferred auto-tune value.
5128 * This isn't an amazingly clean place to do this, but we must wait till
5129 * NBuffers has received its final value, and must do it before using the
5130 * value of XLOGbuffers to do anything important.
5131 */
5132 if (XLOGbuffers == -1)
5133 {
5134 char buf[32];
5135
5136 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5137 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5138 }
5139 Assert(XLOGbuffers > 0);
5140
5141 /* XLogCtl */
5142 size = sizeof(XLogCtlData);
5143
5144 /* WAL insertion locks, plus alignment */
5145 size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
5146 /* xlblocks array */
5147 size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5148 /* extra alignment padding for XLOG I/O buffers */
5149 size = add_size(size, XLOG_BLCKSZ);
5150 /* and the buffers themselves */
5151 size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5152
5153 /*
5154 * Note: we don't count ControlFileData, it comes out of the "slop factor"
5155 * added by CreateSharedMemoryAndSemaphores. This lets us use this
5156 * routine again below to compute the actual allocation size.
5157 */
5158
5159 return size;
5160 }
5161
5162 void
XLOGShmemInit(void)5163 XLOGShmemInit(void)
5164 {
5165 bool foundCFile,
5166 foundXLog;
5167 char *allocptr;
5168 int i;
5169 ControlFileData *localControlFile;
5170
5171 #ifdef WAL_DEBUG
5172
5173 /*
5174 * Create a memory context for WAL debugging that's exempt from the normal
5175 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
5176 * an allocation fails, but wal_debug is not for production use anyway.
5177 */
5178 if (walDebugCxt == NULL)
5179 {
5180 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
5181 "WAL Debug",
5182 ALLOCSET_DEFAULT_SIZES);
5183 MemoryContextAllowInCriticalSection(walDebugCxt, true);
5184 }
5185 #endif
5186
5187
5188 XLogCtl = (XLogCtlData *)
5189 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5190
5191 localControlFile = ControlFile;
5192 ControlFile = (ControlFileData *)
5193 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5194
5195 if (foundCFile || foundXLog)
5196 {
5197 /* both should be present or neither */
5198 Assert(foundCFile && foundXLog);
5199
5200 /* Initialize local copy of WALInsertLocks */
5201 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5202
5203 if (localControlFile)
5204 pfree(localControlFile);
5205 return;
5206 }
5207 memset(XLogCtl, 0, sizeof(XLogCtlData));
5208
5209 /*
5210 * Already have read control file locally, unless in bootstrap mode. Move
5211 * contents into shared memory.
5212 */
5213 if (localControlFile)
5214 {
5215 memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5216 pfree(localControlFile);
5217 }
5218
5219 /*
5220 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5221 * multiple of the alignment for same, so no extra alignment padding is
5222 * needed here.
5223 */
5224 allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5225 XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5226 memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5227 allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5228
5229
5230 /* WAL insertion locks. Ensure they're aligned to the full padded size */
5231 allocptr += sizeof(WALInsertLockPadded) -
5232 ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5233 WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5234 (WALInsertLockPadded *) allocptr;
5235 allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5236
5237 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5238 {
5239 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5240 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5241 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5242 }
5243
5244 /*
5245 * Align the start of the page buffers to a full xlog block size boundary.
5246 * This simplifies some calculations in XLOG insertion. It is also
5247 * required for O_DIRECT.
5248 */
5249 allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5250 XLogCtl->pages = allocptr;
5251 memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5252
5253 /*
5254 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5255 * in additional info.)
5256 */
5257 XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5258 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5259 XLogCtl->SharedHotStandbyActive = false;
5260 XLogCtl->SharedPromoteIsTriggered = false;
5261 XLogCtl->WalWriterSleeping = false;
5262
5263 SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5264 SpinLockInit(&XLogCtl->info_lck);
5265 SpinLockInit(&XLogCtl->ulsn_lck);
5266 InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5267 ConditionVariableInit(&XLogCtl->recoveryNotPausedCV);
5268 }
5269
5270 /*
5271 * This func must be called ONCE on system install. It creates pg_control
5272 * and the initial XLOG segment.
5273 */
5274 void
BootStrapXLOG(void)5275 BootStrapXLOG(void)
5276 {
5277 CheckPoint checkPoint;
5278 char *buffer;
5279 XLogPageHeader page;
5280 XLogLongPageHeader longpage;
5281 XLogRecord *record;
5282 char *recptr;
5283 bool use_existent;
5284 uint64 sysidentifier;
5285 struct timeval tv;
5286 pg_crc32c crc;
5287
5288 /*
5289 * Select a hopefully-unique system identifier code for this installation.
5290 * We use the result of gettimeofday(), including the fractional seconds
5291 * field, as being about as unique as we can easily get. (Think not to
5292 * use random(), since it hasn't been seeded and there's no portable way
5293 * to seed it other than the system clock value...) The upper half of the
5294 * uint64 value is just the tv_sec part, while the lower half contains the
5295 * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5296 * PID for a little extra uniqueness. A person knowing this encoding can
5297 * determine the initialization time of the installation, which could
5298 * perhaps be useful sometimes.
5299 */
5300 gettimeofday(&tv, NULL);
5301 sysidentifier = ((uint64) tv.tv_sec) << 32;
5302 sysidentifier |= ((uint64) tv.tv_usec) << 12;
5303 sysidentifier |= getpid() & 0xFFF;
5304
5305 /* First timeline ID is always 1 */
5306 ThisTimeLineID = 1;
5307
5308 /* page buffer must be aligned suitably for O_DIRECT */
5309 buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5310 page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5311 memset(page, 0, XLOG_BLCKSZ);
5312
5313 /*
5314 * Set up information for the initial checkpoint record
5315 *
5316 * The initial checkpoint record is written to the beginning of the WAL
5317 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5318 * used, so that we can use 0/0 to mean "before any valid WAL segment".
5319 */
5320 checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5321 checkPoint.ThisTimeLineID = ThisTimeLineID;
5322 checkPoint.PrevTimeLineID = ThisTimeLineID;
5323 checkPoint.fullPageWrites = fullPageWrites;
5324 checkPoint.nextXid =
5325 FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
5326 checkPoint.nextOid = FirstBootstrapObjectId;
5327 checkPoint.nextMulti = FirstMultiXactId;
5328 checkPoint.nextMultiOffset = 0;
5329 checkPoint.oldestXid = FirstNormalTransactionId;
5330 checkPoint.oldestXidDB = TemplateDbOid;
5331 checkPoint.oldestMulti = FirstMultiXactId;
5332 checkPoint.oldestMultiDB = TemplateDbOid;
5333 checkPoint.oldestCommitTsXid = InvalidTransactionId;
5334 checkPoint.newestCommitTsXid = InvalidTransactionId;
5335 checkPoint.time = (pg_time_t) time(NULL);
5336 checkPoint.oldestActiveXid = InvalidTransactionId;
5337
5338 ShmemVariableCache->nextXid = checkPoint.nextXid;
5339 ShmemVariableCache->nextOid = checkPoint.nextOid;
5340 ShmemVariableCache->oidCount = 0;
5341 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5342 AdvanceOldestClogXid(checkPoint.oldestXid);
5343 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5344 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5345 SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5346
5347 /* Set up the XLOG page header */
5348 page->xlp_magic = XLOG_PAGE_MAGIC;
5349 page->xlp_info = XLP_LONG_HEADER;
5350 page->xlp_tli = ThisTimeLineID;
5351 page->xlp_pageaddr = wal_segment_size;
5352 longpage = (XLogLongPageHeader) page;
5353 longpage->xlp_sysid = sysidentifier;
5354 longpage->xlp_seg_size = wal_segment_size;
5355 longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5356
5357 /* Insert the initial checkpoint record */
5358 recptr = ((char *) page + SizeOfXLogLongPHD);
5359 record = (XLogRecord *) recptr;
5360 record->xl_prev = 0;
5361 record->xl_xid = InvalidTransactionId;
5362 record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5363 record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5364 record->xl_rmid = RM_XLOG_ID;
5365 recptr += SizeOfXLogRecord;
5366 /* fill the XLogRecordDataHeaderShort struct */
5367 *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5368 *(recptr++) = sizeof(checkPoint);
5369 memcpy(recptr, &checkPoint, sizeof(checkPoint));
5370 recptr += sizeof(checkPoint);
5371 Assert(recptr - (char *) record == record->xl_tot_len);
5372
5373 INIT_CRC32C(crc);
5374 COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5375 COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5376 FIN_CRC32C(crc);
5377 record->xl_crc = crc;
5378
5379 /* Create first XLOG segment file */
5380 use_existent = false;
5381 openLogFile = XLogFileInit(1, &use_existent, false);
5382
5383 /*
5384 * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
5385 * close the file again in a moment.
5386 */
5387
5388 /* Write the first page with the initial record */
5389 errno = 0;
5390 pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5391 if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5392 {
5393 /* if write didn't set errno, assume problem is no disk space */
5394 if (errno == 0)
5395 errno = ENOSPC;
5396 ereport(PANIC,
5397 (errcode_for_file_access(),
5398 errmsg("could not write bootstrap write-ahead log file: %m")));
5399 }
5400 pgstat_report_wait_end();
5401
5402 pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5403 if (pg_fsync(openLogFile) != 0)
5404 ereport(PANIC,
5405 (errcode_for_file_access(),
5406 errmsg("could not fsync bootstrap write-ahead log file: %m")));
5407 pgstat_report_wait_end();
5408
5409 if (close(openLogFile) != 0)
5410 ereport(PANIC,
5411 (errcode_for_file_access(),
5412 errmsg("could not close bootstrap write-ahead log file: %m")));
5413
5414 openLogFile = -1;
5415
5416 /* Now create pg_control */
5417 InitControlFile(sysidentifier);
5418 ControlFile->time = checkPoint.time;
5419 ControlFile->checkPoint = checkPoint.redo;
5420 ControlFile->checkPointCopy = checkPoint;
5421
5422 /* some additional ControlFile fields are set in WriteControlFile() */
5423 WriteControlFile();
5424
5425 /* Bootstrap the commit log, too */
5426 BootStrapCLOG();
5427 BootStrapCommitTs();
5428 BootStrapSUBTRANS();
5429 BootStrapMultiXact();
5430
5431 pfree(buffer);
5432
5433 /*
5434 * Force control file to be read - in contrast to normal processing we'd
5435 * otherwise never run the checks and GUC related initializations therein.
5436 */
5437 ReadControlFile();
5438 }
5439
5440 static char *
str_time(pg_time_t tnow)5441 str_time(pg_time_t tnow)
5442 {
5443 static char buf[128];
5444
5445 pg_strftime(buf, sizeof(buf),
5446 "%Y-%m-%d %H:%M:%S %Z",
5447 pg_localtime(&tnow, log_timezone));
5448
5449 return buf;
5450 }
5451
5452 /*
5453 * See if there are any recovery signal files and if so, set state for
5454 * recovery.
5455 *
5456 * See if there is a recovery command file (recovery.conf), and if so
5457 * throw an ERROR since as of PG12 we no longer recognize that.
5458 */
5459 static void
readRecoverySignalFile(void)5460 readRecoverySignalFile(void)
5461 {
5462 struct stat stat_buf;
5463
5464 if (IsBootstrapProcessingMode())
5465 return;
5466
5467 /*
5468 * Check for old recovery API file: recovery.conf
5469 */
5470 if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
5471 ereport(FATAL,
5472 (errcode_for_file_access(),
5473 errmsg("using recovery command file \"%s\" is not supported",
5474 RECOVERY_COMMAND_FILE)));
5475
5476 /*
5477 * Remove unused .done file, if present. Ignore if absent.
5478 */
5479 unlink(RECOVERY_COMMAND_DONE);
5480
5481 /*
5482 * Check for recovery signal files and if found, fsync them since they
5483 * represent server state information. We don't sweat too much about the
5484 * possibility of fsync failure, however.
5485 *
5486 * If present, standby signal file takes precedence. If neither is present
5487 * then we won't enter archive recovery.
5488 */
5489 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
5490 {
5491 int fd;
5492
5493 fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5494 S_IRUSR | S_IWUSR);
5495 if (fd >= 0)
5496 {
5497 (void) pg_fsync(fd);
5498 close(fd);
5499 }
5500 standby_signal_file_found = true;
5501 }
5502 else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
5503 {
5504 int fd;
5505
5506 fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5507 S_IRUSR | S_IWUSR);
5508 if (fd >= 0)
5509 {
5510 (void) pg_fsync(fd);
5511 close(fd);
5512 }
5513 recovery_signal_file_found = true;
5514 }
5515
5516 StandbyModeRequested = false;
5517 ArchiveRecoveryRequested = false;
5518 if (standby_signal_file_found)
5519 {
5520 StandbyModeRequested = true;
5521 ArchiveRecoveryRequested = true;
5522 }
5523 else if (recovery_signal_file_found)
5524 {
5525 StandbyModeRequested = false;
5526 ArchiveRecoveryRequested = true;
5527 }
5528 else
5529 return;
5530
5531 /*
5532 * We don't support standby mode in standalone backends; that requires
5533 * other processes such as the WAL receiver to be alive.
5534 */
5535 if (StandbyModeRequested && !IsUnderPostmaster)
5536 ereport(FATAL,
5537 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5538 errmsg("standby mode is not supported by single-user servers")));
5539 }
5540
5541 static void
validateRecoveryParameters(void)5542 validateRecoveryParameters(void)
5543 {
5544 if (!ArchiveRecoveryRequested)
5545 return;
5546
5547 /*
5548 * Check for compulsory parameters
5549 */
5550 if (StandbyModeRequested)
5551 {
5552 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
5553 (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
5554 ereport(WARNING,
5555 (errmsg("specified neither primary_conninfo nor restore_command"),
5556 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5557 }
5558 else
5559 {
5560 if (recoveryRestoreCommand == NULL ||
5561 strcmp(recoveryRestoreCommand, "") == 0)
5562 ereport(FATAL,
5563 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5564 errmsg("must specify restore_command when standby mode is not enabled")));
5565 }
5566
5567 /*
5568 * Override any inconsistent requests. Note that this is a change of
5569 * behaviour in 9.5; prior to this we simply ignored a request to pause if
5570 * hot_standby = off, which was surprising behaviour.
5571 */
5572 if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5573 !EnableHotStandby)
5574 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5575
5576 /*
5577 * Final parsing of recovery_target_time string; see also
5578 * check_recovery_target_time().
5579 */
5580 if (recoveryTarget == RECOVERY_TARGET_TIME)
5581 {
5582 recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5583 CStringGetDatum(recovery_target_time_string),
5584 ObjectIdGetDatum(InvalidOid),
5585 Int32GetDatum(-1)));
5586 }
5587
5588 /*
5589 * If user specified recovery_target_timeline, validate it or compute the
5590 * "latest" value. We can't do this until after we've gotten the restore
5591 * command and set InArchiveRecovery, because we need to fetch timeline
5592 * history files from the archive.
5593 */
5594 if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5595 {
5596 TimeLineID rtli = recoveryTargetTLIRequested;
5597
5598 /* Timeline 1 does not have a history file, all else should */
5599 if (rtli != 1 && !existsTimeLineHistory(rtli))
5600 ereport(FATAL,
5601 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5602 errmsg("recovery target timeline %u does not exist",
5603 rtli)));
5604 recoveryTargetTLI = rtli;
5605 }
5606 else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
5607 {
5608 /* We start the "latest" search from pg_control's timeline */
5609 recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5610 }
5611 else
5612 {
5613 /*
5614 * else we just use the recoveryTargetTLI as already read from
5615 * ControlFile
5616 */
5617 Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
5618 }
5619 }
5620
5621 /*
5622 * Exit archive-recovery state
5623 */
5624 static void
exitArchiveRecovery(TimeLineID endTLI,XLogRecPtr endOfLog)5625 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5626 {
5627 char xlogfname[MAXFNAMELEN];
5628 XLogSegNo endLogSegNo;
5629 XLogSegNo startLogSegNo;
5630
5631 /* we always switch to a new timeline after archive recovery */
5632 Assert(endTLI != ThisTimeLineID);
5633
5634 /*
5635 * We are no longer in archive recovery state.
5636 */
5637 InArchiveRecovery = false;
5638
5639 /*
5640 * Update min recovery point one last time.
5641 */
5642 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5643
5644 /*
5645 * If the ending log segment is still open, close it (to avoid problems on
5646 * Windows with trying to rename or delete an open file).
5647 */
5648 if (readFile >= 0)
5649 {
5650 close(readFile);
5651 readFile = -1;
5652 }
5653
5654 /*
5655 * Calculate the last segment on the old timeline, and the first segment
5656 * on the new timeline. If the switch happens in the middle of a segment,
5657 * they are the same, but if the switch happens exactly at a segment
5658 * boundary, startLogSegNo will be endLogSegNo + 1.
5659 */
5660 XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5661 XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5662
5663 /*
5664 * Initialize the starting WAL segment for the new timeline. If the switch
5665 * happens in the middle of a segment, copy data from the last WAL segment
5666 * of the old timeline up to the switch point, to the starting WAL segment
5667 * on the new timeline.
5668 */
5669 if (endLogSegNo == startLogSegNo)
5670 {
5671 /*
5672 * Make a copy of the file on the new timeline.
5673 *
5674 * Writing WAL isn't allowed yet, so there are no locking
5675 * considerations. But we should be just as tense as XLogFileInit to
5676 * avoid emplacing a bogus file.
5677 */
5678 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5679 XLogSegmentOffset(endOfLog, wal_segment_size));
5680 }
5681 else
5682 {
5683 /*
5684 * The switch happened at a segment boundary, so just create the next
5685 * segment on the new timeline.
5686 */
5687 bool use_existent = true;
5688 int fd;
5689
5690 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5691
5692 if (close(fd) != 0)
5693 {
5694 char xlogfname[MAXFNAMELEN];
5695 int save_errno = errno;
5696
5697 XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo,
5698 wal_segment_size);
5699 errno = save_errno;
5700 ereport(ERROR,
5701 (errcode_for_file_access(),
5702 errmsg("could not close file \"%s\": %m", xlogfname)));
5703 }
5704 }
5705
5706 /*
5707 * Let's just make real sure there are not .ready or .done flags posted
5708 * for the new segment.
5709 */
5710 XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5711 XLogArchiveCleanup(xlogfname);
5712
5713 /*
5714 * Remove the signal files out of the way, so that we don't accidentally
5715 * re-enter archive recovery mode in a subsequent crash.
5716 */
5717 if (standby_signal_file_found)
5718 durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
5719
5720 if (recovery_signal_file_found)
5721 durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
5722
5723 ereport(LOG,
5724 (errmsg("archive recovery complete")));
5725 }
5726
5727 /*
5728 * Extract timestamp from WAL record.
5729 *
5730 * If the record contains a timestamp, returns true, and saves the timestamp
5731 * in *recordXtime. If the record type has no timestamp, returns false.
5732 * Currently, only transaction commit/abort records and restore points contain
5733 * timestamps.
5734 */
5735 static bool
getRecordTimestamp(XLogReaderState * record,TimestampTz * recordXtime)5736 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5737 {
5738 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5739 uint8 xact_info = info & XLOG_XACT_OPMASK;
5740 uint8 rmid = XLogRecGetRmid(record);
5741
5742 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5743 {
5744 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5745 return true;
5746 }
5747 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5748 xact_info == XLOG_XACT_COMMIT_PREPARED))
5749 {
5750 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5751 return true;
5752 }
5753 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5754 xact_info == XLOG_XACT_ABORT_PREPARED))
5755 {
5756 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5757 return true;
5758 }
5759 return false;
5760 }
5761
5762 /*
5763 * For point-in-time recovery, this function decides whether we want to
5764 * stop applying the XLOG before the current record.
5765 *
5766 * Returns true if we are stopping, false otherwise. If stopping, some
5767 * information is saved in recoveryStopXid et al for use in annotating the
5768 * new timeline's history file.
5769 */
5770 static bool
recoveryStopsBefore(XLogReaderState * record)5771 recoveryStopsBefore(XLogReaderState *record)
5772 {
5773 bool stopsHere = false;
5774 uint8 xact_info;
5775 bool isCommit;
5776 TimestampTz recordXtime = 0;
5777 TransactionId recordXid;
5778
5779 /*
5780 * Ignore recovery target settings when not in archive recovery (meaning
5781 * we are in crash recovery).
5782 */
5783 if (!ArchiveRecoveryRequested)
5784 return false;
5785
5786 /* Check if we should stop as soon as reaching consistency */
5787 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5788 {
5789 ereport(LOG,
5790 (errmsg("recovery stopping after reaching consistency")));
5791
5792 recoveryStopAfter = false;
5793 recoveryStopXid = InvalidTransactionId;
5794 recoveryStopLSN = InvalidXLogRecPtr;
5795 recoveryStopTime = 0;
5796 recoveryStopName[0] = '\0';
5797 return true;
5798 }
5799
5800 /* Check if target LSN has been reached */
5801 if (recoveryTarget == RECOVERY_TARGET_LSN &&
5802 !recoveryTargetInclusive &&
5803 record->ReadRecPtr >= recoveryTargetLSN)
5804 {
5805 recoveryStopAfter = false;
5806 recoveryStopXid = InvalidTransactionId;
5807 recoveryStopLSN = record->ReadRecPtr;
5808 recoveryStopTime = 0;
5809 recoveryStopName[0] = '\0';
5810 ereport(LOG,
5811 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5812 LSN_FORMAT_ARGS(recoveryStopLSN))));
5813 return true;
5814 }
5815
5816 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5817 if (XLogRecGetRmid(record) != RM_XACT_ID)
5818 return false;
5819
5820 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5821
5822 if (xact_info == XLOG_XACT_COMMIT)
5823 {
5824 isCommit = true;
5825 recordXid = XLogRecGetXid(record);
5826 }
5827 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5828 {
5829 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5830 xl_xact_parsed_commit parsed;
5831
5832 isCommit = true;
5833 ParseCommitRecord(XLogRecGetInfo(record),
5834 xlrec,
5835 &parsed);
5836 recordXid = parsed.twophase_xid;
5837 }
5838 else if (xact_info == XLOG_XACT_ABORT)
5839 {
5840 isCommit = false;
5841 recordXid = XLogRecGetXid(record);
5842 }
5843 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5844 {
5845 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5846 xl_xact_parsed_abort parsed;
5847
5848 isCommit = false;
5849 ParseAbortRecord(XLogRecGetInfo(record),
5850 xlrec,
5851 &parsed);
5852 recordXid = parsed.twophase_xid;
5853 }
5854 else
5855 return false;
5856
5857 if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5858 {
5859 /*
5860 * There can be only one transaction end record with this exact
5861 * transactionid
5862 *
5863 * when testing for an xid, we MUST test for equality only, since
5864 * transactions are numbered in the order they start, not the order
5865 * they complete. A higher numbered xid will complete before you about
5866 * 50% of the time...
5867 */
5868 stopsHere = (recordXid == recoveryTargetXid);
5869 }
5870
5871 if (recoveryTarget == RECOVERY_TARGET_TIME &&
5872 getRecordTimestamp(record, &recordXtime))
5873 {
5874 /*
5875 * There can be many transactions that share the same commit time, so
5876 * we stop after the last one, if we are inclusive, or stop at the
5877 * first one if we are exclusive
5878 */
5879 if (recoveryTargetInclusive)
5880 stopsHere = (recordXtime > recoveryTargetTime);
5881 else
5882 stopsHere = (recordXtime >= recoveryTargetTime);
5883 }
5884
5885 if (stopsHere)
5886 {
5887 recoveryStopAfter = false;
5888 recoveryStopXid = recordXid;
5889 recoveryStopTime = recordXtime;
5890 recoveryStopLSN = InvalidXLogRecPtr;
5891 recoveryStopName[0] = '\0';
5892
5893 if (isCommit)
5894 {
5895 ereport(LOG,
5896 (errmsg("recovery stopping before commit of transaction %u, time %s",
5897 recoveryStopXid,
5898 timestamptz_to_str(recoveryStopTime))));
5899 }
5900 else
5901 {
5902 ereport(LOG,
5903 (errmsg("recovery stopping before abort of transaction %u, time %s",
5904 recoveryStopXid,
5905 timestamptz_to_str(recoveryStopTime))));
5906 }
5907 }
5908
5909 return stopsHere;
5910 }
5911
5912 /*
5913 * Same as recoveryStopsBefore, but called after applying the record.
5914 *
5915 * We also track the timestamp of the latest applied COMMIT/ABORT
5916 * record in XLogCtl->recoveryLastXTime.
5917 */
5918 static bool
recoveryStopsAfter(XLogReaderState * record)5919 recoveryStopsAfter(XLogReaderState *record)
5920 {
5921 uint8 info;
5922 uint8 xact_info;
5923 uint8 rmid;
5924 TimestampTz recordXtime;
5925
5926 /*
5927 * Ignore recovery target settings when not in archive recovery (meaning
5928 * we are in crash recovery).
5929 */
5930 if (!ArchiveRecoveryRequested)
5931 return false;
5932
5933 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5934 rmid = XLogRecGetRmid(record);
5935
5936 /*
5937 * There can be many restore points that share the same name; we stop at
5938 * the first one.
5939 */
5940 if (recoveryTarget == RECOVERY_TARGET_NAME &&
5941 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5942 {
5943 xl_restore_point *recordRestorePointData;
5944
5945 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5946
5947 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5948 {
5949 recoveryStopAfter = true;
5950 recoveryStopXid = InvalidTransactionId;
5951 recoveryStopLSN = InvalidXLogRecPtr;
5952 (void) getRecordTimestamp(record, &recoveryStopTime);
5953 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5954
5955 ereport(LOG,
5956 (errmsg("recovery stopping at restore point \"%s\", time %s",
5957 recoveryStopName,
5958 timestamptz_to_str(recoveryStopTime))));
5959 return true;
5960 }
5961 }
5962
5963 /* Check if the target LSN has been reached */
5964 if (recoveryTarget == RECOVERY_TARGET_LSN &&
5965 recoveryTargetInclusive &&
5966 record->ReadRecPtr >= recoveryTargetLSN)
5967 {
5968 recoveryStopAfter = true;
5969 recoveryStopXid = InvalidTransactionId;
5970 recoveryStopLSN = record->ReadRecPtr;
5971 recoveryStopTime = 0;
5972 recoveryStopName[0] = '\0';
5973 ereport(LOG,
5974 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5975 LSN_FORMAT_ARGS(recoveryStopLSN))));
5976 return true;
5977 }
5978
5979 if (rmid != RM_XACT_ID)
5980 return false;
5981
5982 xact_info = info & XLOG_XACT_OPMASK;
5983
5984 if (xact_info == XLOG_XACT_COMMIT ||
5985 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5986 xact_info == XLOG_XACT_ABORT ||
5987 xact_info == XLOG_XACT_ABORT_PREPARED)
5988 {
5989 TransactionId recordXid;
5990
5991 /* Update the last applied transaction timestamp */
5992 if (getRecordTimestamp(record, &recordXtime))
5993 SetLatestXTime(recordXtime);
5994
5995 /* Extract the XID of the committed/aborted transaction */
5996 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5997 {
5998 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5999 xl_xact_parsed_commit parsed;
6000
6001 ParseCommitRecord(XLogRecGetInfo(record),
6002 xlrec,
6003 &parsed);
6004 recordXid = parsed.twophase_xid;
6005 }
6006 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
6007 {
6008 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
6009 xl_xact_parsed_abort parsed;
6010
6011 ParseAbortRecord(XLogRecGetInfo(record),
6012 xlrec,
6013 &parsed);
6014 recordXid = parsed.twophase_xid;
6015 }
6016 else
6017 recordXid = XLogRecGetXid(record);
6018
6019 /*
6020 * There can be only one transaction end record with this exact
6021 * transactionid
6022 *
6023 * when testing for an xid, we MUST test for equality only, since
6024 * transactions are numbered in the order they start, not the order
6025 * they complete. A higher numbered xid will complete before you about
6026 * 50% of the time...
6027 */
6028 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
6029 recordXid == recoveryTargetXid)
6030 {
6031 recoveryStopAfter = true;
6032 recoveryStopXid = recordXid;
6033 recoveryStopTime = recordXtime;
6034 recoveryStopLSN = InvalidXLogRecPtr;
6035 recoveryStopName[0] = '\0';
6036
6037 if (xact_info == XLOG_XACT_COMMIT ||
6038 xact_info == XLOG_XACT_COMMIT_PREPARED)
6039 {
6040 ereport(LOG,
6041 (errmsg("recovery stopping after commit of transaction %u, time %s",
6042 recoveryStopXid,
6043 timestamptz_to_str(recoveryStopTime))));
6044 }
6045 else if (xact_info == XLOG_XACT_ABORT ||
6046 xact_info == XLOG_XACT_ABORT_PREPARED)
6047 {
6048 ereport(LOG,
6049 (errmsg("recovery stopping after abort of transaction %u, time %s",
6050 recoveryStopXid,
6051 timestamptz_to_str(recoveryStopTime))));
6052 }
6053 return true;
6054 }
6055 }
6056
6057 /* Check if we should stop as soon as reaching consistency */
6058 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
6059 {
6060 ereport(LOG,
6061 (errmsg("recovery stopping after reaching consistency")));
6062
6063 recoveryStopAfter = true;
6064 recoveryStopXid = InvalidTransactionId;
6065 recoveryStopTime = 0;
6066 recoveryStopLSN = InvalidXLogRecPtr;
6067 recoveryStopName[0] = '\0';
6068 return true;
6069 }
6070
6071 return false;
6072 }
6073
6074 /*
6075 * Wait until shared recoveryPauseState is set to RECOVERY_NOT_PAUSED.
6076 *
6077 * endOfRecovery is true if the recovery target is reached and
6078 * the paused state starts at the end of recovery because of
6079 * recovery_target_action=pause, and false otherwise.
6080 */
6081 static void
recoveryPausesHere(bool endOfRecovery)6082 recoveryPausesHere(bool endOfRecovery)
6083 {
6084 /* Don't pause unless users can connect! */
6085 if (!LocalHotStandbyActive)
6086 return;
6087
6088 /* Don't pause after standby promotion has been triggered */
6089 if (LocalPromoteIsTriggered)
6090 return;
6091
6092 if (endOfRecovery)
6093 ereport(LOG,
6094 (errmsg("pausing at the end of recovery"),
6095 errhint("Execute pg_wal_replay_resume() to promote.")));
6096 else
6097 ereport(LOG,
6098 (errmsg("recovery has paused"),
6099 errhint("Execute pg_wal_replay_resume() to continue.")));
6100
6101 /* loop until recoveryPauseState is set to RECOVERY_NOT_PAUSED */
6102 while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
6103 {
6104 HandleStartupProcInterrupts();
6105 if (CheckForStandbyTrigger())
6106 return;
6107
6108 /*
6109 * If recovery pause is requested then set it paused. While we are in
6110 * the loop, user might resume and pause again so set this every time.
6111 */
6112 ConfirmRecoveryPaused();
6113
6114 /*
6115 * We wait on a condition variable that will wake us as soon as the
6116 * pause ends, but we use a timeout so we can check the above exit
6117 * condition periodically too.
6118 */
6119 ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000,
6120 WAIT_EVENT_RECOVERY_PAUSE);
6121 }
6122 ConditionVariableCancelSleep();
6123 }
6124
6125 /*
6126 * Get the current state of the recovery pause request.
6127 */
6128 RecoveryPauseState
GetRecoveryPauseState(void)6129 GetRecoveryPauseState(void)
6130 {
6131 RecoveryPauseState state;
6132
6133 SpinLockAcquire(&XLogCtl->info_lck);
6134 state = XLogCtl->recoveryPauseState;
6135 SpinLockRelease(&XLogCtl->info_lck);
6136
6137 return state;
6138 }
6139
6140 /*
6141 * Set the recovery pause state.
6142 *
6143 * If recovery pause is requested then sets the recovery pause state to
6144 * 'pause requested' if it is not already 'paused'. Otherwise, sets it
6145 * to 'not paused' to resume the recovery. The recovery pause will be
6146 * confirmed by the ConfirmRecoveryPaused.
6147 */
6148 void
SetRecoveryPause(bool recoveryPause)6149 SetRecoveryPause(bool recoveryPause)
6150 {
6151 SpinLockAcquire(&XLogCtl->info_lck);
6152
6153 if (!recoveryPause)
6154 XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
6155 else if (XLogCtl->recoveryPauseState == RECOVERY_NOT_PAUSED)
6156 XLogCtl->recoveryPauseState = RECOVERY_PAUSE_REQUESTED;
6157
6158 SpinLockRelease(&XLogCtl->info_lck);
6159
6160 if (!recoveryPause)
6161 ConditionVariableBroadcast(&XLogCtl->recoveryNotPausedCV);
6162 }
6163
6164 /*
6165 * Confirm the recovery pause by setting the recovery pause state to
6166 * RECOVERY_PAUSED.
6167 */
6168 static void
ConfirmRecoveryPaused(void)6169 ConfirmRecoveryPaused(void)
6170 {
6171 /* If recovery pause is requested then set it paused */
6172 SpinLockAcquire(&XLogCtl->info_lck);
6173 if (XLogCtl->recoveryPauseState == RECOVERY_PAUSE_REQUESTED)
6174 XLogCtl->recoveryPauseState = RECOVERY_PAUSED;
6175 SpinLockRelease(&XLogCtl->info_lck);
6176 }
6177
6178 /*
6179 * When recovery_min_apply_delay is set, we wait long enough to make sure
6180 * certain record types are applied at least that interval behind the primary.
6181 *
6182 * Returns true if we waited.
6183 *
6184 * Note that the delay is calculated between the WAL record log time and
6185 * the current time on standby. We would prefer to keep track of when this
6186 * standby received each WAL record, which would allow a more consistent
6187 * approach and one not affected by time synchronisation issues, but that
6188 * is significantly more effort and complexity for little actual gain in
6189 * usability.
6190 */
6191 static bool
recoveryApplyDelay(XLogReaderState * record)6192 recoveryApplyDelay(XLogReaderState *record)
6193 {
6194 uint8 xact_info;
6195 TimestampTz xtime;
6196 TimestampTz delayUntil;
6197 long msecs;
6198
6199 /* nothing to do if no delay configured */
6200 if (recovery_min_apply_delay <= 0)
6201 return false;
6202
6203 /* no delay is applied on a database not yet consistent */
6204 if (!reachedConsistency)
6205 return false;
6206
6207 /* nothing to do if crash recovery is requested */
6208 if (!ArchiveRecoveryRequested)
6209 return false;
6210
6211 /*
6212 * Is it a COMMIT record?
6213 *
6214 * We deliberately choose not to delay aborts since they have no effect on
6215 * MVCC. We already allow replay of records that don't have a timestamp,
6216 * so there is already opportunity for issues caused by early conflicts on
6217 * standbys.
6218 */
6219 if (XLogRecGetRmid(record) != RM_XACT_ID)
6220 return false;
6221
6222 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6223
6224 if (xact_info != XLOG_XACT_COMMIT &&
6225 xact_info != XLOG_XACT_COMMIT_PREPARED)
6226 return false;
6227
6228 if (!getRecordTimestamp(record, &xtime))
6229 return false;
6230
6231 delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6232
6233 /*
6234 * Exit without arming the latch if it's already past time to apply this
6235 * record
6236 */
6237 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
6238 if (msecs <= 0)
6239 return false;
6240
6241 while (true)
6242 {
6243 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6244
6245 /*
6246 * This might change recovery_min_apply_delay or the trigger file's
6247 * location.
6248 */
6249 HandleStartupProcInterrupts();
6250
6251 if (CheckForStandbyTrigger())
6252 break;
6253
6254 /*
6255 * Recalculate delayUntil as recovery_min_apply_delay could have
6256 * changed while waiting in this loop.
6257 */
6258 delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6259
6260 /*
6261 * Wait for difference between GetCurrentTimestamp() and delayUntil.
6262 */
6263 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
6264 delayUntil);
6265
6266 if (msecs <= 0)
6267 break;
6268
6269 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
6270
6271 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
6272 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
6273 msecs,
6274 WAIT_EVENT_RECOVERY_APPLY_DELAY);
6275 }
6276 return true;
6277 }
6278
6279 /*
6280 * Save timestamp of latest processed commit/abort record.
6281 *
6282 * We keep this in XLogCtl, not a simple static variable, so that it can be
6283 * seen by processes other than the startup process. Note in particular
6284 * that CreateRestartPoint is executed in the checkpointer.
6285 */
6286 static void
SetLatestXTime(TimestampTz xtime)6287 SetLatestXTime(TimestampTz xtime)
6288 {
6289 SpinLockAcquire(&XLogCtl->info_lck);
6290 XLogCtl->recoveryLastXTime = xtime;
6291 SpinLockRelease(&XLogCtl->info_lck);
6292 }
6293
6294 /*
6295 * Fetch timestamp of latest processed commit/abort record.
6296 */
6297 TimestampTz
GetLatestXTime(void)6298 GetLatestXTime(void)
6299 {
6300 TimestampTz xtime;
6301
6302 SpinLockAcquire(&XLogCtl->info_lck);
6303 xtime = XLogCtl->recoveryLastXTime;
6304 SpinLockRelease(&XLogCtl->info_lck);
6305
6306 return xtime;
6307 }
6308
6309 /*
6310 * Save timestamp of the next chunk of WAL records to apply.
6311 *
6312 * We keep this in XLogCtl, not a simple static variable, so that it can be
6313 * seen by all backends.
6314 */
6315 static void
SetCurrentChunkStartTime(TimestampTz xtime)6316 SetCurrentChunkStartTime(TimestampTz xtime)
6317 {
6318 SpinLockAcquire(&XLogCtl->info_lck);
6319 XLogCtl->currentChunkStartTime = xtime;
6320 SpinLockRelease(&XLogCtl->info_lck);
6321 }
6322
6323 /*
6324 * Fetch timestamp of latest processed commit/abort record.
6325 * Startup process maintains an accurate local copy in XLogReceiptTime
6326 */
6327 TimestampTz
GetCurrentChunkReplayStartTime(void)6328 GetCurrentChunkReplayStartTime(void)
6329 {
6330 TimestampTz xtime;
6331
6332 SpinLockAcquire(&XLogCtl->info_lck);
6333 xtime = XLogCtl->currentChunkStartTime;
6334 SpinLockRelease(&XLogCtl->info_lck);
6335
6336 return xtime;
6337 }
6338
6339 /*
6340 * Returns time of receipt of current chunk of XLOG data, as well as
6341 * whether it was received from streaming replication or from archives.
6342 */
6343 void
GetXLogReceiptTime(TimestampTz * rtime,bool * fromStream)6344 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6345 {
6346 /*
6347 * This must be executed in the startup process, since we don't export the
6348 * relevant state to shared memory.
6349 */
6350 Assert(InRecovery);
6351
6352 *rtime = XLogReceiptTime;
6353 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6354 }
6355
6356 /*
6357 * Note that text field supplied is a parameter name and does not require
6358 * translation
6359 */
6360 static void
RecoveryRequiresIntParameter(const char * param_name,int currValue,int minValue)6361 RecoveryRequiresIntParameter(const char *param_name, int currValue, int minValue)
6362 {
6363 if (currValue < minValue)
6364 {
6365 if (LocalHotStandbyActive)
6366 {
6367 bool warned_for_promote = false;
6368
6369 ereport(WARNING,
6370 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6371 errmsg("hot standby is not possible because of insufficient parameter settings"),
6372 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
6373 param_name,
6374 currValue,
6375 minValue)));
6376
6377 SetRecoveryPause(true);
6378
6379 ereport(LOG,
6380 (errmsg("recovery has paused"),
6381 errdetail("If recovery is unpaused, the server will shut down."),
6382 errhint("You can then restart the server after making the necessary configuration changes.")));
6383
6384 while (GetRecoveryPauseState() != RECOVERY_NOT_PAUSED)
6385 {
6386 HandleStartupProcInterrupts();
6387
6388 if (CheckForStandbyTrigger())
6389 {
6390 if (!warned_for_promote)
6391 ereport(WARNING,
6392 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6393 errmsg("promotion is not possible because of insufficient parameter settings"),
6394
6395 /*
6396 * Repeat the detail from above so it's easy to find
6397 * in the log.
6398 */
6399 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
6400 param_name,
6401 currValue,
6402 minValue),
6403 errhint("Restart the server after making the necessary configuration changes.")));
6404 warned_for_promote = true;
6405 }
6406
6407 /*
6408 * If recovery pause is requested then set it paused. While
6409 * we are in the loop, user might resume and pause again so
6410 * set this every time.
6411 */
6412 ConfirmRecoveryPaused();
6413
6414 /*
6415 * We wait on a condition variable that will wake us as soon
6416 * as the pause ends, but we use a timeout so we can check the
6417 * above conditions periodically too.
6418 */
6419 ConditionVariableTimedSleep(&XLogCtl->recoveryNotPausedCV, 1000,
6420 WAIT_EVENT_RECOVERY_PAUSE);
6421 }
6422 ConditionVariableCancelSleep();
6423 }
6424
6425 ereport(FATAL,
6426 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
6427 errmsg("recovery aborted because of insufficient parameter settings"),
6428 /* Repeat the detail from above so it's easy to find in the log. */
6429 errdetail("%s = %d is a lower setting than on the primary server, where its value was %d.",
6430 param_name,
6431 currValue,
6432 minValue),
6433 errhint("You can restart the server after making the necessary configuration changes.")));
6434 }
6435 }
6436
6437 /*
6438 * Check to see if required parameters are set high enough on this server
6439 * for various aspects of recovery operation.
6440 *
6441 * Note that all the parameters which this function tests need to be
6442 * listed in Administrator's Overview section in high-availability.sgml.
6443 * If you change them, don't forget to update the list.
6444 */
6445 static void
CheckRequiredParameterValues(void)6446 CheckRequiredParameterValues(void)
6447 {
6448 /*
6449 * For archive recovery, the WAL must be generated with at least 'replica'
6450 * wal_level.
6451 */
6452 if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6453 {
6454 ereport(FATAL,
6455 (errmsg("WAL was generated with wal_level=minimal, cannot continue recovering"),
6456 errdetail("This happens if you temporarily set wal_level=minimal on the server."),
6457 errhint("Use a backup taken after setting wal_level to higher than minimal.")));
6458 }
6459
6460 /*
6461 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6462 * must have at least as many backend slots as the primary.
6463 */
6464 if (ArchiveRecoveryRequested && EnableHotStandby)
6465 {
6466 /* We ignore autovacuum_max_workers when we make this test. */
6467 RecoveryRequiresIntParameter("max_connections",
6468 MaxConnections,
6469 ControlFile->MaxConnections);
6470 RecoveryRequiresIntParameter("max_worker_processes",
6471 max_worker_processes,
6472 ControlFile->max_worker_processes);
6473 RecoveryRequiresIntParameter("max_wal_senders",
6474 max_wal_senders,
6475 ControlFile->max_wal_senders);
6476 RecoveryRequiresIntParameter("max_prepared_transactions",
6477 max_prepared_xacts,
6478 ControlFile->max_prepared_xacts);
6479 RecoveryRequiresIntParameter("max_locks_per_transaction",
6480 max_locks_per_xact,
6481 ControlFile->max_locks_per_xact);
6482 }
6483 }
6484
6485 /*
6486 * This must be called ONCE during postmaster or standalone-backend startup
6487 */
6488 void
StartupXLOG(void)6489 StartupXLOG(void)
6490 {
6491 XLogCtlInsert *Insert;
6492 CheckPoint checkPoint;
6493 bool wasShutdown;
6494 bool reachedRecoveryTarget = false;
6495 bool haveBackupLabel = false;
6496 bool haveTblspcMap = false;
6497 XLogRecPtr RecPtr,
6498 checkPointLoc,
6499 EndOfLog;
6500 TimeLineID EndOfLogTLI;
6501 TimeLineID PrevTimeLineID;
6502 XLogRecord *record;
6503 TransactionId oldestActiveXID;
6504 bool backupEndRequired = false;
6505 bool backupFromStandby = false;
6506 DBState dbstate_at_startup;
6507 XLogReaderState *xlogreader;
6508 XLogPageReadPrivate private;
6509 bool promoted = false;
6510 struct stat st;
6511
6512 /*
6513 * We should have an aux process resource owner to use, and we should not
6514 * be in a transaction that's installed some other resowner.
6515 */
6516 Assert(AuxProcessResourceOwner != NULL);
6517 Assert(CurrentResourceOwner == NULL ||
6518 CurrentResourceOwner == AuxProcessResourceOwner);
6519 CurrentResourceOwner = AuxProcessResourceOwner;
6520
6521 /*
6522 * Check that contents look valid.
6523 */
6524 if (!XRecOffIsValid(ControlFile->checkPoint))
6525 ereport(FATAL,
6526 (errmsg("control file contains invalid checkpoint location")));
6527
6528 switch (ControlFile->state)
6529 {
6530 case DB_SHUTDOWNED:
6531
6532 /*
6533 * This is the expected case, so don't be chatty in standalone
6534 * mode
6535 */
6536 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6537 (errmsg("database system was shut down at %s",
6538 str_time(ControlFile->time))));
6539 break;
6540
6541 case DB_SHUTDOWNED_IN_RECOVERY:
6542 ereport(LOG,
6543 (errmsg("database system was shut down in recovery at %s",
6544 str_time(ControlFile->time))));
6545 break;
6546
6547 case DB_SHUTDOWNING:
6548 ereport(LOG,
6549 (errmsg("database system shutdown was interrupted; last known up at %s",
6550 str_time(ControlFile->time))));
6551 break;
6552
6553 case DB_IN_CRASH_RECOVERY:
6554 ereport(LOG,
6555 (errmsg("database system was interrupted while in recovery at %s",
6556 str_time(ControlFile->time)),
6557 errhint("This probably means that some data is corrupted and"
6558 " you will have to use the last backup for recovery.")));
6559 break;
6560
6561 case DB_IN_ARCHIVE_RECOVERY:
6562 ereport(LOG,
6563 (errmsg("database system was interrupted while in recovery at log time %s",
6564 str_time(ControlFile->checkPointCopy.time)),
6565 errhint("If this has occurred more than once some data might be corrupted"
6566 " and you might need to choose an earlier recovery target.")));
6567 break;
6568
6569 case DB_IN_PRODUCTION:
6570 ereport(LOG,
6571 (errmsg("database system was interrupted; last known up at %s",
6572 str_time(ControlFile->time))));
6573 break;
6574
6575 default:
6576 ereport(FATAL,
6577 (errmsg("control file contains invalid database cluster state")));
6578 }
6579
6580 /* This is just to allow attaching to startup process with a debugger */
6581 #ifdef XLOG_REPLAY_DELAY
6582 if (ControlFile->state != DB_SHUTDOWNED)
6583 pg_usleep(60000000L);
6584 #endif
6585
6586 /*
6587 * Verify that pg_wal and pg_wal/archive_status exist. In cases where
6588 * someone has performed a copy for PITR, these directories may have been
6589 * excluded and need to be re-created.
6590 */
6591 ValidateXLOGDirectoryStructure();
6592
6593 /*----------
6594 * If we previously crashed, perform a couple of actions:
6595 *
6596 * - The pg_wal directory may still include some temporary WAL segments
6597 * used when creating a new segment, so perform some clean up to not
6598 * bloat this path. This is done first as there is no point to sync
6599 * this temporary data.
6600 *
6601 * - There might be data which we had written, intending to fsync it, but
6602 * which we had not actually fsync'd yet. Therefore, a power failure in
6603 * the near future might cause earlier unflushed writes to be lost, even
6604 * though more recent data written to disk from here on would be
6605 * persisted. To avoid that, fsync the entire data directory.
6606 */
6607 if (ControlFile->state != DB_SHUTDOWNED &&
6608 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6609 {
6610 RemoveTempXlogFiles();
6611 SyncDataDirectory();
6612 }
6613
6614 /*
6615 * Initialize on the assumption we want to recover to the latest timeline
6616 * that's active according to pg_control.
6617 */
6618 if (ControlFile->minRecoveryPointTLI >
6619 ControlFile->checkPointCopy.ThisTimeLineID)
6620 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6621 else
6622 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6623
6624 /*
6625 * Check for signal files, and if so set up state for offline recovery
6626 */
6627 readRecoverySignalFile();
6628 validateRecoveryParameters();
6629
6630 if (ArchiveRecoveryRequested)
6631 {
6632 if (StandbyModeRequested)
6633 ereport(LOG,
6634 (errmsg("entering standby mode")));
6635 else if (recoveryTarget == RECOVERY_TARGET_XID)
6636 ereport(LOG,
6637 (errmsg("starting point-in-time recovery to XID %u",
6638 recoveryTargetXid)));
6639 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6640 ereport(LOG,
6641 (errmsg("starting point-in-time recovery to %s",
6642 timestamptz_to_str(recoveryTargetTime))));
6643 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6644 ereport(LOG,
6645 (errmsg("starting point-in-time recovery to \"%s\"",
6646 recoveryTargetName)));
6647 else if (recoveryTarget == RECOVERY_TARGET_LSN)
6648 ereport(LOG,
6649 (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6650 LSN_FORMAT_ARGS(recoveryTargetLSN))));
6651 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6652 ereport(LOG,
6653 (errmsg("starting point-in-time recovery to earliest consistent point")));
6654 else
6655 ereport(LOG,
6656 (errmsg("starting archive recovery")));
6657 }
6658
6659 /*
6660 * Take ownership of the wakeup latch if we're going to sleep during
6661 * recovery.
6662 */
6663 if (ArchiveRecoveryRequested)
6664 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6665
6666 /* Set up XLOG reader facility */
6667 MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6668 xlogreader =
6669 XLogReaderAllocate(wal_segment_size, NULL,
6670 XL_ROUTINE(.page_read = &XLogPageRead,
6671 .segment_open = NULL,
6672 .segment_close = wal_segment_close),
6673 &private);
6674 if (!xlogreader)
6675 ereport(ERROR,
6676 (errcode(ERRCODE_OUT_OF_MEMORY),
6677 errmsg("out of memory"),
6678 errdetail("Failed while allocating a WAL reading processor.")));
6679 xlogreader->system_identifier = ControlFile->system_identifier;
6680
6681 /*
6682 * Allocate two page buffers dedicated to WAL consistency checks. We do
6683 * it this way, rather than just making static arrays, for two reasons:
6684 * (1) no need to waste the storage in most instantiations of the backend;
6685 * (2) a static char array isn't guaranteed to have any particular
6686 * alignment, whereas palloc() will provide MAXALIGN'd storage.
6687 */
6688 replay_image_masked = (char *) palloc(BLCKSZ);
6689 primary_image_masked = (char *) palloc(BLCKSZ);
6690
6691 if (read_backup_label(&checkPointLoc, &backupEndRequired,
6692 &backupFromStandby))
6693 {
6694 List *tablespaces = NIL;
6695
6696 /*
6697 * Archive recovery was requested, and thanks to the backup label
6698 * file, we know how far we need to replay to reach consistency. Enter
6699 * archive recovery directly.
6700 */
6701 InArchiveRecovery = true;
6702 if (StandbyModeRequested)
6703 StandbyMode = true;
6704
6705 /*
6706 * When a backup_label file is present, we want to roll forward from
6707 * the checkpoint it identifies, rather than using pg_control.
6708 */
6709 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6710 if (record != NULL)
6711 {
6712 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6713 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6714 ereport(DEBUG1,
6715 (errmsg_internal("checkpoint record is at %X/%X",
6716 LSN_FORMAT_ARGS(checkPointLoc))));
6717 InRecovery = true; /* force recovery even if SHUTDOWNED */
6718
6719 /*
6720 * Make sure that REDO location exists. This may not be the case
6721 * if there was a crash during an online backup, which left a
6722 * backup_label around that references a WAL segment that's
6723 * already been archived.
6724 */
6725 if (checkPoint.redo < checkPointLoc)
6726 {
6727 XLogBeginRead(xlogreader, checkPoint.redo);
6728 if (!ReadRecord(xlogreader, LOG, false))
6729 ereport(FATAL,
6730 (errmsg("could not find redo location referenced by checkpoint record"),
6731 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6732 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6733 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6734 DataDir, DataDir, DataDir)));
6735 }
6736 }
6737 else
6738 {
6739 ereport(FATAL,
6740 (errmsg("could not locate required checkpoint record"),
6741 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6742 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6743 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6744 DataDir, DataDir, DataDir)));
6745 wasShutdown = false; /* keep compiler quiet */
6746 }
6747
6748 /* read the tablespace_map file if present and create symlinks. */
6749 if (read_tablespace_map(&tablespaces))
6750 {
6751 ListCell *lc;
6752
6753 foreach(lc, tablespaces)
6754 {
6755 tablespaceinfo *ti = lfirst(lc);
6756 char *linkloc;
6757
6758 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6759
6760 /*
6761 * Remove the existing symlink if any and Create the symlink
6762 * under PGDATA.
6763 */
6764 remove_tablespace_symlink(linkloc);
6765
6766 if (symlink(ti->path, linkloc) < 0)
6767 ereport(ERROR,
6768 (errcode_for_file_access(),
6769 errmsg("could not create symbolic link \"%s\": %m",
6770 linkloc)));
6771
6772 pfree(ti->oid);
6773 pfree(ti->path);
6774 pfree(ti);
6775 }
6776
6777 /* set flag to delete it later */
6778 haveTblspcMap = true;
6779 }
6780
6781 /* set flag to delete it later */
6782 haveBackupLabel = true;
6783 }
6784 else
6785 {
6786 /*
6787 * If tablespace_map file is present without backup_label file, there
6788 * is no use of such file. There is no harm in retaining it, but it
6789 * is better to get rid of the map file so that we don't have any
6790 * redundant file in data directory and it will avoid any sort of
6791 * confusion. It seems prudent though to just rename the file out of
6792 * the way rather than delete it completely, also we ignore any error
6793 * that occurs in rename operation as even if map file is present
6794 * without backup_label file, it is harmless.
6795 */
6796 if (stat(TABLESPACE_MAP, &st) == 0)
6797 {
6798 unlink(TABLESPACE_MAP_OLD);
6799 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6800 ereport(LOG,
6801 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6802 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6803 errdetail("File \"%s\" was renamed to \"%s\".",
6804 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6805 else
6806 ereport(LOG,
6807 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6808 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6809 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6810 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6811 }
6812
6813 /*
6814 * It's possible that archive recovery was requested, but we don't
6815 * know how far we need to replay the WAL before we reach consistency.
6816 * This can happen for example if a base backup is taken from a
6817 * running server using an atomic filesystem snapshot, without calling
6818 * pg_start/stop_backup. Or if you just kill a running primary server
6819 * and put it into archive recovery by creating a recovery signal
6820 * file.
6821 *
6822 * Our strategy in that case is to perform crash recovery first,
6823 * replaying all the WAL present in pg_wal, and only enter archive
6824 * recovery after that.
6825 *
6826 * But usually we already know how far we need to replay the WAL (up
6827 * to minRecoveryPoint, up to backupEndPoint, or until we see an
6828 * end-of-backup record), and we can enter archive recovery directly.
6829 */
6830 if (ArchiveRecoveryRequested &&
6831 (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6832 ControlFile->backupEndRequired ||
6833 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6834 ControlFile->state == DB_SHUTDOWNED))
6835 {
6836 InArchiveRecovery = true;
6837 if (StandbyModeRequested)
6838 StandbyMode = true;
6839 }
6840
6841 /* Get the last valid checkpoint record. */
6842 checkPointLoc = ControlFile->checkPoint;
6843 RedoStartLSN = ControlFile->checkPointCopy.redo;
6844 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6845 if (record != NULL)
6846 {
6847 ereport(DEBUG1,
6848 (errmsg_internal("checkpoint record is at %X/%X",
6849 LSN_FORMAT_ARGS(checkPointLoc))));
6850 }
6851 else
6852 {
6853 /*
6854 * We used to attempt to go back to a secondary checkpoint record
6855 * here, but only when not in standby mode. We now just fail if we
6856 * can't read the last checkpoint because this allows us to
6857 * simplify processing around checkpoints.
6858 */
6859 ereport(PANIC,
6860 (errmsg("could not locate a valid checkpoint record")));
6861 }
6862 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6863 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6864 }
6865
6866 /*
6867 * Clear out any old relcache cache files. This is *necessary* if we do
6868 * any WAL replay, since that would probably result in the cache files
6869 * being out of sync with database reality. In theory we could leave them
6870 * in place if the database had been cleanly shut down, but it seems
6871 * safest to just remove them always and let them be rebuilt during the
6872 * first backend startup. These files needs to be removed from all
6873 * directories including pg_tblspc, however the symlinks are created only
6874 * after reading tablespace_map file in case of archive recovery from
6875 * backup, so needs to clear old relcache files here after creating
6876 * symlinks.
6877 */
6878 RelationCacheInitFileRemove();
6879
6880 /*
6881 * If the location of the checkpoint record is not on the expected
6882 * timeline in the history of the requested timeline, we cannot proceed:
6883 * the backup is not part of the history of the requested timeline.
6884 */
6885 Assert(expectedTLEs); /* was initialized by reading checkpoint
6886 * record */
6887 if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6888 checkPoint.ThisTimeLineID)
6889 {
6890 XLogRecPtr switchpoint;
6891
6892 /*
6893 * tliSwitchPoint will throw an error if the checkpoint's timeline is
6894 * not in expectedTLEs at all.
6895 */
6896 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6897 ereport(FATAL,
6898 (errmsg("requested timeline %u is not a child of this server's history",
6899 recoveryTargetTLI),
6900 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6901 LSN_FORMAT_ARGS(ControlFile->checkPoint),
6902 ControlFile->checkPointCopy.ThisTimeLineID,
6903 LSN_FORMAT_ARGS(switchpoint))));
6904 }
6905
6906 /*
6907 * The min recovery point should be part of the requested timeline's
6908 * history, too.
6909 */
6910 if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6911 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6912 ControlFile->minRecoveryPointTLI)
6913 ereport(FATAL,
6914 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6915 recoveryTargetTLI,
6916 LSN_FORMAT_ARGS(ControlFile->minRecoveryPoint),
6917 ControlFile->minRecoveryPointTLI)));
6918
6919 LastRec = RecPtr = checkPointLoc;
6920
6921 ereport(DEBUG1,
6922 (errmsg_internal("redo record is at %X/%X; shutdown %s",
6923 LSN_FORMAT_ARGS(checkPoint.redo),
6924 wasShutdown ? "true" : "false")));
6925 ereport(DEBUG1,
6926 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
6927 U64FromFullTransactionId(checkPoint.nextXid),
6928 checkPoint.nextOid)));
6929 ereport(DEBUG1,
6930 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6931 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6932 ereport(DEBUG1,
6933 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6934 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6935 ereport(DEBUG1,
6936 (errmsg_internal("oldest MultiXactId: %u, in database %u",
6937 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6938 ereport(DEBUG1,
6939 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6940 checkPoint.oldestCommitTsXid,
6941 checkPoint.newestCommitTsXid)));
6942 if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextXid)))
6943 ereport(PANIC,
6944 (errmsg("invalid next transaction ID")));
6945
6946 /* initialize shared memory variables from the checkpoint record */
6947 ShmemVariableCache->nextXid = checkPoint.nextXid;
6948 ShmemVariableCache->nextOid = checkPoint.nextOid;
6949 ShmemVariableCache->oidCount = 0;
6950 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6951 AdvanceOldestClogXid(checkPoint.oldestXid);
6952 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6953 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6954 SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6955 checkPoint.newestCommitTsXid);
6956 XLogCtl->ckptFullXid = checkPoint.nextXid;
6957
6958 /*
6959 * Initialize replication slots, before there's a chance to remove
6960 * required resources.
6961 */
6962 StartupReplicationSlots();
6963
6964 /*
6965 * Startup logical state, needs to be setup now so we have proper data
6966 * during crash recovery.
6967 */
6968 StartupReorderBuffer();
6969
6970 /*
6971 * Startup CLOG. This must be done after ShmemVariableCache->nextXid has
6972 * been initialized and before we accept connections or begin WAL replay.
6973 */
6974 StartupCLOG();
6975
6976 /*
6977 * Startup MultiXact. We need to do this early to be able to replay
6978 * truncations.
6979 */
6980 StartupMultiXact();
6981
6982 /*
6983 * Ditto for commit timestamps. Activate the facility if the setting is
6984 * enabled in the control file, as there should be no tracking of commit
6985 * timestamps done when the setting was disabled. This facility can be
6986 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6987 */
6988 if (ControlFile->track_commit_timestamp)
6989 StartupCommitTs();
6990
6991 /*
6992 * Recover knowledge about replay progress of known replication partners.
6993 */
6994 StartupReplicationOrigin();
6995
6996 /*
6997 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6998 * control file. On recovery, all unlogged relations are blown away, so
6999 * the unlogged LSN counter can be reset too.
7000 */
7001 if (ControlFile->state == DB_SHUTDOWNED)
7002 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
7003 else
7004 XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
7005
7006 /*
7007 * We must replay WAL entries using the same TimeLineID they were created
7008 * under, so temporarily adopt the TLI indicated by the checkpoint (see
7009 * also xlog_redo()).
7010 */
7011 ThisTimeLineID = checkPoint.ThisTimeLineID;
7012
7013 /*
7014 * Copy any missing timeline history files between 'now' and the recovery
7015 * target timeline from archive to pg_wal. While we don't need those files
7016 * ourselves - the history file of the recovery target timeline covers all
7017 * the previous timelines in the history too - a cascading standby server
7018 * might be interested in them. Or, if you archive the WAL from this
7019 * server to a different archive than the primary, it'd be good for all
7020 * the history files to get archived there after failover, so that you can
7021 * use one of the old timelines as a PITR target. Timeline history files
7022 * are small, so it's better to copy them unnecessarily than not copy them
7023 * and regret later.
7024 */
7025 restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
7026
7027 /*
7028 * Before running in recovery, scan pg_twophase and fill in its status to
7029 * be able to work on entries generated by redo. Doing a scan before
7030 * taking any recovery action has the merit to discard any 2PC files that
7031 * are newer than the first record to replay, saving from any conflicts at
7032 * replay. This avoids as well any subsequent scans when doing recovery
7033 * of the on-disk two-phase data.
7034 */
7035 restoreTwoPhaseData();
7036
7037 lastFullPageWrites = checkPoint.fullPageWrites;
7038
7039 RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
7040 doPageWrites = lastFullPageWrites;
7041
7042 if (RecPtr < checkPoint.redo)
7043 ereport(PANIC,
7044 (errmsg("invalid redo in checkpoint record")));
7045
7046 /*
7047 * Check whether we need to force recovery from WAL. If it appears to
7048 * have been a clean shutdown and we did not have a recovery signal file,
7049 * then assume no recovery needed.
7050 */
7051 if (checkPoint.redo < RecPtr)
7052 {
7053 if (wasShutdown)
7054 ereport(PANIC,
7055 (errmsg("invalid redo record in shutdown checkpoint")));
7056 InRecovery = true;
7057 }
7058 else if (ControlFile->state != DB_SHUTDOWNED)
7059 InRecovery = true;
7060 else if (ArchiveRecoveryRequested)
7061 {
7062 /* force recovery due to presence of recovery signal file */
7063 InRecovery = true;
7064 }
7065
7066 /*
7067 * Start recovery assuming that the final record isn't lost.
7068 */
7069 abortedRecPtr = InvalidXLogRecPtr;
7070 missingContrecPtr = InvalidXLogRecPtr;
7071
7072 /* REDO */
7073 if (InRecovery)
7074 {
7075 int rmid;
7076
7077 /*
7078 * Update pg_control to show that we are recovering and to show the
7079 * selected checkpoint as the place we are starting from. We also mark
7080 * pg_control with any minimum recovery stop point obtained from a
7081 * backup history file.
7082 */
7083 dbstate_at_startup = ControlFile->state;
7084 if (InArchiveRecovery)
7085 {
7086 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
7087
7088 SpinLockAcquire(&XLogCtl->info_lck);
7089 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
7090 SpinLockRelease(&XLogCtl->info_lck);
7091 }
7092 else
7093 {
7094 ereport(LOG,
7095 (errmsg("database system was not properly shut down; "
7096 "automatic recovery in progress")));
7097 if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
7098 ereport(LOG,
7099 (errmsg("crash recovery starts in timeline %u "
7100 "and has target timeline %u",
7101 ControlFile->checkPointCopy.ThisTimeLineID,
7102 recoveryTargetTLI)));
7103 ControlFile->state = DB_IN_CRASH_RECOVERY;
7104
7105 SpinLockAcquire(&XLogCtl->info_lck);
7106 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
7107 SpinLockRelease(&XLogCtl->info_lck);
7108 }
7109 ControlFile->checkPoint = checkPointLoc;
7110 ControlFile->checkPointCopy = checkPoint;
7111 if (InArchiveRecovery)
7112 {
7113 /* initialize minRecoveryPoint if not set yet */
7114 if (ControlFile->minRecoveryPoint < checkPoint.redo)
7115 {
7116 ControlFile->minRecoveryPoint = checkPoint.redo;
7117 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
7118 }
7119 }
7120
7121 /*
7122 * Set backupStartPoint if we're starting recovery from a base backup.
7123 *
7124 * Also set backupEndPoint and use minRecoveryPoint as the backup end
7125 * location if we're starting recovery from a base backup which was
7126 * taken from a standby. In this case, the database system status in
7127 * pg_control must indicate that the database was already in recovery.
7128 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
7129 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
7130 * before reaching this point; e.g. because restore_command or
7131 * primary_conninfo were faulty.
7132 *
7133 * Any other state indicates that the backup somehow became corrupted
7134 * and we can't sensibly continue with recovery.
7135 */
7136 if (haveBackupLabel)
7137 {
7138 ControlFile->backupStartPoint = checkPoint.redo;
7139 ControlFile->backupEndRequired = backupEndRequired;
7140
7141 if (backupFromStandby)
7142 {
7143 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
7144 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
7145 ereport(FATAL,
7146 (errmsg("backup_label contains data inconsistent with control file"),
7147 errhint("This means that the backup is corrupted and you will "
7148 "have to use another backup for recovery.")));
7149 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
7150 }
7151 }
7152 ControlFile->time = (pg_time_t) time(NULL);
7153 /* No need to hold ControlFileLock yet, we aren't up far enough */
7154 UpdateControlFile();
7155
7156 /*
7157 * Initialize our local copy of minRecoveryPoint. When doing crash
7158 * recovery we want to replay up to the end of WAL. Particularly, in
7159 * the case of a promoted standby minRecoveryPoint value in the
7160 * control file is only updated after the first checkpoint. However,
7161 * if the instance crashes before the first post-recovery checkpoint
7162 * is completed then recovery will use a stale location causing the
7163 * startup process to think that there are still invalid page
7164 * references when checking for data consistency.
7165 */
7166 if (InArchiveRecovery)
7167 {
7168 minRecoveryPoint = ControlFile->minRecoveryPoint;
7169 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7170 }
7171 else
7172 {
7173 minRecoveryPoint = InvalidXLogRecPtr;
7174 minRecoveryPointTLI = 0;
7175 }
7176
7177 /*
7178 * Reset pgstat data, because it may be invalid after recovery.
7179 */
7180 pgstat_reset_all();
7181
7182 /*
7183 * If there was a backup label file, it's done its job and the info
7184 * has now been propagated into pg_control. We must get rid of the
7185 * label file so that if we crash during recovery, we'll pick up at
7186 * the latest recovery restartpoint instead of going all the way back
7187 * to the backup start point. It seems prudent though to just rename
7188 * the file out of the way rather than delete it completely.
7189 */
7190 if (haveBackupLabel)
7191 {
7192 unlink(BACKUP_LABEL_OLD);
7193 durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
7194 }
7195
7196 /*
7197 * If there was a tablespace_map file, it's done its job and the
7198 * symlinks have been created. We must get rid of the map file so
7199 * that if we crash during recovery, we don't create symlinks again.
7200 * It seems prudent though to just rename the file out of the way
7201 * rather than delete it completely.
7202 */
7203 if (haveTblspcMap)
7204 {
7205 unlink(TABLESPACE_MAP_OLD);
7206 durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
7207 }
7208
7209 /* Check that the GUCs used to generate the WAL allow recovery */
7210 CheckRequiredParameterValues();
7211
7212 /*
7213 * We're in recovery, so unlogged relations may be trashed and must be
7214 * reset. This should be done BEFORE allowing Hot Standby
7215 * connections, so that read-only backends don't try to read whatever
7216 * garbage is left over from before.
7217 */
7218 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
7219
7220 /*
7221 * Likewise, delete any saved transaction snapshot files that got left
7222 * behind by crashed backends.
7223 */
7224 DeleteAllExportedSnapshotFiles();
7225
7226 /*
7227 * Initialize for Hot Standby, if enabled. We won't let backends in
7228 * yet, not until we've reached the min recovery point specified in
7229 * control file and we've established a recovery snapshot from a
7230 * running-xacts WAL record.
7231 */
7232 if (ArchiveRecoveryRequested && EnableHotStandby)
7233 {
7234 TransactionId *xids;
7235 int nxids;
7236
7237 ereport(DEBUG1,
7238 (errmsg_internal("initializing for hot standby")));
7239
7240 InitRecoveryTransactionEnvironment();
7241
7242 if (wasShutdown)
7243 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7244 else
7245 oldestActiveXID = checkPoint.oldestActiveXid;
7246 Assert(TransactionIdIsValid(oldestActiveXID));
7247
7248 /* Tell procarray about the range of xids it has to deal with */
7249 ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextXid));
7250
7251 /*
7252 * Startup subtrans only. CLOG, MultiXact and commit timestamp
7253 * have already been started up and other SLRUs are not maintained
7254 * during recovery and need not be started yet.
7255 */
7256 StartupSUBTRANS(oldestActiveXID);
7257
7258 /*
7259 * If we're beginning at a shutdown checkpoint, we know that
7260 * nothing was running on the primary at this point. So fake-up an
7261 * empty running-xacts record and use that here and now. Recover
7262 * additional standby state for prepared transactions.
7263 */
7264 if (wasShutdown)
7265 {
7266 RunningTransactionsData running;
7267 TransactionId latestCompletedXid;
7268
7269 /*
7270 * Construct a RunningTransactions snapshot representing a
7271 * shut down server, with only prepared transactions still
7272 * alive. We're never overflowed at this point because all
7273 * subxids are listed with their parent prepared transactions.
7274 */
7275 running.xcnt = nxids;
7276 running.subxcnt = 0;
7277 running.subxid_overflow = false;
7278 running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
7279 running.oldestRunningXid = oldestActiveXID;
7280 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
7281 TransactionIdRetreat(latestCompletedXid);
7282 Assert(TransactionIdIsNormal(latestCompletedXid));
7283 running.latestCompletedXid = latestCompletedXid;
7284 running.xids = xids;
7285
7286 ProcArrayApplyRecoveryInfo(&running);
7287
7288 StandbyRecoverPreparedTransactions();
7289 }
7290 }
7291
7292 /* Initialize resource managers */
7293 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7294 {
7295 if (RmgrTable[rmid].rm_startup != NULL)
7296 RmgrTable[rmid].rm_startup();
7297 }
7298
7299 /*
7300 * Initialize shared variables for tracking progress of WAL replay, as
7301 * if we had just replayed the record before the REDO location (or the
7302 * checkpoint record itself, if it's a shutdown checkpoint).
7303 */
7304 SpinLockAcquire(&XLogCtl->info_lck);
7305 if (checkPoint.redo < RecPtr)
7306 XLogCtl->replayEndRecPtr = checkPoint.redo;
7307 else
7308 XLogCtl->replayEndRecPtr = EndRecPtr;
7309 XLogCtl->replayEndTLI = ThisTimeLineID;
7310 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7311 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7312 XLogCtl->recoveryLastXTime = 0;
7313 XLogCtl->currentChunkStartTime = 0;
7314 XLogCtl->recoveryPauseState = RECOVERY_NOT_PAUSED;
7315 SpinLockRelease(&XLogCtl->info_lck);
7316
7317 /* Also ensure XLogReceiptTime has a sane value */
7318 XLogReceiptTime = GetCurrentTimestamp();
7319
7320 /*
7321 * Let postmaster know we've started redo now, so that it can launch
7322 * checkpointer to perform restartpoints. We don't bother during
7323 * crash recovery as restartpoints can only be performed during
7324 * archive recovery. And we'd like to keep crash recovery simple, to
7325 * avoid introducing bugs that could affect you when recovering after
7326 * crash.
7327 *
7328 * After this point, we can no longer assume that we're the only
7329 * process in addition to postmaster! Also, fsync requests are
7330 * subsequently to be handled by the checkpointer, not locally.
7331 */
7332 if (ArchiveRecoveryRequested && IsUnderPostmaster)
7333 {
7334 PublishStartupProcessInformation();
7335 EnableSyncRequestForwarding();
7336 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7337 bgwriterLaunched = true;
7338 }
7339
7340 /*
7341 * Allow read-only connections immediately if we're consistent
7342 * already.
7343 */
7344 CheckRecoveryConsistency();
7345
7346 /*
7347 * Find the first record that logically follows the checkpoint --- it
7348 * might physically precede it, though.
7349 */
7350 if (checkPoint.redo < RecPtr)
7351 {
7352 /* back up to find the record */
7353 XLogBeginRead(xlogreader, checkPoint.redo);
7354 record = ReadRecord(xlogreader, PANIC, false);
7355 }
7356 else
7357 {
7358 /* just have to read next record after CheckPoint */
7359 record = ReadRecord(xlogreader, LOG, false);
7360 }
7361
7362 if (record != NULL)
7363 {
7364 ErrorContextCallback errcallback;
7365 TimestampTz xtime;
7366 PGRUsage ru0;
7367
7368 pg_rusage_init(&ru0);
7369
7370 InRedo = true;
7371
7372 ereport(LOG,
7373 (errmsg("redo starts at %X/%X",
7374 LSN_FORMAT_ARGS(ReadRecPtr))));
7375
7376 /*
7377 * main redo apply loop
7378 */
7379 do
7380 {
7381 bool switchedTLI = false;
7382
7383 #ifdef WAL_DEBUG
7384 if (XLOG_DEBUG ||
7385 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7386 (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7387 {
7388 StringInfoData buf;
7389
7390 initStringInfo(&buf);
7391 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7392 LSN_FORMAT_ARGS(ReadRecPtr),
7393 LSN_FORMAT_ARGS(EndRecPtr));
7394 xlog_outrec(&buf, xlogreader);
7395 appendStringInfoString(&buf, " - ");
7396 xlog_outdesc(&buf, xlogreader);
7397 elog(LOG, "%s", buf.data);
7398 pfree(buf.data);
7399 }
7400 #endif
7401
7402 /* Handle interrupt signals of startup process */
7403 HandleStartupProcInterrupts();
7404
7405 /*
7406 * Pause WAL replay, if requested by a hot-standby session via
7407 * SetRecoveryPause().
7408 *
7409 * Note that we intentionally don't take the info_lck spinlock
7410 * here. We might therefore read a slightly stale value of
7411 * the recoveryPause flag, but it can't be very stale (no
7412 * worse than the last spinlock we did acquire). Since a
7413 * pause request is a pretty asynchronous thing anyway,
7414 * possibly responding to it one WAL record later than we
7415 * otherwise would is a minor issue, so it doesn't seem worth
7416 * adding another spinlock cycle to prevent that.
7417 */
7418 if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
7419 RECOVERY_NOT_PAUSED)
7420 recoveryPausesHere(false);
7421
7422 /*
7423 * Have we reached our recovery target?
7424 */
7425 if (recoveryStopsBefore(xlogreader))
7426 {
7427 reachedRecoveryTarget = true;
7428 break;
7429 }
7430
7431 /*
7432 * If we've been asked to lag the primary, wait on latch until
7433 * enough time has passed.
7434 */
7435 if (recoveryApplyDelay(xlogreader))
7436 {
7437 /*
7438 * We test for paused recovery again here. If user sets
7439 * delayed apply, it may be because they expect to pause
7440 * recovery in case of problems, so we must test again
7441 * here otherwise pausing during the delay-wait wouldn't
7442 * work.
7443 */
7444 if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
7445 RECOVERY_NOT_PAUSED)
7446 recoveryPausesHere(false);
7447 }
7448
7449 /* Setup error traceback support for ereport() */
7450 errcallback.callback = rm_redo_error_callback;
7451 errcallback.arg = (void *) xlogreader;
7452 errcallback.previous = error_context_stack;
7453 error_context_stack = &errcallback;
7454
7455 /*
7456 * ShmemVariableCache->nextXid must be beyond record's xid.
7457 */
7458 AdvanceNextFullTransactionIdPastXid(record->xl_xid);
7459
7460 /*
7461 * Before replaying this record, check if this record causes
7462 * the current timeline to change. The record is already
7463 * considered to be part of the new timeline, so we update
7464 * ThisTimeLineID before replaying it. That's important so
7465 * that replayEndTLI, which is recorded as the minimum
7466 * recovery point's TLI if recovery stops after this record,
7467 * is set correctly.
7468 */
7469 if (record->xl_rmid == RM_XLOG_ID)
7470 {
7471 TimeLineID newTLI = ThisTimeLineID;
7472 TimeLineID prevTLI = ThisTimeLineID;
7473 uint8 info = record->xl_info & ~XLR_INFO_MASK;
7474
7475 if (info == XLOG_CHECKPOINT_SHUTDOWN)
7476 {
7477 CheckPoint checkPoint;
7478
7479 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7480 newTLI = checkPoint.ThisTimeLineID;
7481 prevTLI = checkPoint.PrevTimeLineID;
7482 }
7483 else if (info == XLOG_END_OF_RECOVERY)
7484 {
7485 xl_end_of_recovery xlrec;
7486
7487 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7488 newTLI = xlrec.ThisTimeLineID;
7489 prevTLI = xlrec.PrevTimeLineID;
7490 }
7491
7492 if (newTLI != ThisTimeLineID)
7493 {
7494 /* Check that it's OK to switch to this TLI */
7495 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7496
7497 /* Following WAL records should be run with new TLI */
7498 ThisTimeLineID = newTLI;
7499 switchedTLI = true;
7500 }
7501 }
7502
7503 /*
7504 * Update shared replayEndRecPtr before replaying this record,
7505 * so that XLogFlush will update minRecoveryPoint correctly.
7506 */
7507 SpinLockAcquire(&XLogCtl->info_lck);
7508 XLogCtl->replayEndRecPtr = EndRecPtr;
7509 XLogCtl->replayEndTLI = ThisTimeLineID;
7510 SpinLockRelease(&XLogCtl->info_lck);
7511
7512 /*
7513 * If we are attempting to enter Hot Standby mode, process
7514 * XIDs we see
7515 */
7516 if (standbyState >= STANDBY_INITIALIZED &&
7517 TransactionIdIsValid(record->xl_xid))
7518 RecordKnownAssignedTransactionIds(record->xl_xid);
7519
7520 /* Now apply the WAL record itself */
7521 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7522
7523 /*
7524 * After redo, check whether the backup pages associated with
7525 * the WAL record are consistent with the existing pages. This
7526 * check is done only if consistency check is enabled for this
7527 * record.
7528 */
7529 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7530 checkXLogConsistency(xlogreader);
7531
7532 /* Pop the error context stack */
7533 error_context_stack = errcallback.previous;
7534
7535 /*
7536 * Update lastReplayedEndRecPtr after this record has been
7537 * successfully replayed.
7538 */
7539 SpinLockAcquire(&XLogCtl->info_lck);
7540 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7541 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7542 SpinLockRelease(&XLogCtl->info_lck);
7543
7544 /*
7545 * If rm_redo called XLogRequestWalReceiverReply, then we wake
7546 * up the receiver so that it notices the updated
7547 * lastReplayedEndRecPtr and sends a reply to the primary.
7548 */
7549 if (doRequestWalReceiverReply)
7550 {
7551 doRequestWalReceiverReply = false;
7552 WalRcvForceReply();
7553 }
7554
7555 /* Remember this record as the last-applied one */
7556 LastRec = ReadRecPtr;
7557
7558 /* Allow read-only connections if we're consistent now */
7559 CheckRecoveryConsistency();
7560
7561 /* Is this a timeline switch? */
7562 if (switchedTLI)
7563 {
7564 /*
7565 * Before we continue on the new timeline, clean up any
7566 * (possibly bogus) future WAL segments on the old
7567 * timeline.
7568 */
7569 RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7570
7571 /*
7572 * Wake up any walsenders to notice that we are on a new
7573 * timeline.
7574 */
7575 if (AllowCascadeReplication())
7576 WalSndWakeup();
7577 }
7578
7579 /* Exit loop if we reached inclusive recovery target */
7580 if (recoveryStopsAfter(xlogreader))
7581 {
7582 reachedRecoveryTarget = true;
7583 break;
7584 }
7585
7586 /* Else, try to fetch the next WAL record */
7587 record = ReadRecord(xlogreader, LOG, false);
7588 } while (record != NULL);
7589
7590 /*
7591 * end of main redo apply loop
7592 */
7593
7594 if (reachedRecoveryTarget)
7595 {
7596 if (!reachedConsistency)
7597 ereport(FATAL,
7598 (errmsg("requested recovery stop point is before consistent recovery point")));
7599
7600 /*
7601 * This is the last point where we can restart recovery with a
7602 * new recovery target, if we shutdown and begin again. After
7603 * this, Resource Managers may choose to do permanent
7604 * corrective actions at end of recovery.
7605 */
7606 switch (recoveryTargetAction)
7607 {
7608 case RECOVERY_TARGET_ACTION_SHUTDOWN:
7609
7610 /*
7611 * exit with special return code to request shutdown
7612 * of postmaster. Log messages issued from
7613 * postmaster.
7614 */
7615 proc_exit(3);
7616
7617 case RECOVERY_TARGET_ACTION_PAUSE:
7618 SetRecoveryPause(true);
7619 recoveryPausesHere(true);
7620
7621 /* drop into promote */
7622
7623 case RECOVERY_TARGET_ACTION_PROMOTE:
7624 break;
7625 }
7626 }
7627
7628 /* Allow resource managers to do any required cleanup. */
7629 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7630 {
7631 if (RmgrTable[rmid].rm_cleanup != NULL)
7632 RmgrTable[rmid].rm_cleanup();
7633 }
7634
7635 ereport(LOG,
7636 (errmsg("redo done at %X/%X system usage: %s",
7637 LSN_FORMAT_ARGS(ReadRecPtr),
7638 pg_rusage_show(&ru0))));
7639 xtime = GetLatestXTime();
7640 if (xtime)
7641 ereport(LOG,
7642 (errmsg("last completed transaction was at log time %s",
7643 timestamptz_to_str(xtime))));
7644
7645 InRedo = false;
7646 }
7647 else
7648 {
7649 /* there are no WAL records following the checkpoint */
7650 ereport(LOG,
7651 (errmsg("redo is not required")));
7652
7653 }
7654
7655 /*
7656 * This check is intentionally after the above log messages that
7657 * indicate how far recovery went.
7658 */
7659 if (ArchiveRecoveryRequested &&
7660 recoveryTarget != RECOVERY_TARGET_UNSET &&
7661 !reachedRecoveryTarget)
7662 ereport(FATAL,
7663 (errmsg("recovery ended before configured recovery target was reached")));
7664 }
7665
7666 /*
7667 * Kill WAL receiver, if it's still running, before we continue to write
7668 * the startup checkpoint and aborted-contrecord records. It will trump
7669 * over these records and subsequent ones if it's still alive when we
7670 * start writing WAL.
7671 */
7672 ShutdownWalRcv();
7673
7674 /*
7675 * Reset unlogged relations to the contents of their INIT fork. This is
7676 * done AFTER recovery is complete so as to include any unlogged relations
7677 * created during recovery, but BEFORE recovery is marked as having
7678 * completed successfully. Otherwise we'd not retry if any of the post
7679 * end-of-recovery steps fail.
7680 */
7681 if (InRecovery)
7682 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7683
7684 /*
7685 * We don't need the latch anymore. It's not strictly necessary to disown
7686 * it, but let's do it for the sake of tidiness.
7687 */
7688 if (ArchiveRecoveryRequested)
7689 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7690
7691 /*
7692 * We are now done reading the xlog from stream. Turn off streaming
7693 * recovery to force fetching the files (which would be required at end of
7694 * recovery, e.g., timeline history file) from archive or pg_wal.
7695 *
7696 * Note that standby mode must be turned off after killing WAL receiver,
7697 * i.e., calling ShutdownWalRcv().
7698 */
7699 Assert(!WalRcvStreaming());
7700 StandbyMode = false;
7701
7702 /*
7703 * Determine where to start writing WAL next.
7704 *
7705 * When recovery ended in an incomplete record, write a WAL record about
7706 * that and continue after it. In all other cases, re-fetch the last
7707 * valid or last applied record, so we can identify the exact endpoint of
7708 * what we consider the valid portion of WAL.
7709 */
7710 XLogBeginRead(xlogreader, LastRec);
7711 record = ReadRecord(xlogreader, PANIC, false);
7712 EndOfLog = EndRecPtr;
7713
7714 /*
7715 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7716 * the end-of-log. It could be different from the timeline that EndOfLog
7717 * nominally belongs to, if there was a timeline switch in that segment,
7718 * and we were reading the old WAL from a segment belonging to a higher
7719 * timeline.
7720 */
7721 EndOfLogTLI = xlogreader->seg.ws_tli;
7722
7723 /*
7724 * Complain if we did not roll forward far enough to render the backup
7725 * dump consistent. Note: it is indeed okay to look at the local variable
7726 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7727 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7728 * advanced beyond the WAL we processed.
7729 */
7730 if (InRecovery &&
7731 (EndOfLog < minRecoveryPoint ||
7732 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7733 {
7734 /*
7735 * Ran off end of WAL before reaching end-of-backup WAL record, or
7736 * minRecoveryPoint. That's usually a bad sign, indicating that you
7737 * tried to recover from an online backup but never called
7738 * pg_stop_backup(), or you didn't archive all the WAL up to that
7739 * point. However, this also happens in crash recovery, if the system
7740 * crashes while an online backup is in progress. We must not treat
7741 * that as an error, or the database will refuse to start up.
7742 */
7743 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7744 {
7745 if (ControlFile->backupEndRequired)
7746 ereport(FATAL,
7747 (errmsg("WAL ends before end of online backup"),
7748 errhint("All WAL generated while online backup was taken must be available at recovery.")));
7749 else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7750 ereport(FATAL,
7751 (errmsg("WAL ends before end of online backup"),
7752 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7753 else
7754 ereport(FATAL,
7755 (errmsg("WAL ends before consistent recovery point")));
7756 }
7757 }
7758
7759 /*
7760 * Pre-scan prepared transactions to find out the range of XIDs present.
7761 * This information is not quite needed yet, but it is positioned here so
7762 * as potential problems are detected before any on-disk change is done.
7763 */
7764 oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7765
7766 /*
7767 * Consider whether we need to assign a new timeline ID.
7768 *
7769 * If we are doing an archive recovery, we always assign a new ID. This
7770 * handles a couple of issues. If we stopped short of the end of WAL
7771 * during recovery, then we are clearly generating a new timeline and must
7772 * assign it a unique new ID. Even if we ran to the end, modifying the
7773 * current last segment is problematic because it may result in trying to
7774 * overwrite an already-archived copy of that segment, and we encourage
7775 * DBAs to make their archive_commands reject that. We can dodge the
7776 * problem by making the new active segment have a new timeline ID.
7777 *
7778 * In a normal crash recovery, we can just extend the timeline we were in.
7779 */
7780 PrevTimeLineID = ThisTimeLineID;
7781 if (ArchiveRecoveryRequested)
7782 {
7783 char reason[200];
7784 char recoveryPath[MAXPGPATH];
7785
7786 Assert(InArchiveRecovery);
7787
7788 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7789 ereport(LOG,
7790 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7791
7792 /*
7793 * Create a comment for the history file to explain why and where
7794 * timeline changed.
7795 */
7796 if (recoveryTarget == RECOVERY_TARGET_XID)
7797 snprintf(reason, sizeof(reason),
7798 "%s transaction %u",
7799 recoveryStopAfter ? "after" : "before",
7800 recoveryStopXid);
7801 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7802 snprintf(reason, sizeof(reason),
7803 "%s %s\n",
7804 recoveryStopAfter ? "after" : "before",
7805 timestamptz_to_str(recoveryStopTime));
7806 else if (recoveryTarget == RECOVERY_TARGET_LSN)
7807 snprintf(reason, sizeof(reason),
7808 "%s LSN %X/%X\n",
7809 recoveryStopAfter ? "after" : "before",
7810 LSN_FORMAT_ARGS(recoveryStopLSN));
7811 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7812 snprintf(reason, sizeof(reason),
7813 "at restore point \"%s\"",
7814 recoveryStopName);
7815 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7816 snprintf(reason, sizeof(reason), "reached consistency");
7817 else
7818 snprintf(reason, sizeof(reason), "no recovery target specified");
7819
7820 /*
7821 * We are now done reading the old WAL. Turn off archive fetching if
7822 * it was active, and make a writable copy of the last WAL segment.
7823 * (Note that we also have a copy of the last block of the old WAL in
7824 * readBuf; we will use that below.)
7825 */
7826 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7827
7828 /*
7829 * Write the timeline history file, and have it archived. After this
7830 * point (or rather, as soon as the file is archived), the timeline
7831 * will appear as "taken" in the WAL archive and to any standby
7832 * servers. If we crash before actually switching to the new
7833 * timeline, standby servers will nevertheless think that we switched
7834 * to the new timeline, and will try to connect to the new timeline.
7835 * To minimize the window for that, try to do as little as possible
7836 * between here and writing the end-of-recovery record.
7837 */
7838 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7839 EndRecPtr, reason);
7840
7841 /*
7842 * Since there might be a partial WAL segment named RECOVERYXLOG, get
7843 * rid of it.
7844 */
7845 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
7846 unlink(recoveryPath); /* ignore any error */
7847
7848 /* Get rid of any remaining recovered timeline-history file, too */
7849 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
7850 unlink(recoveryPath); /* ignore any error */
7851 }
7852
7853 /* Save the selected TimeLineID in shared memory, too */
7854 XLogCtl->ThisTimeLineID = ThisTimeLineID;
7855 XLogCtl->PrevTimeLineID = PrevTimeLineID;
7856
7857 /*
7858 * Actually, if WAL ended in an incomplete record, skip the parts that
7859 * made it through and start writing after the portion that persisted.
7860 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
7861 * we'll do as soon as we're open for writing new WAL.)
7862 */
7863 if (!XLogRecPtrIsInvalid(missingContrecPtr))
7864 {
7865 Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
7866 EndOfLog = missingContrecPtr;
7867 }
7868
7869 /*
7870 * Prepare to write WAL starting at EndOfLog location, and init xlog
7871 * buffer cache using the block containing the last record from the
7872 * previous incarnation.
7873 */
7874 Insert = &XLogCtl->Insert;
7875 Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7876 Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7877
7878 /*
7879 * Tricky point here: readBuf contains the *last* block that the LastRec
7880 * record spans, not the one it starts in. The last block is indeed the
7881 * one we want to use.
7882 */
7883 if (EndOfLog % XLOG_BLCKSZ != 0)
7884 {
7885 char *page;
7886 int len;
7887 int firstIdx;
7888 XLogRecPtr pageBeginPtr;
7889
7890 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7891 Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7892
7893 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7894
7895 /* Copy the valid part of the last block, and zero the rest */
7896 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7897 len = EndOfLog % XLOG_BLCKSZ;
7898 memcpy(page, xlogreader->readBuf, len);
7899 memset(page + len, 0, XLOG_BLCKSZ - len);
7900
7901 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7902 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7903 }
7904 else
7905 {
7906 /*
7907 * There is no partial block to copy. Just set InitializedUpTo, and
7908 * let the first attempt to insert a log record to initialize the next
7909 * buffer.
7910 */
7911 XLogCtl->InitializedUpTo = EndOfLog;
7912 }
7913
7914 LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7915
7916 XLogCtl->LogwrtResult = LogwrtResult;
7917
7918 XLogCtl->LogwrtRqst.Write = EndOfLog;
7919 XLogCtl->LogwrtRqst.Flush = EndOfLog;
7920
7921 LocalSetXLogInsertAllowed();
7922
7923 /* If necessary, write overwrite-contrecord before doing anything else */
7924 if (!XLogRecPtrIsInvalid(abortedRecPtr))
7925 {
7926 Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
7927 CreateOverwriteContrecordRecord(abortedRecPtr);
7928 abortedRecPtr = InvalidXLogRecPtr;
7929 missingContrecPtr = InvalidXLogRecPtr;
7930 }
7931
7932 /*
7933 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7934 * record before resource manager writes cleanup WAL records or checkpoint
7935 * record is written.
7936 */
7937 Insert->fullPageWrites = lastFullPageWrites;
7938 UpdateFullPageWrites();
7939 LocalXLogInsertAllowed = -1;
7940
7941 if (InRecovery)
7942 {
7943 /*
7944 * Perform a checkpoint to update all our recovery activity to disk.
7945 *
7946 * Note that we write a shutdown checkpoint rather than an on-line
7947 * one. This is not particularly critical, but since we may be
7948 * assigning a new TLI, using a shutdown checkpoint allows us to have
7949 * the rule that TLI only changes in shutdown checkpoints, which
7950 * allows some extra error checking in xlog_redo.
7951 *
7952 * In promotion, only create a lightweight end-of-recovery record
7953 * instead of a full checkpoint. A checkpoint is requested later,
7954 * after we're fully out of recovery mode and already accepting
7955 * queries.
7956 */
7957 if (bgwriterLaunched)
7958 {
7959 if (LocalPromoteIsTriggered)
7960 {
7961 checkPointLoc = ControlFile->checkPoint;
7962
7963 /*
7964 * Confirm the last checkpoint is available for us to recover
7965 * from if we fail.
7966 */
7967 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7968 if (record != NULL)
7969 {
7970 promoted = true;
7971
7972 /*
7973 * Insert a special WAL record to mark the end of
7974 * recovery, since we aren't doing a checkpoint. That
7975 * means that the checkpointer process may likely be in
7976 * the middle of a time-smoothed restartpoint and could
7977 * continue to be for minutes after this. That sounds
7978 * strange, but the effect is roughly the same and it
7979 * would be stranger to try to come out of the
7980 * restartpoint and then checkpoint. We request a
7981 * checkpoint later anyway, just for safety.
7982 */
7983 CreateEndOfRecoveryRecord();
7984 }
7985 }
7986
7987 if (!promoted)
7988 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7989 CHECKPOINT_IMMEDIATE |
7990 CHECKPOINT_WAIT);
7991 }
7992 else
7993 CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7994 }
7995
7996 if (ArchiveRecoveryRequested)
7997 {
7998 /*
7999 * And finally, execute the recovery_end_command, if any.
8000 */
8001 if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
8002 ExecuteRecoveryCommand(recoveryEndCommand,
8003 "recovery_end_command",
8004 true);
8005
8006 /*
8007 * We switched to a new timeline. Clean up segments on the old
8008 * timeline.
8009 *
8010 * If there are any higher-numbered segments on the old timeline,
8011 * remove them. They might contain valid WAL, but they might also be
8012 * pre-allocated files containing garbage. In any case, they are not
8013 * part of the new timeline's history so we don't need them.
8014 */
8015 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
8016
8017 /*
8018 * If the switch happened in the middle of a segment, what to do with
8019 * the last, partial segment on the old timeline? If we don't archive
8020 * it, and the server that created the WAL never archives it either
8021 * (e.g. because it was hit by a meteor), it will never make it to the
8022 * archive. That's OK from our point of view, because the new segment
8023 * that we created with the new TLI contains all the WAL from the old
8024 * timeline up to the switch point. But if you later try to do PITR to
8025 * the "missing" WAL on the old timeline, recovery won't find it in
8026 * the archive. It's physically present in the new file with new TLI,
8027 * but recovery won't look there when it's recovering to the older
8028 * timeline. On the other hand, if we archive the partial segment, and
8029 * the original server on that timeline is still running and archives
8030 * the completed version of the same segment later, it will fail. (We
8031 * used to do that in 9.4 and below, and it caused such problems).
8032 *
8033 * As a compromise, we rename the last segment with the .partial
8034 * suffix, and archive it. Archive recovery will never try to read
8035 * .partial segments, so they will normally go unused. But in the odd
8036 * PITR case, the administrator can copy them manually to the pg_wal
8037 * directory (removing the suffix). They can be useful in debugging,
8038 * too.
8039 *
8040 * If a .done or .ready file already exists for the old timeline,
8041 * however, we had already determined that the segment is complete, so
8042 * we can let it be archived normally. (In particular, if it was
8043 * restored from the archive to begin with, it's expected to have a
8044 * .done file).
8045 */
8046 if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
8047 XLogArchivingActive())
8048 {
8049 char origfname[MAXFNAMELEN];
8050 XLogSegNo endLogSegNo;
8051
8052 XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
8053 XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
8054
8055 if (!XLogArchiveIsReadyOrDone(origfname))
8056 {
8057 char origpath[MAXPGPATH];
8058 char partialfname[MAXFNAMELEN];
8059 char partialpath[MAXPGPATH];
8060
8061 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
8062 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
8063 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
8064
8065 /*
8066 * Make sure there's no .done or .ready file for the .partial
8067 * file.
8068 */
8069 XLogArchiveCleanup(partialfname);
8070
8071 durable_rename(origpath, partialpath, ERROR);
8072 XLogArchiveNotify(partialfname);
8073 }
8074 }
8075 }
8076
8077 /*
8078 * Preallocate additional log files, if wanted.
8079 */
8080 PreallocXlogFiles(EndOfLog);
8081
8082 /*
8083 * Okay, we're officially UP.
8084 */
8085 InRecovery = false;
8086
8087 /* start the archive_timeout timer and LSN running */
8088 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
8089 XLogCtl->lastSegSwitchLSN = EndOfLog;
8090
8091 /* also initialize latestCompletedXid, to nextXid - 1 */
8092 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
8093 ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
8094 FullTransactionIdRetreat(&ShmemVariableCache->latestCompletedXid);
8095 LWLockRelease(ProcArrayLock);
8096
8097 /*
8098 * Start up subtrans, if not already done for hot standby. (commit
8099 * timestamps are started below, if necessary.)
8100 */
8101 if (standbyState == STANDBY_DISABLED)
8102 StartupSUBTRANS(oldestActiveXID);
8103
8104 /*
8105 * Perform end of recovery actions for any SLRUs that need it.
8106 */
8107 TrimCLOG();
8108 TrimMultiXact();
8109
8110 /* Reload shared-memory state for prepared transactions */
8111 RecoverPreparedTransactions();
8112
8113 /* Shut down xlogreader */
8114 if (readFile >= 0)
8115 {
8116 close(readFile);
8117 readFile = -1;
8118 }
8119 XLogReaderFree(xlogreader);
8120
8121 /*
8122 * If any of the critical GUCs have changed, log them before we allow
8123 * backends to write WAL.
8124 */
8125 LocalSetXLogInsertAllowed();
8126 XLogReportParameters();
8127
8128 /*
8129 * Local WAL inserts enabled, so it's time to finish initialization of
8130 * commit timestamp.
8131 */
8132 CompleteCommitTsInitialization();
8133
8134 /*
8135 * All done with end-of-recovery actions.
8136 *
8137 * Now allow backends to write WAL and update the control file status in
8138 * consequence. SharedRecoveryState, that controls if backends can write
8139 * WAL, is updated while holding ControlFileLock to prevent other backends
8140 * to look at an inconsistent state of the control file in shared memory.
8141 * There is still a small window during which backends can write WAL and
8142 * the control file is still referring to a system not in DB_IN_PRODUCTION
8143 * state while looking at the on-disk control file.
8144 *
8145 * Also, we use info_lck to update SharedRecoveryState to ensure that
8146 * there are no race conditions concerning visibility of other recent
8147 * updates to shared memory.
8148 */
8149 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8150 ControlFile->state = DB_IN_PRODUCTION;
8151 ControlFile->time = (pg_time_t) time(NULL);
8152
8153 SpinLockAcquire(&XLogCtl->info_lck);
8154 XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
8155 SpinLockRelease(&XLogCtl->info_lck);
8156
8157 UpdateControlFile();
8158 LWLockRelease(ControlFileLock);
8159
8160 /*
8161 * Shutdown the recovery environment. This must occur after
8162 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
8163 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
8164 * any session building a snapshot will not rely on KnownAssignedXids as
8165 * RecoveryInProgress() would return false at this stage. This is
8166 * particularly critical for prepared 2PC transactions, that would still
8167 * need to be included in snapshots once recovery has ended.
8168 */
8169 if (standbyState != STANDBY_DISABLED)
8170 ShutdownRecoveryTransactionEnvironment();
8171
8172 /*
8173 * If there were cascading standby servers connected to us, nudge any wal
8174 * sender processes to notice that we've been promoted.
8175 */
8176 WalSndWakeup();
8177
8178 /*
8179 * If this was a promotion, request an (online) checkpoint now. This isn't
8180 * required for consistency, but the last restartpoint might be far back,
8181 * and in case of a crash, recovering from it might take a longer than is
8182 * appropriate now that we're not in standby mode anymore.
8183 */
8184 if (promoted)
8185 RequestCheckpoint(CHECKPOINT_FORCE);
8186 }
8187
8188 /*
8189 * Checks if recovery has reached a consistent state. When consistency is
8190 * reached and we have a valid starting standby snapshot, tell postmaster
8191 * that it can start accepting read-only connections.
8192 */
8193 static void
CheckRecoveryConsistency(void)8194 CheckRecoveryConsistency(void)
8195 {
8196 XLogRecPtr lastReplayedEndRecPtr;
8197
8198 /*
8199 * During crash recovery, we don't reach a consistent state until we've
8200 * replayed all the WAL.
8201 */
8202 if (XLogRecPtrIsInvalid(minRecoveryPoint))
8203 return;
8204
8205 Assert(InArchiveRecovery);
8206
8207 /*
8208 * assume that we are called in the startup process, and hence don't need
8209 * a lock to read lastReplayedEndRecPtr
8210 */
8211 lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
8212
8213 /*
8214 * Have we reached the point where our base backup was completed?
8215 */
8216 if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
8217 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
8218 {
8219 /*
8220 * We have reached the end of base backup, as indicated by pg_control.
8221 * The data on disk is now consistent. Reset backupStartPoint and
8222 * backupEndPoint, and update minRecoveryPoint to make sure we don't
8223 * allow starting up at an earlier point even if recovery is stopped
8224 * and restarted soon after this.
8225 */
8226 elog(DEBUG1, "end of backup reached");
8227
8228 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8229
8230 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
8231 ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
8232
8233 ControlFile->backupStartPoint = InvalidXLogRecPtr;
8234 ControlFile->backupEndPoint = InvalidXLogRecPtr;
8235 ControlFile->backupEndRequired = false;
8236 UpdateControlFile();
8237
8238 LWLockRelease(ControlFileLock);
8239 }
8240
8241 /*
8242 * Have we passed our safe starting point? Note that minRecoveryPoint is
8243 * known to be incorrectly set if ControlFile->backupEndRequired, until
8244 * the XLOG_BACKUP_END arrives to advise us of the correct
8245 * minRecoveryPoint. All we know prior to that is that we're not
8246 * consistent yet.
8247 */
8248 if (!reachedConsistency && !ControlFile->backupEndRequired &&
8249 minRecoveryPoint <= lastReplayedEndRecPtr &&
8250 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8251 {
8252 /*
8253 * Check to see if the XLOG sequence contained any unresolved
8254 * references to uninitialized pages.
8255 */
8256 XLogCheckInvalidPages();
8257
8258 reachedConsistency = true;
8259 ereport(LOG,
8260 (errmsg("consistent recovery state reached at %X/%X",
8261 LSN_FORMAT_ARGS(lastReplayedEndRecPtr))));
8262 }
8263
8264 /*
8265 * Have we got a valid starting snapshot that will allow queries to be
8266 * run? If so, we can tell postmaster that the database is consistent now,
8267 * enabling connections.
8268 */
8269 if (standbyState == STANDBY_SNAPSHOT_READY &&
8270 !LocalHotStandbyActive &&
8271 reachedConsistency &&
8272 IsUnderPostmaster)
8273 {
8274 SpinLockAcquire(&XLogCtl->info_lck);
8275 XLogCtl->SharedHotStandbyActive = true;
8276 SpinLockRelease(&XLogCtl->info_lck);
8277
8278 LocalHotStandbyActive = true;
8279
8280 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8281 }
8282 }
8283
8284 /*
8285 * Is the system still in recovery?
8286 *
8287 * Unlike testing InRecovery, this works in any process that's connected to
8288 * shared memory.
8289 *
8290 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8291 * variables the first time we see that recovery is finished.
8292 */
8293 bool
RecoveryInProgress(void)8294 RecoveryInProgress(void)
8295 {
8296 /*
8297 * We check shared state each time only until we leave recovery mode. We
8298 * can't re-enter recovery, so there's no need to keep checking after the
8299 * shared variable has once been seen false.
8300 */
8301 if (!LocalRecoveryInProgress)
8302 return false;
8303 else
8304 {
8305 /*
8306 * use volatile pointer to make sure we make a fresh read of the
8307 * shared variable.
8308 */
8309 volatile XLogCtlData *xlogctl = XLogCtl;
8310
8311 LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
8312
8313 /*
8314 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8315 * is finished. InitPostgres() relies upon this behaviour to ensure
8316 * that InitXLOGAccess() is called at backend startup. (If you change
8317 * this, see also LocalSetXLogInsertAllowed.)
8318 */
8319 if (!LocalRecoveryInProgress)
8320 {
8321 /*
8322 * If we just exited recovery, make sure we read TimeLineID and
8323 * RedoRecPtr after SharedRecoveryState (for machines with weak
8324 * memory ordering).
8325 */
8326 pg_memory_barrier();
8327 InitXLOGAccess();
8328 }
8329
8330 /*
8331 * Note: We don't need a memory barrier when we're still in recovery.
8332 * We might exit recovery immediately after return, so the caller
8333 * can't rely on 'true' meaning that we're still in recovery anyway.
8334 */
8335
8336 return LocalRecoveryInProgress;
8337 }
8338 }
8339
8340 /*
8341 * Returns current recovery state from shared memory.
8342 *
8343 * This returned state is kept consistent with the contents of the control
8344 * file. See details about the possible values of RecoveryState in xlog.h.
8345 */
8346 RecoveryState
GetRecoveryState(void)8347 GetRecoveryState(void)
8348 {
8349 RecoveryState retval;
8350
8351 SpinLockAcquire(&XLogCtl->info_lck);
8352 retval = XLogCtl->SharedRecoveryState;
8353 SpinLockRelease(&XLogCtl->info_lck);
8354
8355 return retval;
8356 }
8357
8358 /*
8359 * Is HotStandby active yet? This is only important in special backends
8360 * since normal backends won't ever be able to connect until this returns
8361 * true. Postmaster knows this by way of signal, not via shared memory.
8362 *
8363 * Unlike testing standbyState, this works in any process that's connected to
8364 * shared memory. (And note that standbyState alone doesn't tell the truth
8365 * anyway.)
8366 */
8367 bool
HotStandbyActive(void)8368 HotStandbyActive(void)
8369 {
8370 /*
8371 * We check shared state each time only until Hot Standby is active. We
8372 * can't de-activate Hot Standby, so there's no need to keep checking
8373 * after the shared variable has once been seen true.
8374 */
8375 if (LocalHotStandbyActive)
8376 return true;
8377 else
8378 {
8379 /* spinlock is essential on machines with weak memory ordering! */
8380 SpinLockAcquire(&XLogCtl->info_lck);
8381 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8382 SpinLockRelease(&XLogCtl->info_lck);
8383
8384 return LocalHotStandbyActive;
8385 }
8386 }
8387
8388 /*
8389 * Like HotStandbyActive(), but to be used only in WAL replay code,
8390 * where we don't need to ask any other process what the state is.
8391 */
8392 bool
HotStandbyActiveInReplay(void)8393 HotStandbyActiveInReplay(void)
8394 {
8395 Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8396 return LocalHotStandbyActive;
8397 }
8398
8399 /*
8400 * Is this process allowed to insert new WAL records?
8401 *
8402 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8403 * But we also have provisions for forcing the result "true" or "false"
8404 * within specific processes regardless of the global state.
8405 */
8406 bool
XLogInsertAllowed(void)8407 XLogInsertAllowed(void)
8408 {
8409 /*
8410 * If value is "unconditionally true" or "unconditionally false", just
8411 * return it. This provides the normal fast path once recovery is known
8412 * done.
8413 */
8414 if (LocalXLogInsertAllowed >= 0)
8415 return (bool) LocalXLogInsertAllowed;
8416
8417 /*
8418 * Else, must check to see if we're still in recovery.
8419 */
8420 if (RecoveryInProgress())
8421 return false;
8422
8423 /*
8424 * On exit from recovery, reset to "unconditionally true", since there is
8425 * no need to keep checking.
8426 */
8427 LocalXLogInsertAllowed = 1;
8428 return true;
8429 }
8430
8431 /*
8432 * Make XLogInsertAllowed() return true in the current process only.
8433 *
8434 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8435 * and even call LocalSetXLogInsertAllowed() again after that.
8436 */
8437 static void
LocalSetXLogInsertAllowed(void)8438 LocalSetXLogInsertAllowed(void)
8439 {
8440 Assert(LocalXLogInsertAllowed == -1);
8441 LocalXLogInsertAllowed = 1;
8442
8443 /* Initialize as RecoveryInProgress() would do when switching state */
8444 InitXLOGAccess();
8445 }
8446
8447 /*
8448 * Subroutine to try to fetch and validate a prior checkpoint record.
8449 *
8450 * whichChkpt identifies the checkpoint (merely for reporting purposes).
8451 * 1 for "primary", 0 for "other" (backup_label)
8452 */
8453 static XLogRecord *
ReadCheckpointRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int whichChkpt,bool report)8454 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8455 int whichChkpt, bool report)
8456 {
8457 XLogRecord *record;
8458 uint8 info;
8459
8460 if (!XRecOffIsValid(RecPtr))
8461 {
8462 if (!report)
8463 return NULL;
8464
8465 switch (whichChkpt)
8466 {
8467 case 1:
8468 ereport(LOG,
8469 (errmsg("invalid primary checkpoint link in control file")));
8470 break;
8471 default:
8472 ereport(LOG,
8473 (errmsg("invalid checkpoint link in backup_label file")));
8474 break;
8475 }
8476 return NULL;
8477 }
8478
8479 XLogBeginRead(xlogreader, RecPtr);
8480 record = ReadRecord(xlogreader, LOG, true);
8481
8482 if (record == NULL)
8483 {
8484 if (!report)
8485 return NULL;
8486
8487 switch (whichChkpt)
8488 {
8489 case 1:
8490 ereport(LOG,
8491 (errmsg("invalid primary checkpoint record")));
8492 break;
8493 default:
8494 ereport(LOG,
8495 (errmsg("invalid checkpoint record")));
8496 break;
8497 }
8498 return NULL;
8499 }
8500 if (record->xl_rmid != RM_XLOG_ID)
8501 {
8502 switch (whichChkpt)
8503 {
8504 case 1:
8505 ereport(LOG,
8506 (errmsg("invalid resource manager ID in primary checkpoint record")));
8507 break;
8508 default:
8509 ereport(LOG,
8510 (errmsg("invalid resource manager ID in checkpoint record")));
8511 break;
8512 }
8513 return NULL;
8514 }
8515 info = record->xl_info & ~XLR_INFO_MASK;
8516 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8517 info != XLOG_CHECKPOINT_ONLINE)
8518 {
8519 switch (whichChkpt)
8520 {
8521 case 1:
8522 ereport(LOG,
8523 (errmsg("invalid xl_info in primary checkpoint record")));
8524 break;
8525 default:
8526 ereport(LOG,
8527 (errmsg("invalid xl_info in checkpoint record")));
8528 break;
8529 }
8530 return NULL;
8531 }
8532 if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8533 {
8534 switch (whichChkpt)
8535 {
8536 case 1:
8537 ereport(LOG,
8538 (errmsg("invalid length of primary checkpoint record")));
8539 break;
8540 default:
8541 ereport(LOG,
8542 (errmsg("invalid length of checkpoint record")));
8543 break;
8544 }
8545 return NULL;
8546 }
8547 return record;
8548 }
8549
8550 /*
8551 * This must be called in a backend process before creating WAL records
8552 * (except in a standalone backend, which does StartupXLOG instead). We need
8553 * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8554 *
8555 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8556 * process's copies of ThisTimeLineID and RedoRecPtr valid too. This was
8557 * unnecessary however, since the postmaster itself never touches XLOG anyway.
8558 */
8559 void
InitXLOGAccess(void)8560 InitXLOGAccess(void)
8561 {
8562 XLogCtlInsert *Insert = &XLogCtl->Insert;
8563
8564 /* ThisTimeLineID doesn't change so we need no lock to copy it */
8565 ThisTimeLineID = XLogCtl->ThisTimeLineID;
8566 Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8567
8568 /* set wal_segment_size */
8569 wal_segment_size = ControlFile->xlog_seg_size;
8570
8571 /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8572 (void) GetRedoRecPtr();
8573 /* Also update our copy of doPageWrites. */
8574 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8575
8576 /* Also initialize the working areas for constructing WAL records */
8577 InitXLogInsert();
8578 }
8579
8580 /*
8581 * Return the current Redo pointer from shared memory.
8582 *
8583 * As a side-effect, the local RedoRecPtr copy is updated.
8584 */
8585 XLogRecPtr
GetRedoRecPtr(void)8586 GetRedoRecPtr(void)
8587 {
8588 XLogRecPtr ptr;
8589
8590 /*
8591 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8592 * grabbed a WAL insertion lock to read the authoritative value in
8593 * Insert->RedoRecPtr, someone might update it just after we've released
8594 * the lock.
8595 */
8596 SpinLockAcquire(&XLogCtl->info_lck);
8597 ptr = XLogCtl->RedoRecPtr;
8598 SpinLockRelease(&XLogCtl->info_lck);
8599
8600 if (RedoRecPtr < ptr)
8601 RedoRecPtr = ptr;
8602
8603 return RedoRecPtr;
8604 }
8605
8606 /*
8607 * Return information needed to decide whether a modified block needs a
8608 * full-page image to be included in the WAL record.
8609 *
8610 * The returned values are cached copies from backend-private memory, and
8611 * possibly out-of-date. XLogInsertRecord will re-check them against
8612 * up-to-date values, while holding the WAL insert lock.
8613 */
8614 void
GetFullPageWriteInfo(XLogRecPtr * RedoRecPtr_p,bool * doPageWrites_p)8615 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8616 {
8617 *RedoRecPtr_p = RedoRecPtr;
8618 *doPageWrites_p = doPageWrites;
8619 }
8620
8621 /*
8622 * GetInsertRecPtr -- Returns the current insert position.
8623 *
8624 * NOTE: The value *actually* returned is the position of the last full
8625 * xlog page. It lags behind the real insert position by at most 1 page.
8626 * For that, we don't need to scan through WAL insertion locks, and an
8627 * approximation is enough for the current usage of this function.
8628 */
8629 XLogRecPtr
GetInsertRecPtr(void)8630 GetInsertRecPtr(void)
8631 {
8632 XLogRecPtr recptr;
8633
8634 SpinLockAcquire(&XLogCtl->info_lck);
8635 recptr = XLogCtl->LogwrtRqst.Write;
8636 SpinLockRelease(&XLogCtl->info_lck);
8637
8638 return recptr;
8639 }
8640
8641 /*
8642 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8643 * position known to be fsync'd to disk.
8644 */
8645 XLogRecPtr
GetFlushRecPtr(void)8646 GetFlushRecPtr(void)
8647 {
8648 SpinLockAcquire(&XLogCtl->info_lck);
8649 LogwrtResult = XLogCtl->LogwrtResult;
8650 SpinLockRelease(&XLogCtl->info_lck);
8651
8652 return LogwrtResult.Flush;
8653 }
8654
8655 /*
8656 * GetLastImportantRecPtr -- Returns the LSN of the last important record
8657 * inserted. All records not explicitly marked as unimportant are considered
8658 * important.
8659 *
8660 * The LSN is determined by computing the maximum of
8661 * WALInsertLocks[i].lastImportantAt.
8662 */
8663 XLogRecPtr
GetLastImportantRecPtr(void)8664 GetLastImportantRecPtr(void)
8665 {
8666 XLogRecPtr res = InvalidXLogRecPtr;
8667 int i;
8668
8669 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8670 {
8671 XLogRecPtr last_important;
8672
8673 /*
8674 * Need to take a lock to prevent torn reads of the LSN, which are
8675 * possible on some of the supported platforms. WAL insert locks only
8676 * support exclusive mode, so we have to use that.
8677 */
8678 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8679 last_important = WALInsertLocks[i].l.lastImportantAt;
8680 LWLockRelease(&WALInsertLocks[i].l.lock);
8681
8682 if (res < last_important)
8683 res = last_important;
8684 }
8685
8686 return res;
8687 }
8688
8689 /*
8690 * Get the time and LSN of the last xlog segment switch
8691 */
8692 pg_time_t
GetLastSegSwitchData(XLogRecPtr * lastSwitchLSN)8693 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8694 {
8695 pg_time_t result;
8696
8697 /* Need WALWriteLock, but shared lock is sufficient */
8698 LWLockAcquire(WALWriteLock, LW_SHARED);
8699 result = XLogCtl->lastSegSwitchTime;
8700 *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8701 LWLockRelease(WALWriteLock);
8702
8703 return result;
8704 }
8705
8706 /*
8707 * This must be called ONCE during postmaster or standalone-backend shutdown
8708 */
8709 void
ShutdownXLOG(int code,Datum arg)8710 ShutdownXLOG(int code, Datum arg)
8711 {
8712 /*
8713 * We should have an aux process resource owner to use, and we should not
8714 * be in a transaction that's installed some other resowner.
8715 */
8716 Assert(AuxProcessResourceOwner != NULL);
8717 Assert(CurrentResourceOwner == NULL ||
8718 CurrentResourceOwner == AuxProcessResourceOwner);
8719 CurrentResourceOwner = AuxProcessResourceOwner;
8720
8721 /* Don't be chatty in standalone mode */
8722 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8723 (errmsg("shutting down")));
8724
8725 /*
8726 * Signal walsenders to move to stopping state.
8727 */
8728 WalSndInitStopping();
8729
8730 /*
8731 * Wait for WAL senders to be in stopping state. This prevents commands
8732 * from writing new WAL.
8733 */
8734 WalSndWaitStopping();
8735
8736 if (RecoveryInProgress())
8737 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8738 else
8739 {
8740 /*
8741 * If archiving is enabled, rotate the last XLOG file so that all the
8742 * remaining records are archived (postmaster wakes up the archiver
8743 * process one more time at the end of shutdown). The checkpoint
8744 * record will go to the next XLOG file and won't be archived (yet).
8745 */
8746 if (XLogArchivingActive() && XLogArchiveCommandSet())
8747 RequestXLogSwitch(false);
8748
8749 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8750 }
8751 }
8752
8753 /*
8754 * Log start of a checkpoint.
8755 */
8756 static void
LogCheckpointStart(int flags,bool restartpoint)8757 LogCheckpointStart(int flags, bool restartpoint)
8758 {
8759 if (restartpoint)
8760 ereport(LOG,
8761 /* translator: the placeholders show checkpoint options */
8762 (errmsg("restartpoint starting:%s%s%s%s%s%s%s%s",
8763 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8764 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8765 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8766 (flags & CHECKPOINT_FORCE) ? " force" : "",
8767 (flags & CHECKPOINT_WAIT) ? " wait" : "",
8768 (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
8769 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8770 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
8771 else
8772 ereport(LOG,
8773 /* translator: the placeholders show checkpoint options */
8774 (errmsg("checkpoint starting:%s%s%s%s%s%s%s%s",
8775 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8776 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8777 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8778 (flags & CHECKPOINT_FORCE) ? " force" : "",
8779 (flags & CHECKPOINT_WAIT) ? " wait" : "",
8780 (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
8781 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8782 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "")));
8783 }
8784
8785 /*
8786 * Log end of a checkpoint.
8787 */
8788 static void
LogCheckpointEnd(bool restartpoint)8789 LogCheckpointEnd(bool restartpoint)
8790 {
8791 long write_msecs,
8792 sync_msecs,
8793 total_msecs,
8794 longest_msecs,
8795 average_msecs;
8796 uint64 average_sync_time;
8797
8798 CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8799
8800 write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
8801 CheckpointStats.ckpt_sync_t);
8802
8803 sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
8804 CheckpointStats.ckpt_sync_end_t);
8805
8806 /* Accumulate checkpoint timing summary data, in milliseconds. */
8807 BgWriterStats.m_checkpoint_write_time += write_msecs;
8808 BgWriterStats.m_checkpoint_sync_time += sync_msecs;
8809
8810 /*
8811 * All of the published timing statistics are accounted for. Only
8812 * continue if a log message is to be written.
8813 */
8814 if (!log_checkpoints)
8815 return;
8816
8817 total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
8818 CheckpointStats.ckpt_end_t);
8819
8820 /*
8821 * Timing values returned from CheckpointStats are in microseconds.
8822 * Convert to milliseconds for consistent printing.
8823 */
8824 longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
8825
8826 average_sync_time = 0;
8827 if (CheckpointStats.ckpt_sync_rels > 0)
8828 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8829 CheckpointStats.ckpt_sync_rels;
8830 average_msecs = (long) ((average_sync_time + 999) / 1000);
8831
8832 if (restartpoint)
8833 ereport(LOG,
8834 (errmsg("restartpoint complete: wrote %d buffers (%.1f%%); "
8835 "%d WAL file(s) added, %d removed, %d recycled; "
8836 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8837 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8838 "distance=%d kB, estimate=%d kB",
8839 CheckpointStats.ckpt_bufs_written,
8840 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8841 CheckpointStats.ckpt_segs_added,
8842 CheckpointStats.ckpt_segs_removed,
8843 CheckpointStats.ckpt_segs_recycled,
8844 write_msecs / 1000, (int) (write_msecs % 1000),
8845 sync_msecs / 1000, (int) (sync_msecs % 1000),
8846 total_msecs / 1000, (int) (total_msecs % 1000),
8847 CheckpointStats.ckpt_sync_rels,
8848 longest_msecs / 1000, (int) (longest_msecs % 1000),
8849 average_msecs / 1000, (int) (average_msecs % 1000),
8850 (int) (PrevCheckPointDistance / 1024.0),
8851 (int) (CheckPointDistanceEstimate / 1024.0))));
8852 else
8853 ereport(LOG,
8854 (errmsg("checkpoint complete: wrote %d buffers (%.1f%%); "
8855 "%d WAL file(s) added, %d removed, %d recycled; "
8856 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8857 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8858 "distance=%d kB, estimate=%d kB",
8859 CheckpointStats.ckpt_bufs_written,
8860 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8861 CheckpointStats.ckpt_segs_added,
8862 CheckpointStats.ckpt_segs_removed,
8863 CheckpointStats.ckpt_segs_recycled,
8864 write_msecs / 1000, (int) (write_msecs % 1000),
8865 sync_msecs / 1000, (int) (sync_msecs % 1000),
8866 total_msecs / 1000, (int) (total_msecs % 1000),
8867 CheckpointStats.ckpt_sync_rels,
8868 longest_msecs / 1000, (int) (longest_msecs % 1000),
8869 average_msecs / 1000, (int) (average_msecs % 1000),
8870 (int) (PrevCheckPointDistance / 1024.0),
8871 (int) (CheckPointDistanceEstimate / 1024.0))));
8872 }
8873
8874 /*
8875 * Update the estimate of distance between checkpoints.
8876 *
8877 * The estimate is used to calculate the number of WAL segments to keep
8878 * preallocated, see XLOGfileslop().
8879 */
8880 static void
UpdateCheckPointDistanceEstimate(uint64 nbytes)8881 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8882 {
8883 /*
8884 * To estimate the number of segments consumed between checkpoints, keep a
8885 * moving average of the amount of WAL generated in previous checkpoint
8886 * cycles. However, if the load is bursty, with quiet periods and busy
8887 * periods, we want to cater for the peak load. So instead of a plain
8888 * moving average, let the average decline slowly if the previous cycle
8889 * used less WAL than estimated, but bump it up immediately if it used
8890 * more.
8891 *
8892 * When checkpoints are triggered by max_wal_size, this should converge to
8893 * CheckpointSegments * wal_segment_size,
8894 *
8895 * Note: This doesn't pay any attention to what caused the checkpoint.
8896 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8897 * starting a base backup, are counted the same as those created
8898 * automatically. The slow-decline will largely mask them out, if they are
8899 * not frequent. If they are frequent, it seems reasonable to count them
8900 * in as any others; if you issue a manual checkpoint every 5 minutes and
8901 * never let a timed checkpoint happen, it makes sense to base the
8902 * preallocation on that 5 minute interval rather than whatever
8903 * checkpoint_timeout is set to.
8904 */
8905 PrevCheckPointDistance = nbytes;
8906 if (CheckPointDistanceEstimate < nbytes)
8907 CheckPointDistanceEstimate = nbytes;
8908 else
8909 CheckPointDistanceEstimate =
8910 (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8911 }
8912
8913 /*
8914 * Update the ps display for a process running a checkpoint. Note that
8915 * this routine should not do any allocations so as it can be called
8916 * from a critical section.
8917 */
8918 static void
update_checkpoint_display(int flags,bool restartpoint,bool reset)8919 update_checkpoint_display(int flags, bool restartpoint, bool reset)
8920 {
8921 /*
8922 * The status is reported only for end-of-recovery and shutdown
8923 * checkpoints or shutdown restartpoints. Updating the ps display is
8924 * useful in those situations as it may not be possible to rely on
8925 * pg_stat_activity to see the status of the checkpointer or the startup
8926 * process.
8927 */
8928 if ((flags & (CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IS_SHUTDOWN)) == 0)
8929 return;
8930
8931 if (reset)
8932 set_ps_display("");
8933 else
8934 {
8935 char activitymsg[128];
8936
8937 snprintf(activitymsg, sizeof(activitymsg), "performing %s%s%s",
8938 (flags & CHECKPOINT_END_OF_RECOVERY) ? "end-of-recovery " : "",
8939 (flags & CHECKPOINT_IS_SHUTDOWN) ? "shutdown " : "",
8940 restartpoint ? "restartpoint" : "checkpoint");
8941 set_ps_display(activitymsg);
8942 }
8943 }
8944
8945
8946 /*
8947 * Perform a checkpoint --- either during shutdown, or on-the-fly
8948 *
8949 * flags is a bitwise OR of the following:
8950 * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8951 * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8952 * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8953 * ignoring checkpoint_completion_target parameter.
8954 * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8955 * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8956 * CHECKPOINT_END_OF_RECOVERY).
8957 * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8958 *
8959 * Note: flags contains other bits, of interest here only for logging purposes.
8960 * In particular note that this routine is synchronous and does not pay
8961 * attention to CHECKPOINT_WAIT.
8962 *
8963 * If !shutdown then we are writing an online checkpoint. This is a very special
8964 * kind of operation and WAL record because the checkpoint action occurs over
8965 * a period of time yet logically occurs at just a single LSN. The logical
8966 * position of the WAL record (redo ptr) is the same or earlier than the
8967 * physical position. When we replay WAL we locate the checkpoint via its
8968 * physical position then read the redo ptr and actually start replay at the
8969 * earlier logical position. Note that we don't write *anything* to WAL at
8970 * the logical position, so that location could be any other kind of WAL record.
8971 * All of this mechanism allows us to continue working while we checkpoint.
8972 * As a result, timing of actions is critical here and be careful to note that
8973 * this function will likely take minutes to execute on a busy system.
8974 */
8975 void
CreateCheckPoint(int flags)8976 CreateCheckPoint(int flags)
8977 {
8978 bool shutdown;
8979 CheckPoint checkPoint;
8980 XLogRecPtr recptr;
8981 XLogSegNo _logSegNo;
8982 XLogCtlInsert *Insert = &XLogCtl->Insert;
8983 uint32 freespace;
8984 XLogRecPtr PriorRedoPtr;
8985 XLogRecPtr curInsert;
8986 XLogRecPtr last_important_lsn;
8987 VirtualTransactionId *vxids;
8988 int nvxids;
8989
8990 /*
8991 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8992 * issued at a different time.
8993 */
8994 if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8995 shutdown = true;
8996 else
8997 shutdown = false;
8998
8999 /* sanity check */
9000 if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
9001 elog(ERROR, "can't create a checkpoint during recovery");
9002
9003 /*
9004 * Initialize InitXLogInsert working areas before entering the critical
9005 * section. Normally, this is done by the first call to
9006 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
9007 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
9008 * done below in a critical section, and InitXLogInsert cannot be called
9009 * in a critical section.
9010 */
9011 InitXLogInsert();
9012
9013 /*
9014 * Prepare to accumulate statistics.
9015 *
9016 * Note: because it is possible for log_checkpoints to change while a
9017 * checkpoint proceeds, we always accumulate stats, even if
9018 * log_checkpoints is currently off.
9019 */
9020 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9021 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9022
9023 /*
9024 * Use a critical section to force system panic if we have trouble.
9025 */
9026 START_CRIT_SECTION();
9027
9028 if (shutdown)
9029 {
9030 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9031 ControlFile->state = DB_SHUTDOWNING;
9032 ControlFile->time = (pg_time_t) time(NULL);
9033 UpdateControlFile();
9034 LWLockRelease(ControlFileLock);
9035 }
9036
9037 /*
9038 * Let smgr prepare for checkpoint; this has to happen before we determine
9039 * the REDO pointer. Note that smgr must not do anything that'd have to
9040 * be undone if we decide no checkpoint is needed.
9041 */
9042 SyncPreCheckpoint();
9043
9044 /* Begin filling in the checkpoint WAL record */
9045 MemSet(&checkPoint, 0, sizeof(checkPoint));
9046 checkPoint.time = (pg_time_t) time(NULL);
9047
9048 /*
9049 * For Hot Standby, derive the oldestActiveXid before we fix the redo
9050 * pointer. This allows us to begin accumulating changes to assemble our
9051 * starting snapshot of locks and transactions.
9052 */
9053 if (!shutdown && XLogStandbyInfoActive())
9054 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
9055 else
9056 checkPoint.oldestActiveXid = InvalidTransactionId;
9057
9058 /*
9059 * Get location of last important record before acquiring insert locks (as
9060 * GetLastImportantRecPtr() also locks WAL locks).
9061 */
9062 last_important_lsn = GetLastImportantRecPtr();
9063
9064 /*
9065 * We must block concurrent insertions while examining insert state to
9066 * determine the checkpoint REDO pointer.
9067 */
9068 WALInsertLockAcquireExclusive();
9069 curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
9070
9071 /*
9072 * If this isn't a shutdown or forced checkpoint, and if there has been no
9073 * WAL activity requiring a checkpoint, skip it. The idea here is to
9074 * avoid inserting duplicate checkpoints when the system is idle.
9075 */
9076 if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
9077 CHECKPOINT_FORCE)) == 0)
9078 {
9079 if (last_important_lsn == ControlFile->checkPoint)
9080 {
9081 WALInsertLockRelease();
9082 END_CRIT_SECTION();
9083 ereport(DEBUG1,
9084 (errmsg_internal("checkpoint skipped because system is idle")));
9085 return;
9086 }
9087 }
9088
9089 /*
9090 * An end-of-recovery checkpoint is created before anyone is allowed to
9091 * write WAL. To allow us to write the checkpoint record, temporarily
9092 * enable XLogInsertAllowed. (This also ensures ThisTimeLineID is
9093 * initialized, which we need here and in AdvanceXLInsertBuffer.)
9094 */
9095 if (flags & CHECKPOINT_END_OF_RECOVERY)
9096 LocalSetXLogInsertAllowed();
9097
9098 checkPoint.ThisTimeLineID = ThisTimeLineID;
9099 if (flags & CHECKPOINT_END_OF_RECOVERY)
9100 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9101 else
9102 checkPoint.PrevTimeLineID = ThisTimeLineID;
9103
9104 checkPoint.fullPageWrites = Insert->fullPageWrites;
9105
9106 /*
9107 * Compute new REDO record ptr = location of next XLOG record.
9108 *
9109 * NB: this is NOT necessarily where the checkpoint record itself will be,
9110 * since other backends may insert more XLOG records while we're off doing
9111 * the buffer flush work. Those XLOG records are logically after the
9112 * checkpoint, even though physically before it. Got that?
9113 */
9114 freespace = INSERT_FREESPACE(curInsert);
9115 if (freespace == 0)
9116 {
9117 if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
9118 curInsert += SizeOfXLogLongPHD;
9119 else
9120 curInsert += SizeOfXLogShortPHD;
9121 }
9122 checkPoint.redo = curInsert;
9123
9124 /*
9125 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
9126 * must be done while holding all the insertion locks.
9127 *
9128 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
9129 * pointing past where it really needs to point. This is okay; the only
9130 * consequence is that XLogInsert might back up whole buffers that it
9131 * didn't really need to. We can't postpone advancing RedoRecPtr because
9132 * XLogInserts that happen while we are dumping buffers must assume that
9133 * their buffer changes are not included in the checkpoint.
9134 */
9135 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
9136
9137 /*
9138 * Now we can release the WAL insertion locks, allowing other xacts to
9139 * proceed while we are flushing disk buffers.
9140 */
9141 WALInsertLockRelease();
9142
9143 /* Update the info_lck-protected copy of RedoRecPtr as well */
9144 SpinLockAcquire(&XLogCtl->info_lck);
9145 XLogCtl->RedoRecPtr = checkPoint.redo;
9146 SpinLockRelease(&XLogCtl->info_lck);
9147
9148 /*
9149 * If enabled, log checkpoint start. We postpone this until now so as not
9150 * to log anything if we decided to skip the checkpoint.
9151 */
9152 if (log_checkpoints)
9153 LogCheckpointStart(flags, false);
9154
9155 /* Update the process title */
9156 update_checkpoint_display(flags, false, false);
9157
9158 TRACE_POSTGRESQL_CHECKPOINT_START(flags);
9159
9160 /*
9161 * Get the other info we need for the checkpoint record.
9162 *
9163 * We don't need to save oldestClogXid in the checkpoint, it only matters
9164 * for the short period in which clog is being truncated, and if we crash
9165 * during that we'll redo the clog truncation and fix up oldestClogXid
9166 * there.
9167 */
9168 LWLockAcquire(XidGenLock, LW_SHARED);
9169 checkPoint.nextXid = ShmemVariableCache->nextXid;
9170 checkPoint.oldestXid = ShmemVariableCache->oldestXid;
9171 checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
9172 LWLockRelease(XidGenLock);
9173
9174 LWLockAcquire(CommitTsLock, LW_SHARED);
9175 checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
9176 checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
9177 LWLockRelease(CommitTsLock);
9178
9179 LWLockAcquire(OidGenLock, LW_SHARED);
9180 checkPoint.nextOid = ShmemVariableCache->nextOid;
9181 if (!shutdown)
9182 checkPoint.nextOid += ShmemVariableCache->oidCount;
9183 LWLockRelease(OidGenLock);
9184
9185 MultiXactGetCheckptMulti(shutdown,
9186 &checkPoint.nextMulti,
9187 &checkPoint.nextMultiOffset,
9188 &checkPoint.oldestMulti,
9189 &checkPoint.oldestMultiDB);
9190
9191 /*
9192 * Having constructed the checkpoint record, ensure all shmem disk buffers
9193 * and commit-log buffers are flushed to disk.
9194 *
9195 * This I/O could fail for various reasons. If so, we will fail to
9196 * complete the checkpoint, but there is no reason to force a system
9197 * panic. Accordingly, exit critical section while doing it.
9198 */
9199 END_CRIT_SECTION();
9200
9201 /*
9202 * In some cases there are groups of actions that must all occur on one
9203 * side or the other of a checkpoint record. Before flushing the
9204 * checkpoint record we must explicitly wait for any backend currently
9205 * performing those groups of actions.
9206 *
9207 * One example is end of transaction, so we must wait for any transactions
9208 * that are currently in commit critical sections. If an xact inserted
9209 * its commit record into XLOG just before the REDO point, then a crash
9210 * restart from the REDO point would not replay that record, which means
9211 * that our flushing had better include the xact's update of pg_xact. So
9212 * we wait till he's out of his commit critical section before proceeding.
9213 * See notes in RecordTransactionCommit().
9214 *
9215 * Because we've already released the insertion locks, this test is a bit
9216 * fuzzy: it is possible that we will wait for xacts we didn't really need
9217 * to wait for. But the delay should be short and it seems better to make
9218 * checkpoint take a bit longer than to hold off insertions longer than
9219 * necessary. (In fact, the whole reason we have this issue is that xact.c
9220 * does commit record XLOG insertion and clog update as two separate steps
9221 * protected by different locks, but again that seems best on grounds of
9222 * minimizing lock contention.)
9223 *
9224 * A transaction that has not yet set delayChkpt when we look cannot be at
9225 * risk, since he's not inserted his commit record yet; and one that's
9226 * already cleared it is not at risk either, since he's done fixing clog
9227 * and we will correctly flush the update below. So we cannot miss any
9228 * xacts we need to wait for.
9229 */
9230 vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
9231 if (nvxids > 0)
9232 {
9233 do
9234 {
9235 pg_usleep(10000L); /* wait for 10 msec */
9236 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9237 }
9238 pfree(vxids);
9239
9240 CheckPointGuts(checkPoint.redo, flags);
9241
9242 /*
9243 * Take a snapshot of running transactions and write this to WAL. This
9244 * allows us to reconstruct the state of running transactions during
9245 * archive recovery, if required. Skip, if this info disabled.
9246 *
9247 * If we are shutting down, or Startup process is completing crash
9248 * recovery we don't need to write running xact data.
9249 */
9250 if (!shutdown && XLogStandbyInfoActive())
9251 LogStandbySnapshot();
9252
9253 START_CRIT_SECTION();
9254
9255 /*
9256 * Now insert the checkpoint record into XLOG.
9257 */
9258 XLogBeginInsert();
9259 XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
9260 recptr = XLogInsert(RM_XLOG_ID,
9261 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
9262 XLOG_CHECKPOINT_ONLINE);
9263
9264 XLogFlush(recptr);
9265
9266 /*
9267 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
9268 * overwritten at next startup. No-one should even try, this just allows
9269 * sanity-checking. In the case of an end-of-recovery checkpoint, we want
9270 * to just temporarily disable writing until the system has exited
9271 * recovery.
9272 */
9273 if (shutdown)
9274 {
9275 if (flags & CHECKPOINT_END_OF_RECOVERY)
9276 LocalXLogInsertAllowed = -1; /* return to "check" state */
9277 else
9278 LocalXLogInsertAllowed = 0; /* never again write WAL */
9279 }
9280
9281 /*
9282 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
9283 * = end of actual checkpoint record.
9284 */
9285 if (shutdown && checkPoint.redo != ProcLastRecPtr)
9286 ereport(PANIC,
9287 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
9288
9289 /*
9290 * Remember the prior checkpoint's redo ptr for
9291 * UpdateCheckPointDistanceEstimate()
9292 */
9293 PriorRedoPtr = ControlFile->checkPointCopy.redo;
9294
9295 /*
9296 * Update the control file.
9297 */
9298 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9299 if (shutdown)
9300 ControlFile->state = DB_SHUTDOWNED;
9301 ControlFile->checkPoint = ProcLastRecPtr;
9302 ControlFile->checkPointCopy = checkPoint;
9303 ControlFile->time = (pg_time_t) time(NULL);
9304 /* crash recovery should always recover to the end of WAL */
9305 ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
9306 ControlFile->minRecoveryPointTLI = 0;
9307
9308 /*
9309 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9310 * unused on non-shutdown checkpoints, but seems useful to store it always
9311 * for debugging purposes.
9312 */
9313 SpinLockAcquire(&XLogCtl->ulsn_lck);
9314 ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9315 SpinLockRelease(&XLogCtl->ulsn_lck);
9316
9317 UpdateControlFile();
9318 LWLockRelease(ControlFileLock);
9319
9320 /* Update shared-memory copy of checkpoint XID/epoch */
9321 SpinLockAcquire(&XLogCtl->info_lck);
9322 XLogCtl->ckptFullXid = checkPoint.nextXid;
9323 SpinLockRelease(&XLogCtl->info_lck);
9324
9325 /*
9326 * We are now done with critical updates; no need for system panic if we
9327 * have trouble while fooling with old log segments.
9328 */
9329 END_CRIT_SECTION();
9330
9331 /*
9332 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9333 */
9334 SyncPostCheckpoint();
9335
9336 /*
9337 * Update the average distance between checkpoints if the prior checkpoint
9338 * exists.
9339 */
9340 if (PriorRedoPtr != InvalidXLogRecPtr)
9341 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9342
9343 /*
9344 * Delete old log files, those no longer needed for last checkpoint to
9345 * prevent the disk holding the xlog from growing full.
9346 */
9347 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9348 KeepLogSeg(recptr, &_logSegNo);
9349 if (InvalidateObsoleteReplicationSlots(_logSegNo))
9350 {
9351 /*
9352 * Some slots have been invalidated; recalculate the old-segment
9353 * horizon, starting again from RedoRecPtr.
9354 */
9355 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9356 KeepLogSeg(recptr, &_logSegNo);
9357 }
9358 _logSegNo--;
9359 RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
9360
9361 /*
9362 * Make more log segments if needed. (Do this after recycling old log
9363 * segments, since that may supply some of the needed files.)
9364 */
9365 if (!shutdown)
9366 PreallocXlogFiles(recptr);
9367
9368 /*
9369 * Truncate pg_subtrans if possible. We can throw away all data before
9370 * the oldest XMIN of any running transaction. No future transaction will
9371 * attempt to reference any pg_subtrans entry older than that (see Asserts
9372 * in subtrans.c). During recovery, though, we mustn't do this because
9373 * StartupSUBTRANS hasn't been called yet.
9374 */
9375 if (!RecoveryInProgress())
9376 TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
9377
9378 /* Real work is done; log and update stats. */
9379 LogCheckpointEnd(false);
9380
9381 /* Reset the process title */
9382 update_checkpoint_display(flags, false, true);
9383
9384 TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9385 NBuffers,
9386 CheckpointStats.ckpt_segs_added,
9387 CheckpointStats.ckpt_segs_removed,
9388 CheckpointStats.ckpt_segs_recycled);
9389 }
9390
9391 /*
9392 * Mark the end of recovery in WAL though without running a full checkpoint.
9393 * We can expect that a restartpoint is likely to be in progress as we
9394 * do this, though we are unwilling to wait for it to complete.
9395 *
9396 * CreateRestartPoint() allows for the case where recovery may end before
9397 * the restartpoint completes so there is no concern of concurrent behaviour.
9398 */
9399 static void
CreateEndOfRecoveryRecord(void)9400 CreateEndOfRecoveryRecord(void)
9401 {
9402 xl_end_of_recovery xlrec;
9403 XLogRecPtr recptr;
9404
9405 /* sanity check */
9406 if (!RecoveryInProgress())
9407 elog(ERROR, "can only be used to end recovery");
9408
9409 xlrec.end_time = GetCurrentTimestamp();
9410
9411 WALInsertLockAcquireExclusive();
9412 xlrec.ThisTimeLineID = ThisTimeLineID;
9413 xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9414 WALInsertLockRelease();
9415
9416 LocalSetXLogInsertAllowed();
9417
9418 START_CRIT_SECTION();
9419
9420 XLogBeginInsert();
9421 XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9422 recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9423
9424 XLogFlush(recptr);
9425
9426 /*
9427 * Update the control file so that crash recovery can follow the timeline
9428 * changes to this point.
9429 */
9430 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9431 ControlFile->time = (pg_time_t) time(NULL);
9432 ControlFile->minRecoveryPoint = recptr;
9433 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9434 UpdateControlFile();
9435 LWLockRelease(ControlFileLock);
9436
9437 END_CRIT_SECTION();
9438
9439 LocalXLogInsertAllowed = -1; /* return to "check" state */
9440 }
9441
9442 /*
9443 * Write an OVERWRITE_CONTRECORD message.
9444 *
9445 * When on WAL replay we expect a continuation record at the start of a page
9446 * that is not there, recovery ends and WAL writing resumes at that point.
9447 * But it's wrong to resume writing new WAL back at the start of the record
9448 * that was broken, because downstream consumers of that WAL (physical
9449 * replicas) are not prepared to "rewind". So the first action after
9450 * finishing replay of all valid WAL must be to write a record of this type
9451 * at the point where the contrecord was missing; to support xlogreader
9452 * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
9453 * to the page header where the record occurs. xlogreader has an ad-hoc
9454 * mechanism to report metadata about the broken record, which is what we
9455 * use here.
9456 *
9457 * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
9458 * skip the record it was reading, and pass back the LSN of the skipped
9459 * record, so that its caller can verify (on "replay" of that record) that the
9460 * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
9461 */
9462 static XLogRecPtr
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)9463 CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
9464 {
9465 xl_overwrite_contrecord xlrec;
9466 XLogRecPtr recptr;
9467
9468 /* sanity check */
9469 if (!RecoveryInProgress())
9470 elog(ERROR, "can only be used at end of recovery");
9471
9472 xlrec.overwritten_lsn = aborted_lsn;
9473 xlrec.overwrite_time = GetCurrentTimestamp();
9474
9475 START_CRIT_SECTION();
9476
9477 XLogBeginInsert();
9478 XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
9479
9480 recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
9481
9482 XLogFlush(recptr);
9483
9484 END_CRIT_SECTION();
9485
9486 return recptr;
9487 }
9488
9489 /*
9490 * Flush all data in shared memory to disk, and fsync
9491 *
9492 * This is the common code shared between regular checkpoints and
9493 * recovery restartpoints.
9494 */
9495 static void
CheckPointGuts(XLogRecPtr checkPointRedo,int flags)9496 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9497 {
9498 CheckPointRelationMap();
9499 CheckPointReplicationSlots();
9500 CheckPointSnapBuild();
9501 CheckPointLogicalRewriteHeap();
9502 CheckPointReplicationOrigin();
9503
9504 /* Write out all dirty data in SLRUs and the main buffer pool */
9505 TRACE_POSTGRESQL_BUFFER_CHECKPOINT_START(flags);
9506 CheckpointStats.ckpt_write_t = GetCurrentTimestamp();
9507 CheckPointCLOG();
9508 CheckPointCommitTs();
9509 CheckPointSUBTRANS();
9510 CheckPointMultiXact();
9511 CheckPointPredicate();
9512 CheckPointBuffers(flags);
9513
9514 /* Perform all queued up fsyncs */
9515 TRACE_POSTGRESQL_BUFFER_CHECKPOINT_SYNC_START();
9516 CheckpointStats.ckpt_sync_t = GetCurrentTimestamp();
9517 ProcessSyncRequests();
9518 CheckpointStats.ckpt_sync_end_t = GetCurrentTimestamp();
9519 TRACE_POSTGRESQL_BUFFER_CHECKPOINT_DONE();
9520
9521 /* We deliberately delay 2PC checkpointing as long as possible */
9522 CheckPointTwoPhase(checkPointRedo);
9523 }
9524
9525 /*
9526 * Save a checkpoint for recovery restart if appropriate
9527 *
9528 * This function is called each time a checkpoint record is read from XLOG.
9529 * It must determine whether the checkpoint represents a safe restartpoint or
9530 * not. If so, the checkpoint record is stashed in shared memory so that
9531 * CreateRestartPoint can consult it. (Note that the latter function is
9532 * executed by the checkpointer, while this one will be executed by the
9533 * startup process.)
9534 */
9535 static void
RecoveryRestartPoint(const CheckPoint * checkPoint)9536 RecoveryRestartPoint(const CheckPoint *checkPoint)
9537 {
9538 /*
9539 * Also refrain from creating a restartpoint if we have seen any
9540 * references to non-existent pages. Restarting recovery from the
9541 * restartpoint would not see the references, so we would lose the
9542 * cross-check that the pages belonged to a relation that was dropped
9543 * later.
9544 */
9545 if (XLogHaveInvalidPages())
9546 {
9547 elog(trace_recovery(DEBUG2),
9548 "could not record restart point at %X/%X because there "
9549 "are unresolved references to invalid pages",
9550 LSN_FORMAT_ARGS(checkPoint->redo));
9551 return;
9552 }
9553
9554 /*
9555 * Copy the checkpoint record to shared memory, so that checkpointer can
9556 * work out the next time it wants to perform a restartpoint.
9557 */
9558 SpinLockAcquire(&XLogCtl->info_lck);
9559 XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9560 XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9561 XLogCtl->lastCheckPoint = *checkPoint;
9562 SpinLockRelease(&XLogCtl->info_lck);
9563 }
9564
9565 /*
9566 * Establish a restartpoint if possible.
9567 *
9568 * This is similar to CreateCheckPoint, but is used during WAL recovery
9569 * to establish a point from which recovery can roll forward without
9570 * replaying the entire recovery log.
9571 *
9572 * Returns true if a new restartpoint was established. We can only establish
9573 * a restartpoint if we have replayed a safe checkpoint record since last
9574 * restartpoint.
9575 */
9576 bool
CreateRestartPoint(int flags)9577 CreateRestartPoint(int flags)
9578 {
9579 XLogRecPtr lastCheckPointRecPtr;
9580 XLogRecPtr lastCheckPointEndPtr;
9581 CheckPoint lastCheckPoint;
9582 XLogRecPtr PriorRedoPtr;
9583 XLogRecPtr receivePtr;
9584 XLogRecPtr replayPtr;
9585 TimeLineID replayTLI;
9586 XLogRecPtr endptr;
9587 XLogSegNo _logSegNo;
9588 TimestampTz xtime;
9589
9590 /* Get a local copy of the last safe checkpoint record. */
9591 SpinLockAcquire(&XLogCtl->info_lck);
9592 lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9593 lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9594 lastCheckPoint = XLogCtl->lastCheckPoint;
9595 SpinLockRelease(&XLogCtl->info_lck);
9596
9597 /*
9598 * Check that we're still in recovery mode. It's ok if we exit recovery
9599 * mode after this check, the restart point is valid anyway.
9600 */
9601 if (!RecoveryInProgress())
9602 {
9603 ereport(DEBUG2,
9604 (errmsg_internal("skipping restartpoint, recovery has already ended")));
9605 return false;
9606 }
9607
9608 /*
9609 * If the last checkpoint record we've replayed is already our last
9610 * restartpoint, we can't perform a new restart point. We still update
9611 * minRecoveryPoint in that case, so that if this is a shutdown restart
9612 * point, we won't start up earlier than before. That's not strictly
9613 * necessary, but when hot standby is enabled, it would be rather weird if
9614 * the database opened up for read-only connections at a point-in-time
9615 * before the last shutdown. Such time travel is still possible in case of
9616 * immediate shutdown, though.
9617 *
9618 * We don't explicitly advance minRecoveryPoint when we do create a
9619 * restartpoint. It's assumed that flushing the buffers will do that as a
9620 * side-effect.
9621 */
9622 if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9623 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9624 {
9625 ereport(DEBUG2,
9626 (errmsg_internal("skipping restartpoint, already performed at %X/%X",
9627 LSN_FORMAT_ARGS(lastCheckPoint.redo))));
9628
9629 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9630 if (flags & CHECKPOINT_IS_SHUTDOWN)
9631 {
9632 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9633 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9634 ControlFile->time = (pg_time_t) time(NULL);
9635 UpdateControlFile();
9636 LWLockRelease(ControlFileLock);
9637 }
9638 return false;
9639 }
9640
9641 /*
9642 * Update the shared RedoRecPtr so that the startup process can calculate
9643 * the number of segments replayed since last restartpoint, and request a
9644 * restartpoint if it exceeds CheckPointSegments.
9645 *
9646 * Like in CreateCheckPoint(), hold off insertions to update it, although
9647 * during recovery this is just pro forma, because no WAL insertions are
9648 * happening.
9649 */
9650 WALInsertLockAcquireExclusive();
9651 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9652 WALInsertLockRelease();
9653
9654 /* Also update the info_lck-protected copy */
9655 SpinLockAcquire(&XLogCtl->info_lck);
9656 XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9657 SpinLockRelease(&XLogCtl->info_lck);
9658
9659 /*
9660 * Prepare to accumulate statistics.
9661 *
9662 * Note: because it is possible for log_checkpoints to change while a
9663 * checkpoint proceeds, we always accumulate stats, even if
9664 * log_checkpoints is currently off.
9665 */
9666 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9667 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9668
9669 if (log_checkpoints)
9670 LogCheckpointStart(flags, true);
9671
9672 /* Update the process title */
9673 update_checkpoint_display(flags, true, false);
9674
9675 CheckPointGuts(lastCheckPoint.redo, flags);
9676
9677 /*
9678 * Remember the prior checkpoint's redo ptr for
9679 * UpdateCheckPointDistanceEstimate()
9680 */
9681 PriorRedoPtr = ControlFile->checkPointCopy.redo;
9682
9683 /*
9684 * Update pg_control, using current time. Check that it still shows
9685 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9686 * this is a quick hack to make sure nothing really bad happens if somehow
9687 * we get here after the end-of-recovery checkpoint.
9688 */
9689 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9690 if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9691 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9692 {
9693 ControlFile->checkPoint = lastCheckPointRecPtr;
9694 ControlFile->checkPointCopy = lastCheckPoint;
9695 ControlFile->time = (pg_time_t) time(NULL);
9696
9697 /*
9698 * Ensure minRecoveryPoint is past the checkpoint record. Normally,
9699 * this will have happened already while writing out dirty buffers,
9700 * but not necessarily - e.g. because no buffers were dirtied. We do
9701 * this because a non-exclusive base backup uses minRecoveryPoint to
9702 * determine which WAL files must be included in the backup, and the
9703 * file (or files) containing the checkpoint record must be included,
9704 * at a minimum. Note that for an ordinary restart of recovery there's
9705 * no value in having the minimum recovery point any earlier than this
9706 * anyway, because redo will begin just after the checkpoint record.
9707 */
9708 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9709 {
9710 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9711 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9712
9713 /* update local copy */
9714 minRecoveryPoint = ControlFile->minRecoveryPoint;
9715 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9716 }
9717 if (flags & CHECKPOINT_IS_SHUTDOWN)
9718 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9719 UpdateControlFile();
9720 }
9721 LWLockRelease(ControlFileLock);
9722
9723 /*
9724 * Update the average distance between checkpoints/restartpoints if the
9725 * prior checkpoint exists.
9726 */
9727 if (PriorRedoPtr != InvalidXLogRecPtr)
9728 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9729
9730 /*
9731 * Delete old log files, those no longer needed for last restartpoint to
9732 * prevent the disk holding the xlog from growing full.
9733 */
9734 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9735
9736 /*
9737 * Retreat _logSegNo using the current end of xlog replayed or received,
9738 * whichever is later.
9739 */
9740 receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
9741 replayPtr = GetXLogReplayRecPtr(&replayTLI);
9742 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9743 KeepLogSeg(endptr, &_logSegNo);
9744 if (InvalidateObsoleteReplicationSlots(_logSegNo))
9745 {
9746 /*
9747 * Some slots have been invalidated; recalculate the old-segment
9748 * horizon, starting again from RedoRecPtr.
9749 */
9750 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9751 KeepLogSeg(endptr, &_logSegNo);
9752 }
9753 _logSegNo--;
9754
9755 /*
9756 * Try to recycle segments on a useful timeline. If we've been promoted
9757 * since the beginning of this restartpoint, use the new timeline chosen
9758 * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
9759 * case). If we're still in recovery, use the timeline we're currently
9760 * replaying.
9761 *
9762 * There is no guarantee that the WAL segments will be useful on the
9763 * current timeline; if recovery proceeds to a new timeline right after
9764 * this, the pre-allocated WAL segments on this timeline will not be used,
9765 * and will go wasted until recycled on the next restartpoint. We'll live
9766 * with that.
9767 */
9768 if (RecoveryInProgress())
9769 ThisTimeLineID = replayTLI;
9770
9771 RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
9772
9773 /*
9774 * Make more log segments if needed. (Do this after recycling old log
9775 * segments, since that may supply some of the needed files.)
9776 */
9777 PreallocXlogFiles(endptr);
9778
9779 /*
9780 * ThisTimeLineID is normally not set when we're still in recovery.
9781 * However, recycling/preallocating segments above needed ThisTimeLineID
9782 * to determine which timeline to install the segments on. Reset it now,
9783 * to restore the normal state of affairs for debugging purposes.
9784 */
9785 if (RecoveryInProgress())
9786 ThisTimeLineID = 0;
9787
9788 /*
9789 * Truncate pg_subtrans if possible. We can throw away all data before
9790 * the oldest XMIN of any running transaction. No future transaction will
9791 * attempt to reference any pg_subtrans entry older than that (see Asserts
9792 * in subtrans.c). When hot standby is disabled, though, we mustn't do
9793 * this because StartupSUBTRANS hasn't been called yet.
9794 */
9795 if (EnableHotStandby)
9796 TruncateSUBTRANS(GetOldestTransactionIdConsideredRunning());
9797
9798 /* Real work is done; log and update stats. */
9799 LogCheckpointEnd(true);
9800
9801 /* Reset the process title */
9802 update_checkpoint_display(flags, true, true);
9803
9804 xtime = GetLatestXTime();
9805 ereport((log_checkpoints ? LOG : DEBUG2),
9806 (errmsg("recovery restart point at %X/%X",
9807 LSN_FORMAT_ARGS(lastCheckPoint.redo)),
9808 xtime ? errdetail("Last completed transaction was at log time %s.",
9809 timestamptz_to_str(xtime)) : 0));
9810
9811 /*
9812 * Finally, execute archive_cleanup_command, if any.
9813 */
9814 if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
9815 ExecuteRecoveryCommand(archiveCleanupCommand,
9816 "archive_cleanup_command",
9817 false);
9818
9819 return true;
9820 }
9821
9822 /*
9823 * Report availability of WAL for the given target LSN
9824 * (typically a slot's restart_lsn)
9825 *
9826 * Returns one of the following enum values:
9827 *
9828 * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
9829 * max_wal_size.
9830 *
9831 * * WALAVAIL_EXTENDED means it is still available by preserving extra
9832 * segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
9833 * than max_wal_size, this state is not returned.
9834 *
9835 * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
9836 * remove reserved segments. The walsender using this slot may return to the
9837 * above.
9838 *
9839 * * WALAVAIL_REMOVED means it has been removed. A replication stream on
9840 * a slot with this LSN cannot continue after a restart.
9841 *
9842 * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
9843 */
9844 WALAvailability
GetWALAvailability(XLogRecPtr targetLSN)9845 GetWALAvailability(XLogRecPtr targetLSN)
9846 {
9847 XLogRecPtr currpos; /* current write LSN */
9848 XLogSegNo currSeg; /* segid of currpos */
9849 XLogSegNo targetSeg; /* segid of targetLSN */
9850 XLogSegNo oldestSeg; /* actual oldest segid */
9851 XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */
9852 XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */
9853 uint64 keepSegs;
9854
9855 /*
9856 * slot does not reserve WAL. Either deactivated, or has never been active
9857 */
9858 if (XLogRecPtrIsInvalid(targetLSN))
9859 return WALAVAIL_INVALID_LSN;
9860
9861 /*
9862 * Calculate the oldest segment currently reserved by all slots,
9863 * considering wal_keep_size and max_slot_wal_keep_size. Initialize
9864 * oldestSlotSeg to the current segment.
9865 */
9866 currpos = GetXLogWriteRecPtr();
9867 XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
9868 KeepLogSeg(currpos, &oldestSlotSeg);
9869
9870 /*
9871 * Find the oldest extant segment file. We get 1 until checkpoint removes
9872 * the first WAL segment file since startup, which causes the status being
9873 * wrong under certain abnormal conditions but that doesn't actually harm.
9874 */
9875 oldestSeg = XLogGetLastRemovedSegno() + 1;
9876
9877 /* calculate oldest segment by max_wal_size */
9878 XLByteToSeg(currpos, currSeg, wal_segment_size);
9879 keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
9880
9881 if (currSeg > keepSegs)
9882 oldestSegMaxWalSize = currSeg - keepSegs;
9883 else
9884 oldestSegMaxWalSize = 1;
9885
9886 /* the segment we care about */
9887 XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
9888
9889 /*
9890 * No point in returning reserved or extended status values if the
9891 * targetSeg is known to be lost.
9892 */
9893 if (targetSeg >= oldestSlotSeg)
9894 {
9895 /* show "reserved" when targetSeg is within max_wal_size */
9896 if (targetSeg >= oldestSegMaxWalSize)
9897 return WALAVAIL_RESERVED;
9898
9899 /* being retained by slots exceeding max_wal_size */
9900 return WALAVAIL_EXTENDED;
9901 }
9902
9903 /* WAL segments are no longer retained but haven't been removed yet */
9904 if (targetSeg >= oldestSeg)
9905 return WALAVAIL_UNRESERVED;
9906
9907 /* Definitely lost */
9908 return WALAVAIL_REMOVED;
9909 }
9910
9911
9912 /*
9913 * Retreat *logSegNo to the last segment that we need to retain because of
9914 * either wal_keep_size or replication slots.
9915 *
9916 * This is calculated by subtracting wal_keep_size from the given xlog
9917 * location, recptr and by making sure that that result is below the
9918 * requirement of replication slots. For the latter criterion we do consider
9919 * the effects of max_slot_wal_keep_size: reserve at most that much space back
9920 * from recptr.
9921 *
9922 * Note about replication slots: if this function calculates a value
9923 * that's further ahead than what slots need reserved, then affected
9924 * slots need to be invalidated and this function invoked again.
9925 * XXX it might be a good idea to rewrite this function so that
9926 * invalidation is optionally done here, instead.
9927 */
9928 static void
KeepLogSeg(XLogRecPtr recptr,XLogSegNo * logSegNo)9929 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9930 {
9931 XLogSegNo currSegNo;
9932 XLogSegNo segno;
9933 XLogRecPtr keep;
9934
9935 XLByteToSeg(recptr, currSegNo, wal_segment_size);
9936 segno = currSegNo;
9937
9938 /*
9939 * Calculate how many segments are kept by slots first, adjusting for
9940 * max_slot_wal_keep_size.
9941 */
9942 keep = XLogGetReplicationSlotMinimumLSN();
9943 if (keep != InvalidXLogRecPtr)
9944 {
9945 XLByteToSeg(keep, segno, wal_segment_size);
9946
9947 /* Cap by max_slot_wal_keep_size ... */
9948 if (max_slot_wal_keep_size_mb >= 0)
9949 {
9950 uint64 slot_keep_segs;
9951
9952 slot_keep_segs =
9953 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
9954
9955 if (currSegNo - segno > slot_keep_segs)
9956 segno = currSegNo - slot_keep_segs;
9957 }
9958 }
9959
9960 /* but, keep at least wal_keep_size if that's set */
9961 if (wal_keep_size_mb > 0)
9962 {
9963 uint64 keep_segs;
9964
9965 keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
9966 if (currSegNo - segno < keep_segs)
9967 {
9968 /* avoid underflow, don't go below 1 */
9969 if (currSegNo <= keep_segs)
9970 segno = 1;
9971 else
9972 segno = currSegNo - keep_segs;
9973 }
9974 }
9975
9976 /* don't delete WAL segments newer than the calculated segment */
9977 if (segno < *logSegNo)
9978 *logSegNo = segno;
9979 }
9980
9981 /*
9982 * Write a NEXTOID log record
9983 */
9984 void
XLogPutNextOid(Oid nextOid)9985 XLogPutNextOid(Oid nextOid)
9986 {
9987 XLogBeginInsert();
9988 XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9989 (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9990
9991 /*
9992 * We need not flush the NEXTOID record immediately, because any of the
9993 * just-allocated OIDs could only reach disk as part of a tuple insert or
9994 * update that would have its own XLOG record that must follow the NEXTOID
9995 * record. Therefore, the standard buffer LSN interlock applied to those
9996 * records will ensure no such OID reaches disk before the NEXTOID record
9997 * does.
9998 *
9999 * Note, however, that the above statement only covers state "within" the
10000 * database. When we use a generated OID as a file or directory name, we
10001 * are in a sense violating the basic WAL rule, because that filesystem
10002 * change may reach disk before the NEXTOID WAL record does. The impact
10003 * of this is that if a database crash occurs immediately afterward, we
10004 * might after restart re-generate the same OID and find that it conflicts
10005 * with the leftover file or directory. But since for safety's sake we
10006 * always loop until finding a nonconflicting filename, this poses no real
10007 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
10008 */
10009 }
10010
10011 /*
10012 * Write an XLOG SWITCH record.
10013 *
10014 * Here we just blindly issue an XLogInsert request for the record.
10015 * All the magic happens inside XLogInsert.
10016 *
10017 * The return value is either the end+1 address of the switch record,
10018 * or the end+1 address of the prior segment if we did not need to
10019 * write a switch record because we are already at segment start.
10020 */
10021 XLogRecPtr
RequestXLogSwitch(bool mark_unimportant)10022 RequestXLogSwitch(bool mark_unimportant)
10023 {
10024 XLogRecPtr RecPtr;
10025
10026 /* XLOG SWITCH has no data */
10027 XLogBeginInsert();
10028
10029 if (mark_unimportant)
10030 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
10031 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
10032
10033 return RecPtr;
10034 }
10035
10036 /*
10037 * Write a RESTORE POINT record
10038 */
10039 XLogRecPtr
XLogRestorePoint(const char * rpName)10040 XLogRestorePoint(const char *rpName)
10041 {
10042 XLogRecPtr RecPtr;
10043 xl_restore_point xlrec;
10044
10045 xlrec.rp_time = GetCurrentTimestamp();
10046 strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
10047
10048 XLogBeginInsert();
10049 XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
10050
10051 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
10052
10053 ereport(LOG,
10054 (errmsg("restore point \"%s\" created at %X/%X",
10055 rpName, LSN_FORMAT_ARGS(RecPtr))));
10056
10057 return RecPtr;
10058 }
10059
10060 /*
10061 * Check if any of the GUC parameters that are critical for hot standby
10062 * have changed, and update the value in pg_control file if necessary.
10063 */
10064 static void
XLogReportParameters(void)10065 XLogReportParameters(void)
10066 {
10067 if (wal_level != ControlFile->wal_level ||
10068 wal_log_hints != ControlFile->wal_log_hints ||
10069 MaxConnections != ControlFile->MaxConnections ||
10070 max_worker_processes != ControlFile->max_worker_processes ||
10071 max_wal_senders != ControlFile->max_wal_senders ||
10072 max_prepared_xacts != ControlFile->max_prepared_xacts ||
10073 max_locks_per_xact != ControlFile->max_locks_per_xact ||
10074 track_commit_timestamp != ControlFile->track_commit_timestamp)
10075 {
10076 /*
10077 * The change in number of backend slots doesn't need to be WAL-logged
10078 * if archiving is not enabled, as you can't start archive recovery
10079 * with wal_level=minimal anyway. We don't really care about the
10080 * values in pg_control either if wal_level=minimal, but seems better
10081 * to keep them up-to-date to avoid confusion.
10082 */
10083 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
10084 {
10085 xl_parameter_change xlrec;
10086 XLogRecPtr recptr;
10087
10088 xlrec.MaxConnections = MaxConnections;
10089 xlrec.max_worker_processes = max_worker_processes;
10090 xlrec.max_wal_senders = max_wal_senders;
10091 xlrec.max_prepared_xacts = max_prepared_xacts;
10092 xlrec.max_locks_per_xact = max_locks_per_xact;
10093 xlrec.wal_level = wal_level;
10094 xlrec.wal_log_hints = wal_log_hints;
10095 xlrec.track_commit_timestamp = track_commit_timestamp;
10096
10097 XLogBeginInsert();
10098 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
10099
10100 recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
10101 XLogFlush(recptr);
10102 }
10103
10104 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10105
10106 ControlFile->MaxConnections = MaxConnections;
10107 ControlFile->max_worker_processes = max_worker_processes;
10108 ControlFile->max_wal_senders = max_wal_senders;
10109 ControlFile->max_prepared_xacts = max_prepared_xacts;
10110 ControlFile->max_locks_per_xact = max_locks_per_xact;
10111 ControlFile->wal_level = wal_level;
10112 ControlFile->wal_log_hints = wal_log_hints;
10113 ControlFile->track_commit_timestamp = track_commit_timestamp;
10114 UpdateControlFile();
10115
10116 LWLockRelease(ControlFileLock);
10117 }
10118 }
10119
10120 /*
10121 * Update full_page_writes in shared memory, and write an
10122 * XLOG_FPW_CHANGE record if necessary.
10123 *
10124 * Note: this function assumes there is no other process running
10125 * concurrently that could update it.
10126 */
10127 void
UpdateFullPageWrites(void)10128 UpdateFullPageWrites(void)
10129 {
10130 XLogCtlInsert *Insert = &XLogCtl->Insert;
10131 bool recoveryInProgress;
10132
10133 /*
10134 * Do nothing if full_page_writes has not been changed.
10135 *
10136 * It's safe to check the shared full_page_writes without the lock,
10137 * because we assume that there is no concurrently running process which
10138 * can update it.
10139 */
10140 if (fullPageWrites == Insert->fullPageWrites)
10141 return;
10142
10143 /*
10144 * Perform this outside critical section so that the WAL insert
10145 * initialization done by RecoveryInProgress() doesn't trigger an
10146 * assertion failure.
10147 */
10148 recoveryInProgress = RecoveryInProgress();
10149
10150 START_CRIT_SECTION();
10151
10152 /*
10153 * It's always safe to take full page images, even when not strictly
10154 * required, but not the other round. So if we're setting full_page_writes
10155 * to true, first set it true and then write the WAL record. If we're
10156 * setting it to false, first write the WAL record and then set the global
10157 * flag.
10158 */
10159 if (fullPageWrites)
10160 {
10161 WALInsertLockAcquireExclusive();
10162 Insert->fullPageWrites = true;
10163 WALInsertLockRelease();
10164 }
10165
10166 /*
10167 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
10168 * full_page_writes during archive recovery, if required.
10169 */
10170 if (XLogStandbyInfoActive() && !recoveryInProgress)
10171 {
10172 XLogBeginInsert();
10173 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
10174
10175 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
10176 }
10177
10178 if (!fullPageWrites)
10179 {
10180 WALInsertLockAcquireExclusive();
10181 Insert->fullPageWrites = false;
10182 WALInsertLockRelease();
10183 }
10184 END_CRIT_SECTION();
10185 }
10186
10187 /*
10188 * Check that it's OK to switch to new timeline during recovery.
10189 *
10190 * 'lsn' is the address of the shutdown checkpoint record we're about to
10191 * replay. (Currently, timeline can only change at a shutdown checkpoint).
10192 */
10193 static void
checkTimeLineSwitch(XLogRecPtr lsn,TimeLineID newTLI,TimeLineID prevTLI)10194 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
10195 {
10196 /* Check that the record agrees on what the current (old) timeline is */
10197 if (prevTLI != ThisTimeLineID)
10198 ereport(PANIC,
10199 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
10200 prevTLI, ThisTimeLineID)));
10201
10202 /*
10203 * The new timeline better be in the list of timelines we expect to see,
10204 * according to the timeline history. It should also not decrease.
10205 */
10206 if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
10207 ereport(PANIC,
10208 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
10209 newTLI, ThisTimeLineID)));
10210
10211 /*
10212 * If we have not yet reached min recovery point, and we're about to
10213 * switch to a timeline greater than the timeline of the min recovery
10214 * point: trouble. After switching to the new timeline, we could not
10215 * possibly visit the min recovery point on the correct timeline anymore.
10216 * This can happen if there is a newer timeline in the archive that
10217 * branched before the timeline the min recovery point is on, and you
10218 * attempt to do PITR to the new timeline.
10219 */
10220 if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
10221 lsn < minRecoveryPoint &&
10222 newTLI > minRecoveryPointTLI)
10223 ereport(PANIC,
10224 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
10225 newTLI,
10226 LSN_FORMAT_ARGS(minRecoveryPoint),
10227 minRecoveryPointTLI)));
10228
10229 /* Looks good */
10230 }
10231
10232 /*
10233 * XLOG resource manager's routines
10234 *
10235 * Definitions of info values are in include/catalog/pg_control.h, though
10236 * not all record types are related to control file updates.
10237 */
10238 void
xlog_redo(XLogReaderState * record)10239 xlog_redo(XLogReaderState *record)
10240 {
10241 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
10242 XLogRecPtr lsn = record->EndRecPtr;
10243
10244 /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
10245 Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
10246 !XLogRecHasAnyBlockRefs(record));
10247
10248 if (info == XLOG_NEXTOID)
10249 {
10250 Oid nextOid;
10251
10252 /*
10253 * We used to try to take the maximum of ShmemVariableCache->nextOid
10254 * and the recorded nextOid, but that fails if the OID counter wraps
10255 * around. Since no OID allocation should be happening during replay
10256 * anyway, better to just believe the record exactly. We still take
10257 * OidGenLock while setting the variable, just in case.
10258 */
10259 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
10260 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
10261 ShmemVariableCache->nextOid = nextOid;
10262 ShmemVariableCache->oidCount = 0;
10263 LWLockRelease(OidGenLock);
10264 }
10265 else if (info == XLOG_CHECKPOINT_SHUTDOWN)
10266 {
10267 CheckPoint checkPoint;
10268
10269 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10270 /* In a SHUTDOWN checkpoint, believe the counters exactly */
10271 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10272 ShmemVariableCache->nextXid = checkPoint.nextXid;
10273 LWLockRelease(XidGenLock);
10274 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
10275 ShmemVariableCache->nextOid = checkPoint.nextOid;
10276 ShmemVariableCache->oidCount = 0;
10277 LWLockRelease(OidGenLock);
10278 MultiXactSetNextMXact(checkPoint.nextMulti,
10279 checkPoint.nextMultiOffset);
10280
10281 MultiXactAdvanceOldest(checkPoint.oldestMulti,
10282 checkPoint.oldestMultiDB);
10283
10284 /*
10285 * No need to set oldestClogXid here as well; it'll be set when we
10286 * redo an xl_clog_truncate if it changed since initialization.
10287 */
10288 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
10289
10290 /*
10291 * If we see a shutdown checkpoint while waiting for an end-of-backup
10292 * record, the backup was canceled and the end-of-backup record will
10293 * never arrive.
10294 */
10295 if (ArchiveRecoveryRequested &&
10296 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
10297 XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
10298 ereport(PANIC,
10299 (errmsg("online backup was canceled, recovery cannot continue")));
10300
10301 /*
10302 * If we see a shutdown checkpoint, we know that nothing was running
10303 * on the primary at this point. So fake-up an empty running-xacts
10304 * record and use that here and now. Recover additional standby state
10305 * for prepared transactions.
10306 */
10307 if (standbyState >= STANDBY_INITIALIZED)
10308 {
10309 TransactionId *xids;
10310 int nxids;
10311 TransactionId oldestActiveXID;
10312 TransactionId latestCompletedXid;
10313 RunningTransactionsData running;
10314
10315 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
10316
10317 /*
10318 * Construct a RunningTransactions snapshot representing a shut
10319 * down server, with only prepared transactions still alive. We're
10320 * never overflowed at this point because all subxids are listed
10321 * with their parent prepared transactions.
10322 */
10323 running.xcnt = nxids;
10324 running.subxcnt = 0;
10325 running.subxid_overflow = false;
10326 running.nextXid = XidFromFullTransactionId(checkPoint.nextXid);
10327 running.oldestRunningXid = oldestActiveXID;
10328 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextXid);
10329 TransactionIdRetreat(latestCompletedXid);
10330 Assert(TransactionIdIsNormal(latestCompletedXid));
10331 running.latestCompletedXid = latestCompletedXid;
10332 running.xids = xids;
10333
10334 ProcArrayApplyRecoveryInfo(&running);
10335
10336 StandbyRecoverPreparedTransactions();
10337 }
10338
10339 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10340 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10341 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
10342 LWLockRelease(ControlFileLock);
10343
10344 /* Update shared-memory copy of checkpoint XID/epoch */
10345 SpinLockAcquire(&XLogCtl->info_lck);
10346 XLogCtl->ckptFullXid = checkPoint.nextXid;
10347 SpinLockRelease(&XLogCtl->info_lck);
10348
10349 /*
10350 * We should've already switched to the new TLI before replaying this
10351 * record.
10352 */
10353 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10354 ereport(PANIC,
10355 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10356 checkPoint.ThisTimeLineID, ThisTimeLineID)));
10357
10358 RecoveryRestartPoint(&checkPoint);
10359 }
10360 else if (info == XLOG_CHECKPOINT_ONLINE)
10361 {
10362 CheckPoint checkPoint;
10363
10364 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10365 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
10366 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10367 if (FullTransactionIdPrecedes(ShmemVariableCache->nextXid,
10368 checkPoint.nextXid))
10369 ShmemVariableCache->nextXid = checkPoint.nextXid;
10370 LWLockRelease(XidGenLock);
10371
10372 /*
10373 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
10374 * to track OID assignment through XLOG_NEXTOID records. The nextOid
10375 * counter is from the start of the checkpoint and might well be stale
10376 * compared to later XLOG_NEXTOID records. We could try to take the
10377 * maximum of the nextOid counter and our latest value, but since
10378 * there's no particular guarantee about the speed with which the OID
10379 * counter wraps around, that's a risky thing to do. In any case,
10380 * users of the nextOid counter are required to avoid assignment of
10381 * duplicates, so that a somewhat out-of-date value should be safe.
10382 */
10383
10384 /* Handle multixact */
10385 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
10386 checkPoint.nextMultiOffset);
10387
10388 /*
10389 * NB: This may perform multixact truncation when replaying WAL
10390 * generated by an older primary.
10391 */
10392 MultiXactAdvanceOldest(checkPoint.oldestMulti,
10393 checkPoint.oldestMultiDB);
10394 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
10395 checkPoint.oldestXid))
10396 SetTransactionIdLimit(checkPoint.oldestXid,
10397 checkPoint.oldestXidDB);
10398 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10399 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10400 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
10401 LWLockRelease(ControlFileLock);
10402
10403 /* Update shared-memory copy of checkpoint XID/epoch */
10404 SpinLockAcquire(&XLogCtl->info_lck);
10405 XLogCtl->ckptFullXid = checkPoint.nextXid;
10406 SpinLockRelease(&XLogCtl->info_lck);
10407
10408 /* TLI should not change in an on-line checkpoint */
10409 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10410 ereport(PANIC,
10411 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10412 checkPoint.ThisTimeLineID, ThisTimeLineID)));
10413
10414 RecoveryRestartPoint(&checkPoint);
10415 }
10416 else if (info == XLOG_OVERWRITE_CONTRECORD)
10417 {
10418 xl_overwrite_contrecord xlrec;
10419
10420 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
10421 VerifyOverwriteContrecord(&xlrec, record);
10422 }
10423 else if (info == XLOG_END_OF_RECOVERY)
10424 {
10425 xl_end_of_recovery xlrec;
10426
10427 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
10428
10429 /*
10430 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
10431 * but this case is rarer and harder to test, so the benefit doesn't
10432 * outweigh the potential extra cost of maintenance.
10433 */
10434
10435 /*
10436 * We should've already switched to the new TLI before replaying this
10437 * record.
10438 */
10439 if (xlrec.ThisTimeLineID != ThisTimeLineID)
10440 ereport(PANIC,
10441 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10442 xlrec.ThisTimeLineID, ThisTimeLineID)));
10443 }
10444 else if (info == XLOG_NOOP)
10445 {
10446 /* nothing to do here */
10447 }
10448 else if (info == XLOG_SWITCH)
10449 {
10450 /* nothing to do here */
10451 }
10452 else if (info == XLOG_RESTORE_POINT)
10453 {
10454 /* nothing to do here */
10455 }
10456 else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
10457 {
10458 /*
10459 * Full-page image (FPI) records contain nothing else but a backup
10460 * block (or multiple backup blocks). Every block reference must
10461 * include a full-page image - otherwise there would be no point in
10462 * this record.
10463 *
10464 * No recovery conflicts are generated by these generic records - if a
10465 * resource manager needs to generate conflicts, it has to define a
10466 * separate WAL record type and redo routine.
10467 *
10468 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
10469 * WAL- logged because of a hint bit update. They are only generated
10470 * when checksums are enabled. There is no difference in handling
10471 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
10472 * code just to distinguish them for statistics purposes.
10473 */
10474 for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
10475 {
10476 Buffer buffer;
10477
10478 if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
10479 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
10480 UnlockReleaseBuffer(buffer);
10481 }
10482 }
10483 else if (info == XLOG_BACKUP_END)
10484 {
10485 XLogRecPtr startpoint;
10486
10487 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
10488
10489 if (ControlFile->backupStartPoint == startpoint)
10490 {
10491 /*
10492 * We have reached the end of base backup, the point where
10493 * pg_stop_backup() was done. The data on disk is now consistent.
10494 * Reset backupStartPoint, and update minRecoveryPoint to make
10495 * sure we don't allow starting up at an earlier point even if
10496 * recovery is stopped and restarted soon after this.
10497 */
10498 elog(DEBUG1, "end of backup reached");
10499
10500 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10501
10502 if (ControlFile->minRecoveryPoint < lsn)
10503 {
10504 ControlFile->minRecoveryPoint = lsn;
10505 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10506 }
10507 ControlFile->backupStartPoint = InvalidXLogRecPtr;
10508 ControlFile->backupEndRequired = false;
10509 UpdateControlFile();
10510
10511 LWLockRelease(ControlFileLock);
10512 }
10513 }
10514 else if (info == XLOG_PARAMETER_CHANGE)
10515 {
10516 xl_parameter_change xlrec;
10517
10518 /* Update our copy of the parameters in pg_control */
10519 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10520
10521 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10522 ControlFile->MaxConnections = xlrec.MaxConnections;
10523 ControlFile->max_worker_processes = xlrec.max_worker_processes;
10524 ControlFile->max_wal_senders = xlrec.max_wal_senders;
10525 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10526 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10527 ControlFile->wal_level = xlrec.wal_level;
10528 ControlFile->wal_log_hints = xlrec.wal_log_hints;
10529
10530 /*
10531 * Update minRecoveryPoint to ensure that if recovery is aborted, we
10532 * recover back up to this point before allowing hot standby again.
10533 * This is important if the max_* settings are decreased, to ensure
10534 * you don't run queries against the WAL preceding the change. The
10535 * local copies cannot be updated as long as crash recovery is
10536 * happening and we expect all the WAL to be replayed.
10537 */
10538 if (InArchiveRecovery)
10539 {
10540 minRecoveryPoint = ControlFile->minRecoveryPoint;
10541 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10542 }
10543 if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10544 {
10545 ControlFile->minRecoveryPoint = lsn;
10546 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10547 }
10548
10549 CommitTsParameterChange(xlrec.track_commit_timestamp,
10550 ControlFile->track_commit_timestamp);
10551 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10552
10553 UpdateControlFile();
10554 LWLockRelease(ControlFileLock);
10555
10556 /* Check to see if any parameter change gives a problem on recovery */
10557 CheckRequiredParameterValues();
10558 }
10559 else if (info == XLOG_FPW_CHANGE)
10560 {
10561 bool fpw;
10562
10563 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10564
10565 /*
10566 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10567 * do_pg_start_backup() and do_pg_stop_backup() can check whether
10568 * full_page_writes has been disabled during online backup.
10569 */
10570 if (!fpw)
10571 {
10572 SpinLockAcquire(&XLogCtl->info_lck);
10573 if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10574 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10575 SpinLockRelease(&XLogCtl->info_lck);
10576 }
10577
10578 /* Keep track of full_page_writes */
10579 lastFullPageWrites = fpw;
10580 }
10581 }
10582
10583 /*
10584 * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
10585 */
10586 static void
VerifyOverwriteContrecord(xl_overwrite_contrecord * xlrec,XLogReaderState * state)10587 VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
10588 {
10589 if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
10590 elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
10591 LSN_FORMAT_ARGS(xlrec->overwritten_lsn),
10592 LSN_FORMAT_ARGS(state->overwrittenRecPtr));
10593
10594 ereport(LOG,
10595 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
10596 LSN_FORMAT_ARGS(xlrec->overwritten_lsn),
10597 timestamptz_to_str(xlrec->overwrite_time))));
10598
10599 /* Verifying the record should only happen once */
10600 state->overwrittenRecPtr = InvalidXLogRecPtr;
10601 }
10602
10603 #ifdef WAL_DEBUG
10604
10605 static void
xlog_outrec(StringInfo buf,XLogReaderState * record)10606 xlog_outrec(StringInfo buf, XLogReaderState *record)
10607 {
10608 appendStringInfo(buf, "prev %X/%X; xid %u",
10609 LSN_FORMAT_ARGS(XLogRecGetPrev(record)),
10610 XLogRecGetXid(record));
10611
10612 appendStringInfo(buf, "; len %u",
10613 XLogRecGetDataLen(record));
10614
10615 xlog_block_info(buf, record);
10616 }
10617 #endif /* WAL_DEBUG */
10618
10619 /*
10620 * Returns a string giving information about all the blocks in an
10621 * XLogRecord.
10622 */
10623 static void
xlog_block_info(StringInfo buf,XLogReaderState * record)10624 xlog_block_info(StringInfo buf, XLogReaderState *record)
10625 {
10626 int block_id;
10627
10628 /* decode block references */
10629 for (block_id = 0; block_id <= record->max_block_id; block_id++)
10630 {
10631 RelFileNode rnode;
10632 ForkNumber forknum;
10633 BlockNumber blk;
10634
10635 if (!XLogRecHasBlockRef(record, block_id))
10636 continue;
10637
10638 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10639 if (forknum != MAIN_FORKNUM)
10640 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10641 block_id,
10642 rnode.spcNode, rnode.dbNode, rnode.relNode,
10643 forknum,
10644 blk);
10645 else
10646 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10647 block_id,
10648 rnode.spcNode, rnode.dbNode, rnode.relNode,
10649 blk);
10650 if (XLogRecHasBlockImage(record, block_id))
10651 appendStringInfoString(buf, " FPW");
10652 }
10653 }
10654
10655 /*
10656 * Returns a string describing an XLogRecord, consisting of its identity
10657 * optionally followed by a colon, a space, and a further description.
10658 */
10659 static void
xlog_outdesc(StringInfo buf,XLogReaderState * record)10660 xlog_outdesc(StringInfo buf, XLogReaderState *record)
10661 {
10662 RmgrId rmid = XLogRecGetRmid(record);
10663 uint8 info = XLogRecGetInfo(record);
10664 const char *id;
10665
10666 appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10667 appendStringInfoChar(buf, '/');
10668
10669 id = RmgrTable[rmid].rm_identify(info);
10670 if (id == NULL)
10671 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10672 else
10673 appendStringInfo(buf, "%s: ", id);
10674
10675 RmgrTable[rmid].rm_desc(buf, record);
10676 }
10677
10678
10679 /*
10680 * Return the (possible) sync flag used for opening a file, depending on the
10681 * value of the GUC wal_sync_method.
10682 */
10683 static int
get_sync_bit(int method)10684 get_sync_bit(int method)
10685 {
10686 int o_direct_flag = 0;
10687
10688 /* If fsync is disabled, never open in sync mode */
10689 if (!enableFsync)
10690 return 0;
10691
10692 /*
10693 * Optimize writes by bypassing kernel cache with O_DIRECT when using
10694 * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
10695 * disabled, otherwise the archive command or walsender process will read
10696 * the WAL soon after writing it, which is guaranteed to cause a physical
10697 * read if we bypassed the kernel cache. We also skip the
10698 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10699 * reason.
10700 *
10701 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10702 * written by walreceiver is normally read by the startup process soon
10703 * after it's written. Also, walreceiver performs unaligned writes, which
10704 * don't work with O_DIRECT, so it is required for correctness too.
10705 */
10706 if (!XLogIsNeeded() && !AmWalReceiverProcess())
10707 o_direct_flag = PG_O_DIRECT;
10708
10709 switch (method)
10710 {
10711 /*
10712 * enum values for all sync options are defined even if they are
10713 * not supported on the current platform. But if not, they are
10714 * not included in the enum option array, and therefore will never
10715 * be seen here.
10716 */
10717 case SYNC_METHOD_FSYNC:
10718 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10719 case SYNC_METHOD_FDATASYNC:
10720 return 0;
10721 #ifdef OPEN_SYNC_FLAG
10722 case SYNC_METHOD_OPEN:
10723 return OPEN_SYNC_FLAG | o_direct_flag;
10724 #endif
10725 #ifdef OPEN_DATASYNC_FLAG
10726 case SYNC_METHOD_OPEN_DSYNC:
10727 return OPEN_DATASYNC_FLAG | o_direct_flag;
10728 #endif
10729 default:
10730 /* can't happen (unless we are out of sync with option array) */
10731 elog(ERROR, "unrecognized wal_sync_method: %d", method);
10732 return 0; /* silence warning */
10733 }
10734 }
10735
10736 /*
10737 * GUC support
10738 */
10739 void
assign_xlog_sync_method(int new_sync_method,void * extra)10740 assign_xlog_sync_method(int new_sync_method, void *extra)
10741 {
10742 if (sync_method != new_sync_method)
10743 {
10744 /*
10745 * To ensure that no blocks escape unsynced, force an fsync on the
10746 * currently open log segment (if any). Also, if the open flag is
10747 * changing, close the log file so it will be reopened (with new flag
10748 * bit) at next use.
10749 */
10750 if (openLogFile >= 0)
10751 {
10752 pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10753 if (pg_fsync(openLogFile) != 0)
10754 {
10755 char xlogfname[MAXFNAMELEN];
10756 int save_errno;
10757
10758 save_errno = errno;
10759 XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
10760 wal_segment_size);
10761 errno = save_errno;
10762 ereport(PANIC,
10763 (errcode_for_file_access(),
10764 errmsg("could not fsync file \"%s\": %m", xlogfname)));
10765 }
10766
10767 pgstat_report_wait_end();
10768 if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10769 XLogFileClose();
10770 }
10771 }
10772 }
10773
10774
10775 /*
10776 * Issue appropriate kind of fsync (if any) for an XLOG output file.
10777 *
10778 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10779 * 'segno' is for error reporting purposes.
10780 */
10781 void
issue_xlog_fsync(int fd,XLogSegNo segno)10782 issue_xlog_fsync(int fd, XLogSegNo segno)
10783 {
10784 char *msg = NULL;
10785 instr_time start;
10786
10787 /*
10788 * Quick exit if fsync is disabled or write() has already synced the WAL
10789 * file.
10790 */
10791 if (!enableFsync ||
10792 sync_method == SYNC_METHOD_OPEN ||
10793 sync_method == SYNC_METHOD_OPEN_DSYNC)
10794 return;
10795
10796 /* Measure I/O timing to sync the WAL file */
10797 if (track_wal_io_timing)
10798 INSTR_TIME_SET_CURRENT(start);
10799
10800 pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
10801 switch (sync_method)
10802 {
10803 case SYNC_METHOD_FSYNC:
10804 if (pg_fsync_no_writethrough(fd) != 0)
10805 msg = _("could not fsync file \"%s\": %m");
10806 break;
10807 #ifdef HAVE_FSYNC_WRITETHROUGH
10808 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10809 if (pg_fsync_writethrough(fd) != 0)
10810 msg = _("could not fsync write-through file \"%s\": %m");
10811 break;
10812 #endif
10813 #ifdef HAVE_FDATASYNC
10814 case SYNC_METHOD_FDATASYNC:
10815 if (pg_fdatasync(fd) != 0)
10816 msg = _("could not fdatasync file \"%s\": %m");
10817 break;
10818 #endif
10819 case SYNC_METHOD_OPEN:
10820 case SYNC_METHOD_OPEN_DSYNC:
10821 /* not reachable */
10822 Assert(false);
10823 break;
10824 default:
10825 elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10826 break;
10827 }
10828
10829 /* PANIC if failed to fsync */
10830 if (msg)
10831 {
10832 char xlogfname[MAXFNAMELEN];
10833 int save_errno = errno;
10834
10835 XLogFileName(xlogfname, ThisTimeLineID, segno,
10836 wal_segment_size);
10837 errno = save_errno;
10838 ereport(PANIC,
10839 (errcode_for_file_access(),
10840 errmsg(msg, xlogfname)));
10841 }
10842
10843 pgstat_report_wait_end();
10844
10845 /*
10846 * Increment the I/O timing and the number of times WAL files were synced.
10847 */
10848 if (track_wal_io_timing)
10849 {
10850 instr_time duration;
10851
10852 INSTR_TIME_SET_CURRENT(duration);
10853 INSTR_TIME_SUBTRACT(duration, start);
10854 WalStats.m_wal_sync_time += INSTR_TIME_GET_MICROSEC(duration);
10855 }
10856
10857 WalStats.m_wal_sync++;
10858 }
10859
10860 /*
10861 * do_pg_start_backup
10862 *
10863 * Utility function called at the start of an online backup. It creates the
10864 * necessary starting checkpoint and constructs the backup label file.
10865 *
10866 * There are two kind of backups: exclusive and non-exclusive. An exclusive
10867 * backup is started with pg_start_backup(), and there can be only one active
10868 * at a time. The backup and tablespace map files of an exclusive backup are
10869 * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10870 * removed by pg_stop_backup().
10871 *
10872 * A non-exclusive backup is used for the streaming base backups (see
10873 * src/backend/replication/basebackup.c). The difference to exclusive backups
10874 * is that the backup label and tablespace map files are not written to disk.
10875 * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10876 * and the caller is responsible for including them in the backup archive as
10877 * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10878 * active at the same time, and they don't conflict with an exclusive backup
10879 * either.
10880 *
10881 * labelfile and tblspcmapfile must be passed as NULL when starting an
10882 * exclusive backup, and as initially-empty StringInfos for a non-exclusive
10883 * backup.
10884 *
10885 * If "tablespaces" isn't NULL, it receives a list of tablespaceinfo structs
10886 * describing the cluster's tablespaces.
10887 *
10888 * tblspcmapfile is required mainly for tar format in windows as native windows
10889 * utilities are not able to create symlinks while extracting files from tar.
10890 * However for consistency, the same is used for all platforms.
10891 *
10892 * Returns the minimum WAL location that must be present to restore from this
10893 * backup, and the corresponding timeline ID in *starttli_p.
10894 *
10895 * Every successfully started non-exclusive backup must be stopped by calling
10896 * do_pg_stop_backup() or do_pg_abort_backup().
10897 *
10898 * It is the responsibility of the caller of this function to verify the
10899 * permissions of the calling user!
10900 */
10901 XLogRecPtr
do_pg_start_backup(const char * backupidstr,bool fast,TimeLineID * starttli_p,StringInfo labelfile,List ** tablespaces,StringInfo tblspcmapfile)10902 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10903 StringInfo labelfile, List **tablespaces,
10904 StringInfo tblspcmapfile)
10905 {
10906 bool exclusive = (labelfile == NULL);
10907 bool backup_started_in_recovery = false;
10908 XLogRecPtr checkpointloc;
10909 XLogRecPtr startpoint;
10910 TimeLineID starttli;
10911 pg_time_t stamp_time;
10912 char strfbuf[128];
10913 char xlogfilename[MAXFNAMELEN];
10914 XLogSegNo _logSegNo;
10915 struct stat stat_buf;
10916 FILE *fp;
10917
10918 backup_started_in_recovery = RecoveryInProgress();
10919
10920 /*
10921 * Currently only non-exclusive backup can be taken during recovery.
10922 */
10923 if (backup_started_in_recovery && exclusive)
10924 ereport(ERROR,
10925 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10926 errmsg("recovery is in progress"),
10927 errhint("WAL control functions cannot be executed during recovery.")));
10928
10929 /*
10930 * During recovery, we don't need to check WAL level. Because, if WAL
10931 * level is not sufficient, it's impossible to get here during recovery.
10932 */
10933 if (!backup_started_in_recovery && !XLogIsNeeded())
10934 ereport(ERROR,
10935 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10936 errmsg("WAL level not sufficient for making an online backup"),
10937 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10938
10939 if (strlen(backupidstr) > MAXPGPATH)
10940 ereport(ERROR,
10941 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10942 errmsg("backup label too long (max %d bytes)",
10943 MAXPGPATH)));
10944
10945 /*
10946 * Mark backup active in shared memory. We must do full-page WAL writes
10947 * during an on-line backup even if not doing so at other times, because
10948 * it's quite possible for the backup dump to obtain a "torn" (partially
10949 * written) copy of a database page if it reads the page concurrently with
10950 * our write to the same page. This can be fixed as long as the first
10951 * write to the page in the WAL sequence is a full-page write. Hence, we
10952 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10953 * are no dirty pages in shared memory that might get dumped while the
10954 * backup is in progress without having a corresponding WAL record. (Once
10955 * the backup is complete, we need not force full-page writes anymore,
10956 * since we expect that any pages not modified during the backup interval
10957 * must have been correctly captured by the backup.)
10958 *
10959 * Note that forcePageWrites has no effect during an online backup from
10960 * the standby.
10961 *
10962 * We must hold all the insertion locks to change the value of
10963 * forcePageWrites, to ensure adequate interlocking against
10964 * XLogInsertRecord().
10965 */
10966 WALInsertLockAcquireExclusive();
10967 if (exclusive)
10968 {
10969 /*
10970 * At first, mark that we're now starting an exclusive backup, to
10971 * ensure that there are no other sessions currently running
10972 * pg_start_backup() or pg_stop_backup().
10973 */
10974 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10975 {
10976 WALInsertLockRelease();
10977 ereport(ERROR,
10978 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10979 errmsg("a backup is already in progress"),
10980 errhint("Run pg_stop_backup() and try again.")));
10981 }
10982 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10983 }
10984 else
10985 XLogCtl->Insert.nonExclusiveBackups++;
10986 XLogCtl->Insert.forcePageWrites = true;
10987 WALInsertLockRelease();
10988
10989 /* Ensure we release forcePageWrites if fail below */
10990 PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10991 {
10992 bool gotUniqueStartpoint = false;
10993 DIR *tblspcdir;
10994 struct dirent *de;
10995 tablespaceinfo *ti;
10996 int datadirpathlen;
10997
10998 /*
10999 * Force an XLOG file switch before the checkpoint, to ensure that the
11000 * WAL segment the checkpoint is written to doesn't contain pages with
11001 * old timeline IDs. That would otherwise happen if you called
11002 * pg_start_backup() right after restoring from a PITR archive: the
11003 * first WAL segment containing the startup checkpoint has pages in
11004 * the beginning with the old timeline ID. That can cause trouble at
11005 * recovery: we won't have a history file covering the old timeline if
11006 * pg_wal directory was not included in the base backup and the WAL
11007 * archive was cleared too before starting the backup.
11008 *
11009 * This also ensures that we have emitted a WAL page header that has
11010 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
11011 * Therefore, if a WAL archiver (such as pglesslog) is trying to
11012 * compress out removable backup blocks, it won't remove any that
11013 * occur after this point.
11014 *
11015 * During recovery, we skip forcing XLOG file switch, which means that
11016 * the backup taken during recovery is not available for the special
11017 * recovery case described above.
11018 */
11019 if (!backup_started_in_recovery)
11020 RequestXLogSwitch(false);
11021
11022 do
11023 {
11024 bool checkpointfpw;
11025
11026 /*
11027 * Force a CHECKPOINT. Aside from being necessary to prevent torn
11028 * page problems, this guarantees that two successive backup runs
11029 * will have different checkpoint positions and hence different
11030 * history file names, even if nothing happened in between.
11031 *
11032 * During recovery, establish a restartpoint if possible. We use
11033 * the last restartpoint as the backup starting checkpoint. This
11034 * means that two successive backup runs can have same checkpoint
11035 * positions.
11036 *
11037 * Since the fact that we are executing do_pg_start_backup()
11038 * during recovery means that checkpointer is running, we can use
11039 * RequestCheckpoint() to establish a restartpoint.
11040 *
11041 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
11042 * passing fast = true). Otherwise this can take awhile.
11043 */
11044 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
11045 (fast ? CHECKPOINT_IMMEDIATE : 0));
11046
11047 /*
11048 * Now we need to fetch the checkpoint record location, and also
11049 * its REDO pointer. The oldest point in WAL that would be needed
11050 * to restore starting from the checkpoint is precisely the REDO
11051 * pointer.
11052 */
11053 LWLockAcquire(ControlFileLock, LW_SHARED);
11054 checkpointloc = ControlFile->checkPoint;
11055 startpoint = ControlFile->checkPointCopy.redo;
11056 starttli = ControlFile->checkPointCopy.ThisTimeLineID;
11057 checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
11058 LWLockRelease(ControlFileLock);
11059
11060 if (backup_started_in_recovery)
11061 {
11062 XLogRecPtr recptr;
11063
11064 /*
11065 * Check to see if all WAL replayed during online backup
11066 * (i.e., since last restartpoint used as backup starting
11067 * checkpoint) contain full-page writes.
11068 */
11069 SpinLockAcquire(&XLogCtl->info_lck);
11070 recptr = XLogCtl->lastFpwDisableRecPtr;
11071 SpinLockRelease(&XLogCtl->info_lck);
11072
11073 if (!checkpointfpw || startpoint <= recptr)
11074 ereport(ERROR,
11075 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11076 errmsg("WAL generated with full_page_writes=off was replayed "
11077 "since last restartpoint"),
11078 errhint("This means that the backup being taken on the standby "
11079 "is corrupt and should not be used. "
11080 "Enable full_page_writes and run CHECKPOINT on the primary, "
11081 "and then try an online backup again.")));
11082
11083 /*
11084 * During recovery, since we don't use the end-of-backup WAL
11085 * record and don't write the backup history file, the
11086 * starting WAL location doesn't need to be unique. This means
11087 * that two base backups started at the same time might use
11088 * the same checkpoint as starting locations.
11089 */
11090 gotUniqueStartpoint = true;
11091 }
11092
11093 /*
11094 * If two base backups are started at the same time (in WAL sender
11095 * processes), we need to make sure that they use different
11096 * checkpoints as starting locations, because we use the starting
11097 * WAL location as a unique identifier for the base backup in the
11098 * end-of-backup WAL record and when we write the backup history
11099 * file. Perhaps it would be better generate a separate unique ID
11100 * for each backup instead of forcing another checkpoint, but
11101 * taking a checkpoint right after another is not that expensive
11102 * either because only few buffers have been dirtied yet.
11103 */
11104 WALInsertLockAcquireExclusive();
11105 if (XLogCtl->Insert.lastBackupStart < startpoint)
11106 {
11107 XLogCtl->Insert.lastBackupStart = startpoint;
11108 gotUniqueStartpoint = true;
11109 }
11110 WALInsertLockRelease();
11111 } while (!gotUniqueStartpoint);
11112
11113 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11114 XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
11115
11116 /*
11117 * Construct tablespace_map file. If caller isn't interested in this,
11118 * we make a local StringInfo.
11119 */
11120 if (tblspcmapfile == NULL)
11121 tblspcmapfile = makeStringInfo();
11122
11123 datadirpathlen = strlen(DataDir);
11124
11125 /* Collect information about all tablespaces */
11126 tblspcdir = AllocateDir("pg_tblspc");
11127 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
11128 {
11129 char fullpath[MAXPGPATH + 10];
11130 char linkpath[MAXPGPATH];
11131 char *relpath = NULL;
11132 int rllen;
11133 StringInfoData escapedpath;
11134 char *s;
11135
11136 /* Skip anything that doesn't look like a tablespace */
11137 if (strspn(de->d_name, "0123456789") != strlen(de->d_name))
11138 continue;
11139
11140 snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
11141
11142 #if defined(HAVE_READLINK) || defined(WIN32)
11143 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
11144 if (rllen < 0)
11145 {
11146 ereport(WARNING,
11147 (errmsg("could not read symbolic link \"%s\": %m",
11148 fullpath)));
11149 continue;
11150 }
11151 else if (rllen >= sizeof(linkpath))
11152 {
11153 ereport(WARNING,
11154 (errmsg("symbolic link \"%s\" target is too long",
11155 fullpath)));
11156 continue;
11157 }
11158 linkpath[rllen] = '\0';
11159
11160 /*
11161 * Build a backslash-escaped version of the link path to include
11162 * in the tablespace map file.
11163 */
11164 initStringInfo(&escapedpath);
11165 for (s = linkpath; *s; s++)
11166 {
11167 if (*s == '\n' || *s == '\r' || *s == '\\')
11168 appendStringInfoChar(&escapedpath, '\\');
11169 appendStringInfoChar(&escapedpath, *s);
11170 }
11171
11172 /*
11173 * Relpath holds the relative path of the tablespace directory
11174 * when it's located within PGDATA, or NULL if it's located
11175 * elsewhere.
11176 */
11177 if (rllen > datadirpathlen &&
11178 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
11179 IS_DIR_SEP(linkpath[datadirpathlen]))
11180 relpath = linkpath + datadirpathlen + 1;
11181
11182 ti = palloc(sizeof(tablespaceinfo));
11183 ti->oid = pstrdup(de->d_name);
11184 ti->path = pstrdup(linkpath);
11185 ti->rpath = relpath ? pstrdup(relpath) : NULL;
11186 ti->size = -1;
11187
11188 if (tablespaces)
11189 *tablespaces = lappend(*tablespaces, ti);
11190
11191 appendStringInfo(tblspcmapfile, "%s %s\n",
11192 ti->oid, escapedpath.data);
11193
11194 pfree(escapedpath.data);
11195 #else
11196
11197 /*
11198 * If the platform does not have symbolic links, it should not be
11199 * possible to have tablespaces - clearly somebody else created
11200 * them. Warn about it and ignore.
11201 */
11202 ereport(WARNING,
11203 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
11204 errmsg("tablespaces are not supported on this platform")));
11205 #endif
11206 }
11207 FreeDir(tblspcdir);
11208
11209 /*
11210 * Construct backup label file. If caller isn't interested in this,
11211 * we make a local StringInfo.
11212 */
11213 if (labelfile == NULL)
11214 labelfile = makeStringInfo();
11215
11216 /* Use the log timezone here, not the session timezone */
11217 stamp_time = (pg_time_t) time(NULL);
11218 pg_strftime(strfbuf, sizeof(strfbuf),
11219 "%Y-%m-%d %H:%M:%S %Z",
11220 pg_localtime(&stamp_time, log_timezone));
11221 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
11222 LSN_FORMAT_ARGS(startpoint), xlogfilename);
11223 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
11224 LSN_FORMAT_ARGS(checkpointloc));
11225 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
11226 exclusive ? "pg_start_backup" : "streamed");
11227 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
11228 backup_started_in_recovery ? "standby" : "primary");
11229 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
11230 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
11231 appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
11232
11233 /*
11234 * Okay, write the file, or return its contents to caller.
11235 */
11236 if (exclusive)
11237 {
11238 /*
11239 * Check for existing backup label --- implies a backup is already
11240 * running. (XXX given that we checked exclusiveBackupState
11241 * above, maybe it would be OK to just unlink any such label
11242 * file?)
11243 */
11244 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
11245 {
11246 if (errno != ENOENT)
11247 ereport(ERROR,
11248 (errcode_for_file_access(),
11249 errmsg("could not stat file \"%s\": %m",
11250 BACKUP_LABEL_FILE)));
11251 }
11252 else
11253 ereport(ERROR,
11254 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11255 errmsg("a backup is already in progress"),
11256 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
11257 BACKUP_LABEL_FILE)));
11258
11259 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
11260
11261 if (!fp)
11262 ereport(ERROR,
11263 (errcode_for_file_access(),
11264 errmsg("could not create file \"%s\": %m",
11265 BACKUP_LABEL_FILE)));
11266 if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
11267 fflush(fp) != 0 ||
11268 pg_fsync(fileno(fp)) != 0 ||
11269 ferror(fp) ||
11270 FreeFile(fp))
11271 ereport(ERROR,
11272 (errcode_for_file_access(),
11273 errmsg("could not write file \"%s\": %m",
11274 BACKUP_LABEL_FILE)));
11275 /* Allocated locally for exclusive backups, so free separately */
11276 pfree(labelfile->data);
11277 pfree(labelfile);
11278
11279 /* Write backup tablespace_map file. */
11280 if (tblspcmapfile->len > 0)
11281 {
11282 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
11283 {
11284 if (errno != ENOENT)
11285 ereport(ERROR,
11286 (errcode_for_file_access(),
11287 errmsg("could not stat file \"%s\": %m",
11288 TABLESPACE_MAP)));
11289 }
11290 else
11291 ereport(ERROR,
11292 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11293 errmsg("a backup is already in progress"),
11294 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
11295 TABLESPACE_MAP)));
11296
11297 fp = AllocateFile(TABLESPACE_MAP, "w");
11298
11299 if (!fp)
11300 ereport(ERROR,
11301 (errcode_for_file_access(),
11302 errmsg("could not create file \"%s\": %m",
11303 TABLESPACE_MAP)));
11304 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
11305 fflush(fp) != 0 ||
11306 pg_fsync(fileno(fp)) != 0 ||
11307 ferror(fp) ||
11308 FreeFile(fp))
11309 ereport(ERROR,
11310 (errcode_for_file_access(),
11311 errmsg("could not write file \"%s\": %m",
11312 TABLESPACE_MAP)));
11313 }
11314
11315 /* Allocated locally for exclusive backups, so free separately */
11316 pfree(tblspcmapfile->data);
11317 pfree(tblspcmapfile);
11318 }
11319 }
11320 PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
11321
11322 /*
11323 * Mark that start phase has correctly finished for an exclusive backup.
11324 * Session-level locks are updated as well to reflect that state.
11325 *
11326 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
11327 * counters and session-level lock. Otherwise they can be updated
11328 * inconsistently, and which might cause do_pg_abort_backup() to fail.
11329 */
11330 if (exclusive)
11331 {
11332 WALInsertLockAcquireExclusive();
11333 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11334
11335 /* Set session-level lock */
11336 sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
11337 WALInsertLockRelease();
11338 }
11339 else
11340 sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
11341
11342 /*
11343 * We're done. As a convenience, return the starting WAL location.
11344 */
11345 if (starttli_p)
11346 *starttli_p = starttli;
11347 return startpoint;
11348 }
11349
11350 /* Error cleanup callback for pg_start_backup */
11351 static void
pg_start_backup_callback(int code,Datum arg)11352 pg_start_backup_callback(int code, Datum arg)
11353 {
11354 bool exclusive = DatumGetBool(arg);
11355
11356 /* Update backup counters and forcePageWrites on failure */
11357 WALInsertLockAcquireExclusive();
11358 if (exclusive)
11359 {
11360 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
11361 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11362 }
11363 else
11364 {
11365 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11366 XLogCtl->Insert.nonExclusiveBackups--;
11367 }
11368
11369 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11370 XLogCtl->Insert.nonExclusiveBackups == 0)
11371 {
11372 XLogCtl->Insert.forcePageWrites = false;
11373 }
11374 WALInsertLockRelease();
11375 }
11376
11377 /*
11378 * Error cleanup callback for pg_stop_backup
11379 */
11380 static void
pg_stop_backup_callback(int code,Datum arg)11381 pg_stop_backup_callback(int code, Datum arg)
11382 {
11383 bool exclusive = DatumGetBool(arg);
11384
11385 /* Update backup status on failure */
11386 WALInsertLockAcquireExclusive();
11387 if (exclusive)
11388 {
11389 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
11390 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11391 }
11392 WALInsertLockRelease();
11393 }
11394
11395 /*
11396 * Utility routine to fetch the session-level status of a backup running.
11397 */
11398 SessionBackupState
get_backup_status(void)11399 get_backup_status(void)
11400 {
11401 return sessionBackupState;
11402 }
11403
11404 /*
11405 * do_pg_stop_backup
11406 *
11407 * Utility function called at the end of an online backup. It cleans up the
11408 * backup state and can optionally wait for WAL segments to be archived.
11409 *
11410 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
11411 * the non-exclusive backup specified by 'labelfile'.
11412 *
11413 * Returns the last WAL location that must be present to restore from this
11414 * backup, and the corresponding timeline ID in *stoptli_p.
11415 *
11416 * It is the responsibility of the caller of this function to verify the
11417 * permissions of the calling user!
11418 */
11419 XLogRecPtr
do_pg_stop_backup(char * labelfile,bool waitforarchive,TimeLineID * stoptli_p)11420 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
11421 {
11422 bool exclusive = (labelfile == NULL);
11423 bool backup_started_in_recovery = false;
11424 XLogRecPtr startpoint;
11425 XLogRecPtr stoppoint;
11426 TimeLineID stoptli;
11427 pg_time_t stamp_time;
11428 char strfbuf[128];
11429 char histfilepath[MAXPGPATH];
11430 char startxlogfilename[MAXFNAMELEN];
11431 char stopxlogfilename[MAXFNAMELEN];
11432 char lastxlogfilename[MAXFNAMELEN];
11433 char histfilename[MAXFNAMELEN];
11434 char backupfrom[20];
11435 XLogSegNo _logSegNo;
11436 FILE *lfp;
11437 FILE *fp;
11438 char ch;
11439 int seconds_before_warning;
11440 int waits = 0;
11441 bool reported_waiting = false;
11442 char *remaining;
11443 char *ptr;
11444 uint32 hi,
11445 lo;
11446
11447 backup_started_in_recovery = RecoveryInProgress();
11448
11449 /*
11450 * Currently only non-exclusive backup can be taken during recovery.
11451 */
11452 if (backup_started_in_recovery && exclusive)
11453 ereport(ERROR,
11454 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11455 errmsg("recovery is in progress"),
11456 errhint("WAL control functions cannot be executed during recovery.")));
11457
11458 /*
11459 * During recovery, we don't need to check WAL level. Because, if WAL
11460 * level is not sufficient, it's impossible to get here during recovery.
11461 */
11462 if (!backup_started_in_recovery && !XLogIsNeeded())
11463 ereport(ERROR,
11464 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11465 errmsg("WAL level not sufficient for making an online backup"),
11466 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
11467
11468 if (exclusive)
11469 {
11470 /*
11471 * At first, mark that we're now stopping an exclusive backup, to
11472 * ensure that there are no other sessions currently running
11473 * pg_start_backup() or pg_stop_backup().
11474 */
11475 WALInsertLockAcquireExclusive();
11476 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
11477 {
11478 WALInsertLockRelease();
11479 ereport(ERROR,
11480 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11481 errmsg("exclusive backup not in progress")));
11482 }
11483 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
11484 WALInsertLockRelease();
11485
11486 /*
11487 * Remove backup_label. In case of failure, the state for an exclusive
11488 * backup is switched back to in-progress.
11489 */
11490 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11491 {
11492 /*
11493 * Read the existing label file into memory.
11494 */
11495 struct stat statbuf;
11496 int r;
11497
11498 if (stat(BACKUP_LABEL_FILE, &statbuf))
11499 {
11500 /* should not happen per the upper checks */
11501 if (errno != ENOENT)
11502 ereport(ERROR,
11503 (errcode_for_file_access(),
11504 errmsg("could not stat file \"%s\": %m",
11505 BACKUP_LABEL_FILE)));
11506 ereport(ERROR,
11507 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11508 errmsg("a backup is not in progress")));
11509 }
11510
11511 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11512 if (!lfp)
11513 {
11514 ereport(ERROR,
11515 (errcode_for_file_access(),
11516 errmsg("could not read file \"%s\": %m",
11517 BACKUP_LABEL_FILE)));
11518 }
11519 labelfile = palloc(statbuf.st_size + 1);
11520 r = fread(labelfile, statbuf.st_size, 1, lfp);
11521 labelfile[statbuf.st_size] = '\0';
11522
11523 /*
11524 * Close and remove the backup label file
11525 */
11526 if (r != 1 || ferror(lfp) || FreeFile(lfp))
11527 ereport(ERROR,
11528 (errcode_for_file_access(),
11529 errmsg("could not read file \"%s\": %m",
11530 BACKUP_LABEL_FILE)));
11531 durable_unlink(BACKUP_LABEL_FILE, ERROR);
11532
11533 /*
11534 * Remove tablespace_map file if present, it is created only if
11535 * there are tablespaces.
11536 */
11537 durable_unlink(TABLESPACE_MAP, DEBUG1);
11538 }
11539 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11540 }
11541
11542 /*
11543 * OK to update backup counters, forcePageWrites and session-level lock.
11544 *
11545 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
11546 * Otherwise they can be updated inconsistently, and which might cause
11547 * do_pg_abort_backup() to fail.
11548 */
11549 WALInsertLockAcquireExclusive();
11550 if (exclusive)
11551 {
11552 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11553 }
11554 else
11555 {
11556 /*
11557 * The user-visible pg_start/stop_backup() functions that operate on
11558 * exclusive backups can be called at any time, but for non-exclusive
11559 * backups, it is expected that each do_pg_start_backup() call is
11560 * matched by exactly one do_pg_stop_backup() call.
11561 */
11562 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11563 XLogCtl->Insert.nonExclusiveBackups--;
11564 }
11565
11566 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11567 XLogCtl->Insert.nonExclusiveBackups == 0)
11568 {
11569 XLogCtl->Insert.forcePageWrites = false;
11570 }
11571
11572 /*
11573 * Clean up session-level lock.
11574 *
11575 * You might think that WALInsertLockRelease() can be called before
11576 * cleaning up session-level lock because session-level lock doesn't need
11577 * to be protected with WAL insertion lock. But since
11578 * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
11579 * cleaned up before it.
11580 */
11581 sessionBackupState = SESSION_BACKUP_NONE;
11582
11583 WALInsertLockRelease();
11584
11585 /*
11586 * Read and parse the START WAL LOCATION line (this code is pretty crude,
11587 * but we are not expecting any variability in the file format).
11588 */
11589 if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11590 &hi, &lo, startxlogfilename,
11591 &ch) != 4 || ch != '\n')
11592 ereport(ERROR,
11593 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11594 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11595 startpoint = ((uint64) hi) << 32 | lo;
11596 remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
11597
11598 /*
11599 * Parse the BACKUP FROM line. If we are taking an online backup from the
11600 * standby, we confirm that the standby has not been promoted during the
11601 * backup.
11602 */
11603 ptr = strstr(remaining, "BACKUP FROM:");
11604 if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11605 ereport(ERROR,
11606 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11607 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11608 if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11609 ereport(ERROR,
11610 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11611 errmsg("the standby was promoted during online backup"),
11612 errhint("This means that the backup being taken is corrupt "
11613 "and should not be used. "
11614 "Try taking another online backup.")));
11615
11616 /*
11617 * During recovery, we don't write an end-of-backup record. We assume that
11618 * pg_control was backed up last and its minimum recovery point can be
11619 * available as the backup end location. Since we don't have an
11620 * end-of-backup record, we use the pg_control value to check whether
11621 * we've reached the end of backup when starting recovery from this
11622 * backup. We have no way of checking if pg_control wasn't backed up last
11623 * however.
11624 *
11625 * We don't force a switch to new WAL file but it is still possible to
11626 * wait for all the required files to be archived if waitforarchive is
11627 * true. This is okay if we use the backup to start a standby and fetch
11628 * the missing WAL using streaming replication. But in the case of an
11629 * archive recovery, a user should set waitforarchive to true and wait for
11630 * them to be archived to ensure that all the required files are
11631 * available.
11632 *
11633 * We return the current minimum recovery point as the backup end
11634 * location. Note that it can be greater than the exact backup end
11635 * location if the minimum recovery point is updated after the backup of
11636 * pg_control. This is harmless for current uses.
11637 *
11638 * XXX currently a backup history file is for informational and debug
11639 * purposes only. It's not essential for an online backup. Furthermore,
11640 * even if it's created, it will not be archived during recovery because
11641 * an archiver is not invoked. So it doesn't seem worthwhile to write a
11642 * backup history file during recovery.
11643 */
11644 if (backup_started_in_recovery)
11645 {
11646 XLogRecPtr recptr;
11647
11648 /*
11649 * Check to see if all WAL replayed during online backup contain
11650 * full-page writes.
11651 */
11652 SpinLockAcquire(&XLogCtl->info_lck);
11653 recptr = XLogCtl->lastFpwDisableRecPtr;
11654 SpinLockRelease(&XLogCtl->info_lck);
11655
11656 if (startpoint <= recptr)
11657 ereport(ERROR,
11658 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11659 errmsg("WAL generated with full_page_writes=off was replayed "
11660 "during online backup"),
11661 errhint("This means that the backup being taken on the standby "
11662 "is corrupt and should not be used. "
11663 "Enable full_page_writes and run CHECKPOINT on the primary, "
11664 "and then try an online backup again.")));
11665
11666
11667 LWLockAcquire(ControlFileLock, LW_SHARED);
11668 stoppoint = ControlFile->minRecoveryPoint;
11669 stoptli = ControlFile->minRecoveryPointTLI;
11670 LWLockRelease(ControlFileLock);
11671 }
11672 else
11673 {
11674 /*
11675 * Write the backup-end xlog record
11676 */
11677 XLogBeginInsert();
11678 XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11679 stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11680 stoptli = ThisTimeLineID;
11681
11682 /*
11683 * Force a switch to a new xlog segment file, so that the backup is
11684 * valid as soon as archiver moves out the current segment file.
11685 */
11686 RequestXLogSwitch(false);
11687
11688 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11689 XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
11690
11691 /* Use the log timezone here, not the session timezone */
11692 stamp_time = (pg_time_t) time(NULL);
11693 pg_strftime(strfbuf, sizeof(strfbuf),
11694 "%Y-%m-%d %H:%M:%S %Z",
11695 pg_localtime(&stamp_time, log_timezone));
11696
11697 /*
11698 * Write the backup history file
11699 */
11700 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11701 BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11702 startpoint, wal_segment_size);
11703 fp = AllocateFile(histfilepath, "w");
11704 if (!fp)
11705 ereport(ERROR,
11706 (errcode_for_file_access(),
11707 errmsg("could not create file \"%s\": %m",
11708 histfilepath)));
11709 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11710 LSN_FORMAT_ARGS(startpoint), startxlogfilename);
11711 fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11712 LSN_FORMAT_ARGS(stoppoint), stopxlogfilename);
11713
11714 /*
11715 * Transfer remaining lines including label and start timeline to
11716 * history file.
11717 */
11718 fprintf(fp, "%s", remaining);
11719 fprintf(fp, "STOP TIME: %s\n", strfbuf);
11720 fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11721 if (fflush(fp) || ferror(fp) || FreeFile(fp))
11722 ereport(ERROR,
11723 (errcode_for_file_access(),
11724 errmsg("could not write file \"%s\": %m",
11725 histfilepath)));
11726
11727 /*
11728 * Clean out any no-longer-needed history files. As a side effect,
11729 * this will post a .ready file for the newly created history file,
11730 * notifying the archiver that history file may be archived
11731 * immediately.
11732 */
11733 CleanupBackupHistory();
11734 }
11735
11736 /*
11737 * If archiving is enabled, wait for all the required WAL files to be
11738 * archived before returning. If archiving isn't enabled, the required WAL
11739 * needs to be transported via streaming replication (hopefully with
11740 * wal_keep_size set high enough), or some more exotic mechanism like
11741 * polling and copying files from pg_wal with script. We have no knowledge
11742 * of those mechanisms, so it's up to the user to ensure that he gets all
11743 * the required WAL.
11744 *
11745 * We wait until both the last WAL file filled during backup and the
11746 * history file have been archived, and assume that the alphabetic sorting
11747 * property of the WAL files ensures any earlier WAL files are safely
11748 * archived as well.
11749 *
11750 * We wait forever, since archive_command is supposed to work and we
11751 * assume the admin wanted his backup to work completely. If you don't
11752 * wish to wait, then either waitforarchive should be passed in as false,
11753 * or you can set statement_timeout. Also, some notices are issued to
11754 * clue in anyone who might be doing this interactively.
11755 */
11756
11757 if (waitforarchive &&
11758 ((!backup_started_in_recovery && XLogArchivingActive()) ||
11759 (backup_started_in_recovery && XLogArchivingAlways())))
11760 {
11761 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11762 XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11763
11764 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11765 BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11766 startpoint, wal_segment_size);
11767
11768 seconds_before_warning = 60;
11769 waits = 0;
11770
11771 while (XLogArchiveIsBusy(lastxlogfilename) ||
11772 XLogArchiveIsBusy(histfilename))
11773 {
11774 CHECK_FOR_INTERRUPTS();
11775
11776 if (!reported_waiting && waits > 5)
11777 {
11778 ereport(NOTICE,
11779 (errmsg("base backup done, waiting for required WAL segments to be archived")));
11780 reported_waiting = true;
11781 }
11782
11783 pgstat_report_wait_start(WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
11784 pg_usleep(1000000L);
11785 pgstat_report_wait_end();
11786
11787 if (++waits >= seconds_before_warning)
11788 {
11789 seconds_before_warning *= 2; /* This wraps in >10 years... */
11790 ereport(WARNING,
11791 (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11792 waits),
11793 errhint("Check that your archive_command is executing properly. "
11794 "You can safely cancel this backup, "
11795 "but the database backup will not be usable without all the WAL segments.")));
11796 }
11797 }
11798
11799 ereport(NOTICE,
11800 (errmsg("all required WAL segments have been archived")));
11801 }
11802 else if (waitforarchive)
11803 ereport(NOTICE,
11804 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11805
11806 /*
11807 * We're done. As a convenience, return the ending WAL location.
11808 */
11809 if (stoptli_p)
11810 *stoptli_p = stoptli;
11811 return stoppoint;
11812 }
11813
11814
11815 /*
11816 * do_pg_abort_backup: abort a running backup
11817 *
11818 * This does just the most basic steps of do_pg_stop_backup(), by taking the
11819 * system out of backup mode, thus making it a lot more safe to call from
11820 * an error handler.
11821 *
11822 * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
11823 * is emitted.
11824 *
11825 * NB: This is only for aborting a non-exclusive backup that doesn't write
11826 * backup_label. A backup started with pg_start_backup() needs to be finished
11827 * with pg_stop_backup().
11828 *
11829 * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
11830 * signature.
11831 */
11832 void
do_pg_abort_backup(int code,Datum arg)11833 do_pg_abort_backup(int code, Datum arg)
11834 {
11835 bool emit_warning = DatumGetBool(arg);
11836
11837 /*
11838 * Quick exit if session is not keeping around a non-exclusive backup
11839 * already started.
11840 */
11841 if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
11842 return;
11843
11844 WALInsertLockAcquireExclusive();
11845 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11846 XLogCtl->Insert.nonExclusiveBackups--;
11847
11848 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11849 XLogCtl->Insert.nonExclusiveBackups == 0)
11850 {
11851 XLogCtl->Insert.forcePageWrites = false;
11852 }
11853 WALInsertLockRelease();
11854
11855 if (emit_warning)
11856 ereport(WARNING,
11857 (errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
11858 }
11859
11860 /*
11861 * Register a handler that will warn about unterminated backups at end of
11862 * session, unless this has already been done.
11863 */
11864 void
register_persistent_abort_backup_handler(void)11865 register_persistent_abort_backup_handler(void)
11866 {
11867 static bool already_done = false;
11868
11869 if (already_done)
11870 return;
11871 before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
11872 already_done = true;
11873 }
11874
11875 /*
11876 * Get latest redo apply position.
11877 *
11878 * Exported to allow WALReceiver to read the pointer directly.
11879 */
11880 XLogRecPtr
GetXLogReplayRecPtr(TimeLineID * replayTLI)11881 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11882 {
11883 XLogRecPtr recptr;
11884 TimeLineID tli;
11885
11886 SpinLockAcquire(&XLogCtl->info_lck);
11887 recptr = XLogCtl->lastReplayedEndRecPtr;
11888 tli = XLogCtl->lastReplayedTLI;
11889 SpinLockRelease(&XLogCtl->info_lck);
11890
11891 if (replayTLI)
11892 *replayTLI = tli;
11893 return recptr;
11894 }
11895
11896 /*
11897 * Get latest WAL insert pointer
11898 */
11899 XLogRecPtr
GetXLogInsertRecPtr(void)11900 GetXLogInsertRecPtr(void)
11901 {
11902 XLogCtlInsert *Insert = &XLogCtl->Insert;
11903 uint64 current_bytepos;
11904
11905 SpinLockAcquire(&Insert->insertpos_lck);
11906 current_bytepos = Insert->CurrBytePos;
11907 SpinLockRelease(&Insert->insertpos_lck);
11908
11909 return XLogBytePosToRecPtr(current_bytepos);
11910 }
11911
11912 /*
11913 * Get latest WAL write pointer
11914 */
11915 XLogRecPtr
GetXLogWriteRecPtr(void)11916 GetXLogWriteRecPtr(void)
11917 {
11918 SpinLockAcquire(&XLogCtl->info_lck);
11919 LogwrtResult = XLogCtl->LogwrtResult;
11920 SpinLockRelease(&XLogCtl->info_lck);
11921
11922 return LogwrtResult.Write;
11923 }
11924
11925 /*
11926 * Returns the redo pointer of the last checkpoint or restartpoint. This is
11927 * the oldest point in WAL that we still need, if we have to restart recovery.
11928 */
11929 void
GetOldestRestartPoint(XLogRecPtr * oldrecptr,TimeLineID * oldtli)11930 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11931 {
11932 LWLockAcquire(ControlFileLock, LW_SHARED);
11933 *oldrecptr = ControlFile->checkPointCopy.redo;
11934 *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11935 LWLockRelease(ControlFileLock);
11936 }
11937
11938 /*
11939 * read_backup_label: check to see if a backup_label file is present
11940 *
11941 * If we see a backup_label during recovery, we assume that we are recovering
11942 * from a backup dump file, and we therefore roll forward from the checkpoint
11943 * identified by the label file, NOT what pg_control says. This avoids the
11944 * problem that pg_control might have been archived one or more checkpoints
11945 * later than the start of the dump, and so if we rely on it as the start
11946 * point, we will fail to restore a consistent database state.
11947 *
11948 * Returns true if a backup_label was found (and fills the checkpoint
11949 * location and its REDO location into *checkPointLoc and RedoStartLSN,
11950 * respectively); returns false if not. If this backup_label came from a
11951 * streamed backup, *backupEndRequired is set to true. If this backup_label
11952 * was created during recovery, *backupFromStandby is set to true.
11953 */
11954 static bool
read_backup_label(XLogRecPtr * checkPointLoc,bool * backupEndRequired,bool * backupFromStandby)11955 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11956 bool *backupFromStandby)
11957 {
11958 char startxlogfilename[MAXFNAMELEN];
11959 TimeLineID tli_from_walseg,
11960 tli_from_file;
11961 FILE *lfp;
11962 char ch;
11963 char backuptype[20];
11964 char backupfrom[20];
11965 char backuplabel[MAXPGPATH];
11966 char backuptime[128];
11967 uint32 hi,
11968 lo;
11969
11970 *backupEndRequired = false;
11971 *backupFromStandby = false;
11972
11973 /*
11974 * See if label file is present
11975 */
11976 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11977 if (!lfp)
11978 {
11979 if (errno != ENOENT)
11980 ereport(FATAL,
11981 (errcode_for_file_access(),
11982 errmsg("could not read file \"%s\": %m",
11983 BACKUP_LABEL_FILE)));
11984 return false; /* it's not there, all is fine */
11985 }
11986
11987 /*
11988 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11989 * is pretty crude, but we are not expecting any variability in the file
11990 * format).
11991 */
11992 if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11993 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
11994 ereport(FATAL,
11995 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11996 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11997 RedoStartLSN = ((uint64) hi) << 32 | lo;
11998 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11999 &hi, &lo, &ch) != 3 || ch != '\n')
12000 ereport(FATAL,
12001 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
12002 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
12003 *checkPointLoc = ((uint64) hi) << 32 | lo;
12004
12005 /*
12006 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
12007 * from an older backup anyway, but since the information on it is not
12008 * strictly required, don't error out if it's missing for some reason.
12009 */
12010 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
12011 {
12012 if (strcmp(backuptype, "streamed") == 0)
12013 *backupEndRequired = true;
12014 }
12015
12016 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
12017 {
12018 if (strcmp(backupfrom, "standby") == 0)
12019 *backupFromStandby = true;
12020 }
12021
12022 /*
12023 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
12024 * but checking for their presence is useful for debugging and the next
12025 * sanity checks. Cope also with the fact that the result buffers have a
12026 * pre-allocated size, hence if the backup_label file has been generated
12027 * with strings longer than the maximum assumed here an incorrect parsing
12028 * happens. That's fine as only minor consistency checks are done
12029 * afterwards.
12030 */
12031 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
12032 ereport(DEBUG1,
12033 (errmsg_internal("backup time %s in file \"%s\"",
12034 backuptime, BACKUP_LABEL_FILE)));
12035
12036 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
12037 ereport(DEBUG1,
12038 (errmsg_internal("backup label %s in file \"%s\"",
12039 backuplabel, BACKUP_LABEL_FILE)));
12040
12041 /*
12042 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
12043 * it as a sanity check if present.
12044 */
12045 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
12046 {
12047 if (tli_from_walseg != tli_from_file)
12048 ereport(FATAL,
12049 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
12050 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
12051 errdetail("Timeline ID parsed is %u, but expected %u.",
12052 tli_from_file, tli_from_walseg)));
12053
12054 ereport(DEBUG1,
12055 (errmsg_internal("backup timeline %u in file \"%s\"",
12056 tli_from_file, BACKUP_LABEL_FILE)));
12057 }
12058
12059 if (ferror(lfp) || FreeFile(lfp))
12060 ereport(FATAL,
12061 (errcode_for_file_access(),
12062 errmsg("could not read file \"%s\": %m",
12063 BACKUP_LABEL_FILE)));
12064
12065 return true;
12066 }
12067
12068 /*
12069 * read_tablespace_map: check to see if a tablespace_map file is present
12070 *
12071 * If we see a tablespace_map file during recovery, we assume that we are
12072 * recovering from a backup dump file, and we therefore need to create symlinks
12073 * as per the information present in tablespace_map file.
12074 *
12075 * Returns true if a tablespace_map file was found (and fills *tablespaces
12076 * with a tablespaceinfo struct for each tablespace listed in the file);
12077 * returns false if not.
12078 */
12079 static bool
read_tablespace_map(List ** tablespaces)12080 read_tablespace_map(List **tablespaces)
12081 {
12082 tablespaceinfo *ti;
12083 FILE *lfp;
12084 char str[MAXPGPATH];
12085 int ch,
12086 i,
12087 n;
12088 bool was_backslash;
12089
12090 /*
12091 * See if tablespace_map file is present
12092 */
12093 lfp = AllocateFile(TABLESPACE_MAP, "r");
12094 if (!lfp)
12095 {
12096 if (errno != ENOENT)
12097 ereport(FATAL,
12098 (errcode_for_file_access(),
12099 errmsg("could not read file \"%s\": %m",
12100 TABLESPACE_MAP)));
12101 return false; /* it's not there, all is fine */
12102 }
12103
12104 /*
12105 * Read and parse the link name and path lines from tablespace_map file
12106 * (this code is pretty crude, but we are not expecting any variability in
12107 * the file format). De-escape any backslashes that were inserted.
12108 */
12109 i = 0;
12110 was_backslash = false;
12111 while ((ch = fgetc(lfp)) != EOF)
12112 {
12113 if (!was_backslash && (ch == '\n' || ch == '\r'))
12114 {
12115 if (i == 0)
12116 continue; /* \r immediately followed by \n */
12117
12118 /*
12119 * The de-escaped line should contain an OID followed by exactly
12120 * one space followed by a path. The path might start with
12121 * spaces, so don't be too liberal about parsing.
12122 */
12123 str[i] = '\0';
12124 n = 0;
12125 while (str[n] && str[n] != ' ')
12126 n++;
12127 if (n < 1 || n >= i - 1)
12128 ereport(FATAL,
12129 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
12130 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
12131 str[n++] = '\0';
12132
12133 ti = palloc0(sizeof(tablespaceinfo));
12134 ti->oid = pstrdup(str);
12135 ti->path = pstrdup(str + n);
12136 *tablespaces = lappend(*tablespaces, ti);
12137
12138 i = 0;
12139 continue;
12140 }
12141 else if (!was_backslash && ch == '\\')
12142 was_backslash = true;
12143 else
12144 {
12145 if (i < sizeof(str) - 1)
12146 str[i++] = ch;
12147 was_backslash = false;
12148 }
12149 }
12150
12151 if (i != 0 || was_backslash) /* last line not terminated? */
12152 ereport(FATAL,
12153 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
12154 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
12155
12156 if (ferror(lfp) || FreeFile(lfp))
12157 ereport(FATAL,
12158 (errcode_for_file_access(),
12159 errmsg("could not read file \"%s\": %m",
12160 TABLESPACE_MAP)));
12161
12162 return true;
12163 }
12164
12165 /*
12166 * Error context callback for errors occurring during rm_redo().
12167 */
12168 static void
rm_redo_error_callback(void * arg)12169 rm_redo_error_callback(void *arg)
12170 {
12171 XLogReaderState *record = (XLogReaderState *) arg;
12172 StringInfoData buf;
12173
12174 initStringInfo(&buf);
12175 xlog_outdesc(&buf, record);
12176 xlog_block_info(&buf, record);
12177
12178 /* translator: %s is a WAL record description */
12179 errcontext("WAL redo at %X/%X for %s",
12180 LSN_FORMAT_ARGS(record->ReadRecPtr),
12181 buf.data);
12182
12183 pfree(buf.data);
12184 }
12185
12186 /*
12187 * BackupInProgress: check if online backup mode is active
12188 *
12189 * This is done by checking for existence of the "backup_label" file.
12190 */
12191 bool
BackupInProgress(void)12192 BackupInProgress(void)
12193 {
12194 struct stat stat_buf;
12195
12196 return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
12197 }
12198
12199 /*
12200 * CancelBackup: rename the "backup_label" and "tablespace_map"
12201 * files to cancel backup mode
12202 *
12203 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
12204 * Similarly, if the "tablespace_map" file exists, it will be renamed to
12205 * "tablespace_map.old".
12206 *
12207 * Note that this will render an online backup in progress
12208 * useless. To correctly finish an online backup, pg_stop_backup must be
12209 * called.
12210 */
12211 void
CancelBackup(void)12212 CancelBackup(void)
12213 {
12214 struct stat stat_buf;
12215
12216 /* if the backup_label file is not there, return */
12217 if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
12218 return;
12219
12220 /* remove leftover file from previously canceled backup if it exists */
12221 unlink(BACKUP_LABEL_OLD);
12222
12223 if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
12224 {
12225 ereport(WARNING,
12226 (errcode_for_file_access(),
12227 errmsg("online backup mode was not canceled"),
12228 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
12229 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
12230 return;
12231 }
12232
12233 /* if the tablespace_map file is not there, return */
12234 if (stat(TABLESPACE_MAP, &stat_buf) < 0)
12235 {
12236 ereport(LOG,
12237 (errmsg("online backup mode canceled"),
12238 errdetail("File \"%s\" was renamed to \"%s\".",
12239 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
12240 return;
12241 }
12242
12243 /* remove leftover file from previously canceled backup if it exists */
12244 unlink(TABLESPACE_MAP_OLD);
12245
12246 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
12247 {
12248 ereport(LOG,
12249 (errmsg("online backup mode canceled"),
12250 errdetail("Files \"%s\" and \"%s\" were renamed to "
12251 "\"%s\" and \"%s\", respectively.",
12252 BACKUP_LABEL_FILE, TABLESPACE_MAP,
12253 BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
12254 }
12255 else
12256 {
12257 ereport(WARNING,
12258 (errcode_for_file_access(),
12259 errmsg("online backup mode canceled"),
12260 errdetail("File \"%s\" was renamed to \"%s\", but "
12261 "file \"%s\" could not be renamed to \"%s\": %m.",
12262 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
12263 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
12264 }
12265 }
12266
12267 /*
12268 * Read the XLOG page containing RecPtr into readBuf (if not read already).
12269 * Returns number of bytes read, if the page is read successfully, or -1
12270 * in case of errors. When errors occur, they are ereport'ed, but only
12271 * if they have not been previously reported.
12272 *
12273 * This is responsible for restoring files from archive as needed, as well
12274 * as for waiting for the requested WAL record to arrive in standby mode.
12275 *
12276 * 'emode' specifies the log level used for reporting "file not found" or
12277 * "end of WAL" situations in archive recovery, or in standby mode when a
12278 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
12279 * false in those situations, on higher log levels the ereport() won't
12280 * return.
12281 *
12282 * In standby mode, if after a successful return of XLogPageRead() the
12283 * caller finds the record it's interested in to be broken, it should
12284 * ereport the error with the level determined by
12285 * emode_for_corrupt_record(), and then set lastSourceFailed
12286 * and call XLogPageRead() again with the same arguments. This lets
12287 * XLogPageRead() to try fetching the record from another source, or to
12288 * sleep and retry.
12289 */
12290 static int
XLogPageRead(XLogReaderState * xlogreader,XLogRecPtr targetPagePtr,int reqLen,XLogRecPtr targetRecPtr,char * readBuf)12291 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
12292 XLogRecPtr targetRecPtr, char *readBuf)
12293 {
12294 XLogPageReadPrivate *private =
12295 (XLogPageReadPrivate *) xlogreader->private_data;
12296 int emode = private->emode;
12297 uint32 targetPageOff;
12298 XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
12299 int r;
12300
12301 XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
12302 targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
12303
12304 /*
12305 * See if we need to switch to a new segment because the requested record
12306 * is not in the currently open one.
12307 */
12308 if (readFile >= 0 &&
12309 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
12310 {
12311 /*
12312 * Request a restartpoint if we've replayed too much xlog since the
12313 * last one.
12314 */
12315 if (bgwriterLaunched)
12316 {
12317 if (XLogCheckpointNeeded(readSegNo))
12318 {
12319 (void) GetRedoRecPtr();
12320 if (XLogCheckpointNeeded(readSegNo))
12321 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
12322 }
12323 }
12324
12325 close(readFile);
12326 readFile = -1;
12327 readSource = XLOG_FROM_ANY;
12328 }
12329
12330 XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
12331
12332 retry:
12333 /* See if we need to retrieve more data */
12334 if (readFile < 0 ||
12335 (readSource == XLOG_FROM_STREAM &&
12336 flushedUpto < targetPagePtr + reqLen))
12337 {
12338 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
12339 private->randAccess,
12340 private->fetching_ckpt,
12341 targetRecPtr))
12342 {
12343 if (readFile >= 0)
12344 close(readFile);
12345 readFile = -1;
12346 readLen = 0;
12347 readSource = XLOG_FROM_ANY;
12348
12349 return -1;
12350 }
12351 }
12352
12353 /*
12354 * At this point, we have the right segment open and if we're streaming we
12355 * know the requested record is in it.
12356 */
12357 Assert(readFile != -1);
12358
12359 /*
12360 * If the current segment is being streamed from the primary, calculate
12361 * how much of the current page we have received already. We know the
12362 * requested record has been received, but this is for the benefit of
12363 * future calls, to allow quick exit at the top of this function.
12364 */
12365 if (readSource == XLOG_FROM_STREAM)
12366 {
12367 if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
12368 readLen = XLOG_BLCKSZ;
12369 else
12370 readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
12371 targetPageOff;
12372 }
12373 else
12374 readLen = XLOG_BLCKSZ;
12375
12376 /* Read the requested page */
12377 readOff = targetPageOff;
12378
12379 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
12380 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
12381 if (r != XLOG_BLCKSZ)
12382 {
12383 char fname[MAXFNAMELEN];
12384 int save_errno = errno;
12385
12386 pgstat_report_wait_end();
12387 XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
12388 if (r < 0)
12389 {
12390 errno = save_errno;
12391 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
12392 (errcode_for_file_access(),
12393 errmsg("could not read from log segment %s, offset %u: %m",
12394 fname, readOff)));
12395 }
12396 else
12397 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
12398 (errcode(ERRCODE_DATA_CORRUPTED),
12399 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
12400 fname, readOff, r, (Size) XLOG_BLCKSZ)));
12401 goto next_record_is_invalid;
12402 }
12403 pgstat_report_wait_end();
12404
12405 Assert(targetSegNo == readSegNo);
12406 Assert(targetPageOff == readOff);
12407 Assert(reqLen <= readLen);
12408
12409 xlogreader->seg.ws_tli = curFileTLI;
12410
12411 /*
12412 * Check the page header immediately, so that we can retry immediately if
12413 * it's not valid. This may seem unnecessary, because XLogReadRecord()
12414 * validates the page header anyway, and would propagate the failure up to
12415 * ReadRecord(), which would retry. However, there's a corner case with
12416 * continuation records, if a record is split across two pages such that
12417 * we would need to read the two pages from different sources. For
12418 * example, imagine a scenario where a streaming replica is started up,
12419 * and replay reaches a record that's split across two WAL segments. The
12420 * first page is only available locally, in pg_wal, because it's already
12421 * been recycled on the primary. The second page, however, is not present
12422 * in pg_wal, and we should stream it from the primary. There is a
12423 * recycled WAL segment present in pg_wal, with garbage contents, however.
12424 * We would read the first page from the local WAL segment, but when
12425 * reading the second page, we would read the bogus, recycled, WAL
12426 * segment. If we didn't catch that case here, we would never recover,
12427 * because ReadRecord() would retry reading the whole record from the
12428 * beginning.
12429 *
12430 * Of course, this only catches errors in the page header, which is what
12431 * happens in the case of a recycled WAL segment. Other kinds of errors or
12432 * corruption still has the same problem. But this at least fixes the
12433 * common case, which can happen as part of normal operation.
12434 *
12435 * Validating the page header is cheap enough that doing it twice
12436 * shouldn't be a big deal from a performance point of view.
12437 */
12438 if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
12439 {
12440 /* reset any error XLogReaderValidatePageHeader() might have set */
12441 xlogreader->errormsg_buf[0] = '\0';
12442 goto next_record_is_invalid;
12443 }
12444
12445 return readLen;
12446
12447 next_record_is_invalid:
12448 lastSourceFailed = true;
12449
12450 if (readFile >= 0)
12451 close(readFile);
12452 readFile = -1;
12453 readLen = 0;
12454 readSource = XLOG_FROM_ANY;
12455
12456 /* In standby-mode, keep trying */
12457 if (StandbyMode)
12458 goto retry;
12459 else
12460 return -1;
12461 }
12462
12463 /*
12464 * Open the WAL segment containing WAL location 'RecPtr'.
12465 *
12466 * The segment can be fetched via restore_command, or via walreceiver having
12467 * streamed the record, or it can already be present in pg_wal. Checking
12468 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
12469 * too, in case someone copies a new segment directly to pg_wal. That is not
12470 * documented or recommended, though.
12471 *
12472 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
12473 * prepare to read WAL starting from RedoStartLSN after this.
12474 *
12475 * 'RecPtr' might not point to the beginning of the record we're interested
12476 * in, it might also point to the page or segment header. In that case,
12477 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
12478 * used to decide which timeline to stream the requested WAL from.
12479 *
12480 * If the record is not immediately available, the function returns false
12481 * if we're not in standby mode. In standby mode, waits for it to become
12482 * available.
12483 *
12484 * When the requested record becomes available, the function opens the file
12485 * containing it (if not open already), and returns true. When end of standby
12486 * mode is triggered by the user, and there is no more WAL available, returns
12487 * false.
12488 */
12489 static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,bool randAccess,bool fetching_ckpt,XLogRecPtr tliRecPtr)12490 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
12491 bool fetching_ckpt, XLogRecPtr tliRecPtr)
12492 {
12493 static TimestampTz last_fail_time = 0;
12494 TimestampTz now;
12495 bool streaming_reply_sent = false;
12496
12497 /*-------
12498 * Standby mode is implemented by a state machine:
12499 *
12500 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
12501 * pg_wal (XLOG_FROM_PG_WAL)
12502 * 2. Check trigger file
12503 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
12504 * 4. Rescan timelines
12505 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
12506 *
12507 * Failure to read from the current source advances the state machine to
12508 * the next state.
12509 *
12510 * 'currentSource' indicates the current state. There are no currentSource
12511 * values for "check trigger", "rescan timelines", and "sleep" states,
12512 * those actions are taken when reading from the previous source fails, as
12513 * part of advancing to the next state.
12514 *
12515 * If standby mode is turned off while reading WAL from stream, we move
12516 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
12517 * the files (which would be required at end of recovery, e.g., timeline
12518 * history file) from archive or pg_wal. We don't need to kill WAL receiver
12519 * here because it's already stopped when standby mode is turned off at
12520 * the end of recovery.
12521 *-------
12522 */
12523 if (!InArchiveRecovery)
12524 currentSource = XLOG_FROM_PG_WAL;
12525 else if (currentSource == XLOG_FROM_ANY ||
12526 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
12527 {
12528 lastSourceFailed = false;
12529 currentSource = XLOG_FROM_ARCHIVE;
12530 }
12531
12532 for (;;)
12533 {
12534 XLogSource oldSource = currentSource;
12535 bool startWalReceiver = false;
12536
12537 /*
12538 * First check if we failed to read from the current source, and
12539 * advance the state machine if so. The failure to read might've
12540 * happened outside this function, e.g when a CRC check fails on a
12541 * record, or within this loop.
12542 */
12543 if (lastSourceFailed)
12544 {
12545 switch (currentSource)
12546 {
12547 case XLOG_FROM_ARCHIVE:
12548 case XLOG_FROM_PG_WAL:
12549
12550 /*
12551 * Check to see if the trigger file exists. Note that we
12552 * do this only after failure, so when you create the
12553 * trigger file, we still finish replaying as much as we
12554 * can from archive and pg_wal before failover.
12555 */
12556 if (StandbyMode && CheckForStandbyTrigger())
12557 {
12558 ShutdownWalRcv();
12559 return false;
12560 }
12561
12562 /*
12563 * Not in standby mode, and we've now tried the archive
12564 * and pg_wal.
12565 */
12566 if (!StandbyMode)
12567 return false;
12568
12569 /*
12570 * Move to XLOG_FROM_STREAM state, and set to start a
12571 * walreceiver if necessary.
12572 */
12573 currentSource = XLOG_FROM_STREAM;
12574 startWalReceiver = true;
12575 break;
12576
12577 case XLOG_FROM_STREAM:
12578
12579 /*
12580 * Failure while streaming. Most likely, we got here
12581 * because streaming replication was terminated, or
12582 * promotion was triggered. But we also get here if we
12583 * find an invalid record in the WAL streamed from the
12584 * primary, in which case something is seriously wrong.
12585 * There's little chance that the problem will just go
12586 * away, but PANIC is not good for availability either,
12587 * especially in hot standby mode. So, we treat that the
12588 * same as disconnection, and retry from archive/pg_wal
12589 * again. The WAL in the archive should be identical to
12590 * what was streamed, so it's unlikely that it helps, but
12591 * one can hope...
12592 */
12593
12594 /*
12595 * We should be able to move to XLOG_FROM_STREAM only in
12596 * standby mode.
12597 */
12598 Assert(StandbyMode);
12599
12600 /*
12601 * Before we leave XLOG_FROM_STREAM state, make sure that
12602 * walreceiver is not active, so that it won't overwrite
12603 * WAL that we restore from archive.
12604 */
12605 if (WalRcvStreaming())
12606 ShutdownWalRcv();
12607
12608 /*
12609 * Before we sleep, re-scan for possible new timelines if
12610 * we were requested to recover to the latest timeline.
12611 */
12612 if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
12613 {
12614 if (rescanLatestTimeLine())
12615 {
12616 currentSource = XLOG_FROM_ARCHIVE;
12617 break;
12618 }
12619 }
12620
12621 /*
12622 * XLOG_FROM_STREAM is the last state in our state
12623 * machine, so we've exhausted all the options for
12624 * obtaining the requested WAL. We're going to loop back
12625 * and retry from the archive, but if it hasn't been long
12626 * since last attempt, sleep wal_retrieve_retry_interval
12627 * milliseconds to avoid busy-waiting.
12628 */
12629 now = GetCurrentTimestamp();
12630 if (!TimestampDifferenceExceeds(last_fail_time, now,
12631 wal_retrieve_retry_interval))
12632 {
12633 long wait_time;
12634
12635 wait_time = wal_retrieve_retry_interval -
12636 TimestampDifferenceMilliseconds(last_fail_time, now);
12637
12638 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12639 WL_LATCH_SET | WL_TIMEOUT |
12640 WL_EXIT_ON_PM_DEATH,
12641 wait_time,
12642 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
12643 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12644 now = GetCurrentTimestamp();
12645
12646 /* Handle interrupt signals of startup process */
12647 HandleStartupProcInterrupts();
12648 }
12649 last_fail_time = now;
12650 currentSource = XLOG_FROM_ARCHIVE;
12651 break;
12652
12653 default:
12654 elog(ERROR, "unexpected WAL source %d", currentSource);
12655 }
12656 }
12657 else if (currentSource == XLOG_FROM_PG_WAL)
12658 {
12659 /*
12660 * We just successfully read a file in pg_wal. We prefer files in
12661 * the archive over ones in pg_wal, so try the next file again
12662 * from the archive first.
12663 */
12664 if (InArchiveRecovery)
12665 currentSource = XLOG_FROM_ARCHIVE;
12666 }
12667
12668 if (currentSource != oldSource)
12669 elog(DEBUG2, "switched WAL source from %s to %s after %s",
12670 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12671 lastSourceFailed ? "failure" : "success");
12672
12673 /*
12674 * We've now handled possible failure. Try to read from the chosen
12675 * source.
12676 */
12677 lastSourceFailed = false;
12678
12679 switch (currentSource)
12680 {
12681 case XLOG_FROM_ARCHIVE:
12682 case XLOG_FROM_PG_WAL:
12683
12684 /*
12685 * WAL receiver must not be running when reading WAL from
12686 * archive or pg_wal.
12687 */
12688 Assert(!WalRcvStreaming());
12689
12690 /* Close any old file we might have open. */
12691 if (readFile >= 0)
12692 {
12693 close(readFile);
12694 readFile = -1;
12695 }
12696 /* Reset curFileTLI if random fetch. */
12697 if (randAccess)
12698 curFileTLI = 0;
12699
12700 /*
12701 * Try to restore the file from archive, or read an existing
12702 * file from pg_wal.
12703 */
12704 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12705 currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
12706 currentSource);
12707 if (readFile >= 0)
12708 return true; /* success! */
12709
12710 /*
12711 * Nope, not found in archive or pg_wal.
12712 */
12713 lastSourceFailed = true;
12714 break;
12715
12716 case XLOG_FROM_STREAM:
12717 {
12718 bool havedata;
12719
12720 /*
12721 * We should be able to move to XLOG_FROM_STREAM only in
12722 * standby mode.
12723 */
12724 Assert(StandbyMode);
12725
12726 /*
12727 * First, shutdown walreceiver if its restart has been
12728 * requested -- but no point if we're already slated for
12729 * starting it.
12730 */
12731 if (pendingWalRcvRestart && !startWalReceiver)
12732 {
12733 ShutdownWalRcv();
12734
12735 /*
12736 * Re-scan for possible new timelines if we were
12737 * requested to recover to the latest timeline.
12738 */
12739 if (recoveryTargetTimeLineGoal ==
12740 RECOVERY_TARGET_TIMELINE_LATEST)
12741 rescanLatestTimeLine();
12742
12743 startWalReceiver = true;
12744 }
12745 pendingWalRcvRestart = false;
12746
12747 /*
12748 * Launch walreceiver if needed.
12749 *
12750 * If fetching_ckpt is true, RecPtr points to the initial
12751 * checkpoint location. In that case, we use RedoStartLSN
12752 * as the streaming start position instead of RecPtr, so
12753 * that when we later jump backwards to start redo at
12754 * RedoStartLSN, we will have the logs streamed already.
12755 */
12756 if (startWalReceiver &&
12757 PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
12758 {
12759 XLogRecPtr ptr;
12760 TimeLineID tli;
12761
12762 if (fetching_ckpt)
12763 {
12764 ptr = RedoStartLSN;
12765 tli = ControlFile->checkPointCopy.ThisTimeLineID;
12766 }
12767 else
12768 {
12769 ptr = RecPtr;
12770
12771 /*
12772 * Use the record begin position to determine the
12773 * TLI, rather than the position we're reading.
12774 */
12775 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
12776
12777 if (curFileTLI > 0 && tli < curFileTLI)
12778 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
12779 LSN_FORMAT_ARGS(tliRecPtr),
12780 tli, curFileTLI);
12781 }
12782 curFileTLI = tli;
12783 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
12784 PrimarySlotName,
12785 wal_receiver_create_temp_slot);
12786 flushedUpto = 0;
12787 }
12788
12789 /*
12790 * Check if WAL receiver is active or wait to start up.
12791 */
12792 if (!WalRcvStreaming())
12793 {
12794 lastSourceFailed = true;
12795 break;
12796 }
12797
12798 /*
12799 * Walreceiver is active, so see if new data has arrived.
12800 *
12801 * We only advance XLogReceiptTime when we obtain fresh
12802 * WAL from walreceiver and observe that we had already
12803 * processed everything before the most recent "chunk"
12804 * that it flushed to disk. In steady state where we are
12805 * keeping up with the incoming data, XLogReceiptTime will
12806 * be updated on each cycle. When we are behind,
12807 * XLogReceiptTime will not advance, so the grace time
12808 * allotted to conflicting queries will decrease.
12809 */
12810 if (RecPtr < flushedUpto)
12811 havedata = true;
12812 else
12813 {
12814 XLogRecPtr latestChunkStart;
12815
12816 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
12817 if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
12818 {
12819 havedata = true;
12820 if (latestChunkStart <= RecPtr)
12821 {
12822 XLogReceiptTime = GetCurrentTimestamp();
12823 SetCurrentChunkStartTime(XLogReceiptTime);
12824 }
12825 }
12826 else
12827 havedata = false;
12828 }
12829 if (havedata)
12830 {
12831 /*
12832 * Great, streamed far enough. Open the file if it's
12833 * not open already. Also read the timeline history
12834 * file if we haven't initialized timeline history
12835 * yet; it should be streamed over and present in
12836 * pg_wal by now. Use XLOG_FROM_STREAM so that source
12837 * info is set correctly and XLogReceiptTime isn't
12838 * changed.
12839 *
12840 * NB: We must set readTimeLineHistory based on
12841 * recoveryTargetTLI, not receiveTLI. Normally they'll
12842 * be the same, but if recovery_target_timeline is
12843 * 'latest' and archiving is configured, then it's
12844 * possible that we managed to retrieve one or more
12845 * new timeline history files from the archive,
12846 * updating recoveryTargetTLI.
12847 */
12848 if (readFile < 0)
12849 {
12850 if (!expectedTLEs)
12851 expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
12852 readFile = XLogFileRead(readSegNo, PANIC,
12853 receiveTLI,
12854 XLOG_FROM_STREAM, false);
12855 Assert(readFile >= 0);
12856 }
12857 else
12858 {
12859 /* just make sure source info is correct... */
12860 readSource = XLOG_FROM_STREAM;
12861 XLogReceiptSource = XLOG_FROM_STREAM;
12862 return true;
12863 }
12864 break;
12865 }
12866
12867 /*
12868 * Data not here yet. Check for trigger, then wait for
12869 * walreceiver to wake us up when new WAL arrives.
12870 */
12871 if (CheckForStandbyTrigger())
12872 {
12873 /*
12874 * Note that we don't "return false" immediately here.
12875 * After being triggered, we still want to replay all
12876 * the WAL that was already streamed. It's in pg_wal
12877 * now, so we just treat this as a failure, and the
12878 * state machine will move on to replay the streamed
12879 * WAL from pg_wal, and then recheck the trigger and
12880 * exit replay.
12881 */
12882 lastSourceFailed = true;
12883 break;
12884 }
12885
12886 /*
12887 * Since we have replayed everything we have received so
12888 * far and are about to start waiting for more WAL, let's
12889 * tell the upstream server our replay location now so
12890 * that pg_stat_replication doesn't show stale
12891 * information.
12892 */
12893 if (!streaming_reply_sent)
12894 {
12895 WalRcvForceReply();
12896 streaming_reply_sent = true;
12897 }
12898
12899 /*
12900 * Wait for more WAL to arrive. Time out after 5 seconds
12901 * to react to a trigger file promptly and to check if the
12902 * WAL receiver is still active.
12903 */
12904 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12905 WL_LATCH_SET | WL_TIMEOUT |
12906 WL_EXIT_ON_PM_DEATH,
12907 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
12908 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12909 break;
12910 }
12911
12912 default:
12913 elog(ERROR, "unexpected WAL source %d", currentSource);
12914 }
12915
12916 /*
12917 * Check for recovery pause here so that we can confirm more quickly
12918 * that a requested pause has actually taken effect.
12919 */
12920 if (((volatile XLogCtlData *) XLogCtl)->recoveryPauseState !=
12921 RECOVERY_NOT_PAUSED)
12922 recoveryPausesHere(false);
12923
12924 /*
12925 * This possibly-long loop needs to handle interrupts of startup
12926 * process.
12927 */
12928 HandleStartupProcInterrupts();
12929 }
12930
12931 return false; /* not reached */
12932 }
12933
12934 /*
12935 * Set flag to signal the walreceiver to restart. (The startup process calls
12936 * this on noticing a relevant configuration change.)
12937 */
12938 void
StartupRequestWalReceiverRestart(void)12939 StartupRequestWalReceiverRestart(void)
12940 {
12941 if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
12942 {
12943 ereport(LOG,
12944 (errmsg("WAL receiver process shutdown requested")));
12945
12946 pendingWalRcvRestart = true;
12947 }
12948 }
12949
12950 /*
12951 * Determine what log level should be used to report a corrupt WAL record
12952 * in the current WAL page, previously read by XLogPageRead().
12953 *
12954 * 'emode' is the error mode that would be used to report a file-not-found
12955 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
12956 * we're retrying the exact same record that we've tried previously, only
12957 * complain the first time to keep the noise down. However, we only do when
12958 * reading from pg_wal, because we don't expect any invalid records in archive
12959 * or in records streamed from the primary. Files in the archive should be complete,
12960 * and we should never hit the end of WAL because we stop and wait for more WAL
12961 * to arrive before replaying it.
12962 *
12963 * NOTE: This function remembers the RecPtr value it was last called with,
12964 * to suppress repeated messages about the same record. Only call this when
12965 * you are about to ereport(), or you might cause a later message to be
12966 * erroneously suppressed.
12967 */
12968 static int
emode_for_corrupt_record(int emode,XLogRecPtr RecPtr)12969 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12970 {
12971 static XLogRecPtr lastComplaint = 0;
12972
12973 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12974 {
12975 if (RecPtr == lastComplaint)
12976 emode = DEBUG1;
12977 else
12978 lastComplaint = RecPtr;
12979 }
12980 return emode;
12981 }
12982
12983 /*
12984 * Has a standby promotion already been triggered?
12985 *
12986 * Unlike CheckForStandbyTrigger(), this works in any process
12987 * that's connected to shared memory.
12988 */
12989 bool
PromoteIsTriggered(void)12990 PromoteIsTriggered(void)
12991 {
12992 /*
12993 * We check shared state each time only until a standby promotion is
12994 * triggered. We can't trigger a promotion again, so there's no need to
12995 * keep checking after the shared variable has once been seen true.
12996 */
12997 if (LocalPromoteIsTriggered)
12998 return true;
12999
13000 SpinLockAcquire(&XLogCtl->info_lck);
13001 LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered;
13002 SpinLockRelease(&XLogCtl->info_lck);
13003
13004 return LocalPromoteIsTriggered;
13005 }
13006
13007 static void
SetPromoteIsTriggered(void)13008 SetPromoteIsTriggered(void)
13009 {
13010 SpinLockAcquire(&XLogCtl->info_lck);
13011 XLogCtl->SharedPromoteIsTriggered = true;
13012 SpinLockRelease(&XLogCtl->info_lck);
13013
13014 /*
13015 * Mark the recovery pause state as 'not paused' because the paused state
13016 * ends and promotion continues if a promotion is triggered while recovery
13017 * is paused. Otherwise pg_get_wal_replay_pause_state() can mistakenly
13018 * return 'paused' while a promotion is ongoing.
13019 */
13020 SetRecoveryPause(false);
13021
13022 LocalPromoteIsTriggered = true;
13023 }
13024
13025 /*
13026 * Check to see whether the user-specified trigger file exists and whether a
13027 * promote request has arrived. If either condition holds, return true.
13028 */
13029 static bool
CheckForStandbyTrigger(void)13030 CheckForStandbyTrigger(void)
13031 {
13032 struct stat stat_buf;
13033
13034 if (LocalPromoteIsTriggered)
13035 return true;
13036
13037 if (IsPromoteSignaled() && CheckPromoteSignal())
13038 {
13039 ereport(LOG, (errmsg("received promote request")));
13040 RemovePromoteSignalFiles();
13041 ResetPromoteSignaled();
13042 SetPromoteIsTriggered();
13043 return true;
13044 }
13045
13046 if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
13047 return false;
13048
13049 if (stat(PromoteTriggerFile, &stat_buf) == 0)
13050 {
13051 ereport(LOG,
13052 (errmsg("promote trigger file found: %s", PromoteTriggerFile)));
13053 unlink(PromoteTriggerFile);
13054 SetPromoteIsTriggered();
13055 return true;
13056 }
13057 else if (errno != ENOENT)
13058 ereport(ERROR,
13059 (errcode_for_file_access(),
13060 errmsg("could not stat promote trigger file \"%s\": %m",
13061 PromoteTriggerFile)));
13062
13063 return false;
13064 }
13065
13066 /*
13067 * Remove the files signaling a standby promotion request.
13068 */
13069 void
RemovePromoteSignalFiles(void)13070 RemovePromoteSignalFiles(void)
13071 {
13072 unlink(PROMOTE_SIGNAL_FILE);
13073 }
13074
13075 /*
13076 * Check to see if a promote request has arrived.
13077 */
13078 bool
CheckPromoteSignal(void)13079 CheckPromoteSignal(void)
13080 {
13081 struct stat stat_buf;
13082
13083 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
13084 return true;
13085
13086 return false;
13087 }
13088
13089 /*
13090 * Wake up startup process to replay newly arrived WAL, or to notice that
13091 * failover has been requested.
13092 */
13093 void
WakeupRecovery(void)13094 WakeupRecovery(void)
13095 {
13096 SetLatch(&XLogCtl->recoveryWakeupLatch);
13097 }
13098
13099 /*
13100 * Update the WalWriterSleeping flag.
13101 */
13102 void
SetWalWriterSleeping(bool sleeping)13103 SetWalWriterSleeping(bool sleeping)
13104 {
13105 SpinLockAcquire(&XLogCtl->info_lck);
13106 XLogCtl->WalWriterSleeping = sleeping;
13107 SpinLockRelease(&XLogCtl->info_lck);
13108 }
13109
13110 /*
13111 * Schedule a walreceiver wakeup in the main recovery loop.
13112 */
13113 void
XLogRequestWalReceiverReply(void)13114 XLogRequestWalReceiverReply(void)
13115 {
13116 doRequestWalReceiverReply = true;
13117 }
13118