1 /*-------------------------------------------------------------------------
2 *
3 * xlog.c
4 * PostgreSQL write-ahead log manager
5 *
6 *
7 * Portions Copyright (c) 1996-2020, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/backend/access/transam/xlog.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <math.h>
19 #include <time.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/time.h>
23 #include <unistd.h>
24
25 #include "access/clog.h"
26 #include "access/commit_ts.h"
27 #include "access/heaptoast.h"
28 #include "access/multixact.h"
29 #include "access/rewriteheap.h"
30 #include "access/subtrans.h"
31 #include "access/timeline.h"
32 #include "access/transam.h"
33 #include "access/twophase.h"
34 #include "access/xact.h"
35 #include "access/xlog_internal.h"
36 #include "access/xlogarchive.h"
37 #include "access/xloginsert.h"
38 #include "access/xlogreader.h"
39 #include "access/xlogutils.h"
40 #include "catalog/catversion.h"
41 #include "catalog/pg_control.h"
42 #include "catalog/pg_database.h"
43 #include "commands/progress.h"
44 #include "commands/tablespace.h"
45 #include "common/controldata_utils.h"
46 #include "executor/instrument.h"
47 #include "miscadmin.h"
48 #include "pg_trace.h"
49 #include "pgstat.h"
50 #include "port/atomics.h"
51 #include "postmaster/bgwriter.h"
52 #include "postmaster/startup.h"
53 #include "postmaster/walwriter.h"
54 #include "replication/basebackup.h"
55 #include "replication/logical.h"
56 #include "replication/origin.h"
57 #include "replication/slot.h"
58 #include "replication/snapbuild.h"
59 #include "replication/walreceiver.h"
60 #include "replication/walsender.h"
61 #include "storage/bufmgr.h"
62 #include "storage/fd.h"
63 #include "storage/ipc.h"
64 #include "storage/large_object.h"
65 #include "storage/latch.h"
66 #include "storage/pmsignal.h"
67 #include "storage/predicate.h"
68 #include "storage/proc.h"
69 #include "storage/procarray.h"
70 #include "storage/reinit.h"
71 #include "storage/smgr.h"
72 #include "storage/spin.h"
73 #include "storage/sync.h"
74 #include "utils/builtins.h"
75 #include "utils/guc.h"
76 #include "utils/memutils.h"
77 #include "utils/ps_status.h"
78 #include "utils/relmapper.h"
79 #include "utils/snapmgr.h"
80 #include "utils/timestamp.h"
81
82 extern uint32 bootstrap_data_checksum_version;
83
84 /* Unsupported old recovery command file names (relative to $PGDATA) */
85 #define RECOVERY_COMMAND_FILE "recovery.conf"
86 #define RECOVERY_COMMAND_DONE "recovery.done"
87
88 /* User-settable parameters */
89 int max_wal_size_mb = 1024; /* 1 GB */
90 int min_wal_size_mb = 80; /* 80 MB */
91 int wal_keep_size_mb = 0;
92 int XLOGbuffers = -1;
93 int XLogArchiveTimeout = 0;
94 int XLogArchiveMode = ARCHIVE_MODE_OFF;
95 char *XLogArchiveCommand = NULL;
96 bool EnableHotStandby = false;
97 bool fullPageWrites = true;
98 bool wal_log_hints = false;
99 bool wal_compression = false;
100 char *wal_consistency_checking_string = NULL;
101 bool *wal_consistency_checking = NULL;
102 bool wal_init_zero = true;
103 bool wal_recycle = true;
104 bool log_checkpoints = false;
105 int sync_method = DEFAULT_SYNC_METHOD;
106 int wal_level = WAL_LEVEL_MINIMAL;
107 int CommitDelay = 0; /* precommit delay in microseconds */
108 int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
109 int wal_retrieve_retry_interval = 5000;
110 int max_slot_wal_keep_size_mb = -1;
111
112 #ifdef WAL_DEBUG
113 bool XLOG_DEBUG = false;
114 #endif
115
116 int wal_segment_size = DEFAULT_XLOG_SEG_SIZE;
117
118 /*
119 * Number of WAL insertion locks to use. A higher value allows more insertions
120 * to happen concurrently, but adds some CPU overhead to flushing the WAL,
121 * which needs to iterate all the locks.
122 */
123 #define NUM_XLOGINSERT_LOCKS 8
124
125 /*
126 * Max distance from last checkpoint, before triggering a new xlog-based
127 * checkpoint.
128 */
129 int CheckPointSegments;
130
131 /* Estimated distance between checkpoints, in bytes */
132 static double CheckPointDistanceEstimate = 0;
133 static double PrevCheckPointDistance = 0;
134
135 /*
136 * GUC support
137 */
138 const struct config_enum_entry sync_method_options[] = {
139 {"fsync", SYNC_METHOD_FSYNC, false},
140 #ifdef HAVE_FSYNC_WRITETHROUGH
141 {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
142 #endif
143 #ifdef HAVE_FDATASYNC
144 {"fdatasync", SYNC_METHOD_FDATASYNC, false},
145 #endif
146 #ifdef OPEN_SYNC_FLAG
147 {"open_sync", SYNC_METHOD_OPEN, false},
148 #endif
149 #ifdef OPEN_DATASYNC_FLAG
150 {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
151 #endif
152 {NULL, 0, false}
153 };
154
155
156 /*
157 * Although only "on", "off", and "always" are documented,
158 * we accept all the likely variants of "on" and "off".
159 */
160 const struct config_enum_entry archive_mode_options[] = {
161 {"always", ARCHIVE_MODE_ALWAYS, false},
162 {"on", ARCHIVE_MODE_ON, false},
163 {"off", ARCHIVE_MODE_OFF, false},
164 {"true", ARCHIVE_MODE_ON, true},
165 {"false", ARCHIVE_MODE_OFF, true},
166 {"yes", ARCHIVE_MODE_ON, true},
167 {"no", ARCHIVE_MODE_OFF, true},
168 {"1", ARCHIVE_MODE_ON, true},
169 {"0", ARCHIVE_MODE_OFF, true},
170 {NULL, 0, false}
171 };
172
173 const struct config_enum_entry recovery_target_action_options[] = {
174 {"pause", RECOVERY_TARGET_ACTION_PAUSE, false},
175 {"promote", RECOVERY_TARGET_ACTION_PROMOTE, false},
176 {"shutdown", RECOVERY_TARGET_ACTION_SHUTDOWN, false},
177 {NULL, 0, false}
178 };
179
180 /*
181 * Statistics for current checkpoint are collected in this global struct.
182 * Because only the checkpointer or a stand-alone backend can perform
183 * checkpoints, this will be unused in normal backends.
184 */
185 CheckpointStatsData CheckpointStats;
186
187 /*
188 * ThisTimeLineID will be same in all backends --- it identifies current
189 * WAL timeline for the database system.
190 */
191 TimeLineID ThisTimeLineID = 0;
192
193 /*
194 * Are we doing recovery from XLOG?
195 *
196 * This is only ever true in the startup process; it should be read as meaning
197 * "this process is replaying WAL records", rather than "the system is in
198 * recovery mode". It should be examined primarily by functions that need
199 * to act differently when called from a WAL redo function (e.g., to skip WAL
200 * logging). To check whether the system is in recovery regardless of which
201 * process you're running in, use RecoveryInProgress() but only after shared
202 * memory startup and lock initialization.
203 */
204 bool InRecovery = false;
205
206 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
207 HotStandbyState standbyState = STANDBY_DISABLED;
208
209 static XLogRecPtr LastRec;
210
211 /* Local copy of WalRcv->flushedUpto */
212 static XLogRecPtr flushedUpto = 0;
213 static TimeLineID receiveTLI = 0;
214
215 /*
216 * abortedRecPtr is the start pointer of a broken record at end of WAL when
217 * recovery completes; missingContrecPtr is the location of the first
218 * contrecord that went missing. See CreateOverwriteContrecordRecord for
219 * details.
220 */
221 static XLogRecPtr abortedRecPtr;
222 static XLogRecPtr missingContrecPtr;
223
224 /*
225 * During recovery, lastFullPageWrites keeps track of full_page_writes that
226 * the replayed WAL records indicate. It's initialized with full_page_writes
227 * that the recovery starting checkpoint record indicates, and then updated
228 * each time XLOG_FPW_CHANGE record is replayed.
229 */
230 static bool lastFullPageWrites;
231
232 /*
233 * Local copy of the state tracked by SharedRecoveryState in shared memory,
234 * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
235 * means "not known, need to check the shared state".
236 */
237 static bool LocalRecoveryInProgress = true;
238
239 /*
240 * Local copy of SharedHotStandbyActive variable. False actually means "not
241 * known, need to check the shared state".
242 */
243 static bool LocalHotStandbyActive = false;
244
245 /*
246 * Local copy of SharedPromoteIsTriggered variable. False actually means "not
247 * known, need to check the shared state".
248 */
249 static bool LocalPromoteIsTriggered = false;
250
251 /*
252 * Local state for XLogInsertAllowed():
253 * 1: unconditionally allowed to insert XLOG
254 * 0: unconditionally not allowed to insert XLOG
255 * -1: must check RecoveryInProgress(); disallow until it is false
256 * Most processes start with -1 and transition to 1 after seeing that recovery
257 * is not in progress. But we can also force the value for special cases.
258 * The coding in XLogInsertAllowed() depends on the first two of these states
259 * being numerically the same as bool true and false.
260 */
261 static int LocalXLogInsertAllowed = -1;
262
263 /*
264 * When ArchiveRecoveryRequested is set, archive recovery was requested,
265 * ie. signal files were present. When InArchiveRecovery is set, we are
266 * currently recovering using offline XLOG archives. These variables are only
267 * valid in the startup process.
268 *
269 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
270 * currently performing crash recovery using only XLOG files in pg_wal, but
271 * will switch to using offline XLOG archives as soon as we reach the end of
272 * WAL in pg_wal.
273 */
274 bool ArchiveRecoveryRequested = false;
275 bool InArchiveRecovery = false;
276
277 static bool standby_signal_file_found = false;
278 static bool recovery_signal_file_found = false;
279
280 /* Was the last xlog file restored from archive, or local? */
281 static bool restoredFromArchive = false;
282
283 /* Buffers dedicated to consistency checks of size BLCKSZ */
284 static char *replay_image_masked = NULL;
285 static char *master_image_masked = NULL;
286
287 /* options formerly taken from recovery.conf for archive recovery */
288 char *recoveryRestoreCommand = NULL;
289 char *recoveryEndCommand = NULL;
290 char *archiveCleanupCommand = NULL;
291 RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
292 bool recoveryTargetInclusive = true;
293 int recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
294 TransactionId recoveryTargetXid;
295 char *recovery_target_time_string;
296 static TimestampTz recoveryTargetTime;
297 const char *recoveryTargetName;
298 XLogRecPtr recoveryTargetLSN;
299 int recovery_min_apply_delay = 0;
300
301 /* options formerly taken from recovery.conf for XLOG streaming */
302 bool StandbyModeRequested = false;
303 char *PrimaryConnInfo = NULL;
304 char *PrimarySlotName = NULL;
305 char *PromoteTriggerFile = NULL;
306 bool wal_receiver_create_temp_slot = false;
307
308 /* are we currently in standby mode? */
309 bool StandbyMode = false;
310
311 /* whether request for fast promotion has been made yet */
312 static bool fast_promote = false;
313
314 /*
315 * if recoveryStopsBefore/After returns true, it saves information of the stop
316 * point here
317 */
318 static TransactionId recoveryStopXid;
319 static TimestampTz recoveryStopTime;
320 static XLogRecPtr recoveryStopLSN;
321 static char recoveryStopName[MAXFNAMELEN];
322 static bool recoveryStopAfter;
323
324 /*
325 * During normal operation, the only timeline we care about is ThisTimeLineID.
326 * During recovery, however, things are more complicated. To simplify life
327 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
328 * scan through the WAL history (that is, it is the line that was active when
329 * the currently-scanned WAL record was generated). We also need these
330 * timeline values:
331 *
332 * recoveryTargetTimeLineGoal: what the user requested, if any
333 *
334 * recoveryTargetTLIRequested: numeric value of requested timeline, if constant
335 *
336 * recoveryTargetTLI: the currently understood target timeline; changes
337 *
338 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
339 * its known parents, newest first (so recoveryTargetTLI is always the
340 * first list member). Only these TLIs are expected to be seen in the WAL
341 * segments we read, and indeed only these TLIs will be considered as
342 * candidate WAL files to open at all.
343 *
344 * curFileTLI: the TLI appearing in the name of the current input WAL file.
345 * (This is not necessarily the same as ThisTimeLineID, because we could
346 * be scanning data that was copied from an ancestor timeline when the current
347 * file was created.) During a sequential scan we do not allow this value
348 * to decrease.
349 */
350 RecoveryTargetTimeLineGoal recoveryTargetTimeLineGoal = RECOVERY_TARGET_TIMELINE_LATEST;
351 TimeLineID recoveryTargetTLIRequested = 0;
352 TimeLineID recoveryTargetTLI = 0;
353 static List *expectedTLEs;
354 static TimeLineID curFileTLI;
355
356 /*
357 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
358 * current backend. It is updated for all inserts. XactLastRecEnd points to
359 * end+1 of the last record, and is reset when we end a top-level transaction,
360 * or start a new one; so it can be used to tell if the current transaction has
361 * created any XLOG records.
362 *
363 * While in parallel mode, this may not be fully up to date. When committing,
364 * a transaction can assume this covers all xlog records written either by the
365 * user backend or by any parallel worker which was present at any point during
366 * the transaction. But when aborting, or when still in parallel mode, other
367 * parallel backends may have written WAL records at later LSNs than the value
368 * stored here. The parallel leader advances its own copy, when necessary,
369 * in WaitForParallelWorkersToFinish.
370 */
371 XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
372 XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
373 XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
374
375 /*
376 * RedoRecPtr is this backend's local copy of the REDO record pointer
377 * (which is almost but not quite the same as a pointer to the most recent
378 * CHECKPOINT record). We update this from the shared-memory copy,
379 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
380 * hold an insertion lock). See XLogInsertRecord for details. We are also
381 * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
382 * see GetRedoRecPtr. A freshly spawned backend obtains the value during
383 * InitXLOGAccess.
384 */
385 static XLogRecPtr RedoRecPtr;
386
387 /*
388 * doPageWrites is this backend's local copy of (forcePageWrites ||
389 * fullPageWrites). It is used together with RedoRecPtr to decide whether
390 * a full-page image of a page need to be taken.
391 */
392 static bool doPageWrites;
393
394 /* Has the recovery code requested a walreceiver wakeup? */
395 static bool doRequestWalReceiverReply;
396
397 /*
398 * RedoStartLSN points to the checkpoint's REDO location which is specified
399 * in a backup label file, backup history file or control file. In standby
400 * mode, XLOG streaming usually starts from the position where an invalid
401 * record was found. But if we fail to read even the initial checkpoint
402 * record, we use the REDO location instead of the checkpoint location as
403 * the start position of XLOG streaming. Otherwise we would have to jump
404 * backwards to the REDO location after reading the checkpoint record,
405 * because the REDO record can precede the checkpoint record.
406 */
407 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
408
409 /*----------
410 * Shared-memory data structures for XLOG control
411 *
412 * LogwrtRqst indicates a byte position that we need to write and/or fsync
413 * the log up to (all records before that point must be written or fsynced).
414 * LogwrtResult indicates the byte positions we have already written/fsynced.
415 * These structs are identical but are declared separately to indicate their
416 * slightly different functions.
417 *
418 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
419 * WALWriteLock. To update it, you need to hold both locks. The point of
420 * this arrangement is that the value can be examined by code that already
421 * holds WALWriteLock without needing to grab info_lck as well. In addition
422 * to the shared variable, each backend has a private copy of LogwrtResult,
423 * which is updated when convenient.
424 *
425 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
426 * (protected by info_lck), but we don't need to cache any copies of it.
427 *
428 * info_lck is only held long enough to read/update the protected variables,
429 * so it's a plain spinlock. The other locks are held longer (potentially
430 * over I/O operations), so we use LWLocks for them. These locks are:
431 *
432 * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
433 * It is only held while initializing and changing the mapping. If the
434 * contents of the buffer being replaced haven't been written yet, the mapping
435 * lock is released while the write is done, and reacquired afterwards.
436 *
437 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
438 * XLogFlush).
439 *
440 * ControlFileLock: must be held to read/update control file or create
441 * new log file.
442 *
443 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
444 * only one checkpointer at a time; currently, with all checkpoints done by
445 * the checkpointer, this is just pro forma).
446 *
447 *----------
448 */
449
450 typedef struct XLogwrtRqst
451 {
452 XLogRecPtr Write; /* last byte + 1 to write out */
453 XLogRecPtr Flush; /* last byte + 1 to flush */
454 } XLogwrtRqst;
455
456 typedef struct XLogwrtResult
457 {
458 XLogRecPtr Write; /* last byte + 1 written out */
459 XLogRecPtr Flush; /* last byte + 1 flushed */
460 } XLogwrtResult;
461
462 /*
463 * Inserting to WAL is protected by a small fixed number of WAL insertion
464 * locks. To insert to the WAL, you must hold one of the locks - it doesn't
465 * matter which one. To lock out other concurrent insertions, you must hold
466 * of them. Each WAL insertion lock consists of a lightweight lock, plus an
467 * indicator of how far the insertion has progressed (insertingAt).
468 *
469 * The insertingAt values are read when a process wants to flush WAL from
470 * the in-memory buffers to disk, to check that all the insertions to the
471 * region the process is about to write out have finished. You could simply
472 * wait for all currently in-progress insertions to finish, but the
473 * insertingAt indicator allows you to ignore insertions to later in the WAL,
474 * so that you only wait for the insertions that are modifying the buffers
475 * you're about to write out.
476 *
477 * This isn't just an optimization. If all the WAL buffers are dirty, an
478 * inserter that's holding a WAL insert lock might need to evict an old WAL
479 * buffer, which requires flushing the WAL. If it's possible for an inserter
480 * to block on another inserter unnecessarily, deadlock can arise when two
481 * inserters holding a WAL insert lock wait for each other to finish their
482 * insertion.
483 *
484 * Small WAL records that don't cross a page boundary never update the value,
485 * the WAL record is just copied to the page and the lock is released. But
486 * to avoid the deadlock-scenario explained above, the indicator is always
487 * updated before sleeping while holding an insertion lock.
488 *
489 * lastImportantAt contains the LSN of the last important WAL record inserted
490 * using a given lock. This value is used to detect if there has been
491 * important WAL activity since the last time some action, like a checkpoint,
492 * was performed - allowing to not repeat the action if not. The LSN is
493 * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
494 * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
495 * records. Tracking the WAL activity directly in WALInsertLock has the
496 * advantage of not needing any additional locks to update the value.
497 */
498 typedef struct
499 {
500 LWLock lock;
501 XLogRecPtr insertingAt;
502 XLogRecPtr lastImportantAt;
503 } WALInsertLock;
504
505 /*
506 * All the WAL insertion locks are allocated as an array in shared memory. We
507 * force the array stride to be a power of 2, which saves a few cycles in
508 * indexing, but more importantly also ensures that individual slots don't
509 * cross cache line boundaries. (Of course, we have to also ensure that the
510 * array start address is suitably aligned.)
511 */
512 typedef union WALInsertLockPadded
513 {
514 WALInsertLock l;
515 char pad[PG_CACHE_LINE_SIZE];
516 } WALInsertLockPadded;
517
518 /*
519 * State of an exclusive backup, necessary to control concurrent activities
520 * across sessions when working on exclusive backups.
521 *
522 * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
523 * running, to be more precise pg_start_backup() is not being executed for
524 * an exclusive backup and there is no exclusive backup in progress.
525 * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
526 * exclusive backup.
527 * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
528 * running and an exclusive backup is in progress. pg_stop_backup() is
529 * needed to finish it.
530 * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
531 * exclusive backup.
532 */
533 typedef enum ExclusiveBackupState
534 {
535 EXCLUSIVE_BACKUP_NONE = 0,
536 EXCLUSIVE_BACKUP_STARTING,
537 EXCLUSIVE_BACKUP_IN_PROGRESS,
538 EXCLUSIVE_BACKUP_STOPPING
539 } ExclusiveBackupState;
540
541 /*
542 * Session status of running backup, used for sanity checks in SQL-callable
543 * functions to start and stop backups.
544 */
545 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
546
547 /*
548 * Shared state data for WAL insertion.
549 */
550 typedef struct XLogCtlInsert
551 {
552 slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
553
554 /*
555 * CurrBytePos is the end of reserved WAL. The next record will be
556 * inserted at that position. PrevBytePos is the start position of the
557 * previously inserted (or rather, reserved) record - it is copied to the
558 * prev-link of the next record. These are stored as "usable byte
559 * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
560 */
561 uint64 CurrBytePos;
562 uint64 PrevBytePos;
563
564 /*
565 * Make sure the above heavily-contended spinlock and byte positions are
566 * on their own cache line. In particular, the RedoRecPtr and full page
567 * write variables below should be on a different cache line. They are
568 * read on every WAL insertion, but updated rarely, and we don't want
569 * those reads to steal the cache line containing Curr/PrevBytePos.
570 */
571 char pad[PG_CACHE_LINE_SIZE];
572
573 /*
574 * fullPageWrites is the master copy used by all backends to determine
575 * whether to write full-page to WAL, instead of using process-local one.
576 * This is required because, when full_page_writes is changed by SIGHUP,
577 * we must WAL-log it before it actually affects WAL-logging by backends.
578 * Checkpointer sets at startup or after SIGHUP.
579 *
580 * To read these fields, you must hold an insertion lock. To modify them,
581 * you must hold ALL the locks.
582 */
583 XLogRecPtr RedoRecPtr; /* current redo point for insertions */
584 bool forcePageWrites; /* forcing full-page writes for PITR? */
585 bool fullPageWrites;
586
587 /*
588 * exclusiveBackupState indicates the state of an exclusive backup (see
589 * comments of ExclusiveBackupState for more details). nonExclusiveBackups
590 * is a counter indicating the number of streaming base backups currently
591 * in progress. forcePageWrites is set to true when either of these is
592 * non-zero. lastBackupStart is the latest checkpoint redo location used
593 * as a starting point for an online backup.
594 */
595 ExclusiveBackupState exclusiveBackupState;
596 int nonExclusiveBackups;
597 XLogRecPtr lastBackupStart;
598
599 /*
600 * WAL insertion locks.
601 */
602 WALInsertLockPadded *WALInsertLocks;
603 } XLogCtlInsert;
604
605 /*
606 * Total shared-memory state for XLOG.
607 */
608 typedef struct XLogCtlData
609 {
610 XLogCtlInsert Insert;
611
612 /* Protected by info_lck: */
613 XLogwrtRqst LogwrtRqst;
614 XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
615 FullTransactionId ckptFullXid; /* nextFullXid of latest checkpoint */
616 XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
617 XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
618
619 XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
620
621 /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
622 XLogRecPtr unloggedLSN;
623 slock_t ulsn_lck;
624
625 /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
626 pg_time_t lastSegSwitchTime;
627 XLogRecPtr lastSegSwitchLSN;
628
629 /*
630 * Protected by info_lck and WALWriteLock (you must hold either lock to
631 * read it, but both to update)
632 */
633 XLogwrtResult LogwrtResult;
634
635 /*
636 * Latest initialized page in the cache (last byte position + 1).
637 *
638 * To change the identity of a buffer (and InitializedUpTo), you need to
639 * hold WALBufMappingLock. To change the identity of a buffer that's
640 * still dirty, the old page needs to be written out first, and for that
641 * you need WALWriteLock, and you need to ensure that there are no
642 * in-progress insertions to the page by calling
643 * WaitXLogInsertionsToFinish().
644 */
645 XLogRecPtr InitializedUpTo;
646
647 /*
648 * These values do not change after startup, although the pointed-to pages
649 * and xlblocks values certainly do. xlblocks values are protected by
650 * WALBufMappingLock.
651 */
652 char *pages; /* buffers for unwritten XLOG pages */
653 XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
654 int XLogCacheBlck; /* highest allocated xlog buffer index */
655
656 /*
657 * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
658 * If we created a new timeline when the system was started up,
659 * PrevTimeLineID is the old timeline's ID that we forked off from.
660 * Otherwise it's equal to ThisTimeLineID.
661 */
662 TimeLineID ThisTimeLineID;
663 TimeLineID PrevTimeLineID;
664
665 /*
666 * SharedRecoveryState indicates if we're still in crash or archive
667 * recovery. Protected by info_lck.
668 */
669 RecoveryState SharedRecoveryState;
670
671 /*
672 * SharedHotStandbyActive indicates if we allow hot standby queries to be
673 * run. Protected by info_lck.
674 */
675 bool SharedHotStandbyActive;
676
677 /*
678 * SharedPromoteIsTriggered indicates if a standby promotion has been
679 * triggered. Protected by info_lck.
680 */
681 bool SharedPromoteIsTriggered;
682
683 /*
684 * WalWriterSleeping indicates whether the WAL writer is currently in
685 * low-power mode (and hence should be nudged if an async commit occurs).
686 * Protected by info_lck.
687 */
688 bool WalWriterSleeping;
689
690 /*
691 * recoveryWakeupLatch is used to wake up the startup process to continue
692 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
693 * to appear.
694 */
695 Latch recoveryWakeupLatch;
696
697 /*
698 * During recovery, we keep a copy of the latest checkpoint record here.
699 * lastCheckPointRecPtr points to start of checkpoint record and
700 * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
701 * checkpointer when it wants to create a restartpoint.
702 *
703 * Protected by info_lck.
704 */
705 XLogRecPtr lastCheckPointRecPtr;
706 XLogRecPtr lastCheckPointEndPtr;
707 CheckPoint lastCheckPoint;
708
709 /*
710 * lastReplayedEndRecPtr points to end+1 of the last record successfully
711 * replayed. When we're currently replaying a record, ie. in a redo
712 * function, replayEndRecPtr points to the end+1 of the record being
713 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
714 */
715 XLogRecPtr lastReplayedEndRecPtr;
716 TimeLineID lastReplayedTLI;
717 XLogRecPtr replayEndRecPtr;
718 TimeLineID replayEndTLI;
719 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
720 TimestampTz recoveryLastXTime;
721
722 /*
723 * timestamp of when we started replaying the current chunk of WAL data,
724 * only relevant for replication or archive recovery
725 */
726 TimestampTz currentChunkStartTime;
727 /* Are we requested to pause recovery? */
728 bool recoveryPause;
729
730 /*
731 * lastFpwDisableRecPtr points to the start of the last replayed
732 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
733 */
734 XLogRecPtr lastFpwDisableRecPtr;
735
736 slock_t info_lck; /* locks shared variables shown above */
737 } XLogCtlData;
738
739 static XLogCtlData *XLogCtl = NULL;
740
741 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
742 static WALInsertLockPadded *WALInsertLocks = NULL;
743
744 /*
745 * We maintain an image of pg_control in shared memory.
746 */
747 static ControlFileData *ControlFile = NULL;
748
749 /*
750 * Calculate the amount of space left on the page after 'endptr'. Beware
751 * multiple evaluation!
752 */
753 #define INSERT_FREESPACE(endptr) \
754 (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
755
756 /* Macro to advance to next buffer index. */
757 #define NextBufIdx(idx) \
758 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
759
760 /*
761 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
762 * would hold if it was in cache, the page containing 'recptr'.
763 */
764 #define XLogRecPtrToBufIdx(recptr) \
765 (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
766
767 /*
768 * These are the number of bytes in a WAL page usable for WAL data.
769 */
770 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
771
772 /*
773 * Convert values of GUCs measured in megabytes to equiv. segment count.
774 * Rounds down.
775 */
776 #define ConvertToXSegs(x, segsize) XLogMBVarToSegs((x), (segsize))
777
778 /* The number of bytes in a WAL segment usable for WAL data. */
779 static int UsableBytesInSegment;
780
781 /*
782 * Private, possibly out-of-date copy of shared LogwrtResult.
783 * See discussion above.
784 */
785 static XLogwrtResult LogwrtResult = {0, 0};
786
787 /*
788 * Codes indicating where we got a WAL file from during recovery, or where
789 * to attempt to get one.
790 */
791 typedef enum
792 {
793 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
794 XLOG_FROM_ARCHIVE, /* restored using restore_command */
795 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
796 XLOG_FROM_STREAM /* streamed from master */
797 } XLogSource;
798
799 /* human-readable names for XLogSources, for debugging output */
800 static const char *const xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
801
802 /*
803 * openLogFile is -1 or a kernel FD for an open log file segment.
804 * openLogSegNo identifies the segment. These variables are only used to
805 * write the XLOG, and so will normally refer to the active segment.
806 * Note: call Reserve/ReleaseExternalFD to track consumption of this FD.
807 */
808 static int openLogFile = -1;
809 static XLogSegNo openLogSegNo = 0;
810
811 /*
812 * These variables are used similarly to the ones above, but for reading
813 * the XLOG. readOff is the offset of the page just read, readLen
814 * indicates how much of it has been read into readBuf, and readSource
815 * indicates where we got the currently open file from.
816 * Note: we could use Reserve/ReleaseExternalFD to track consumption of
817 * this FD too; but it doesn't currently seem worthwhile, since the XLOG is
818 * not read by general-purpose sessions.
819 */
820 static int readFile = -1;
821 static XLogSegNo readSegNo = 0;
822 static uint32 readOff = 0;
823 static uint32 readLen = 0;
824 static XLogSource readSource = XLOG_FROM_ANY;
825
826 /*
827 * Keeps track of which source we're currently reading from. This is
828 * different from readSource in that this is always set, even when we don't
829 * currently have a WAL file open. If lastSourceFailed is set, our last
830 * attempt to read from currentSource failed, and we should try another source
831 * next.
832 *
833 * pendingWalRcvRestart is set when a config change occurs that requires a
834 * walreceiver restart. This is only valid in XLOG_FROM_STREAM state.
835 */
836 static XLogSource currentSource = XLOG_FROM_ANY;
837 static bool lastSourceFailed = false;
838 static bool pendingWalRcvRestart = false;
839
840 typedef struct XLogPageReadPrivate
841 {
842 int emode;
843 bool fetching_ckpt; /* are we fetching a checkpoint record? */
844 bool randAccess;
845 } XLogPageReadPrivate;
846
847 /*
848 * These variables track when we last obtained some WAL data to process,
849 * and where we got it from. (XLogReceiptSource is initially the same as
850 * readSource, but readSource gets reset to zero when we don't have data
851 * to process right now. It is also different from currentSource, which
852 * also changes when we try to read from a source and fail, while
853 * XLogReceiptSource tracks where we last successfully read some WAL.)
854 */
855 static TimestampTz XLogReceiptTime = 0;
856 static XLogSource XLogReceiptSource = XLOG_FROM_ANY;
857
858 /* State information for XLOG reading */
859 static XLogRecPtr ReadRecPtr; /* start of last record read */
860 static XLogRecPtr EndRecPtr; /* end+1 of last record read */
861
862 /*
863 * Local copies of equivalent fields in the control file. When running
864 * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
865 * expect to replay all the WAL available, and updateMinRecoveryPoint is
866 * switched to false to prevent any updates while replaying records.
867 * Those values are kept consistent as long as crash recovery runs.
868 */
869 static XLogRecPtr minRecoveryPoint;
870 static TimeLineID minRecoveryPointTLI;
871 static bool updateMinRecoveryPoint = true;
872
873 /*
874 * Have we reached a consistent database state? In crash recovery, we have
875 * to replay all the WAL, so reachedConsistency is never set. During archive
876 * recovery, the database is consistent once minRecoveryPoint is reached.
877 */
878 bool reachedConsistency = false;
879
880 static bool InRedo = false;
881
882 /* Have we launched bgwriter during recovery? */
883 static bool bgwriterLaunched = false;
884
885 /* For WALInsertLockAcquire/Release functions */
886 static int MyLockNo = 0;
887 static bool holdingAllLocks = false;
888
889 #ifdef WAL_DEBUG
890 static MemoryContext walDebugCxt = NULL;
891 #endif
892
893 static void readRecoverySignalFile(void);
894 static void validateRecoveryParameters(void);
895 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
896 static bool recoveryStopsBefore(XLogReaderState *record);
897 static bool recoveryStopsAfter(XLogReaderState *record);
898 static void recoveryPausesHere(bool endOfRecovery);
899 static bool recoveryApplyDelay(XLogReaderState *record);
900 static void SetLatestXTime(TimestampTz xtime);
901 static void SetCurrentChunkStartTime(TimestampTz xtime);
902 static void CheckRequiredParameterValues(void);
903 static void XLogReportParameters(void);
904 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
905 TimeLineID prevTLI);
906 static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
907 XLogReaderState *state);
908 static void LocalSetXLogInsertAllowed(void);
909 static void CreateEndOfRecoveryRecord(void);
910 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
911 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
912 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
913 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
914
915 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
916 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
917 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
918 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
919 bool find_free, XLogSegNo max_segno,
920 bool use_lock);
921 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
922 XLogSource source, bool notfoundOk);
923 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source);
924 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
925 int reqLen, XLogRecPtr targetRecPtr, char *readBuf);
926 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
927 bool fetching_ckpt, XLogRecPtr tliRecPtr);
928 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
929 static void XLogFileClose(void);
930 static void PreallocXlogFiles(XLogRecPtr endptr);
931 static void RemoveTempXlogFiles(void);
932 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr);
933 static void RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr);
934 static void UpdateLastRemovedPtr(char *filename);
935 static void ValidateXLOGDirectoryStructure(void);
936 static void CleanupBackupHistory(void);
937 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
938 static XLogRecord *ReadRecord(XLogReaderState *xlogreader,
939 int emode, bool fetching_ckpt);
940 static void CheckRecoveryConsistency(void);
941 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
942 XLogRecPtr RecPtr, int whichChkpt, bool report);
943 static bool rescanLatestTimeLine(void);
944 static void InitControlFile(uint64 sysidentifier);
945 static void WriteControlFile(void);
946 static void ReadControlFile(void);
947 static char *str_time(pg_time_t tnow);
948 static void SetPromoteIsTriggered(void);
949 static bool CheckForStandbyTrigger(void);
950
951 #ifdef WAL_DEBUG
952 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
953 #endif
954 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
955 static void pg_start_backup_callback(int code, Datum arg);
956 static void pg_stop_backup_callback(int code, Datum arg);
957 static bool read_backup_label(XLogRecPtr *checkPointLoc,
958 bool *backupEndRequired, bool *backupFromStandby);
959 static bool read_tablespace_map(List **tablespaces);
960
961 static void rm_redo_error_callback(void *arg);
962 static int get_sync_bit(int method);
963
964 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
965 XLogRecData *rdata,
966 XLogRecPtr StartPos, XLogRecPtr EndPos);
967 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
968 XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
969 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
970 XLogRecPtr *PrevPtr);
971 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
972 static char *GetXLogBuffer(XLogRecPtr ptr);
973 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
974 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
975 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
976 static void checkXLogConsistency(XLogReaderState *record);
977
978 static void WALInsertLockAcquire(void);
979 static void WALInsertLockAcquireExclusive(void);
980 static void WALInsertLockRelease(void);
981 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
982
983 /*
984 * Insert an XLOG record represented by an already-constructed chain of data
985 * chunks. This is a low-level routine; to construct the WAL record header
986 * and data, use the higher-level routines in xloginsert.c.
987 *
988 * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
989 * WAL record applies to, that were not included in the record as full page
990 * images. If fpw_lsn <= RedoRecPtr, the function does not perform the
991 * insertion and returns InvalidXLogRecPtr. The caller can then recalculate
992 * which pages need a full-page image, and retry. If fpw_lsn is invalid, the
993 * record is always inserted.
994 *
995 * 'flags' gives more in-depth control on the record being inserted. See
996 * XLogSetRecordFlags() for details.
997 *
998 * The first XLogRecData in the chain must be for the record header, and its
999 * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
1000 * xl_crc fields in the header, the rest of the header must already be filled
1001 * by the caller.
1002 *
1003 * Returns XLOG pointer to end of record (beginning of next record).
1004 * This can be used as LSN for data pages affected by the logged action.
1005 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
1006 * before the data page can be written out. This implements the basic
1007 * WAL rule "write the log before the data".)
1008 */
1009 XLogRecPtr
XLogInsertRecord(XLogRecData * rdata,XLogRecPtr fpw_lsn,uint8 flags,int num_fpi)1010 XLogInsertRecord(XLogRecData *rdata,
1011 XLogRecPtr fpw_lsn,
1012 uint8 flags,
1013 int num_fpi)
1014 {
1015 XLogCtlInsert *Insert = &XLogCtl->Insert;
1016 pg_crc32c rdata_crc;
1017 bool inserted;
1018 XLogRecord *rechdr = (XLogRecord *) rdata->data;
1019 uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
1020 bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
1021 info == XLOG_SWITCH);
1022 XLogRecPtr StartPos;
1023 XLogRecPtr EndPos;
1024 bool prevDoPageWrites = doPageWrites;
1025
1026 /* we assume that all of the record header is in the first chunk */
1027 Assert(rdata->len >= SizeOfXLogRecord);
1028
1029 /* cross-check on whether we should be here or not */
1030 if (!XLogInsertAllowed())
1031 elog(ERROR, "cannot make new WAL entries during recovery");
1032
1033 /*----------
1034 *
1035 * We have now done all the preparatory work we can without holding a
1036 * lock or modifying shared state. From here on, inserting the new WAL
1037 * record to the shared WAL buffer cache is a two-step process:
1038 *
1039 * 1. Reserve the right amount of space from the WAL. The current head of
1040 * reserved space is kept in Insert->CurrBytePos, and is protected by
1041 * insertpos_lck.
1042 *
1043 * 2. Copy the record to the reserved WAL space. This involves finding the
1044 * correct WAL buffer containing the reserved space, and copying the
1045 * record in place. This can be done concurrently in multiple processes.
1046 *
1047 * To keep track of which insertions are still in-progress, each concurrent
1048 * inserter acquires an insertion lock. In addition to just indicating that
1049 * an insertion is in progress, the lock tells others how far the inserter
1050 * has progressed. There is a small fixed number of insertion locks,
1051 * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1052 * boundary, it updates the value stored in the lock to the how far it has
1053 * inserted, to allow the previous buffer to be flushed.
1054 *
1055 * Holding onto an insertion lock also protects RedoRecPtr and
1056 * fullPageWrites from changing until the insertion is finished.
1057 *
1058 * Step 2 can usually be done completely in parallel. If the required WAL
1059 * page is not initialized yet, you have to grab WALBufMappingLock to
1060 * initialize it, but the WAL writer tries to do that ahead of insertions
1061 * to avoid that from happening in the critical path.
1062 *
1063 *----------
1064 */
1065 START_CRIT_SECTION();
1066 if (isLogSwitch)
1067 WALInsertLockAcquireExclusive();
1068 else
1069 WALInsertLockAcquire();
1070
1071 /*
1072 * Check to see if my copy of RedoRecPtr is out of date. If so, may have
1073 * to go back and have the caller recompute everything. This can only
1074 * happen just after a checkpoint, so it's better to be slow in this case
1075 * and fast otherwise.
1076 *
1077 * Also check to see if fullPageWrites or forcePageWrites was just turned
1078 * on; if we weren't already doing full-page writes then go back and
1079 * recompute.
1080 *
1081 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1082 * affect the contents of the XLOG record, so we'll update our local copy
1083 * but not force a recomputation. (If doPageWrites was just turned off,
1084 * we could recompute the record without full pages, but we choose not to
1085 * bother.)
1086 */
1087 if (RedoRecPtr != Insert->RedoRecPtr)
1088 {
1089 Assert(RedoRecPtr < Insert->RedoRecPtr);
1090 RedoRecPtr = Insert->RedoRecPtr;
1091 }
1092 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1093
1094 if (doPageWrites &&
1095 (!prevDoPageWrites ||
1096 (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
1097 {
1098 /*
1099 * Oops, some buffer now needs to be backed up that the caller didn't
1100 * back up. Start over.
1101 */
1102 WALInsertLockRelease();
1103 END_CRIT_SECTION();
1104 return InvalidXLogRecPtr;
1105 }
1106
1107 /*
1108 * Reserve space for the record in the WAL. This also sets the xl_prev
1109 * pointer.
1110 */
1111 if (isLogSwitch)
1112 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1113 else
1114 {
1115 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1116 &rechdr->xl_prev);
1117 inserted = true;
1118 }
1119
1120 if (inserted)
1121 {
1122 /*
1123 * Now that xl_prev has been filled in, calculate CRC of the record
1124 * header.
1125 */
1126 rdata_crc = rechdr->xl_crc;
1127 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1128 FIN_CRC32C(rdata_crc);
1129 rechdr->xl_crc = rdata_crc;
1130
1131 /*
1132 * All the record data, including the header, is now ready to be
1133 * inserted. Copy the record in the space reserved.
1134 */
1135 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1136 StartPos, EndPos);
1137
1138 /*
1139 * Unless record is flagged as not important, update LSN of last
1140 * important record in the current slot. When holding all locks, just
1141 * update the first one.
1142 */
1143 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1144 {
1145 int lockno = holdingAllLocks ? 0 : MyLockNo;
1146
1147 WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1148 }
1149 }
1150 else
1151 {
1152 /*
1153 * This was an xlog-switch record, but the current insert location was
1154 * already exactly at the beginning of a segment, so there was no need
1155 * to do anything.
1156 */
1157 }
1158
1159 /*
1160 * Done! Let others know that we're finished.
1161 */
1162 WALInsertLockRelease();
1163
1164 MarkCurrentTransactionIdLoggedIfAny();
1165
1166 END_CRIT_SECTION();
1167
1168 /*
1169 * Update shared LogwrtRqst.Write, if we crossed page boundary.
1170 */
1171 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1172 {
1173 SpinLockAcquire(&XLogCtl->info_lck);
1174 /* advance global request to include new block(s) */
1175 if (XLogCtl->LogwrtRqst.Write < EndPos)
1176 XLogCtl->LogwrtRqst.Write = EndPos;
1177 /* update local result copy while I have the chance */
1178 LogwrtResult = XLogCtl->LogwrtResult;
1179 SpinLockRelease(&XLogCtl->info_lck);
1180 }
1181
1182 /*
1183 * If this was an XLOG_SWITCH record, flush the record and the empty
1184 * padding space that fills the rest of the segment, and perform
1185 * end-of-segment actions (eg, notifying archiver).
1186 */
1187 if (isLogSwitch)
1188 {
1189 TRACE_POSTGRESQL_WAL_SWITCH();
1190 XLogFlush(EndPos);
1191
1192 /*
1193 * Even though we reserved the rest of the segment for us, which is
1194 * reflected in EndPos, we return a pointer to just the end of the
1195 * xlog-switch record.
1196 */
1197 if (inserted)
1198 {
1199 EndPos = StartPos + SizeOfXLogRecord;
1200 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1201 {
1202 uint64 offset = XLogSegmentOffset(EndPos, wal_segment_size);
1203
1204 if (offset == EndPos % XLOG_BLCKSZ)
1205 EndPos += SizeOfXLogLongPHD;
1206 else
1207 EndPos += SizeOfXLogShortPHD;
1208 }
1209 }
1210 }
1211
1212 #ifdef WAL_DEBUG
1213 if (XLOG_DEBUG)
1214 {
1215 static XLogReaderState *debug_reader = NULL;
1216 StringInfoData buf;
1217 StringInfoData recordBuf;
1218 char *errormsg = NULL;
1219 MemoryContext oldCxt;
1220
1221 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1222
1223 initStringInfo(&buf);
1224 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1225 (uint32) (EndPos >> 32), (uint32) EndPos);
1226
1227 /*
1228 * We have to piece together the WAL record data from the XLogRecData
1229 * entries, so that we can pass it to the rm_desc function as one
1230 * contiguous chunk.
1231 */
1232 initStringInfo(&recordBuf);
1233 for (; rdata != NULL; rdata = rdata->next)
1234 appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1235
1236 if (!debug_reader)
1237 debug_reader = XLogReaderAllocate(wal_segment_size, NULL,
1238 XL_ROUTINE(), NULL);
1239
1240 if (!debug_reader)
1241 {
1242 appendStringInfoString(&buf, "error decoding record: out of memory");
1243 }
1244 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1245 &errormsg))
1246 {
1247 appendStringInfo(&buf, "error decoding record: %s",
1248 errormsg ? errormsg : "no error message");
1249 }
1250 else
1251 {
1252 appendStringInfoString(&buf, " - ");
1253 xlog_outdesc(&buf, debug_reader);
1254 }
1255 elog(LOG, "%s", buf.data);
1256
1257 pfree(buf.data);
1258 pfree(recordBuf.data);
1259 MemoryContextSwitchTo(oldCxt);
1260 }
1261 #endif
1262
1263 /*
1264 * Update our global variables
1265 */
1266 ProcLastRecPtr = StartPos;
1267 XactLastRecEnd = EndPos;
1268
1269 /* Report WAL traffic to the instrumentation. */
1270 if (inserted)
1271 {
1272 pgWalUsage.wal_bytes += rechdr->xl_tot_len;
1273 pgWalUsage.wal_records++;
1274 pgWalUsage.wal_fpi += num_fpi;
1275 }
1276
1277 return EndPos;
1278 }
1279
1280 /*
1281 * Reserves the right amount of space for a record of given size from the WAL.
1282 * *StartPos is set to the beginning of the reserved section, *EndPos to
1283 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1284 * used to set the xl_prev of this record.
1285 *
1286 * This is the performance critical part of XLogInsert that must be serialized
1287 * across backends. The rest can happen mostly in parallel. Try to keep this
1288 * section as short as possible, insertpos_lck can be heavily contended on a
1289 * busy system.
1290 *
1291 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1292 * where we actually copy the record to the reserved space.
1293 */
1294 static void
ReserveXLogInsertLocation(int size,XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1295 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1296 XLogRecPtr *PrevPtr)
1297 {
1298 XLogCtlInsert *Insert = &XLogCtl->Insert;
1299 uint64 startbytepos;
1300 uint64 endbytepos;
1301 uint64 prevbytepos;
1302
1303 size = MAXALIGN(size);
1304
1305 /* All (non xlog-switch) records should contain data. */
1306 Assert(size > SizeOfXLogRecord);
1307
1308 /*
1309 * The duration the spinlock needs to be held is minimized by minimizing
1310 * the calculations that have to be done while holding the lock. The
1311 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1312 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1313 * page headers. The mapping between "usable" byte positions and physical
1314 * positions (XLogRecPtrs) can be done outside the locked region, and
1315 * because the usable byte position doesn't include any headers, reserving
1316 * X bytes from WAL is almost as simple as "CurrBytePos += X".
1317 */
1318 SpinLockAcquire(&Insert->insertpos_lck);
1319
1320 startbytepos = Insert->CurrBytePos;
1321 endbytepos = startbytepos + size;
1322 prevbytepos = Insert->PrevBytePos;
1323 Insert->CurrBytePos = endbytepos;
1324 Insert->PrevBytePos = startbytepos;
1325
1326 SpinLockRelease(&Insert->insertpos_lck);
1327
1328 *StartPos = XLogBytePosToRecPtr(startbytepos);
1329 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1330 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1331
1332 /*
1333 * Check that the conversions between "usable byte positions" and
1334 * XLogRecPtrs work consistently in both directions.
1335 */
1336 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1337 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1338 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1339 }
1340
1341 /*
1342 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1343 *
1344 * A log-switch record is handled slightly differently. The rest of the
1345 * segment will be reserved for this insertion, as indicated by the returned
1346 * *EndPos value. However, if we are already at the beginning of the current
1347 * segment, *StartPos and *EndPos are set to the current location without
1348 * reserving any space, and the function returns false.
1349 */
1350 static bool
ReserveXLogSwitch(XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1351 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1352 {
1353 XLogCtlInsert *Insert = &XLogCtl->Insert;
1354 uint64 startbytepos;
1355 uint64 endbytepos;
1356 uint64 prevbytepos;
1357 uint32 size = MAXALIGN(SizeOfXLogRecord);
1358 XLogRecPtr ptr;
1359 uint32 segleft;
1360
1361 /*
1362 * These calculations are a bit heavy-weight to be done while holding a
1363 * spinlock, but since we're holding all the WAL insertion locks, there
1364 * are no other inserters competing for it. GetXLogInsertRecPtr() does
1365 * compete for it, but that's not called very frequently.
1366 */
1367 SpinLockAcquire(&Insert->insertpos_lck);
1368
1369 startbytepos = Insert->CurrBytePos;
1370
1371 ptr = XLogBytePosToEndRecPtr(startbytepos);
1372 if (XLogSegmentOffset(ptr, wal_segment_size) == 0)
1373 {
1374 SpinLockRelease(&Insert->insertpos_lck);
1375 *EndPos = *StartPos = ptr;
1376 return false;
1377 }
1378
1379 endbytepos = startbytepos + size;
1380 prevbytepos = Insert->PrevBytePos;
1381
1382 *StartPos = XLogBytePosToRecPtr(startbytepos);
1383 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1384
1385 segleft = wal_segment_size - XLogSegmentOffset(*EndPos, wal_segment_size);
1386 if (segleft != wal_segment_size)
1387 {
1388 /* consume the rest of the segment */
1389 *EndPos += segleft;
1390 endbytepos = XLogRecPtrToBytePos(*EndPos);
1391 }
1392 Insert->CurrBytePos = endbytepos;
1393 Insert->PrevBytePos = startbytepos;
1394
1395 SpinLockRelease(&Insert->insertpos_lck);
1396
1397 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1398
1399 Assert(XLogSegmentOffset(*EndPos, wal_segment_size) == 0);
1400 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1401 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1402 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1403
1404 return true;
1405 }
1406
1407 /*
1408 * Checks whether the current buffer page and backup page stored in the
1409 * WAL record are consistent or not. Before comparing the two pages, a
1410 * masking can be applied to the pages to ignore certain areas like hint bits,
1411 * unused space between pd_lower and pd_upper among other things. This
1412 * function should be called once WAL replay has been completed for a
1413 * given record.
1414 */
1415 static void
checkXLogConsistency(XLogReaderState * record)1416 checkXLogConsistency(XLogReaderState *record)
1417 {
1418 RmgrId rmid = XLogRecGetRmid(record);
1419 RelFileNode rnode;
1420 ForkNumber forknum;
1421 BlockNumber blkno;
1422 int block_id;
1423
1424 /* Records with no backup blocks have no need for consistency checks. */
1425 if (!XLogRecHasAnyBlockRefs(record))
1426 return;
1427
1428 Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1429
1430 for (block_id = 0; block_id <= record->max_block_id; block_id++)
1431 {
1432 Buffer buf;
1433 Page page;
1434
1435 if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1436 {
1437 /*
1438 * WAL record doesn't contain a block reference with the given id.
1439 * Do nothing.
1440 */
1441 continue;
1442 }
1443
1444 Assert(XLogRecHasBlockImage(record, block_id));
1445
1446 if (XLogRecBlockImageApply(record, block_id))
1447 {
1448 /*
1449 * WAL record has already applied the page, so bypass the
1450 * consistency check as that would result in comparing the full
1451 * page stored in the record with itself.
1452 */
1453 continue;
1454 }
1455
1456 /*
1457 * Read the contents from the current buffer and store it in a
1458 * temporary page.
1459 */
1460 buf = XLogReadBufferExtended(rnode, forknum, blkno,
1461 RBM_NORMAL_NO_LOG);
1462 if (!BufferIsValid(buf))
1463 continue;
1464
1465 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1466 page = BufferGetPage(buf);
1467
1468 /*
1469 * Take a copy of the local page where WAL has been applied to have a
1470 * comparison base before masking it...
1471 */
1472 memcpy(replay_image_masked, page, BLCKSZ);
1473
1474 /* No need for this page anymore now that a copy is in. */
1475 UnlockReleaseBuffer(buf);
1476
1477 /*
1478 * If the block LSN is already ahead of this WAL record, we can't
1479 * expect contents to match. This can happen if recovery is
1480 * restarted.
1481 */
1482 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1483 continue;
1484
1485 /*
1486 * Read the contents from the backup copy, stored in WAL record and
1487 * store it in a temporary page. There is no need to allocate a new
1488 * page here, a local buffer is fine to hold its contents and a mask
1489 * can be directly applied on it.
1490 */
1491 if (!RestoreBlockImage(record, block_id, master_image_masked))
1492 elog(ERROR, "failed to restore block image");
1493
1494 /*
1495 * If masking function is defined, mask both the master and replay
1496 * images
1497 */
1498 if (RmgrTable[rmid].rm_mask != NULL)
1499 {
1500 RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1501 RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1502 }
1503
1504 /* Time to compare the master and replay images. */
1505 if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1506 {
1507 elog(FATAL,
1508 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1509 rnode.spcNode, rnode.dbNode, rnode.relNode,
1510 forknum, blkno);
1511 }
1512 }
1513 }
1514
1515 /*
1516 * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
1517 * area in the WAL.
1518 */
1519 static void
CopyXLogRecordToWAL(int write_len,bool isLogSwitch,XLogRecData * rdata,XLogRecPtr StartPos,XLogRecPtr EndPos)1520 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1521 XLogRecPtr StartPos, XLogRecPtr EndPos)
1522 {
1523 char *currpos;
1524 int freespace;
1525 int written;
1526 XLogRecPtr CurrPos;
1527 XLogPageHeader pagehdr;
1528
1529 /*
1530 * Get a pointer to the right place in the right WAL buffer to start
1531 * inserting to.
1532 */
1533 CurrPos = StartPos;
1534 currpos = GetXLogBuffer(CurrPos);
1535 freespace = INSERT_FREESPACE(CurrPos);
1536
1537 /*
1538 * there should be enough space for at least the first field (xl_tot_len)
1539 * on this page.
1540 */
1541 Assert(freespace >= sizeof(uint32));
1542
1543 /* Copy record data */
1544 written = 0;
1545 while (rdata != NULL)
1546 {
1547 char *rdata_data = rdata->data;
1548 int rdata_len = rdata->len;
1549
1550 while (rdata_len > freespace)
1551 {
1552 /*
1553 * Write what fits on this page, and continue on the next page.
1554 */
1555 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1556 memcpy(currpos, rdata_data, freespace);
1557 rdata_data += freespace;
1558 rdata_len -= freespace;
1559 written += freespace;
1560 CurrPos += freespace;
1561
1562 /*
1563 * Get pointer to beginning of next page, and set the xlp_rem_len
1564 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1565 *
1566 * It's safe to set the contrecord flag and xlp_rem_len without a
1567 * lock on the page. All the other flags were already set when the
1568 * page was initialized, in AdvanceXLInsertBuffer, and we're the
1569 * only backend that needs to set the contrecord flag.
1570 */
1571 currpos = GetXLogBuffer(CurrPos);
1572 pagehdr = (XLogPageHeader) currpos;
1573 pagehdr->xlp_rem_len = write_len - written;
1574 pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1575
1576 /* skip over the page header */
1577 if (XLogSegmentOffset(CurrPos, wal_segment_size) == 0)
1578 {
1579 CurrPos += SizeOfXLogLongPHD;
1580 currpos += SizeOfXLogLongPHD;
1581 }
1582 else
1583 {
1584 CurrPos += SizeOfXLogShortPHD;
1585 currpos += SizeOfXLogShortPHD;
1586 }
1587 freespace = INSERT_FREESPACE(CurrPos);
1588 }
1589
1590 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1591 memcpy(currpos, rdata_data, rdata_len);
1592 currpos += rdata_len;
1593 CurrPos += rdata_len;
1594 freespace -= rdata_len;
1595 written += rdata_len;
1596
1597 rdata = rdata->next;
1598 }
1599 Assert(written == write_len);
1600
1601 /*
1602 * If this was an xlog-switch, it's not enough to write the switch record,
1603 * we also have to consume all the remaining space in the WAL segment. We
1604 * have already reserved that space, but we need to actually fill it.
1605 */
1606 if (isLogSwitch && XLogSegmentOffset(CurrPos, wal_segment_size) != 0)
1607 {
1608 /* An xlog-switch record doesn't contain any data besides the header */
1609 Assert(write_len == SizeOfXLogRecord);
1610
1611 /* Assert that we did reserve the right amount of space */
1612 Assert(XLogSegmentOffset(EndPos, wal_segment_size) == 0);
1613
1614 /* Use up all the remaining space on the current page */
1615 CurrPos += freespace;
1616
1617 /*
1618 * Cause all remaining pages in the segment to be flushed, leaving the
1619 * XLog position where it should be, at the start of the next segment.
1620 * We do this one page at a time, to make sure we don't deadlock
1621 * against ourselves if wal_buffers < wal_segment_size.
1622 */
1623 while (CurrPos < EndPos)
1624 {
1625 /*
1626 * The minimal action to flush the page would be to call
1627 * WALInsertLockUpdateInsertingAt(CurrPos) followed by
1628 * AdvanceXLInsertBuffer(...). The page would be left initialized
1629 * mostly to zeros, except for the page header (always the short
1630 * variant, as this is never a segment's first page).
1631 *
1632 * The large vistas of zeros are good for compressibility, but the
1633 * headers interrupting them every XLOG_BLCKSZ (with values that
1634 * differ from page to page) are not. The effect varies with
1635 * compression tool, but bzip2 for instance compresses about an
1636 * order of magnitude worse if those headers are left in place.
1637 *
1638 * Rather than complicating AdvanceXLInsertBuffer itself (which is
1639 * called in heavily-loaded circumstances as well as this lightly-
1640 * loaded one) with variant behavior, we just use GetXLogBuffer
1641 * (which itself calls the two methods we need) to get the pointer
1642 * and zero most of the page. Then we just zero the page header.
1643 */
1644 currpos = GetXLogBuffer(CurrPos);
1645 MemSet(currpos, 0, SizeOfXLogShortPHD);
1646
1647 CurrPos += XLOG_BLCKSZ;
1648 }
1649 }
1650 else
1651 {
1652 /* Align the end position, so that the next record starts aligned */
1653 CurrPos = MAXALIGN64(CurrPos);
1654 }
1655
1656 if (CurrPos != EndPos)
1657 elog(PANIC, "space reserved for WAL record does not match what was written");
1658 }
1659
1660 /*
1661 * Acquire a WAL insertion lock, for inserting to WAL.
1662 */
1663 static void
WALInsertLockAcquire(void)1664 WALInsertLockAcquire(void)
1665 {
1666 bool immed;
1667
1668 /*
1669 * It doesn't matter which of the WAL insertion locks we acquire, so try
1670 * the one we used last time. If the system isn't particularly busy, it's
1671 * a good bet that it's still available, and it's good to have some
1672 * affinity to a particular lock so that you don't unnecessarily bounce
1673 * cache lines between processes when there's no contention.
1674 *
1675 * If this is the first time through in this backend, pick a lock
1676 * (semi-)randomly. This allows the locks to be used evenly if you have a
1677 * lot of very short connections.
1678 */
1679 static int lockToTry = -1;
1680
1681 if (lockToTry == -1)
1682 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1683 MyLockNo = lockToTry;
1684
1685 /*
1686 * The insertingAt value is initially set to 0, as we don't know our
1687 * insert location yet.
1688 */
1689 immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1690 if (!immed)
1691 {
1692 /*
1693 * If we couldn't get the lock immediately, try another lock next
1694 * time. On a system with more insertion locks than concurrent
1695 * inserters, this causes all the inserters to eventually migrate to a
1696 * lock that no-one else is using. On a system with more inserters
1697 * than locks, it still helps to distribute the inserters evenly
1698 * across the locks.
1699 */
1700 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1701 }
1702 }
1703
1704 /*
1705 * Acquire all WAL insertion locks, to prevent other backends from inserting
1706 * to WAL.
1707 */
1708 static void
WALInsertLockAcquireExclusive(void)1709 WALInsertLockAcquireExclusive(void)
1710 {
1711 int i;
1712
1713 /*
1714 * When holding all the locks, all but the last lock's insertingAt
1715 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1716 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1717 */
1718 for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1719 {
1720 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1721 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1722 &WALInsertLocks[i].l.insertingAt,
1723 PG_UINT64_MAX);
1724 }
1725 /* Variable value reset to 0 at release */
1726 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1727
1728 holdingAllLocks = true;
1729 }
1730
1731 /*
1732 * Release our insertion lock (or locks, if we're holding them all).
1733 *
1734 * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1735 * next time the lock is acquired.
1736 */
1737 static void
WALInsertLockRelease(void)1738 WALInsertLockRelease(void)
1739 {
1740 if (holdingAllLocks)
1741 {
1742 int i;
1743
1744 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1745 LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1746 &WALInsertLocks[i].l.insertingAt,
1747 0);
1748
1749 holdingAllLocks = false;
1750 }
1751 else
1752 {
1753 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1754 &WALInsertLocks[MyLockNo].l.insertingAt,
1755 0);
1756 }
1757 }
1758
1759 /*
1760 * Update our insertingAt value, to let others know that we've finished
1761 * inserting up to that point.
1762 */
1763 static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)1764 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1765 {
1766 if (holdingAllLocks)
1767 {
1768 /*
1769 * We use the last lock to mark our actual position, see comments in
1770 * WALInsertLockAcquireExclusive.
1771 */
1772 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1773 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1774 insertingAt);
1775 }
1776 else
1777 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1778 &WALInsertLocks[MyLockNo].l.insertingAt,
1779 insertingAt);
1780 }
1781
1782 /*
1783 * Wait for any WAL insertions < upto to finish.
1784 *
1785 * Returns the location of the oldest insertion that is still in-progress.
1786 * Any WAL prior to that point has been fully copied into WAL buffers, and
1787 * can be flushed out to disk. Because this waits for any insertions older
1788 * than 'upto' to finish, the return value is always >= 'upto'.
1789 *
1790 * Note: When you are about to write out WAL, you must call this function
1791 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1792 * need to wait for an insertion to finish (or at least advance to next
1793 * uninitialized page), and the inserter might need to evict an old WAL buffer
1794 * to make room for a new one, which in turn requires WALWriteLock.
1795 */
1796 static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)1797 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1798 {
1799 uint64 bytepos;
1800 XLogRecPtr reservedUpto;
1801 XLogRecPtr finishedUpto;
1802 XLogCtlInsert *Insert = &XLogCtl->Insert;
1803 int i;
1804
1805 if (MyProc == NULL)
1806 elog(PANIC, "cannot wait without a PGPROC structure");
1807
1808 /* Read the current insert position */
1809 SpinLockAcquire(&Insert->insertpos_lck);
1810 bytepos = Insert->CurrBytePos;
1811 SpinLockRelease(&Insert->insertpos_lck);
1812 reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1813
1814 /*
1815 * No-one should request to flush a piece of WAL that hasn't even been
1816 * reserved yet. However, it can happen if there is a block with a bogus
1817 * LSN on disk, for example. XLogFlush checks for that situation and
1818 * complains, but only after the flush. Here we just assume that to mean
1819 * that all WAL that has been reserved needs to be finished. In this
1820 * corner-case, the return value can be smaller than 'upto' argument.
1821 */
1822 if (upto > reservedUpto)
1823 {
1824 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1825 (uint32) (upto >> 32), (uint32) upto,
1826 (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1827 upto = reservedUpto;
1828 }
1829
1830 /*
1831 * Loop through all the locks, sleeping on any in-progress insert older
1832 * than 'upto'.
1833 *
1834 * finishedUpto is our return value, indicating the point upto which all
1835 * the WAL insertions have been finished. Initialize it to the head of
1836 * reserved WAL, and as we iterate through the insertion locks, back it
1837 * out for any insertion that's still in progress.
1838 */
1839 finishedUpto = reservedUpto;
1840 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1841 {
1842 XLogRecPtr insertingat = InvalidXLogRecPtr;
1843
1844 do
1845 {
1846 /*
1847 * See if this insertion is in progress. LWLockWaitForVar will
1848 * wait for the lock to be released, or for the 'value' to be set
1849 * by a LWLockUpdateVar call. When a lock is initially acquired,
1850 * its value is 0 (InvalidXLogRecPtr), which means that we don't
1851 * know where it's inserting yet. We will have to wait for it. If
1852 * it's a small insertion, the record will most likely fit on the
1853 * same page and the inserter will release the lock without ever
1854 * calling LWLockUpdateVar. But if it has to sleep, it will
1855 * advertise the insertion point with LWLockUpdateVar before
1856 * sleeping.
1857 */
1858 if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1859 &WALInsertLocks[i].l.insertingAt,
1860 insertingat, &insertingat))
1861 {
1862 /* the lock was free, so no insertion in progress */
1863 insertingat = InvalidXLogRecPtr;
1864 break;
1865 }
1866
1867 /*
1868 * This insertion is still in progress. Have to wait, unless the
1869 * inserter has proceeded past 'upto'.
1870 */
1871 } while (insertingat < upto);
1872
1873 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1874 finishedUpto = insertingat;
1875 }
1876 return finishedUpto;
1877 }
1878
1879 /*
1880 * Get a pointer to the right location in the WAL buffer containing the
1881 * given XLogRecPtr.
1882 *
1883 * If the page is not initialized yet, it is initialized. That might require
1884 * evicting an old dirty buffer from the buffer cache, which means I/O.
1885 *
1886 * The caller must ensure that the page containing the requested location
1887 * isn't evicted yet, and won't be evicted. The way to ensure that is to
1888 * hold onto a WAL insertion lock with the insertingAt position set to
1889 * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1890 * to evict an old page from the buffer. (This means that once you call
1891 * GetXLogBuffer() with a given 'ptr', you must not access anything before
1892 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1893 * later, because older buffers might be recycled already)
1894 */
1895 static char *
GetXLogBuffer(XLogRecPtr ptr)1896 GetXLogBuffer(XLogRecPtr ptr)
1897 {
1898 int idx;
1899 XLogRecPtr endptr;
1900 static uint64 cachedPage = 0;
1901 static char *cachedPos = NULL;
1902 XLogRecPtr expectedEndPtr;
1903
1904 /*
1905 * Fast path for the common case that we need to access again the same
1906 * page as last time.
1907 */
1908 if (ptr / XLOG_BLCKSZ == cachedPage)
1909 {
1910 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1911 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1912 return cachedPos + ptr % XLOG_BLCKSZ;
1913 }
1914
1915 /*
1916 * The XLog buffer cache is organized so that a page is always loaded to a
1917 * particular buffer. That way we can easily calculate the buffer a given
1918 * page must be loaded into, from the XLogRecPtr alone.
1919 */
1920 idx = XLogRecPtrToBufIdx(ptr);
1921
1922 /*
1923 * See what page is loaded in the buffer at the moment. It could be the
1924 * page we're looking for, or something older. It can't be anything newer
1925 * - that would imply the page we're looking for has already been written
1926 * out to disk and evicted, and the caller is responsible for making sure
1927 * that doesn't happen.
1928 *
1929 * However, we don't hold a lock while we read the value. If someone has
1930 * just initialized the page, it's possible that we get a "torn read" of
1931 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1932 * that case we will see a bogus value. That's ok, we'll grab the mapping
1933 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1934 * the page we're looking for. But it means that when we do this unlocked
1935 * read, we might see a value that appears to be ahead of the page we're
1936 * looking for. Don't PANIC on that, until we've verified the value while
1937 * holding the lock.
1938 */
1939 expectedEndPtr = ptr;
1940 expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1941
1942 endptr = XLogCtl->xlblocks[idx];
1943 if (expectedEndPtr != endptr)
1944 {
1945 XLogRecPtr initializedUpto;
1946
1947 /*
1948 * Before calling AdvanceXLInsertBuffer(), which can block, let others
1949 * know how far we're finished with inserting the record.
1950 *
1951 * NB: If 'ptr' points to just after the page header, advertise a
1952 * position at the beginning of the page rather than 'ptr' itself. If
1953 * there are no other insertions running, someone might try to flush
1954 * up to our advertised location. If we advertised a position after
1955 * the page header, someone might try to flush the page header, even
1956 * though page might actually not be initialized yet. As the first
1957 * inserter on the page, we are effectively responsible for making
1958 * sure that it's initialized, before we let insertingAt to move past
1959 * the page header.
1960 */
1961 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1962 XLogSegmentOffset(ptr, wal_segment_size) > XLOG_BLCKSZ)
1963 initializedUpto = ptr - SizeOfXLogShortPHD;
1964 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1965 XLogSegmentOffset(ptr, wal_segment_size) < XLOG_BLCKSZ)
1966 initializedUpto = ptr - SizeOfXLogLongPHD;
1967 else
1968 initializedUpto = ptr;
1969
1970 WALInsertLockUpdateInsertingAt(initializedUpto);
1971
1972 AdvanceXLInsertBuffer(ptr, false);
1973 endptr = XLogCtl->xlblocks[idx];
1974
1975 if (expectedEndPtr != endptr)
1976 elog(PANIC, "could not find WAL buffer for %X/%X",
1977 (uint32) (ptr >> 32), (uint32) ptr);
1978 }
1979 else
1980 {
1981 /*
1982 * Make sure the initialization of the page is visible to us, and
1983 * won't arrive later to overwrite the WAL data we write on the page.
1984 */
1985 pg_memory_barrier();
1986 }
1987
1988 /*
1989 * Found the buffer holding this page. Return a pointer to the right
1990 * offset within the page.
1991 */
1992 cachedPage = ptr / XLOG_BLCKSZ;
1993 cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1994
1995 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1996 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1997
1998 return cachedPos + ptr % XLOG_BLCKSZ;
1999 }
2000
2001 /*
2002 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
2003 * is the position starting from the beginning of WAL, excluding all WAL
2004 * page headers.
2005 */
2006 static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)2007 XLogBytePosToRecPtr(uint64 bytepos)
2008 {
2009 uint64 fullsegs;
2010 uint64 fullpages;
2011 uint64 bytesleft;
2012 uint32 seg_offset;
2013 XLogRecPtr result;
2014
2015 fullsegs = bytepos / UsableBytesInSegment;
2016 bytesleft = bytepos % UsableBytesInSegment;
2017
2018 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2019 {
2020 /* fits on first page of segment */
2021 seg_offset = bytesleft + SizeOfXLogLongPHD;
2022 }
2023 else
2024 {
2025 /* account for the first page on segment with long header */
2026 seg_offset = XLOG_BLCKSZ;
2027 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2028
2029 fullpages = bytesleft / UsableBytesInPage;
2030 bytesleft = bytesleft % UsableBytesInPage;
2031
2032 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2033 }
2034
2035 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2036
2037 return result;
2038 }
2039
2040 /*
2041 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
2042 * returns a pointer to the beginning of the page (ie. before page header),
2043 * not to where the first xlog record on that page would go to. This is used
2044 * when converting a pointer to the end of a record.
2045 */
2046 static XLogRecPtr
XLogBytePosToEndRecPtr(uint64 bytepos)2047 XLogBytePosToEndRecPtr(uint64 bytepos)
2048 {
2049 uint64 fullsegs;
2050 uint64 fullpages;
2051 uint64 bytesleft;
2052 uint32 seg_offset;
2053 XLogRecPtr result;
2054
2055 fullsegs = bytepos / UsableBytesInSegment;
2056 bytesleft = bytepos % UsableBytesInSegment;
2057
2058 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
2059 {
2060 /* fits on first page of segment */
2061 if (bytesleft == 0)
2062 seg_offset = 0;
2063 else
2064 seg_offset = bytesleft + SizeOfXLogLongPHD;
2065 }
2066 else
2067 {
2068 /* account for the first page on segment with long header */
2069 seg_offset = XLOG_BLCKSZ;
2070 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2071
2072 fullpages = bytesleft / UsableBytesInPage;
2073 bytesleft = bytesleft % UsableBytesInPage;
2074
2075 if (bytesleft == 0)
2076 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2077 else
2078 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2079 }
2080
2081 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, wal_segment_size, result);
2082
2083 return result;
2084 }
2085
2086 /*
2087 * Convert an XLogRecPtr to a "usable byte position".
2088 */
2089 static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)2090 XLogRecPtrToBytePos(XLogRecPtr ptr)
2091 {
2092 uint64 fullsegs;
2093 uint32 fullpages;
2094 uint32 offset;
2095 uint64 result;
2096
2097 XLByteToSeg(ptr, fullsegs, wal_segment_size);
2098
2099 fullpages = (XLogSegmentOffset(ptr, wal_segment_size)) / XLOG_BLCKSZ;
2100 offset = ptr % XLOG_BLCKSZ;
2101
2102 if (fullpages == 0)
2103 {
2104 result = fullsegs * UsableBytesInSegment;
2105 if (offset > 0)
2106 {
2107 Assert(offset >= SizeOfXLogLongPHD);
2108 result += offset - SizeOfXLogLongPHD;
2109 }
2110 }
2111 else
2112 {
2113 result = fullsegs * UsableBytesInSegment +
2114 (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2115 (fullpages - 1) * UsableBytesInPage; /* full pages */
2116 if (offset > 0)
2117 {
2118 Assert(offset >= SizeOfXLogShortPHD);
2119 result += offset - SizeOfXLogShortPHD;
2120 }
2121 }
2122
2123 return result;
2124 }
2125
2126 /*
2127 * Initialize XLOG buffers, writing out old buffers if they still contain
2128 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2129 * true, initialize as many pages as we can without having to write out
2130 * unwritten data. Any new pages are initialized to zeros, with pages headers
2131 * initialized properly.
2132 */
2133 static void
AdvanceXLInsertBuffer(XLogRecPtr upto,bool opportunistic)2134 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2135 {
2136 XLogCtlInsert *Insert = &XLogCtl->Insert;
2137 int nextidx;
2138 XLogRecPtr OldPageRqstPtr;
2139 XLogwrtRqst WriteRqst;
2140 XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
2141 XLogRecPtr NewPageBeginPtr;
2142 XLogPageHeader NewPage;
2143 int npages = 0;
2144
2145 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2146
2147 /*
2148 * Now that we have the lock, check if someone initialized the page
2149 * already.
2150 */
2151 while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2152 {
2153 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2154
2155 /*
2156 * Get ending-offset of the buffer page we need to replace (this may
2157 * be zero if the buffer hasn't been used yet). Fall through if it's
2158 * already written out.
2159 */
2160 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2161 if (LogwrtResult.Write < OldPageRqstPtr)
2162 {
2163 /*
2164 * Nope, got work to do. If we just want to pre-initialize as much
2165 * as we can without flushing, give up now.
2166 */
2167 if (opportunistic)
2168 break;
2169
2170 /* Before waiting, get info_lck and update LogwrtResult */
2171 SpinLockAcquire(&XLogCtl->info_lck);
2172 if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2173 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2174 LogwrtResult = XLogCtl->LogwrtResult;
2175 SpinLockRelease(&XLogCtl->info_lck);
2176
2177 /*
2178 * Now that we have an up-to-date LogwrtResult value, see if we
2179 * still need to write it or if someone else already did.
2180 */
2181 if (LogwrtResult.Write < OldPageRqstPtr)
2182 {
2183 /*
2184 * Must acquire write lock. Release WALBufMappingLock first,
2185 * to make sure that all insertions that we need to wait for
2186 * can finish (up to this same position). Otherwise we risk
2187 * deadlock.
2188 */
2189 LWLockRelease(WALBufMappingLock);
2190
2191 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2192
2193 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2194
2195 LogwrtResult = XLogCtl->LogwrtResult;
2196 if (LogwrtResult.Write >= OldPageRqstPtr)
2197 {
2198 /* OK, someone wrote it already */
2199 LWLockRelease(WALWriteLock);
2200 }
2201 else
2202 {
2203 /* Have to write it ourselves */
2204 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2205 WriteRqst.Write = OldPageRqstPtr;
2206 WriteRqst.Flush = 0;
2207 XLogWrite(WriteRqst, false);
2208 LWLockRelease(WALWriteLock);
2209 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2210 }
2211 /* Re-acquire WALBufMappingLock and retry */
2212 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2213 continue;
2214 }
2215 }
2216
2217 /*
2218 * Now the next buffer slot is free and we can set it up to be the
2219 * next output page.
2220 */
2221 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2222 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2223
2224 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2225
2226 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2227
2228 /*
2229 * Be sure to re-zero the buffer so that bytes beyond what we've
2230 * written will look like zeroes and not valid XLOG records...
2231 */
2232 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2233
2234 /*
2235 * Fill the new page's header
2236 */
2237 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2238
2239 /* NewPage->xlp_info = 0; */ /* done by memset */
2240 NewPage->xlp_tli = ThisTimeLineID;
2241 NewPage->xlp_pageaddr = NewPageBeginPtr;
2242
2243 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2244
2245 /*
2246 * If online backup is not in progress, mark the header to indicate
2247 * that WAL records beginning in this page have removable backup
2248 * blocks. This allows the WAL archiver to know whether it is safe to
2249 * compress archived WAL data by transforming full-block records into
2250 * the non-full-block format. It is sufficient to record this at the
2251 * page level because we force a page switch (in fact a segment
2252 * switch) when starting a backup, so the flag will be off before any
2253 * records can be written during the backup. At the end of a backup,
2254 * the last page will be marked as all unsafe when perhaps only part
2255 * is unsafe, but at worst the archiver would miss the opportunity to
2256 * compress a few records.
2257 */
2258 if (!Insert->forcePageWrites)
2259 NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2260
2261 /*
2262 * If a record was found to be broken at the end of recovery, and
2263 * we're going to write on the page where its first contrecord was
2264 * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
2265 * header. See CreateOverwriteContrecordRecord().
2266 */
2267 if (missingContrecPtr == NewPageBeginPtr)
2268 {
2269 NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
2270 missingContrecPtr = InvalidXLogRecPtr;
2271 }
2272
2273 /*
2274 * If first page of an XLOG segment file, make it a long header.
2275 */
2276 if ((XLogSegmentOffset(NewPage->xlp_pageaddr, wal_segment_size)) == 0)
2277 {
2278 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2279
2280 NewLongPage->xlp_sysid = ControlFile->system_identifier;
2281 NewLongPage->xlp_seg_size = wal_segment_size;
2282 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2283 NewPage->xlp_info |= XLP_LONG_HEADER;
2284 }
2285
2286 /*
2287 * Make sure the initialization of the page becomes visible to others
2288 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2289 * holding a lock.
2290 */
2291 pg_write_barrier();
2292
2293 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2294
2295 XLogCtl->InitializedUpTo = NewPageEndPtr;
2296
2297 npages++;
2298 }
2299 LWLockRelease(WALBufMappingLock);
2300
2301 #ifdef WAL_DEBUG
2302 if (XLOG_DEBUG && npages > 0)
2303 {
2304 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2305 npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2306 }
2307 #endif
2308 }
2309
2310 /*
2311 * Calculate CheckPointSegments based on max_wal_size_mb and
2312 * checkpoint_completion_target.
2313 */
2314 static void
CalculateCheckpointSegments(void)2315 CalculateCheckpointSegments(void)
2316 {
2317 double target;
2318
2319 /*-------
2320 * Calculate the distance at which to trigger a checkpoint, to avoid
2321 * exceeding max_wal_size_mb. This is based on two assumptions:
2322 *
2323 * a) we keep WAL for only one checkpoint cycle (prior to PG11 we kept
2324 * WAL for two checkpoint cycles to allow us to recover from the
2325 * secondary checkpoint if the first checkpoint failed, though we
2326 * only did this on the master anyway, not on standby. Keeping just
2327 * one checkpoint simplifies processing and reduces disk space in
2328 * many smaller databases.)
2329 * b) during checkpoint, we consume checkpoint_completion_target *
2330 * number of segments consumed between checkpoints.
2331 *-------
2332 */
2333 target = (double) ConvertToXSegs(max_wal_size_mb, wal_segment_size) /
2334 (1.0 + CheckPointCompletionTarget);
2335
2336 /* round down */
2337 CheckPointSegments = (int) target;
2338
2339 if (CheckPointSegments < 1)
2340 CheckPointSegments = 1;
2341 }
2342
2343 void
assign_max_wal_size(int newval,void * extra)2344 assign_max_wal_size(int newval, void *extra)
2345 {
2346 max_wal_size_mb = newval;
2347 CalculateCheckpointSegments();
2348 }
2349
2350 void
assign_checkpoint_completion_target(double newval,void * extra)2351 assign_checkpoint_completion_target(double newval, void *extra)
2352 {
2353 CheckPointCompletionTarget = newval;
2354 CalculateCheckpointSegments();
2355 }
2356
2357 /*
2358 * At a checkpoint, how many WAL segments to recycle as preallocated future
2359 * XLOG segments? Returns the highest segment that should be preallocated.
2360 */
2361 static XLogSegNo
XLOGfileslop(XLogRecPtr lastredoptr)2362 XLOGfileslop(XLogRecPtr lastredoptr)
2363 {
2364 XLogSegNo minSegNo;
2365 XLogSegNo maxSegNo;
2366 double distance;
2367 XLogSegNo recycleSegNo;
2368
2369 /*
2370 * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2371 * correspond to. Always recycle enough segments to meet the minimum, and
2372 * remove enough segments to stay below the maximum.
2373 */
2374 minSegNo = lastredoptr / wal_segment_size +
2375 ConvertToXSegs(min_wal_size_mb, wal_segment_size) - 1;
2376 maxSegNo = lastredoptr / wal_segment_size +
2377 ConvertToXSegs(max_wal_size_mb, wal_segment_size) - 1;
2378
2379 /*
2380 * Between those limits, recycle enough segments to get us through to the
2381 * estimated end of next checkpoint.
2382 *
2383 * To estimate where the next checkpoint will finish, assume that the
2384 * system runs steadily consuming CheckPointDistanceEstimate bytes between
2385 * every checkpoint.
2386 */
2387 distance = (1.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2388 /* add 10% for good measure. */
2389 distance *= 1.10;
2390
2391 recycleSegNo = (XLogSegNo) ceil(((double) lastredoptr + distance) /
2392 wal_segment_size);
2393
2394 if (recycleSegNo < minSegNo)
2395 recycleSegNo = minSegNo;
2396 if (recycleSegNo > maxSegNo)
2397 recycleSegNo = maxSegNo;
2398
2399 return recycleSegNo;
2400 }
2401
2402 /*
2403 * Check whether we've consumed enough xlog space that a checkpoint is needed.
2404 *
2405 * new_segno indicates a log file that has just been filled up (or read
2406 * during recovery). We measure the distance from RedoRecPtr to new_segno
2407 * and see if that exceeds CheckPointSegments.
2408 *
2409 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2410 */
2411 static bool
XLogCheckpointNeeded(XLogSegNo new_segno)2412 XLogCheckpointNeeded(XLogSegNo new_segno)
2413 {
2414 XLogSegNo old_segno;
2415
2416 XLByteToSeg(RedoRecPtr, old_segno, wal_segment_size);
2417
2418 if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2419 return true;
2420 return false;
2421 }
2422
2423 /*
2424 * Write and/or fsync the log at least as far as WriteRqst indicates.
2425 *
2426 * If flexible == true, we don't have to write as far as WriteRqst, but
2427 * may stop at any convenient boundary (such as a cache or logfile boundary).
2428 * This option allows us to avoid uselessly issuing multiple writes when a
2429 * single one would do.
2430 *
2431 * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2432 * must be called before grabbing the lock, to make sure the data is ready to
2433 * write.
2434 */
2435 static void
XLogWrite(XLogwrtRqst WriteRqst,bool flexible)2436 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2437 {
2438 bool ispartialpage;
2439 bool last_iteration;
2440 bool finishing_seg;
2441 bool use_existent;
2442 int curridx;
2443 int npages;
2444 int startidx;
2445 uint32 startoffset;
2446
2447 /* We should always be inside a critical section here */
2448 Assert(CritSectionCount > 0);
2449
2450 /*
2451 * Update local LogwrtResult (caller probably did this already, but...)
2452 */
2453 LogwrtResult = XLogCtl->LogwrtResult;
2454
2455 /*
2456 * Since successive pages in the xlog cache are consecutively allocated,
2457 * we can usually gather multiple pages together and issue just one
2458 * write() call. npages is the number of pages we have determined can be
2459 * written together; startidx is the cache block index of the first one,
2460 * and startoffset is the file offset at which it should go. The latter
2461 * two variables are only valid when npages > 0, but we must initialize
2462 * all of them to keep the compiler quiet.
2463 */
2464 npages = 0;
2465 startidx = 0;
2466 startoffset = 0;
2467
2468 /*
2469 * Within the loop, curridx is the cache block index of the page to
2470 * consider writing. Begin at the buffer containing the next unwritten
2471 * page, or last partially written page.
2472 */
2473 curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2474
2475 while (LogwrtResult.Write < WriteRqst.Write)
2476 {
2477 /*
2478 * Make sure we're not ahead of the insert process. This could happen
2479 * if we're passed a bogus WriteRqst.Write that is past the end of the
2480 * last page that's been initialized by AdvanceXLInsertBuffer.
2481 */
2482 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2483
2484 if (LogwrtResult.Write >= EndPtr)
2485 elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2486 (uint32) (LogwrtResult.Write >> 32),
2487 (uint32) LogwrtResult.Write,
2488 (uint32) (EndPtr >> 32), (uint32) EndPtr);
2489
2490 /* Advance LogwrtResult.Write to end of current buffer page */
2491 LogwrtResult.Write = EndPtr;
2492 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2493
2494 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2495 wal_segment_size))
2496 {
2497 /*
2498 * Switch to new logfile segment. We cannot have any pending
2499 * pages here (since we dump what we have at segment end).
2500 */
2501 Assert(npages == 0);
2502 if (openLogFile >= 0)
2503 XLogFileClose();
2504 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2505 wal_segment_size);
2506
2507 /* create/use new log file */
2508 use_existent = true;
2509 openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2510 ReserveExternalFD();
2511 }
2512
2513 /* Make sure we have the current logfile open */
2514 if (openLogFile < 0)
2515 {
2516 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2517 wal_segment_size);
2518 openLogFile = XLogFileOpen(openLogSegNo);
2519 ReserveExternalFD();
2520 }
2521
2522 /* Add current page to the set of pending pages-to-dump */
2523 if (npages == 0)
2524 {
2525 /* first of group */
2526 startidx = curridx;
2527 startoffset = XLogSegmentOffset(LogwrtResult.Write - XLOG_BLCKSZ,
2528 wal_segment_size);
2529 }
2530 npages++;
2531
2532 /*
2533 * Dump the set if this will be the last loop iteration, or if we are
2534 * at the last page of the cache area (since the next page won't be
2535 * contiguous in memory), or if we are at the end of the logfile
2536 * segment.
2537 */
2538 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2539
2540 finishing_seg = !ispartialpage &&
2541 (startoffset + npages * XLOG_BLCKSZ) >= wal_segment_size;
2542
2543 if (last_iteration ||
2544 curridx == XLogCtl->XLogCacheBlck ||
2545 finishing_seg)
2546 {
2547 char *from;
2548 Size nbytes;
2549 Size nleft;
2550 int written;
2551
2552 /* OK to write the page(s) */
2553 from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2554 nbytes = npages * (Size) XLOG_BLCKSZ;
2555 nleft = nbytes;
2556 do
2557 {
2558 errno = 0;
2559 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2560 written = pg_pwrite(openLogFile, from, nleft, startoffset);
2561 pgstat_report_wait_end();
2562 if (written <= 0)
2563 {
2564 char xlogfname[MAXFNAMELEN];
2565 int save_errno;
2566
2567 if (errno == EINTR)
2568 continue;
2569
2570 save_errno = errno;
2571 XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
2572 wal_segment_size);
2573 errno = save_errno;
2574 ereport(PANIC,
2575 (errcode_for_file_access(),
2576 errmsg("could not write to log file %s "
2577 "at offset %u, length %zu: %m",
2578 xlogfname, startoffset, nleft)));
2579 }
2580 nleft -= written;
2581 from += written;
2582 startoffset += written;
2583 } while (nleft > 0);
2584
2585 npages = 0;
2586
2587 /*
2588 * If we just wrote the whole last page of a logfile segment,
2589 * fsync the segment immediately. This avoids having to go back
2590 * and re-open prior segments when an fsync request comes along
2591 * later. Doing it here ensures that one and only one backend will
2592 * perform this fsync.
2593 *
2594 * This is also the right place to notify the Archiver that the
2595 * segment is ready to copy to archival storage, and to update the
2596 * timer for archive_timeout, and to signal for a checkpoint if
2597 * too many logfile segments have been used since the last
2598 * checkpoint.
2599 */
2600 if (finishing_seg)
2601 {
2602 issue_xlog_fsync(openLogFile, openLogSegNo);
2603
2604 /* signal that we need to wakeup walsenders later */
2605 WalSndWakeupRequest();
2606
2607 LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
2608
2609 if (XLogArchivingActive())
2610 XLogArchiveNotifySeg(openLogSegNo);
2611
2612 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2613 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2614
2615 /*
2616 * Request a checkpoint if we've consumed too much xlog since
2617 * the last one. For speed, we first check using the local
2618 * copy of RedoRecPtr, which might be out of date; if it looks
2619 * like a checkpoint is needed, forcibly update RedoRecPtr and
2620 * recheck.
2621 */
2622 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2623 {
2624 (void) GetRedoRecPtr();
2625 if (XLogCheckpointNeeded(openLogSegNo))
2626 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2627 }
2628 }
2629 }
2630
2631 if (ispartialpage)
2632 {
2633 /* Only asked to write a partial page */
2634 LogwrtResult.Write = WriteRqst.Write;
2635 break;
2636 }
2637 curridx = NextBufIdx(curridx);
2638
2639 /* If flexible, break out of loop as soon as we wrote something */
2640 if (flexible && npages == 0)
2641 break;
2642 }
2643
2644 Assert(npages == 0);
2645
2646 /*
2647 * If asked to flush, do so
2648 */
2649 if (LogwrtResult.Flush < WriteRqst.Flush &&
2650 LogwrtResult.Flush < LogwrtResult.Write)
2651
2652 {
2653 /*
2654 * Could get here without iterating above loop, in which case we might
2655 * have no open file or the wrong one. However, we do not need to
2656 * fsync more than one file.
2657 */
2658 if (sync_method != SYNC_METHOD_OPEN &&
2659 sync_method != SYNC_METHOD_OPEN_DSYNC)
2660 {
2661 if (openLogFile >= 0 &&
2662 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
2663 wal_segment_size))
2664 XLogFileClose();
2665 if (openLogFile < 0)
2666 {
2667 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo,
2668 wal_segment_size);
2669 openLogFile = XLogFileOpen(openLogSegNo);
2670 ReserveExternalFD();
2671 }
2672
2673 issue_xlog_fsync(openLogFile, openLogSegNo);
2674 }
2675
2676 /* signal that we need to wakeup walsenders later */
2677 WalSndWakeupRequest();
2678
2679 LogwrtResult.Flush = LogwrtResult.Write;
2680 }
2681
2682 /*
2683 * Update shared-memory status
2684 *
2685 * We make sure that the shared 'request' values do not fall behind the
2686 * 'result' values. This is not absolutely essential, but it saves some
2687 * code in a couple of places.
2688 */
2689 {
2690 SpinLockAcquire(&XLogCtl->info_lck);
2691 XLogCtl->LogwrtResult = LogwrtResult;
2692 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2693 XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2694 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2695 XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2696 SpinLockRelease(&XLogCtl->info_lck);
2697 }
2698 }
2699
2700 /*
2701 * Record the LSN for an asynchronous transaction commit/abort
2702 * and nudge the WALWriter if there is work for it to do.
2703 * (This should not be called for synchronous commits.)
2704 */
2705 void
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)2706 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2707 {
2708 XLogRecPtr WriteRqstPtr = asyncXactLSN;
2709 bool sleeping;
2710
2711 SpinLockAcquire(&XLogCtl->info_lck);
2712 LogwrtResult = XLogCtl->LogwrtResult;
2713 sleeping = XLogCtl->WalWriterSleeping;
2714 if (XLogCtl->asyncXactLSN < asyncXactLSN)
2715 XLogCtl->asyncXactLSN = asyncXactLSN;
2716 SpinLockRelease(&XLogCtl->info_lck);
2717
2718 /*
2719 * If the WALWriter is sleeping, we should kick it to make it come out of
2720 * low-power mode. Otherwise, determine whether there's a full page of
2721 * WAL available to write.
2722 */
2723 if (!sleeping)
2724 {
2725 /* back off to last completed page boundary */
2726 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2727
2728 /* if we have already flushed that far, we're done */
2729 if (WriteRqstPtr <= LogwrtResult.Flush)
2730 return;
2731 }
2732
2733 /*
2734 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2735 * to come out of low-power mode so that this async commit will reach disk
2736 * within the expected amount of time.
2737 */
2738 if (ProcGlobal->walwriterLatch)
2739 SetLatch(ProcGlobal->walwriterLatch);
2740 }
2741
2742 /*
2743 * Record the LSN up to which we can remove WAL because it's not required by
2744 * any replication slot.
2745 */
2746 void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)2747 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2748 {
2749 SpinLockAcquire(&XLogCtl->info_lck);
2750 XLogCtl->replicationSlotMinLSN = lsn;
2751 SpinLockRelease(&XLogCtl->info_lck);
2752 }
2753
2754
2755 /*
2756 * Return the oldest LSN we must retain to satisfy the needs of some
2757 * replication slot.
2758 */
2759 static XLogRecPtr
XLogGetReplicationSlotMinimumLSN(void)2760 XLogGetReplicationSlotMinimumLSN(void)
2761 {
2762 XLogRecPtr retval;
2763
2764 SpinLockAcquire(&XLogCtl->info_lck);
2765 retval = XLogCtl->replicationSlotMinLSN;
2766 SpinLockRelease(&XLogCtl->info_lck);
2767
2768 return retval;
2769 }
2770
2771 /*
2772 * Advance minRecoveryPoint in control file.
2773 *
2774 * If we crash during recovery, we must reach this point again before the
2775 * database is consistent.
2776 *
2777 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2778 * is only updated if it's not already greater than or equal to 'lsn'.
2779 */
2780 static void
UpdateMinRecoveryPoint(XLogRecPtr lsn,bool force)2781 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2782 {
2783 /* Quick check using our local copy of the variable */
2784 if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2785 return;
2786
2787 /*
2788 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2789 * i.e., we're doing crash recovery. We never modify the control file's
2790 * value in that case, so we can short-circuit future checks here too. The
2791 * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2792 * updated until crash recovery finishes. We only do this for the startup
2793 * process as it should not update its own reference of minRecoveryPoint
2794 * until it has finished crash recovery to make sure that all WAL
2795 * available is replayed in this case. This also saves from extra locks
2796 * taken on the control file from the startup process.
2797 */
2798 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2799 {
2800 updateMinRecoveryPoint = false;
2801 return;
2802 }
2803
2804 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2805
2806 /* update local copy */
2807 minRecoveryPoint = ControlFile->minRecoveryPoint;
2808 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2809
2810 if (XLogRecPtrIsInvalid(minRecoveryPoint))
2811 updateMinRecoveryPoint = false;
2812 else if (force || minRecoveryPoint < lsn)
2813 {
2814 XLogRecPtr newMinRecoveryPoint;
2815 TimeLineID newMinRecoveryPointTLI;
2816
2817 /*
2818 * To avoid having to update the control file too often, we update it
2819 * all the way to the last record being replayed, even though 'lsn'
2820 * would suffice for correctness. This also allows the 'force' case
2821 * to not need a valid 'lsn' value.
2822 *
2823 * Another important reason for doing it this way is that the passed
2824 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2825 * the caller got it from a corrupted heap page. Accepting such a
2826 * value as the min recovery point would prevent us from coming up at
2827 * all. Instead, we just log a warning and continue with recovery.
2828 * (See also the comments about corrupt LSNs in XLogFlush.)
2829 */
2830 SpinLockAcquire(&XLogCtl->info_lck);
2831 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2832 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2833 SpinLockRelease(&XLogCtl->info_lck);
2834
2835 if (!force && newMinRecoveryPoint < lsn)
2836 elog(WARNING,
2837 "xlog min recovery request %X/%X is past current point %X/%X",
2838 (uint32) (lsn >> 32), (uint32) lsn,
2839 (uint32) (newMinRecoveryPoint >> 32),
2840 (uint32) newMinRecoveryPoint);
2841
2842 /* update control file */
2843 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2844 {
2845 ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2846 ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2847 UpdateControlFile();
2848 minRecoveryPoint = newMinRecoveryPoint;
2849 minRecoveryPointTLI = newMinRecoveryPointTLI;
2850
2851 ereport(DEBUG2,
2852 (errmsg("updated min recovery point to %X/%X on timeline %u",
2853 (uint32) (minRecoveryPoint >> 32),
2854 (uint32) minRecoveryPoint,
2855 newMinRecoveryPointTLI)));
2856 }
2857 }
2858 LWLockRelease(ControlFileLock);
2859 }
2860
2861 /*
2862 * Ensure that all XLOG data through the given position is flushed to disk.
2863 *
2864 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2865 * already held, and we try to avoid acquiring it if possible.
2866 */
2867 void
XLogFlush(XLogRecPtr record)2868 XLogFlush(XLogRecPtr record)
2869 {
2870 XLogRecPtr WriteRqstPtr;
2871 XLogwrtRqst WriteRqst;
2872
2873 /*
2874 * During REDO, we are reading not writing WAL. Therefore, instead of
2875 * trying to flush the WAL, we should update minRecoveryPoint instead. We
2876 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2877 * to act this way too, and because when it tries to write the
2878 * end-of-recovery checkpoint, it should indeed flush.
2879 */
2880 if (!XLogInsertAllowed())
2881 {
2882 UpdateMinRecoveryPoint(record, false);
2883 return;
2884 }
2885
2886 /* Quick exit if already known flushed */
2887 if (record <= LogwrtResult.Flush)
2888 return;
2889
2890 #ifdef WAL_DEBUG
2891 if (XLOG_DEBUG)
2892 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2893 (uint32) (record >> 32), (uint32) record,
2894 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2895 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2896 #endif
2897
2898 START_CRIT_SECTION();
2899
2900 /*
2901 * Since fsync is usually a horribly expensive operation, we try to
2902 * piggyback as much data as we can on each fsync: if we see any more data
2903 * entered into the xlog buffer, we'll write and fsync that too, so that
2904 * the final value of LogwrtResult.Flush is as large as possible. This
2905 * gives us some chance of avoiding another fsync immediately after.
2906 */
2907
2908 /* initialize to given target; may increase below */
2909 WriteRqstPtr = record;
2910
2911 /*
2912 * Now wait until we get the write lock, or someone else does the flush
2913 * for us.
2914 */
2915 for (;;)
2916 {
2917 XLogRecPtr insertpos;
2918
2919 /* read LogwrtResult and update local state */
2920 SpinLockAcquire(&XLogCtl->info_lck);
2921 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2922 WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2923 LogwrtResult = XLogCtl->LogwrtResult;
2924 SpinLockRelease(&XLogCtl->info_lck);
2925
2926 /* done already? */
2927 if (record <= LogwrtResult.Flush)
2928 break;
2929
2930 /*
2931 * Before actually performing the write, wait for all in-flight
2932 * insertions to the pages we're about to write to finish.
2933 */
2934 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2935
2936 /*
2937 * Try to get the write lock. If we can't get it immediately, wait
2938 * until it's released, and recheck if we still need to do the flush
2939 * or if the backend that held the lock did it for us already. This
2940 * helps to maintain a good rate of group committing when the system
2941 * is bottlenecked by the speed of fsyncing.
2942 */
2943 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2944 {
2945 /*
2946 * The lock is now free, but we didn't acquire it yet. Before we
2947 * do, loop back to check if someone else flushed the record for
2948 * us already.
2949 */
2950 continue;
2951 }
2952
2953 /* Got the lock; recheck whether request is satisfied */
2954 LogwrtResult = XLogCtl->LogwrtResult;
2955 if (record <= LogwrtResult.Flush)
2956 {
2957 LWLockRelease(WALWriteLock);
2958 break;
2959 }
2960
2961 /*
2962 * Sleep before flush! By adding a delay here, we may give further
2963 * backends the opportunity to join the backlog of group commit
2964 * followers; this can significantly improve transaction throughput,
2965 * at the risk of increasing transaction latency.
2966 *
2967 * We do not sleep if enableFsync is not turned on, nor if there are
2968 * fewer than CommitSiblings other backends with active transactions.
2969 */
2970 if (CommitDelay > 0 && enableFsync &&
2971 MinimumActiveBackends(CommitSiblings))
2972 {
2973 pg_usleep(CommitDelay);
2974
2975 /*
2976 * Re-check how far we can now flush the WAL. It's generally not
2977 * safe to call WaitXLogInsertionsToFinish while holding
2978 * WALWriteLock, because an in-progress insertion might need to
2979 * also grab WALWriteLock to make progress. But we know that all
2980 * the insertions up to insertpos have already finished, because
2981 * that's what the earlier WaitXLogInsertionsToFinish() returned.
2982 * We're only calling it again to allow insertpos to be moved
2983 * further forward, not to actually wait for anyone.
2984 */
2985 insertpos = WaitXLogInsertionsToFinish(insertpos);
2986 }
2987
2988 /* try to write/flush later additions to XLOG as well */
2989 WriteRqst.Write = insertpos;
2990 WriteRqst.Flush = insertpos;
2991
2992 XLogWrite(WriteRqst, false);
2993
2994 LWLockRelease(WALWriteLock);
2995 /* done */
2996 break;
2997 }
2998
2999 END_CRIT_SECTION();
3000
3001 /* wake up walsenders now that we've released heavily contended locks */
3002 WalSndWakeupProcessRequests();
3003
3004 /*
3005 * If we still haven't flushed to the request point then we have a
3006 * problem; most likely, the requested flush point is past end of XLOG.
3007 * This has been seen to occur when a disk page has a corrupted LSN.
3008 *
3009 * Formerly we treated this as a PANIC condition, but that hurts the
3010 * system's robustness rather than helping it: we do not want to take down
3011 * the whole system due to corruption on one data page. In particular, if
3012 * the bad page is encountered again during recovery then we would be
3013 * unable to restart the database at all! (This scenario actually
3014 * happened in the field several times with 7.1 releases.) As of 8.4, bad
3015 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
3016 * the only time we can reach here during recovery is while flushing the
3017 * end-of-recovery checkpoint record, and we don't expect that to have a
3018 * bad LSN.
3019 *
3020 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
3021 * since xact.c calls this routine inside a critical section. However,
3022 * calls from bufmgr.c are not within critical sections and so we will not
3023 * force a restart for a bad LSN on a data page.
3024 */
3025 if (LogwrtResult.Flush < record)
3026 elog(ERROR,
3027 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
3028 (uint32) (record >> 32), (uint32) record,
3029 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3030 }
3031
3032 /*
3033 * Write & flush xlog, but without specifying exactly where to.
3034 *
3035 * We normally write only completed blocks; but if there is nothing to do on
3036 * that basis, we check for unwritten async commits in the current incomplete
3037 * block, and write through the latest one of those. Thus, if async commits
3038 * are not being used, we will write complete blocks only.
3039 *
3040 * If, based on the above, there's anything to write we do so immediately. But
3041 * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
3042 * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
3043 * more than wal_writer_flush_after unflushed blocks.
3044 *
3045 * We can guarantee that async commits reach disk after at most three
3046 * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
3047 * to write "flexibly", meaning it can stop at the end of the buffer ring;
3048 * this makes a difference only with very high load or long wal_writer_delay,
3049 * but imposes one extra cycle for the worst case for async commits.)
3050 *
3051 * This routine is invoked periodically by the background walwriter process.
3052 *
3053 * Returns true if there was any work to do, even if we skipped flushing due
3054 * to wal_writer_delay/wal_writer_flush_after.
3055 */
3056 bool
XLogBackgroundFlush(void)3057 XLogBackgroundFlush(void)
3058 {
3059 XLogwrtRqst WriteRqst;
3060 bool flexible = true;
3061 static TimestampTz lastflush;
3062 TimestampTz now;
3063 int flushbytes;
3064
3065 /* XLOG doesn't need flushing during recovery */
3066 if (RecoveryInProgress())
3067 return false;
3068
3069 /* read LogwrtResult and update local state */
3070 SpinLockAcquire(&XLogCtl->info_lck);
3071 LogwrtResult = XLogCtl->LogwrtResult;
3072 WriteRqst = XLogCtl->LogwrtRqst;
3073 SpinLockRelease(&XLogCtl->info_lck);
3074
3075 /* back off to last completed page boundary */
3076 WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3077
3078 /* if we have already flushed that far, consider async commit records */
3079 if (WriteRqst.Write <= LogwrtResult.Flush)
3080 {
3081 SpinLockAcquire(&XLogCtl->info_lck);
3082 WriteRqst.Write = XLogCtl->asyncXactLSN;
3083 SpinLockRelease(&XLogCtl->info_lck);
3084 flexible = false; /* ensure it all gets written */
3085 }
3086
3087 /*
3088 * If already known flushed, we're done. Just need to check if we are
3089 * holding an open file handle to a logfile that's no longer in use,
3090 * preventing the file from being deleted.
3091 */
3092 if (WriteRqst.Write <= LogwrtResult.Flush)
3093 {
3094 if (openLogFile >= 0)
3095 {
3096 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo,
3097 wal_segment_size))
3098 {
3099 XLogFileClose();
3100 }
3101 }
3102 return false;
3103 }
3104
3105 /*
3106 * Determine how far to flush WAL, based on the wal_writer_delay and
3107 * wal_writer_flush_after GUCs.
3108 */
3109 now = GetCurrentTimestamp();
3110 flushbytes =
3111 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3112
3113 if (WalWriterFlushAfter == 0 || lastflush == 0)
3114 {
3115 /* first call, or block based limits disabled */
3116 WriteRqst.Flush = WriteRqst.Write;
3117 lastflush = now;
3118 }
3119 else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3120 {
3121 /*
3122 * Flush the writes at least every WalWriterDelay ms. This is
3123 * important to bound the amount of time it takes for an asynchronous
3124 * commit to hit disk.
3125 */
3126 WriteRqst.Flush = WriteRqst.Write;
3127 lastflush = now;
3128 }
3129 else if (flushbytes >= WalWriterFlushAfter)
3130 {
3131 /* exceeded wal_writer_flush_after blocks, flush */
3132 WriteRqst.Flush = WriteRqst.Write;
3133 lastflush = now;
3134 }
3135 else
3136 {
3137 /* no flushing, this time round */
3138 WriteRqst.Flush = 0;
3139 }
3140
3141 #ifdef WAL_DEBUG
3142 if (XLOG_DEBUG)
3143 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3144 (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3145 (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3146 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3147 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3148 #endif
3149
3150 START_CRIT_SECTION();
3151
3152 /* now wait for any in-progress insertions to finish and get write lock */
3153 WaitXLogInsertionsToFinish(WriteRqst.Write);
3154 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3155 LogwrtResult = XLogCtl->LogwrtResult;
3156 if (WriteRqst.Write > LogwrtResult.Write ||
3157 WriteRqst.Flush > LogwrtResult.Flush)
3158 {
3159 XLogWrite(WriteRqst, flexible);
3160 }
3161 LWLockRelease(WALWriteLock);
3162
3163 END_CRIT_SECTION();
3164
3165 /* wake up walsenders now that we've released heavily contended locks */
3166 WalSndWakeupProcessRequests();
3167
3168 /*
3169 * Great, done. To take some work off the critical path, try to initialize
3170 * as many of the no-longer-needed WAL buffers for future use as we can.
3171 */
3172 AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3173
3174 /*
3175 * If we determined that we need to write data, but somebody else
3176 * wrote/flushed already, it should be considered as being active, to
3177 * avoid hibernating too early.
3178 */
3179 return true;
3180 }
3181
3182 /*
3183 * Test whether XLOG data has been flushed up to (at least) the given position.
3184 *
3185 * Returns true if a flush is still needed. (It may be that someone else
3186 * is already in process of flushing that far, however.)
3187 */
3188 bool
XLogNeedsFlush(XLogRecPtr record)3189 XLogNeedsFlush(XLogRecPtr record)
3190 {
3191 /*
3192 * During recovery, we don't flush WAL but update minRecoveryPoint
3193 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3194 * would need to be updated.
3195 */
3196 if (RecoveryInProgress())
3197 {
3198 /*
3199 * An invalid minRecoveryPoint means that we need to recover all the
3200 * WAL, i.e., we're doing crash recovery. We never modify the control
3201 * file's value in that case, so we can short-circuit future checks
3202 * here too. This triggers a quick exit path for the startup process,
3203 * which cannot update its local copy of minRecoveryPoint as long as
3204 * it has not replayed all WAL available when doing crash recovery.
3205 */
3206 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3207 updateMinRecoveryPoint = false;
3208
3209 /* Quick exit if already known to be updated or cannot be updated */
3210 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3211 return false;
3212
3213 /*
3214 * Update local copy of minRecoveryPoint. But if the lock is busy,
3215 * just return a conservative guess.
3216 */
3217 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3218 return true;
3219 minRecoveryPoint = ControlFile->minRecoveryPoint;
3220 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3221 LWLockRelease(ControlFileLock);
3222
3223 /*
3224 * Check minRecoveryPoint for any other process than the startup
3225 * process doing crash recovery, which should not update the control
3226 * file value if crash recovery is still running.
3227 */
3228 if (XLogRecPtrIsInvalid(minRecoveryPoint))
3229 updateMinRecoveryPoint = false;
3230
3231 /* check again */
3232 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3233 return false;
3234 else
3235 return true;
3236 }
3237
3238 /* Quick exit if already known flushed */
3239 if (record <= LogwrtResult.Flush)
3240 return false;
3241
3242 /* read LogwrtResult and update local state */
3243 SpinLockAcquire(&XLogCtl->info_lck);
3244 LogwrtResult = XLogCtl->LogwrtResult;
3245 SpinLockRelease(&XLogCtl->info_lck);
3246
3247 /* check again */
3248 if (record <= LogwrtResult.Flush)
3249 return false;
3250
3251 return true;
3252 }
3253
3254 /*
3255 * Create a new XLOG file segment, or open a pre-existing one.
3256 *
3257 * logsegno: identify segment to be created/opened.
3258 *
3259 * *use_existent: if true, OK to use a pre-existing file (else, any
3260 * pre-existing file will be deleted). On return, true if a pre-existing
3261 * file was used.
3262 *
3263 * use_lock: if true, acquire ControlFileLock while moving file into
3264 * place. This should be true except during bootstrap log creation. The
3265 * caller must *not* hold the lock at call.
3266 *
3267 * Returns FD of opened file.
3268 *
3269 * Note: errors here are ERROR not PANIC because we might or might not be
3270 * inside a critical section (eg, during checkpoint there is no reason to
3271 * take down the system on failure). They will promote to PANIC if we are
3272 * in a critical section.
3273 */
3274 int
XLogFileInit(XLogSegNo logsegno,bool * use_existent,bool use_lock)3275 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3276 {
3277 char path[MAXPGPATH];
3278 char tmppath[MAXPGPATH];
3279 PGAlignedXLogBlock zbuffer;
3280 XLogSegNo installed_segno;
3281 XLogSegNo max_segno;
3282 int fd;
3283 int nbytes;
3284 int save_errno;
3285
3286 XLogFilePath(path, ThisTimeLineID, logsegno, wal_segment_size);
3287
3288 /*
3289 * Try to use existent file (checkpoint maker may have created it already)
3290 */
3291 if (*use_existent)
3292 {
3293 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3294 if (fd < 0)
3295 {
3296 if (errno != ENOENT)
3297 ereport(ERROR,
3298 (errcode_for_file_access(),
3299 errmsg("could not open file \"%s\": %m", path)));
3300 }
3301 else
3302 return fd;
3303 }
3304
3305 /*
3306 * Initialize an empty (all zeroes) segment. NOTE: it is possible that
3307 * another process is doing the same thing. If so, we will end up
3308 * pre-creating an extra log segment. That seems OK, and better than
3309 * holding the lock throughout this lengthy process.
3310 */
3311 elog(DEBUG2, "creating and filling new WAL file");
3312
3313 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3314
3315 unlink(tmppath);
3316
3317 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3318 fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3319 if (fd < 0)
3320 ereport(ERROR,
3321 (errcode_for_file_access(),
3322 errmsg("could not create file \"%s\": %m", tmppath)));
3323
3324 memset(zbuffer.data, 0, XLOG_BLCKSZ);
3325
3326 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3327 save_errno = 0;
3328 if (wal_init_zero)
3329 {
3330 /*
3331 * Zero-fill the file. With this setting, we do this the hard way to
3332 * ensure that all the file space has really been allocated. On
3333 * platforms that allow "holes" in files, just seeking to the end
3334 * doesn't allocate intermediate space. This way, we know that we
3335 * have all the space and (after the fsync below) that all the
3336 * indirect blocks are down on disk. Therefore, fdatasync(2) or
3337 * O_DSYNC will be sufficient to sync future writes to the log file.
3338 */
3339 for (nbytes = 0; nbytes < wal_segment_size; nbytes += XLOG_BLCKSZ)
3340 {
3341 errno = 0;
3342 if (write(fd, zbuffer.data, XLOG_BLCKSZ) != XLOG_BLCKSZ)
3343 {
3344 /* if write didn't set errno, assume no disk space */
3345 save_errno = errno ? errno : ENOSPC;
3346 break;
3347 }
3348 }
3349 }
3350 else
3351 {
3352 /*
3353 * Otherwise, seeking to the end and writing a solitary byte is
3354 * enough.
3355 */
3356 errno = 0;
3357 if (pg_pwrite(fd, zbuffer.data, 1, wal_segment_size - 1) != 1)
3358 {
3359 /* if write didn't set errno, assume no disk space */
3360 save_errno = errno ? errno : ENOSPC;
3361 }
3362 }
3363 pgstat_report_wait_end();
3364
3365 if (save_errno)
3366 {
3367 /*
3368 * If we fail to make the file, delete it to release disk space
3369 */
3370 unlink(tmppath);
3371
3372 close(fd);
3373
3374 errno = save_errno;
3375
3376 ereport(ERROR,
3377 (errcode_for_file_access(),
3378 errmsg("could not write to file \"%s\": %m", tmppath)));
3379 }
3380
3381 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3382 if (pg_fsync(fd) != 0)
3383 {
3384 int save_errno = errno;
3385
3386 close(fd);
3387 errno = save_errno;
3388 ereport(ERROR,
3389 (errcode_for_file_access(),
3390 errmsg("could not fsync file \"%s\": %m", tmppath)));
3391 }
3392 pgstat_report_wait_end();
3393
3394 if (close(fd) != 0)
3395 ereport(ERROR,
3396 (errcode_for_file_access(),
3397 errmsg("could not close file \"%s\": %m", tmppath)));
3398
3399 /*
3400 * Now move the segment into place with its final name.
3401 *
3402 * If caller didn't want to use a pre-existing file, get rid of any
3403 * pre-existing file. Otherwise, cope with possibility that someone else
3404 * has created the file while we were filling ours: if so, use ours to
3405 * pre-create a future log segment.
3406 */
3407 installed_segno = logsegno;
3408
3409 /*
3410 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3411 * that was a constant, but that was always a bit dubious: normally, at a
3412 * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3413 * here, it was the offset from the insert location. We can't do the
3414 * normal XLOGfileslop calculation here because we don't have access to
3415 * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3416 * CheckPointSegments.
3417 */
3418 max_segno = logsegno + CheckPointSegments;
3419 if (!InstallXLogFileSegment(&installed_segno, tmppath,
3420 *use_existent, max_segno,
3421 use_lock))
3422 {
3423 /*
3424 * No need for any more future segments, or InstallXLogFileSegment()
3425 * failed to rename the file into place. If the rename failed, opening
3426 * the file below will fail.
3427 */
3428 unlink(tmppath);
3429 }
3430
3431 /* Set flag to tell caller there was no existent file */
3432 *use_existent = false;
3433
3434 /* Now open original target segment (might not be file I just made) */
3435 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3436 if (fd < 0)
3437 ereport(ERROR,
3438 (errcode_for_file_access(),
3439 errmsg("could not open file \"%s\": %m", path)));
3440
3441 elog(DEBUG2, "done creating and filling new WAL file");
3442
3443 return fd;
3444 }
3445
3446 /*
3447 * Create a new XLOG file segment by copying a pre-existing one.
3448 *
3449 * destsegno: identify segment to be created.
3450 *
3451 * srcTLI, srcsegno: identify segment to be copied (could be from
3452 * a different timeline)
3453 *
3454 * upto: how much of the source file to copy (the rest is filled with
3455 * zeros)
3456 *
3457 * Currently this is only used during recovery, and so there are no locking
3458 * considerations. But we should be just as tense as XLogFileInit to avoid
3459 * emplacing a bogus file.
3460 */
3461 static void
XLogFileCopy(XLogSegNo destsegno,TimeLineID srcTLI,XLogSegNo srcsegno,int upto)3462 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3463 int upto)
3464 {
3465 char path[MAXPGPATH];
3466 char tmppath[MAXPGPATH];
3467 PGAlignedXLogBlock buffer;
3468 int srcfd;
3469 int fd;
3470 int nbytes;
3471
3472 /*
3473 * Open the source file
3474 */
3475 XLogFilePath(path, srcTLI, srcsegno, wal_segment_size);
3476 srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY);
3477 if (srcfd < 0)
3478 ereport(ERROR,
3479 (errcode_for_file_access(),
3480 errmsg("could not open file \"%s\": %m", path)));
3481
3482 /*
3483 * Copy into a temp file name.
3484 */
3485 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3486
3487 unlink(tmppath);
3488
3489 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3490 fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
3491 if (fd < 0)
3492 ereport(ERROR,
3493 (errcode_for_file_access(),
3494 errmsg("could not create file \"%s\": %m", tmppath)));
3495
3496 /*
3497 * Do the data copying.
3498 */
3499 for (nbytes = 0; nbytes < wal_segment_size; nbytes += sizeof(buffer))
3500 {
3501 int nread;
3502
3503 nread = upto - nbytes;
3504
3505 /*
3506 * The part that is not read from the source file is filled with
3507 * zeros.
3508 */
3509 if (nread < sizeof(buffer))
3510 memset(buffer.data, 0, sizeof(buffer));
3511
3512 if (nread > 0)
3513 {
3514 int r;
3515
3516 if (nread > sizeof(buffer))
3517 nread = sizeof(buffer);
3518 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3519 r = read(srcfd, buffer.data, nread);
3520 if (r != nread)
3521 {
3522 if (r < 0)
3523 ereport(ERROR,
3524 (errcode_for_file_access(),
3525 errmsg("could not read file \"%s\": %m",
3526 path)));
3527 else
3528 ereport(ERROR,
3529 (errcode(ERRCODE_DATA_CORRUPTED),
3530 errmsg("could not read file \"%s\": read %d of %zu",
3531 path, r, (Size) nread)));
3532 }
3533 pgstat_report_wait_end();
3534 }
3535 errno = 0;
3536 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3537 if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3538 {
3539 int save_errno = errno;
3540
3541 /*
3542 * If we fail to make the file, delete it to release disk space
3543 */
3544 unlink(tmppath);
3545 /* if write didn't set errno, assume problem is no disk space */
3546 errno = save_errno ? save_errno : ENOSPC;
3547
3548 ereport(ERROR,
3549 (errcode_for_file_access(),
3550 errmsg("could not write to file \"%s\": %m", tmppath)));
3551 }
3552 pgstat_report_wait_end();
3553 }
3554
3555 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3556 if (pg_fsync(fd) != 0)
3557 ereport(data_sync_elevel(ERROR),
3558 (errcode_for_file_access(),
3559 errmsg("could not fsync file \"%s\": %m", tmppath)));
3560 pgstat_report_wait_end();
3561
3562 if (CloseTransientFile(fd) != 0)
3563 ereport(ERROR,
3564 (errcode_for_file_access(),
3565 errmsg("could not close file \"%s\": %m", tmppath)));
3566
3567 if (CloseTransientFile(srcfd) != 0)
3568 ereport(ERROR,
3569 (errcode_for_file_access(),
3570 errmsg("could not close file \"%s\": %m", path)));
3571
3572 /*
3573 * Now move the segment into place with its final name.
3574 */
3575 if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3576 elog(ERROR, "InstallXLogFileSegment should not have failed");
3577 }
3578
3579 /*
3580 * Install a new XLOG segment file as a current or future log segment.
3581 *
3582 * This is used both to install a newly-created segment (which has a temp
3583 * filename while it's being created) and to recycle an old segment.
3584 *
3585 * *segno: identify segment to install as (or first possible target).
3586 * When find_free is true, this is modified on return to indicate the
3587 * actual installation location or last segment searched.
3588 *
3589 * tmppath: initial name of file to install. It will be renamed into place.
3590 *
3591 * find_free: if true, install the new segment at the first empty segno
3592 * number at or after the passed numbers. If false, install the new segment
3593 * exactly where specified, deleting any existing segment file there.
3594 *
3595 * max_segno: maximum segment number to install the new file as. Fail if no
3596 * free slot is found between *segno and max_segno. (Ignored when find_free
3597 * is false.)
3598 *
3599 * use_lock: if true, acquire ControlFileLock while moving file into
3600 * place. This should be true except during bootstrap log creation. The
3601 * caller must *not* hold the lock at call.
3602 *
3603 * Returns true if the file was installed successfully. false indicates that
3604 * max_segno limit was exceeded, or an error occurred while renaming the
3605 * file into place.
3606 */
3607 static bool
InstallXLogFileSegment(XLogSegNo * segno,char * tmppath,bool find_free,XLogSegNo max_segno,bool use_lock)3608 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3609 bool find_free, XLogSegNo max_segno,
3610 bool use_lock)
3611 {
3612 char path[MAXPGPATH];
3613 struct stat stat_buf;
3614
3615 XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3616
3617 /*
3618 * We want to be sure that only one process does this at a time.
3619 */
3620 if (use_lock)
3621 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3622
3623 if (!find_free)
3624 {
3625 /* Force installation: get rid of any pre-existing segment file */
3626 durable_unlink(path, DEBUG1);
3627 }
3628 else
3629 {
3630 /* Find a free slot to put it in */
3631 while (stat(path, &stat_buf) == 0)
3632 {
3633 if ((*segno) >= max_segno)
3634 {
3635 /* Failed to find a free slot within specified range */
3636 if (use_lock)
3637 LWLockRelease(ControlFileLock);
3638 return false;
3639 }
3640 (*segno)++;
3641 XLogFilePath(path, ThisTimeLineID, *segno, wal_segment_size);
3642 }
3643 }
3644
3645 /*
3646 * Perform the rename using link if available, paranoidly trying to avoid
3647 * overwriting an existing file (there shouldn't be one).
3648 */
3649 if (durable_rename_excl(tmppath, path, LOG) != 0)
3650 {
3651 if (use_lock)
3652 LWLockRelease(ControlFileLock);
3653 /* durable_rename_excl already emitted log message */
3654 return false;
3655 }
3656
3657 if (use_lock)
3658 LWLockRelease(ControlFileLock);
3659
3660 return true;
3661 }
3662
3663 /*
3664 * Open a pre-existing logfile segment for writing.
3665 */
3666 int
XLogFileOpen(XLogSegNo segno)3667 XLogFileOpen(XLogSegNo segno)
3668 {
3669 char path[MAXPGPATH];
3670 int fd;
3671
3672 XLogFilePath(path, ThisTimeLineID, segno, wal_segment_size);
3673
3674 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method));
3675 if (fd < 0)
3676 ereport(PANIC,
3677 (errcode_for_file_access(),
3678 errmsg("could not open file \"%s\": %m", path)));
3679
3680 return fd;
3681 }
3682
3683 /*
3684 * Open a logfile segment for reading (during recovery).
3685 *
3686 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3687 * Otherwise, it's assumed to be already available in pg_wal.
3688 */
3689 static int
XLogFileRead(XLogSegNo segno,int emode,TimeLineID tli,XLogSource source,bool notfoundOk)3690 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3691 XLogSource source, bool notfoundOk)
3692 {
3693 char xlogfname[MAXFNAMELEN];
3694 char activitymsg[MAXFNAMELEN + 16];
3695 char path[MAXPGPATH];
3696 int fd;
3697
3698 XLogFileName(xlogfname, tli, segno, wal_segment_size);
3699
3700 switch (source)
3701 {
3702 case XLOG_FROM_ARCHIVE:
3703 /* Report recovery progress in PS display */
3704 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3705 xlogfname);
3706 set_ps_display(activitymsg);
3707
3708 restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3709 "RECOVERYXLOG",
3710 wal_segment_size,
3711 InRedo);
3712 if (!restoredFromArchive)
3713 return -1;
3714 break;
3715
3716 case XLOG_FROM_PG_WAL:
3717 case XLOG_FROM_STREAM:
3718 XLogFilePath(path, tli, segno, wal_segment_size);
3719 restoredFromArchive = false;
3720 break;
3721
3722 default:
3723 elog(ERROR, "invalid XLogFileRead source %d", source);
3724 }
3725
3726 /*
3727 * If the segment was fetched from archival storage, replace the existing
3728 * xlog segment (if any) with the archival version.
3729 */
3730 if (source == XLOG_FROM_ARCHIVE)
3731 {
3732 KeepFileRestoredFromArchive(path, xlogfname);
3733
3734 /*
3735 * Set path to point at the new file in pg_wal.
3736 */
3737 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3738 }
3739
3740 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY);
3741 if (fd >= 0)
3742 {
3743 /* Success! */
3744 curFileTLI = tli;
3745
3746 /* Report recovery progress in PS display */
3747 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3748 xlogfname);
3749 set_ps_display(activitymsg);
3750
3751 /* Track source of data in assorted state variables */
3752 readSource = source;
3753 XLogReceiptSource = source;
3754 /* In FROM_STREAM case, caller tracks receipt time, not me */
3755 if (source != XLOG_FROM_STREAM)
3756 XLogReceiptTime = GetCurrentTimestamp();
3757
3758 return fd;
3759 }
3760 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3761 ereport(PANIC,
3762 (errcode_for_file_access(),
3763 errmsg("could not open file \"%s\": %m", path)));
3764 return -1;
3765 }
3766
3767 /*
3768 * Open a logfile segment for reading (during recovery).
3769 *
3770 * This version searches for the segment with any TLI listed in expectedTLEs.
3771 */
3772 static int
XLogFileReadAnyTLI(XLogSegNo segno,int emode,XLogSource source)3773 XLogFileReadAnyTLI(XLogSegNo segno, int emode, XLogSource source)
3774 {
3775 char path[MAXPGPATH];
3776 ListCell *cell;
3777 int fd;
3778 List *tles;
3779
3780 /*
3781 * Loop looking for a suitable timeline ID: we might need to read any of
3782 * the timelines listed in expectedTLEs.
3783 *
3784 * We expect curFileTLI on entry to be the TLI of the preceding file in
3785 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
3786 * to go backwards; this prevents us from picking up the wrong file when a
3787 * parent timeline extends to higher segment numbers than the child we
3788 * want to read.
3789 *
3790 * If we haven't read the timeline history file yet, read it now, so that
3791 * we know which TLIs to scan. We don't save the list in expectedTLEs,
3792 * however, unless we actually find a valid segment. That way if there is
3793 * neither a timeline history file nor a WAL segment in the archive, and
3794 * streaming replication is set up, we'll read the timeline history file
3795 * streamed from the master when we start streaming, instead of recovering
3796 * with a dummy history generated here.
3797 */
3798 if (expectedTLEs)
3799 tles = expectedTLEs;
3800 else
3801 tles = readTimeLineHistory(recoveryTargetTLI);
3802
3803 foreach(cell, tles)
3804 {
3805 TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
3806 TimeLineID tli = hent->tli;
3807
3808 if (tli < curFileTLI)
3809 break; /* don't bother looking at too-old TLIs */
3810
3811 /*
3812 * Skip scanning the timeline ID that the logfile segment to read
3813 * doesn't belong to
3814 */
3815 if (hent->begin != InvalidXLogRecPtr)
3816 {
3817 XLogSegNo beginseg = 0;
3818
3819 XLByteToSeg(hent->begin, beginseg, wal_segment_size);
3820
3821 /*
3822 * The logfile segment that doesn't belong to the timeline is
3823 * older or newer than the segment that the timeline started or
3824 * ended at, respectively. It's sufficient to check only the
3825 * starting segment of the timeline here. Since the timelines are
3826 * scanned in descending order in this loop, any segments newer
3827 * than the ending segment should belong to newer timeline and
3828 * have already been read before. So it's not necessary to check
3829 * the ending segment of the timeline here.
3830 */
3831 if (segno < beginseg)
3832 continue;
3833 }
3834
3835 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3836 {
3837 fd = XLogFileRead(segno, emode, tli,
3838 XLOG_FROM_ARCHIVE, true);
3839 if (fd != -1)
3840 {
3841 elog(DEBUG1, "got WAL segment from archive");
3842 if (!expectedTLEs)
3843 expectedTLEs = tles;
3844 return fd;
3845 }
3846 }
3847
3848 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3849 {
3850 fd = XLogFileRead(segno, emode, tli,
3851 XLOG_FROM_PG_WAL, true);
3852 if (fd != -1)
3853 {
3854 if (!expectedTLEs)
3855 expectedTLEs = tles;
3856 return fd;
3857 }
3858 }
3859 }
3860
3861 /* Couldn't find it. For simplicity, complain about front timeline */
3862 XLogFilePath(path, recoveryTargetTLI, segno, wal_segment_size);
3863 errno = ENOENT;
3864 ereport(emode,
3865 (errcode_for_file_access(),
3866 errmsg("could not open file \"%s\": %m", path)));
3867 return -1;
3868 }
3869
3870 /*
3871 * Close the current logfile segment for writing.
3872 */
3873 static void
XLogFileClose(void)3874 XLogFileClose(void)
3875 {
3876 Assert(openLogFile >= 0);
3877
3878 /*
3879 * WAL segment files will not be re-read in normal operation, so we advise
3880 * the OS to release any cached pages. But do not do so if WAL archiving
3881 * or streaming is active, because archiver and walsender process could
3882 * use the cache to read the WAL segment.
3883 */
3884 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3885 if (!XLogIsNeeded())
3886 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3887 #endif
3888
3889 if (close(openLogFile) != 0)
3890 {
3891 char xlogfname[MAXFNAMELEN];
3892 int save_errno = errno;
3893
3894 XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo, wal_segment_size);
3895 errno = save_errno;
3896 ereport(PANIC,
3897 (errcode_for_file_access(),
3898 errmsg("could not close file \"%s\": %m", xlogfname)));
3899 }
3900
3901 openLogFile = -1;
3902 ReleaseExternalFD();
3903 }
3904
3905 /*
3906 * Preallocate log files beyond the specified log endpoint.
3907 *
3908 * XXX this is currently extremely conservative, since it forces only one
3909 * future log segment to exist, and even that only if we are 75% done with
3910 * the current one. This is only appropriate for very low-WAL-volume systems.
3911 * High-volume systems will be OK once they've built up a sufficient set of
3912 * recycled log segments, but the startup transient is likely to include
3913 * a lot of segment creations by foreground processes, which is not so good.
3914 */
3915 static void
PreallocXlogFiles(XLogRecPtr endptr)3916 PreallocXlogFiles(XLogRecPtr endptr)
3917 {
3918 XLogSegNo _logSegNo;
3919 int lf;
3920 bool use_existent;
3921 uint64 offset;
3922
3923 XLByteToPrevSeg(endptr, _logSegNo, wal_segment_size);
3924 offset = XLogSegmentOffset(endptr - 1, wal_segment_size);
3925 if (offset >= (uint32) (0.75 * wal_segment_size))
3926 {
3927 _logSegNo++;
3928 use_existent = true;
3929 lf = XLogFileInit(_logSegNo, &use_existent, true);
3930 close(lf);
3931 if (!use_existent)
3932 CheckpointStats.ckpt_segs_added++;
3933 }
3934 }
3935
3936 /*
3937 * Throws an error if the given log segment has already been removed or
3938 * recycled. The caller should only pass a segment that it knows to have
3939 * existed while the server has been running, as this function always
3940 * succeeds if no WAL segments have been removed since startup.
3941 * 'tli' is only used in the error message.
3942 *
3943 * Note: this function guarantees to keep errno unchanged on return.
3944 * This supports callers that use this to possibly deliver a better
3945 * error message about a missing file, while still being able to throw
3946 * a normal file-access error afterwards, if this does return.
3947 */
3948 void
CheckXLogRemoved(XLogSegNo segno,TimeLineID tli)3949 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3950 {
3951 int save_errno = errno;
3952 XLogSegNo lastRemovedSegNo;
3953
3954 SpinLockAcquire(&XLogCtl->info_lck);
3955 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3956 SpinLockRelease(&XLogCtl->info_lck);
3957
3958 if (segno <= lastRemovedSegNo)
3959 {
3960 char filename[MAXFNAMELEN];
3961
3962 XLogFileName(filename, tli, segno, wal_segment_size);
3963 errno = save_errno;
3964 ereport(ERROR,
3965 (errcode_for_file_access(),
3966 errmsg("requested WAL segment %s has already been removed",
3967 filename)));
3968 }
3969 errno = save_errno;
3970 }
3971
3972 /*
3973 * Return the last WAL segment removed, or 0 if no segment has been removed
3974 * since startup.
3975 *
3976 * NB: the result can be out of date arbitrarily fast, the caller has to deal
3977 * with that.
3978 */
3979 XLogSegNo
XLogGetLastRemovedSegno(void)3980 XLogGetLastRemovedSegno(void)
3981 {
3982 XLogSegNo lastRemovedSegNo;
3983
3984 SpinLockAcquire(&XLogCtl->info_lck);
3985 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3986 SpinLockRelease(&XLogCtl->info_lck);
3987
3988 return lastRemovedSegNo;
3989 }
3990
3991
3992 /*
3993 * Update the last removed segno pointer in shared memory, to reflect that the
3994 * given XLOG file has been removed.
3995 */
3996 static void
UpdateLastRemovedPtr(char * filename)3997 UpdateLastRemovedPtr(char *filename)
3998 {
3999 uint32 tli;
4000 XLogSegNo segno;
4001
4002 XLogFromFileName(filename, &tli, &segno, wal_segment_size);
4003
4004 SpinLockAcquire(&XLogCtl->info_lck);
4005 if (segno > XLogCtl->lastRemovedSegNo)
4006 XLogCtl->lastRemovedSegNo = segno;
4007 SpinLockRelease(&XLogCtl->info_lck);
4008 }
4009
4010 /*
4011 * Remove all temporary log files in pg_wal
4012 *
4013 * This is called at the beginning of recovery after a previous crash,
4014 * at a point where no other processes write fresh WAL data.
4015 */
4016 static void
RemoveTempXlogFiles(void)4017 RemoveTempXlogFiles(void)
4018 {
4019 DIR *xldir;
4020 struct dirent *xlde;
4021
4022 elog(DEBUG2, "removing all temporary WAL segments");
4023
4024 xldir = AllocateDir(XLOGDIR);
4025 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4026 {
4027 char path[MAXPGPATH];
4028
4029 if (strncmp(xlde->d_name, "xlogtemp.", 9) != 0)
4030 continue;
4031
4032 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlde->d_name);
4033 unlink(path);
4034 elog(DEBUG2, "removed temporary WAL segment \"%s\"", path);
4035 }
4036 FreeDir(xldir);
4037 }
4038
4039 /*
4040 * Recycle or remove all log files older or equal to passed segno.
4041 *
4042 * endptr is current (or recent) end of xlog, and lastredoptr is the
4043 * redo pointer of the last checkpoint. These are used to determine
4044 * whether we want to recycle rather than delete no-longer-wanted log files.
4045 */
4046 static void
RemoveOldXlogFiles(XLogSegNo segno,XLogRecPtr lastredoptr,XLogRecPtr endptr)4047 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr lastredoptr, XLogRecPtr endptr)
4048 {
4049 DIR *xldir;
4050 struct dirent *xlde;
4051 char lastoff[MAXFNAMELEN];
4052
4053 /*
4054 * Construct a filename of the last segment to be kept. The timeline ID
4055 * doesn't matter, we ignore that in the comparison. (During recovery,
4056 * ThisTimeLineID isn't set, so we can't use that.)
4057 */
4058 XLogFileName(lastoff, 0, segno, wal_segment_size);
4059
4060 elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
4061 lastoff);
4062
4063 xldir = AllocateDir(XLOGDIR);
4064
4065 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4066 {
4067 /* Ignore files that are not XLOG segments */
4068 if (!IsXLogFileName(xlde->d_name) &&
4069 !IsPartialXLogFileName(xlde->d_name))
4070 continue;
4071
4072 /*
4073 * We ignore the timeline part of the XLOG segment identifiers in
4074 * deciding whether a segment is still needed. This ensures that we
4075 * won't prematurely remove a segment from a parent timeline. We could
4076 * probably be a little more proactive about removing segments of
4077 * non-parent timelines, but that would be a whole lot more
4078 * complicated.
4079 *
4080 * We use the alphanumeric sorting property of the filenames to decide
4081 * which ones are earlier than the lastoff segment.
4082 */
4083 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
4084 {
4085 if (XLogArchiveCheckDone(xlde->d_name))
4086 {
4087 /* Update the last removed location in shared memory first */
4088 UpdateLastRemovedPtr(xlde->d_name);
4089
4090 RemoveXlogFile(xlde->d_name, lastredoptr, endptr);
4091 }
4092 }
4093 }
4094
4095 FreeDir(xldir);
4096 }
4097
4098 /*
4099 * Remove WAL files that are not part of the given timeline's history.
4100 *
4101 * This is called during recovery, whenever we switch to follow a new
4102 * timeline, and at the end of recovery when we create a new timeline. We
4103 * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
4104 * might be leftover pre-allocated or recycled WAL segments on the old timeline
4105 * that we haven't used yet, and contain garbage. If we just leave them in
4106 * pg_wal, they will eventually be archived, and we can't let that happen.
4107 * Files that belong to our timeline history are valid, because we have
4108 * successfully replayed them, but from others we can't be sure.
4109 *
4110 * 'switchpoint' is the current point in WAL where we switch to new timeline,
4111 * and 'newTLI' is the new timeline we switch to.
4112 */
4113 static void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint,TimeLineID newTLI)4114 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
4115 {
4116 DIR *xldir;
4117 struct dirent *xlde;
4118 char switchseg[MAXFNAMELEN];
4119 XLogSegNo endLogSegNo;
4120
4121 XLByteToPrevSeg(switchpoint, endLogSegNo, wal_segment_size);
4122
4123 /*
4124 * Construct a filename of the last segment to be kept.
4125 */
4126 XLogFileName(switchseg, newTLI, endLogSegNo, wal_segment_size);
4127
4128 elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4129 switchseg);
4130
4131 xldir = AllocateDir(XLOGDIR);
4132
4133 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4134 {
4135 /* Ignore files that are not XLOG segments */
4136 if (!IsXLogFileName(xlde->d_name))
4137 continue;
4138
4139 /*
4140 * Remove files that are on a timeline older than the new one we're
4141 * switching to, but with a segment number >= the first segment on the
4142 * new timeline.
4143 */
4144 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4145 strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4146 {
4147 /*
4148 * If the file has already been marked as .ready, however, don't
4149 * remove it yet. It should be OK to remove it - files that are
4150 * not part of our timeline history are not required for recovery
4151 * - but seems safer to let them be archived and removed later.
4152 */
4153 if (!XLogArchiveIsReady(xlde->d_name))
4154 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
4155 }
4156 }
4157
4158 FreeDir(xldir);
4159 }
4160
4161 /*
4162 * Recycle or remove a log file that's no longer needed.
4163 *
4164 * endptr is current (or recent) end of xlog, and lastredoptr is the
4165 * redo pointer of the last checkpoint. These are used to determine
4166 * whether we want to recycle rather than delete no-longer-wanted log files.
4167 * If lastredoptr is not known, pass invalid, and the function will recycle,
4168 * somewhat arbitrarily, 10 future segments.
4169 */
4170 static void
RemoveXlogFile(const char * segname,XLogRecPtr lastredoptr,XLogRecPtr endptr)4171 RemoveXlogFile(const char *segname, XLogRecPtr lastredoptr, XLogRecPtr endptr)
4172 {
4173 char path[MAXPGPATH];
4174 #ifdef WIN32
4175 char newpath[MAXPGPATH];
4176 #endif
4177 struct stat statbuf;
4178 XLogSegNo endlogSegNo;
4179 XLogSegNo recycleSegNo;
4180
4181 if (wal_recycle)
4182 {
4183 /*
4184 * Initialize info about where to try to recycle to.
4185 */
4186 XLByteToSeg(endptr, endlogSegNo, wal_segment_size);
4187 if (lastredoptr == InvalidXLogRecPtr)
4188 recycleSegNo = endlogSegNo + 10;
4189 else
4190 recycleSegNo = XLOGfileslop(lastredoptr);
4191 }
4192 else
4193 recycleSegNo = 0; /* keep compiler quiet */
4194
4195 snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4196
4197 /*
4198 * Before deleting the file, see if it can be recycled as a future log
4199 * segment. Only recycle normal files, pg_standby for example can create
4200 * symbolic links pointing to a separate archive directory.
4201 */
4202 if (wal_recycle &&
4203 endlogSegNo <= recycleSegNo &&
4204 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4205 InstallXLogFileSegment(&endlogSegNo, path,
4206 true, recycleSegNo, true))
4207 {
4208 ereport(DEBUG2,
4209 (errmsg("recycled write-ahead log file \"%s\"",
4210 segname)));
4211 CheckpointStats.ckpt_segs_recycled++;
4212 /* Needn't recheck that slot on future iterations */
4213 endlogSegNo++;
4214 }
4215 else
4216 {
4217 /* No need for any more future segments... */
4218 int rc;
4219
4220 ereport(DEBUG2,
4221 (errmsg("removing write-ahead log file \"%s\"",
4222 segname)));
4223
4224 #ifdef WIN32
4225
4226 /*
4227 * On Windows, if another process (e.g another backend) holds the file
4228 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4229 * will still show up in directory listing until the last handle is
4230 * closed. To avoid confusing the lingering deleted file for a live
4231 * WAL file that needs to be archived, rename it before deleting it.
4232 *
4233 * If another process holds the file open without FILE_SHARE_DELETE
4234 * flag, rename will fail. We'll try again at the next checkpoint.
4235 */
4236 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4237 if (rename(path, newpath) != 0)
4238 {
4239 ereport(LOG,
4240 (errcode_for_file_access(),
4241 errmsg("could not rename file \"%s\": %m",
4242 path)));
4243 return;
4244 }
4245 rc = durable_unlink(newpath, LOG);
4246 #else
4247 rc = durable_unlink(path, LOG);
4248 #endif
4249 if (rc != 0)
4250 {
4251 /* Message already logged by durable_unlink() */
4252 return;
4253 }
4254 CheckpointStats.ckpt_segs_removed++;
4255 }
4256
4257 XLogArchiveCleanup(segname);
4258 }
4259
4260 /*
4261 * Verify whether pg_wal and pg_wal/archive_status exist.
4262 * If the latter does not exist, recreate it.
4263 *
4264 * It is not the goal of this function to verify the contents of these
4265 * directories, but to help in cases where someone has performed a cluster
4266 * copy for PITR purposes but omitted pg_wal from the copy.
4267 *
4268 * We could also recreate pg_wal if it doesn't exist, but a deliberate
4269 * policy decision was made not to. It is fairly common for pg_wal to be
4270 * a symlink, and if that was the DBA's intent then automatically making a
4271 * plain directory would result in degraded performance with no notice.
4272 */
4273 static void
ValidateXLOGDirectoryStructure(void)4274 ValidateXLOGDirectoryStructure(void)
4275 {
4276 char path[MAXPGPATH];
4277 struct stat stat_buf;
4278
4279 /* Check for pg_wal; if it doesn't exist, error out */
4280 if (stat(XLOGDIR, &stat_buf) != 0 ||
4281 !S_ISDIR(stat_buf.st_mode))
4282 ereport(FATAL,
4283 (errmsg("required WAL directory \"%s\" does not exist",
4284 XLOGDIR)));
4285
4286 /* Check for archive_status */
4287 snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4288 if (stat(path, &stat_buf) == 0)
4289 {
4290 /* Check for weird cases where it exists but isn't a directory */
4291 if (!S_ISDIR(stat_buf.st_mode))
4292 ereport(FATAL,
4293 (errmsg("required WAL directory \"%s\" does not exist",
4294 path)));
4295 }
4296 else
4297 {
4298 ereport(LOG,
4299 (errmsg("creating missing WAL directory \"%s\"", path)));
4300 if (MakePGDirectory(path) < 0)
4301 ereport(FATAL,
4302 (errmsg("could not create missing directory \"%s\": %m",
4303 path)));
4304 }
4305 }
4306
4307 /*
4308 * Remove previous backup history files. This also retries creation of
4309 * .ready files for any backup history files for which XLogArchiveNotify
4310 * failed earlier.
4311 */
4312 static void
CleanupBackupHistory(void)4313 CleanupBackupHistory(void)
4314 {
4315 DIR *xldir;
4316 struct dirent *xlde;
4317 char path[MAXPGPATH + sizeof(XLOGDIR)];
4318
4319 xldir = AllocateDir(XLOGDIR);
4320
4321 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4322 {
4323 if (IsBackupHistoryFileName(xlde->d_name))
4324 {
4325 if (XLogArchiveCheckDone(xlde->d_name))
4326 {
4327 elog(DEBUG2, "removing WAL backup history file \"%s\"",
4328 xlde->d_name);
4329 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4330 unlink(path);
4331 XLogArchiveCleanup(xlde->d_name);
4332 }
4333 }
4334 }
4335
4336 FreeDir(xldir);
4337 }
4338
4339 /*
4340 * Attempt to read the next XLOG record.
4341 *
4342 * Before first call, the reader needs to be positioned to the first record
4343 * by calling XLogBeginRead().
4344 *
4345 * If no valid record is available, returns NULL, or fails if emode is PANIC.
4346 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4347 * record is available.
4348 */
4349 static XLogRecord *
ReadRecord(XLogReaderState * xlogreader,int emode,bool fetching_ckpt)4350 ReadRecord(XLogReaderState *xlogreader, int emode,
4351 bool fetching_ckpt)
4352 {
4353 XLogRecord *record;
4354 XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4355
4356 /* Pass through parameters to XLogPageRead */
4357 private->fetching_ckpt = fetching_ckpt;
4358 private->emode = emode;
4359 private->randAccess = (xlogreader->ReadRecPtr == InvalidXLogRecPtr);
4360
4361 /* This is the first attempt to read this page. */
4362 lastSourceFailed = false;
4363
4364 for (;;)
4365 {
4366 char *errormsg;
4367
4368 record = XLogReadRecord(xlogreader, &errormsg);
4369 ReadRecPtr = xlogreader->ReadRecPtr;
4370 EndRecPtr = xlogreader->EndRecPtr;
4371 if (record == NULL)
4372 {
4373 /*
4374 * When not in standby mode we find that WAL ends in an incomplete
4375 * record, keep track of that record. After recovery is done,
4376 * we'll write a record to indicate downstream WAL readers that
4377 * that portion is to be ignored.
4378 */
4379 if (!StandbyMode &&
4380 !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
4381 {
4382 abortedRecPtr = xlogreader->abortedRecPtr;
4383 missingContrecPtr = xlogreader->missingContrecPtr;
4384 }
4385
4386 if (readFile >= 0)
4387 {
4388 close(readFile);
4389 readFile = -1;
4390 }
4391
4392 /*
4393 * We only end up here without a message when XLogPageRead()
4394 * failed - in that case we already logged something. In
4395 * StandbyMode that only happens if we have been triggered, so we
4396 * shouldn't loop anymore in that case.
4397 */
4398 if (errormsg)
4399 ereport(emode_for_corrupt_record(emode, EndRecPtr),
4400 (errmsg_internal("%s", errormsg) /* already translated */ ));
4401 }
4402
4403 /*
4404 * Check page TLI is one of the expected values.
4405 */
4406 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4407 {
4408 char fname[MAXFNAMELEN];
4409 XLogSegNo segno;
4410 int32 offset;
4411
4412 XLByteToSeg(xlogreader->latestPagePtr, segno, wal_segment_size);
4413 offset = XLogSegmentOffset(xlogreader->latestPagePtr,
4414 wal_segment_size);
4415 XLogFileName(fname, xlogreader->seg.ws_tli, segno,
4416 wal_segment_size);
4417 ereport(emode_for_corrupt_record(emode, EndRecPtr),
4418 (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4419 xlogreader->latestPageTLI,
4420 fname,
4421 offset)));
4422 record = NULL;
4423 }
4424
4425 if (record)
4426 {
4427 /* Great, got a record */
4428 return record;
4429 }
4430 else
4431 {
4432 /* No valid record available from this source */
4433 lastSourceFailed = true;
4434
4435 /*
4436 * If archive recovery was requested, but we were still doing
4437 * crash recovery, switch to archive recovery and retry using the
4438 * offline archive. We have now replayed all the valid WAL in
4439 * pg_wal, so we are presumably now consistent.
4440 *
4441 * We require that there's at least some valid WAL present in
4442 * pg_wal, however (!fetching_ckpt). We could recover using the
4443 * WAL from the archive, even if pg_wal is completely empty, but
4444 * we'd have no idea how far we'd have to replay to reach
4445 * consistency. So err on the safe side and give up.
4446 */
4447 if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4448 !fetching_ckpt)
4449 {
4450 ereport(DEBUG1,
4451 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4452 InArchiveRecovery = true;
4453 if (StandbyModeRequested)
4454 StandbyMode = true;
4455
4456 /* initialize minRecoveryPoint to this record */
4457 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4458 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4459 if (ControlFile->minRecoveryPoint < EndRecPtr)
4460 {
4461 ControlFile->minRecoveryPoint = EndRecPtr;
4462 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4463 }
4464 /* update local copy */
4465 minRecoveryPoint = ControlFile->minRecoveryPoint;
4466 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4467
4468 /*
4469 * The startup process can update its local copy of
4470 * minRecoveryPoint from this point.
4471 */
4472 updateMinRecoveryPoint = true;
4473
4474 UpdateControlFile();
4475
4476 /*
4477 * We update SharedRecoveryState while holding the lock on
4478 * ControlFileLock so both states are consistent in shared
4479 * memory.
4480 */
4481 SpinLockAcquire(&XLogCtl->info_lck);
4482 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
4483 SpinLockRelease(&XLogCtl->info_lck);
4484
4485 LWLockRelease(ControlFileLock);
4486
4487 CheckRecoveryConsistency();
4488
4489 /*
4490 * Before we retry, reset lastSourceFailed and currentSource
4491 * so that we will check the archive next.
4492 */
4493 lastSourceFailed = false;
4494 currentSource = XLOG_FROM_ANY;
4495
4496 continue;
4497 }
4498
4499 /* In standby mode, loop back to retry. Otherwise, give up. */
4500 if (StandbyMode && !CheckForStandbyTrigger())
4501 continue;
4502 else
4503 return NULL;
4504 }
4505 }
4506 }
4507
4508 /*
4509 * Scan for new timelines that might have appeared in the archive since we
4510 * started recovery.
4511 *
4512 * If there are any, the function changes recovery target TLI to the latest
4513 * one and returns 'true'.
4514 */
4515 static bool
rescanLatestTimeLine(void)4516 rescanLatestTimeLine(void)
4517 {
4518 List *newExpectedTLEs;
4519 bool found;
4520 ListCell *cell;
4521 TimeLineID newtarget;
4522 TimeLineID oldtarget = recoveryTargetTLI;
4523 TimeLineHistoryEntry *currentTle = NULL;
4524
4525 newtarget = findNewestTimeLine(recoveryTargetTLI);
4526 if (newtarget == recoveryTargetTLI)
4527 {
4528 /* No new timelines found */
4529 return false;
4530 }
4531
4532 /*
4533 * Determine the list of expected TLIs for the new TLI
4534 */
4535
4536 newExpectedTLEs = readTimeLineHistory(newtarget);
4537
4538 /*
4539 * If the current timeline is not part of the history of the new timeline,
4540 * we cannot proceed to it.
4541 */
4542 found = false;
4543 foreach(cell, newExpectedTLEs)
4544 {
4545 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4546
4547 if (currentTle->tli == recoveryTargetTLI)
4548 {
4549 found = true;
4550 break;
4551 }
4552 }
4553 if (!found)
4554 {
4555 ereport(LOG,
4556 (errmsg("new timeline %u is not a child of database system timeline %u",
4557 newtarget,
4558 ThisTimeLineID)));
4559 return false;
4560 }
4561
4562 /*
4563 * The current timeline was found in the history file, but check that the
4564 * next timeline was forked off from it *after* the current recovery
4565 * location.
4566 */
4567 if (currentTle->end < EndRecPtr)
4568 {
4569 ereport(LOG,
4570 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4571 newtarget,
4572 ThisTimeLineID,
4573 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4574 return false;
4575 }
4576
4577 /* The new timeline history seems valid. Switch target */
4578 recoveryTargetTLI = newtarget;
4579 list_free_deep(expectedTLEs);
4580 expectedTLEs = newExpectedTLEs;
4581
4582 /*
4583 * As in StartupXLOG(), try to ensure we have all the history files
4584 * between the old target and new target in pg_wal.
4585 */
4586 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4587
4588 ereport(LOG,
4589 (errmsg("new target timeline is %u",
4590 recoveryTargetTLI)));
4591
4592 return true;
4593 }
4594
4595 /*
4596 * I/O routines for pg_control
4597 *
4598 * *ControlFile is a buffer in shared memory that holds an image of the
4599 * contents of pg_control. WriteControlFile() initializes pg_control
4600 * given a preloaded buffer, ReadControlFile() loads the buffer from
4601 * the pg_control file (during postmaster or standalone-backend startup),
4602 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4603 * InitControlFile() fills the buffer with initial values.
4604 *
4605 * For simplicity, WriteControlFile() initializes the fields of pg_control
4606 * that are related to checking backend/database compatibility, and
4607 * ReadControlFile() verifies they are correct. We could split out the
4608 * I/O and compatibility-check functions, but there seems no need currently.
4609 */
4610
4611 static void
InitControlFile(uint64 sysidentifier)4612 InitControlFile(uint64 sysidentifier)
4613 {
4614 char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
4615
4616 /*
4617 * Generate a random nonce. This is used for authentication requests that
4618 * will fail because the user does not exist. The nonce is used to create
4619 * a genuine-looking password challenge for the non-existent user, in lieu
4620 * of an actual stored password.
4621 */
4622 if (!pg_strong_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
4623 ereport(PANIC,
4624 (errcode(ERRCODE_INTERNAL_ERROR),
4625 errmsg("could not generate secret authorization token")));
4626
4627 memset(ControlFile, 0, sizeof(ControlFileData));
4628 /* Initialize pg_control status fields */
4629 ControlFile->system_identifier = sysidentifier;
4630 memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
4631 ControlFile->state = DB_SHUTDOWNED;
4632 ControlFile->unloggedLSN = FirstNormalUnloggedLSN;
4633
4634 /* Set important parameter values for use when replaying WAL */
4635 ControlFile->MaxConnections = MaxConnections;
4636 ControlFile->max_worker_processes = max_worker_processes;
4637 ControlFile->max_wal_senders = max_wal_senders;
4638 ControlFile->max_prepared_xacts = max_prepared_xacts;
4639 ControlFile->max_locks_per_xact = max_locks_per_xact;
4640 ControlFile->wal_level = wal_level;
4641 ControlFile->wal_log_hints = wal_log_hints;
4642 ControlFile->track_commit_timestamp = track_commit_timestamp;
4643 ControlFile->data_checksum_version = bootstrap_data_checksum_version;
4644 }
4645
4646 static void
WriteControlFile(void)4647 WriteControlFile(void)
4648 {
4649 int fd;
4650 char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
4651
4652 /*
4653 * Ensure that the size of the pg_control data structure is sane. See the
4654 * comments for these symbols in pg_control.h.
4655 */
4656 StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4657 "pg_control is too large for atomic disk writes");
4658 StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4659 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4660
4661 /*
4662 * Initialize version and compatibility-check fields
4663 */
4664 ControlFile->pg_control_version = PG_CONTROL_VERSION;
4665 ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4666
4667 ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4668 ControlFile->floatFormat = FLOATFORMAT_VALUE;
4669
4670 ControlFile->blcksz = BLCKSZ;
4671 ControlFile->relseg_size = RELSEG_SIZE;
4672 ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4673 ControlFile->xlog_seg_size = wal_segment_size;
4674
4675 ControlFile->nameDataLen = NAMEDATALEN;
4676 ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4677
4678 ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4679 ControlFile->loblksize = LOBLKSIZE;
4680
4681 ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4682
4683 /* Contents are protected with a CRC */
4684 INIT_CRC32C(ControlFile->crc);
4685 COMP_CRC32C(ControlFile->crc,
4686 (char *) ControlFile,
4687 offsetof(ControlFileData, crc));
4688 FIN_CRC32C(ControlFile->crc);
4689
4690 /*
4691 * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4692 * the excess over sizeof(ControlFileData). This reduces the odds of
4693 * premature-EOF errors when reading pg_control. We'll still fail when we
4694 * check the contents of the file, but hopefully with a more specific
4695 * error than "couldn't read pg_control".
4696 */
4697 memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4698 memcpy(buffer, ControlFile, sizeof(ControlFileData));
4699
4700 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4701 O_RDWR | O_CREAT | O_EXCL | PG_BINARY);
4702 if (fd < 0)
4703 ereport(PANIC,
4704 (errcode_for_file_access(),
4705 errmsg("could not create file \"%s\": %m",
4706 XLOG_CONTROL_FILE)));
4707
4708 errno = 0;
4709 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4710 if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4711 {
4712 /* if write didn't set errno, assume problem is no disk space */
4713 if (errno == 0)
4714 errno = ENOSPC;
4715 ereport(PANIC,
4716 (errcode_for_file_access(),
4717 errmsg("could not write to file \"%s\": %m",
4718 XLOG_CONTROL_FILE)));
4719 }
4720 pgstat_report_wait_end();
4721
4722 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4723 if (pg_fsync(fd) != 0)
4724 ereport(PANIC,
4725 (errcode_for_file_access(),
4726 errmsg("could not fsync file \"%s\": %m",
4727 XLOG_CONTROL_FILE)));
4728 pgstat_report_wait_end();
4729
4730 if (close(fd) != 0)
4731 ereport(PANIC,
4732 (errcode_for_file_access(),
4733 errmsg("could not close file \"%s\": %m",
4734 XLOG_CONTROL_FILE)));
4735 }
4736
4737 static void
ReadControlFile(void)4738 ReadControlFile(void)
4739 {
4740 pg_crc32c crc;
4741 int fd;
4742 static char wal_segsz_str[20];
4743 int r;
4744
4745 /*
4746 * Read data...
4747 */
4748 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4749 O_RDWR | PG_BINARY);
4750 if (fd < 0)
4751 ereport(PANIC,
4752 (errcode_for_file_access(),
4753 errmsg("could not open file \"%s\": %m",
4754 XLOG_CONTROL_FILE)));
4755
4756 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4757 r = read(fd, ControlFile, sizeof(ControlFileData));
4758 if (r != sizeof(ControlFileData))
4759 {
4760 if (r < 0)
4761 ereport(PANIC,
4762 (errcode_for_file_access(),
4763 errmsg("could not read file \"%s\": %m",
4764 XLOG_CONTROL_FILE)));
4765 else
4766 ereport(PANIC,
4767 (errcode(ERRCODE_DATA_CORRUPTED),
4768 errmsg("could not read file \"%s\": read %d of %zu",
4769 XLOG_CONTROL_FILE, r, sizeof(ControlFileData))));
4770 }
4771 pgstat_report_wait_end();
4772
4773 close(fd);
4774
4775 /*
4776 * Check for expected pg_control format version. If this is wrong, the
4777 * CRC check will likely fail because we'll be checking the wrong number
4778 * of bytes. Complaining about wrong version will probably be more
4779 * enlightening than complaining about wrong CRC.
4780 */
4781
4782 if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4783 ereport(FATAL,
4784 (errmsg("database files are incompatible with server"),
4785 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4786 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4787 ControlFile->pg_control_version, ControlFile->pg_control_version,
4788 PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4789 errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
4790
4791 if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4792 ereport(FATAL,
4793 (errmsg("database files are incompatible with server"),
4794 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4795 " but the server was compiled with PG_CONTROL_VERSION %d.",
4796 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4797 errhint("It looks like you need to initdb.")));
4798
4799 /* Now check the CRC. */
4800 INIT_CRC32C(crc);
4801 COMP_CRC32C(crc,
4802 (char *) ControlFile,
4803 offsetof(ControlFileData, crc));
4804 FIN_CRC32C(crc);
4805
4806 if (!EQ_CRC32C(crc, ControlFile->crc))
4807 ereport(FATAL,
4808 (errmsg("incorrect checksum in control file")));
4809
4810 /*
4811 * Do compatibility checking immediately. If the database isn't
4812 * compatible with the backend executable, we want to abort before we can
4813 * possibly do any damage.
4814 */
4815 if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4816 ereport(FATAL,
4817 (errmsg("database files are incompatible with server"),
4818 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4819 " but the server was compiled with CATALOG_VERSION_NO %d.",
4820 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4821 errhint("It looks like you need to initdb.")));
4822 if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4823 ereport(FATAL,
4824 (errmsg("database files are incompatible with server"),
4825 errdetail("The database cluster was initialized with MAXALIGN %d,"
4826 " but the server was compiled with MAXALIGN %d.",
4827 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4828 errhint("It looks like you need to initdb.")));
4829 if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4830 ereport(FATAL,
4831 (errmsg("database files are incompatible with server"),
4832 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4833 errhint("It looks like you need to initdb.")));
4834 if (ControlFile->blcksz != BLCKSZ)
4835 ereport(FATAL,
4836 (errmsg("database files are incompatible with server"),
4837 errdetail("The database cluster was initialized with BLCKSZ %d,"
4838 " but the server was compiled with BLCKSZ %d.",
4839 ControlFile->blcksz, BLCKSZ),
4840 errhint("It looks like you need to recompile or initdb.")));
4841 if (ControlFile->relseg_size != RELSEG_SIZE)
4842 ereport(FATAL,
4843 (errmsg("database files are incompatible with server"),
4844 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4845 " but the server was compiled with RELSEG_SIZE %d.",
4846 ControlFile->relseg_size, RELSEG_SIZE),
4847 errhint("It looks like you need to recompile or initdb.")));
4848 if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4849 ereport(FATAL,
4850 (errmsg("database files are incompatible with server"),
4851 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4852 " but the server was compiled with XLOG_BLCKSZ %d.",
4853 ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4854 errhint("It looks like you need to recompile or initdb.")));
4855 if (ControlFile->nameDataLen != NAMEDATALEN)
4856 ereport(FATAL,
4857 (errmsg("database files are incompatible with server"),
4858 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4859 " but the server was compiled with NAMEDATALEN %d.",
4860 ControlFile->nameDataLen, NAMEDATALEN),
4861 errhint("It looks like you need to recompile or initdb.")));
4862 if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4863 ereport(FATAL,
4864 (errmsg("database files are incompatible with server"),
4865 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4866 " but the server was compiled with INDEX_MAX_KEYS %d.",
4867 ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4868 errhint("It looks like you need to recompile or initdb.")));
4869 if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4870 ereport(FATAL,
4871 (errmsg("database files are incompatible with server"),
4872 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4873 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4874 ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4875 errhint("It looks like you need to recompile or initdb.")));
4876 if (ControlFile->loblksize != LOBLKSIZE)
4877 ereport(FATAL,
4878 (errmsg("database files are incompatible with server"),
4879 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4880 " but the server was compiled with LOBLKSIZE %d.",
4881 ControlFile->loblksize, (int) LOBLKSIZE),
4882 errhint("It looks like you need to recompile or initdb.")));
4883
4884 #ifdef USE_FLOAT8_BYVAL
4885 if (ControlFile->float8ByVal != true)
4886 ereport(FATAL,
4887 (errmsg("database files are incompatible with server"),
4888 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4889 " but the server was compiled with USE_FLOAT8_BYVAL."),
4890 errhint("It looks like you need to recompile or initdb.")));
4891 #else
4892 if (ControlFile->float8ByVal != false)
4893 ereport(FATAL,
4894 (errmsg("database files are incompatible with server"),
4895 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4896 " but the server was compiled without USE_FLOAT8_BYVAL."),
4897 errhint("It looks like you need to recompile or initdb.")));
4898 #endif
4899
4900 wal_segment_size = ControlFile->xlog_seg_size;
4901
4902 if (!IsValidWalSegSize(wal_segment_size))
4903 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4904 errmsg_plural("WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d byte",
4905 "WAL segment size must be a power of two between 1 MB and 1 GB, but the control file specifies %d bytes",
4906 wal_segment_size,
4907 wal_segment_size)));
4908
4909 snprintf(wal_segsz_str, sizeof(wal_segsz_str), "%d", wal_segment_size);
4910 SetConfigOption("wal_segment_size", wal_segsz_str, PGC_INTERNAL,
4911 PGC_S_OVERRIDE);
4912
4913 /* check and update variables dependent on wal_segment_size */
4914 if (ConvertToXSegs(min_wal_size_mb, wal_segment_size) < 2)
4915 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4916 errmsg("\"min_wal_size\" must be at least twice \"wal_segment_size\"")));
4917
4918 if (ConvertToXSegs(max_wal_size_mb, wal_segment_size) < 2)
4919 ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
4920 errmsg("\"max_wal_size\" must be at least twice \"wal_segment_size\"")));
4921
4922 UsableBytesInSegment =
4923 (wal_segment_size / XLOG_BLCKSZ * UsableBytesInPage) -
4924 (SizeOfXLogLongPHD - SizeOfXLogShortPHD);
4925
4926 CalculateCheckpointSegments();
4927
4928 /* Make the initdb settings visible as GUC variables, too */
4929 SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4930 PGC_INTERNAL, PGC_S_OVERRIDE);
4931 }
4932
4933 /*
4934 * Utility wrapper to update the control file. Note that the control
4935 * file gets flushed.
4936 */
4937 void
UpdateControlFile(void)4938 UpdateControlFile(void)
4939 {
4940 update_controlfile(DataDir, ControlFile, true);
4941 }
4942
4943 /*
4944 * Returns the unique system identifier from control file.
4945 */
4946 uint64
GetSystemIdentifier(void)4947 GetSystemIdentifier(void)
4948 {
4949 Assert(ControlFile != NULL);
4950 return ControlFile->system_identifier;
4951 }
4952
4953 /*
4954 * Returns the random nonce from control file.
4955 */
4956 char *
GetMockAuthenticationNonce(void)4957 GetMockAuthenticationNonce(void)
4958 {
4959 Assert(ControlFile != NULL);
4960 return ControlFile->mock_authentication_nonce;
4961 }
4962
4963 /*
4964 * Are checksums enabled for data pages?
4965 */
4966 bool
DataChecksumsEnabled(void)4967 DataChecksumsEnabled(void)
4968 {
4969 Assert(ControlFile != NULL);
4970 return (ControlFile->data_checksum_version > 0);
4971 }
4972
4973 /*
4974 * Returns a fake LSN for unlogged relations.
4975 *
4976 * Each call generates an LSN that is greater than any previous value
4977 * returned. The current counter value is saved and restored across clean
4978 * shutdowns, but like unlogged relations, does not survive a crash. This can
4979 * be used in lieu of real LSN values returned by XLogInsert, if you need an
4980 * LSN-like increasing sequence of numbers without writing any WAL.
4981 */
4982 XLogRecPtr
GetFakeLSNForUnloggedRel(void)4983 GetFakeLSNForUnloggedRel(void)
4984 {
4985 XLogRecPtr nextUnloggedLSN;
4986
4987 /* increment the unloggedLSN counter, need SpinLock */
4988 SpinLockAcquire(&XLogCtl->ulsn_lck);
4989 nextUnloggedLSN = XLogCtl->unloggedLSN++;
4990 SpinLockRelease(&XLogCtl->ulsn_lck);
4991
4992 return nextUnloggedLSN;
4993 }
4994
4995 /*
4996 * Auto-tune the number of XLOG buffers.
4997 *
4998 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4999 * a maximum of one XLOG segment (there is little reason to think that more
5000 * is helpful, at least so long as we force an fsync when switching log files)
5001 * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
5002 * 9.1, when auto-tuning was added).
5003 *
5004 * This should not be called until NBuffers has received its final value.
5005 */
5006 static int
XLOGChooseNumBuffers(void)5007 XLOGChooseNumBuffers(void)
5008 {
5009 int xbuffers;
5010
5011 xbuffers = NBuffers / 32;
5012 if (xbuffers > (wal_segment_size / XLOG_BLCKSZ))
5013 xbuffers = (wal_segment_size / XLOG_BLCKSZ);
5014 if (xbuffers < 8)
5015 xbuffers = 8;
5016 return xbuffers;
5017 }
5018
5019 /*
5020 * GUC check_hook for wal_buffers
5021 */
5022 bool
check_wal_buffers(int * newval,void ** extra,GucSource source)5023 check_wal_buffers(int *newval, void **extra, GucSource source)
5024 {
5025 /*
5026 * -1 indicates a request for auto-tune.
5027 */
5028 if (*newval == -1)
5029 {
5030 /*
5031 * If we haven't yet changed the boot_val default of -1, just let it
5032 * be. We'll fix it when XLOGShmemSize is called.
5033 */
5034 if (XLOGbuffers == -1)
5035 return true;
5036
5037 /* Otherwise, substitute the auto-tune value */
5038 *newval = XLOGChooseNumBuffers();
5039 }
5040
5041 /*
5042 * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
5043 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
5044 * the case, we just silently treat such values as a request for the
5045 * minimum. (We could throw an error instead, but that doesn't seem very
5046 * helpful.)
5047 */
5048 if (*newval < 4)
5049 *newval = 4;
5050
5051 return true;
5052 }
5053
5054 /*
5055 * Read the control file, set respective GUCs.
5056 *
5057 * This is to be called during startup, including a crash recovery cycle,
5058 * unless in bootstrap mode, where no control file yet exists. As there's no
5059 * usable shared memory yet (its sizing can depend on the contents of the
5060 * control file!), first store the contents in local memory. XLOGShmemInit()
5061 * will then copy it to shared memory later.
5062 *
5063 * reset just controls whether previous contents are to be expected (in the
5064 * reset case, there's a dangling pointer into old shared memory), or not.
5065 */
5066 void
LocalProcessControlFile(bool reset)5067 LocalProcessControlFile(bool reset)
5068 {
5069 Assert(reset || ControlFile == NULL);
5070 ControlFile = palloc(sizeof(ControlFileData));
5071 ReadControlFile();
5072 }
5073
5074 /*
5075 * Initialization of shared memory for XLOG
5076 */
5077 Size
XLOGShmemSize(void)5078 XLOGShmemSize(void)
5079 {
5080 Size size;
5081
5082 /*
5083 * If the value of wal_buffers is -1, use the preferred auto-tune value.
5084 * This isn't an amazingly clean place to do this, but we must wait till
5085 * NBuffers has received its final value, and must do it before using the
5086 * value of XLOGbuffers to do anything important.
5087 */
5088 if (XLOGbuffers == -1)
5089 {
5090 char buf[32];
5091
5092 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
5093 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
5094 }
5095 Assert(XLOGbuffers > 0);
5096
5097 /* XLogCtl */
5098 size = sizeof(XLogCtlData);
5099
5100 /* WAL insertion locks, plus alignment */
5101 size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
5102 /* xlblocks array */
5103 size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
5104 /* extra alignment padding for XLOG I/O buffers */
5105 size = add_size(size, XLOG_BLCKSZ);
5106 /* and the buffers themselves */
5107 size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
5108
5109 /*
5110 * Note: we don't count ControlFileData, it comes out of the "slop factor"
5111 * added by CreateSharedMemoryAndSemaphores. This lets us use this
5112 * routine again below to compute the actual allocation size.
5113 */
5114
5115 return size;
5116 }
5117
5118 void
XLOGShmemInit(void)5119 XLOGShmemInit(void)
5120 {
5121 bool foundCFile,
5122 foundXLog;
5123 char *allocptr;
5124 int i;
5125 ControlFileData *localControlFile;
5126
5127 #ifdef WAL_DEBUG
5128
5129 /*
5130 * Create a memory context for WAL debugging that's exempt from the normal
5131 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
5132 * an allocation fails, but wal_debug is not for production use anyway.
5133 */
5134 if (walDebugCxt == NULL)
5135 {
5136 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
5137 "WAL Debug",
5138 ALLOCSET_DEFAULT_SIZES);
5139 MemoryContextAllowInCriticalSection(walDebugCxt, true);
5140 }
5141 #endif
5142
5143
5144 XLogCtl = (XLogCtlData *)
5145 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5146
5147 localControlFile = ControlFile;
5148 ControlFile = (ControlFileData *)
5149 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5150
5151 if (foundCFile || foundXLog)
5152 {
5153 /* both should be present or neither */
5154 Assert(foundCFile && foundXLog);
5155
5156 /* Initialize local copy of WALInsertLocks */
5157 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5158
5159 if (localControlFile)
5160 pfree(localControlFile);
5161 return;
5162 }
5163 memset(XLogCtl, 0, sizeof(XLogCtlData));
5164
5165 /*
5166 * Already have read control file locally, unless in bootstrap mode. Move
5167 * contents into shared memory.
5168 */
5169 if (localControlFile)
5170 {
5171 memcpy(ControlFile, localControlFile, sizeof(ControlFileData));
5172 pfree(localControlFile);
5173 }
5174
5175 /*
5176 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5177 * multiple of the alignment for same, so no extra alignment padding is
5178 * needed here.
5179 */
5180 allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5181 XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5182 memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5183 allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5184
5185
5186 /* WAL insertion locks. Ensure they're aligned to the full padded size */
5187 allocptr += sizeof(WALInsertLockPadded) -
5188 ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5189 WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5190 (WALInsertLockPadded *) allocptr;
5191 allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5192
5193 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5194 {
5195 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5196 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5197 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5198 }
5199
5200 /*
5201 * Align the start of the page buffers to a full xlog block size boundary.
5202 * This simplifies some calculations in XLOG insertion. It is also
5203 * required for O_DIRECT.
5204 */
5205 allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5206 XLogCtl->pages = allocptr;
5207 memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5208
5209 /*
5210 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5211 * in additional info.)
5212 */
5213 XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5214 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5215 XLogCtl->SharedHotStandbyActive = false;
5216 XLogCtl->SharedPromoteIsTriggered = false;
5217 XLogCtl->WalWriterSleeping = false;
5218
5219 SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5220 SpinLockInit(&XLogCtl->info_lck);
5221 SpinLockInit(&XLogCtl->ulsn_lck);
5222 InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5223 }
5224
5225 /*
5226 * This func must be called ONCE on system install. It creates pg_control
5227 * and the initial XLOG segment.
5228 */
5229 void
BootStrapXLOG(void)5230 BootStrapXLOG(void)
5231 {
5232 CheckPoint checkPoint;
5233 char *buffer;
5234 XLogPageHeader page;
5235 XLogLongPageHeader longpage;
5236 XLogRecord *record;
5237 char *recptr;
5238 bool use_existent;
5239 uint64 sysidentifier;
5240 struct timeval tv;
5241 pg_crc32c crc;
5242
5243 /*
5244 * Select a hopefully-unique system identifier code for this installation.
5245 * We use the result of gettimeofday(), including the fractional seconds
5246 * field, as being about as unique as we can easily get. (Think not to
5247 * use random(), since it hasn't been seeded and there's no portable way
5248 * to seed it other than the system clock value...) The upper half of the
5249 * uint64 value is just the tv_sec part, while the lower half contains the
5250 * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5251 * PID for a little extra uniqueness. A person knowing this encoding can
5252 * determine the initialization time of the installation, which could
5253 * perhaps be useful sometimes.
5254 */
5255 gettimeofday(&tv, NULL);
5256 sysidentifier = ((uint64) tv.tv_sec) << 32;
5257 sysidentifier |= ((uint64) tv.tv_usec) << 12;
5258 sysidentifier |= getpid() & 0xFFF;
5259
5260 /* First timeline ID is always 1 */
5261 ThisTimeLineID = 1;
5262
5263 /* page buffer must be aligned suitably for O_DIRECT */
5264 buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5265 page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5266 memset(page, 0, XLOG_BLCKSZ);
5267
5268 /*
5269 * Set up information for the initial checkpoint record
5270 *
5271 * The initial checkpoint record is written to the beginning of the WAL
5272 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5273 * used, so that we can use 0/0 to mean "before any valid WAL segment".
5274 */
5275 checkPoint.redo = wal_segment_size + SizeOfXLogLongPHD;
5276 checkPoint.ThisTimeLineID = ThisTimeLineID;
5277 checkPoint.PrevTimeLineID = ThisTimeLineID;
5278 checkPoint.fullPageWrites = fullPageWrites;
5279 checkPoint.nextFullXid =
5280 FullTransactionIdFromEpochAndXid(0, FirstNormalTransactionId);
5281 checkPoint.nextOid = FirstBootstrapObjectId;
5282 checkPoint.nextMulti = FirstMultiXactId;
5283 checkPoint.nextMultiOffset = 0;
5284 checkPoint.oldestXid = FirstNormalTransactionId;
5285 checkPoint.oldestXidDB = TemplateDbOid;
5286 checkPoint.oldestMulti = FirstMultiXactId;
5287 checkPoint.oldestMultiDB = TemplateDbOid;
5288 checkPoint.oldestCommitTsXid = InvalidTransactionId;
5289 checkPoint.newestCommitTsXid = InvalidTransactionId;
5290 checkPoint.time = (pg_time_t) time(NULL);
5291 checkPoint.oldestActiveXid = InvalidTransactionId;
5292
5293 ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
5294 ShmemVariableCache->nextOid = checkPoint.nextOid;
5295 ShmemVariableCache->oidCount = 0;
5296 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5297 AdvanceOldestClogXid(checkPoint.oldestXid);
5298 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5299 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5300 SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5301
5302 /* Set up the XLOG page header */
5303 page->xlp_magic = XLOG_PAGE_MAGIC;
5304 page->xlp_info = XLP_LONG_HEADER;
5305 page->xlp_tli = ThisTimeLineID;
5306 page->xlp_pageaddr = wal_segment_size;
5307 longpage = (XLogLongPageHeader) page;
5308 longpage->xlp_sysid = sysidentifier;
5309 longpage->xlp_seg_size = wal_segment_size;
5310 longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5311
5312 /* Insert the initial checkpoint record */
5313 recptr = ((char *) page + SizeOfXLogLongPHD);
5314 record = (XLogRecord *) recptr;
5315 record->xl_prev = 0;
5316 record->xl_xid = InvalidTransactionId;
5317 record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5318 record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5319 record->xl_rmid = RM_XLOG_ID;
5320 recptr += SizeOfXLogRecord;
5321 /* fill the XLogRecordDataHeaderShort struct */
5322 *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5323 *(recptr++) = sizeof(checkPoint);
5324 memcpy(recptr, &checkPoint, sizeof(checkPoint));
5325 recptr += sizeof(checkPoint);
5326 Assert(recptr - (char *) record == record->xl_tot_len);
5327
5328 INIT_CRC32C(crc);
5329 COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5330 COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5331 FIN_CRC32C(crc);
5332 record->xl_crc = crc;
5333
5334 /* Create first XLOG segment file */
5335 use_existent = false;
5336 openLogFile = XLogFileInit(1, &use_existent, false);
5337
5338 /*
5339 * We needn't bother with Reserve/ReleaseExternalFD here, since we'll
5340 * close the file again in a moment.
5341 */
5342
5343 /* Write the first page with the initial record */
5344 errno = 0;
5345 pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5346 if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5347 {
5348 /* if write didn't set errno, assume problem is no disk space */
5349 if (errno == 0)
5350 errno = ENOSPC;
5351 ereport(PANIC,
5352 (errcode_for_file_access(),
5353 errmsg("could not write bootstrap write-ahead log file: %m")));
5354 }
5355 pgstat_report_wait_end();
5356
5357 pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5358 if (pg_fsync(openLogFile) != 0)
5359 ereport(PANIC,
5360 (errcode_for_file_access(),
5361 errmsg("could not fsync bootstrap write-ahead log file: %m")));
5362 pgstat_report_wait_end();
5363
5364 if (close(openLogFile) != 0)
5365 ereport(PANIC,
5366 (errcode_for_file_access(),
5367 errmsg("could not close bootstrap write-ahead log file: %m")));
5368
5369 openLogFile = -1;
5370
5371 /* Now create pg_control */
5372 InitControlFile(sysidentifier);
5373 ControlFile->time = checkPoint.time;
5374 ControlFile->checkPoint = checkPoint.redo;
5375 ControlFile->checkPointCopy = checkPoint;
5376
5377 /* some additional ControlFile fields are set in WriteControlFile() */
5378 WriteControlFile();
5379
5380 /* Bootstrap the commit log, too */
5381 BootStrapCLOG();
5382 BootStrapCommitTs();
5383 BootStrapSUBTRANS();
5384 BootStrapMultiXact();
5385
5386 pfree(buffer);
5387
5388 /*
5389 * Force control file to be read - in contrast to normal processing we'd
5390 * otherwise never run the checks and GUC related initializations therein.
5391 */
5392 ReadControlFile();
5393 }
5394
5395 static char *
str_time(pg_time_t tnow)5396 str_time(pg_time_t tnow)
5397 {
5398 static char buf[128];
5399
5400 pg_strftime(buf, sizeof(buf),
5401 "%Y-%m-%d %H:%M:%S %Z",
5402 pg_localtime(&tnow, log_timezone));
5403
5404 return buf;
5405 }
5406
5407 /*
5408 * See if there are any recovery signal files and if so, set state for
5409 * recovery.
5410 *
5411 * See if there is a recovery command file (recovery.conf), and if so
5412 * throw an ERROR since as of PG12 we no longer recognize that.
5413 */
5414 static void
readRecoverySignalFile(void)5415 readRecoverySignalFile(void)
5416 {
5417 struct stat stat_buf;
5418
5419 if (IsBootstrapProcessingMode())
5420 return;
5421
5422 /*
5423 * Check for old recovery API file: recovery.conf
5424 */
5425 if (stat(RECOVERY_COMMAND_FILE, &stat_buf) == 0)
5426 ereport(FATAL,
5427 (errcode_for_file_access(),
5428 errmsg("using recovery command file \"%s\" is not supported",
5429 RECOVERY_COMMAND_FILE)));
5430
5431 /*
5432 * Remove unused .done file, if present. Ignore if absent.
5433 */
5434 unlink(RECOVERY_COMMAND_DONE);
5435
5436 /*
5437 * Check for recovery signal files and if found, fsync them since they
5438 * represent server state information. We don't sweat too much about the
5439 * possibility of fsync failure, however.
5440 *
5441 * If present, standby signal file takes precedence. If neither is present
5442 * then we won't enter archive recovery.
5443 */
5444 if (stat(STANDBY_SIGNAL_FILE, &stat_buf) == 0)
5445 {
5446 int fd;
5447
5448 fd = BasicOpenFilePerm(STANDBY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5449 S_IRUSR | S_IWUSR);
5450 if (fd >= 0)
5451 {
5452 (void) pg_fsync(fd);
5453 close(fd);
5454 }
5455 standby_signal_file_found = true;
5456 }
5457 else if (stat(RECOVERY_SIGNAL_FILE, &stat_buf) == 0)
5458 {
5459 int fd;
5460
5461 fd = BasicOpenFilePerm(RECOVERY_SIGNAL_FILE, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
5462 S_IRUSR | S_IWUSR);
5463 if (fd >= 0)
5464 {
5465 (void) pg_fsync(fd);
5466 close(fd);
5467 }
5468 recovery_signal_file_found = true;
5469 }
5470
5471 StandbyModeRequested = false;
5472 ArchiveRecoveryRequested = false;
5473 if (standby_signal_file_found)
5474 {
5475 StandbyModeRequested = true;
5476 ArchiveRecoveryRequested = true;
5477 }
5478 else if (recovery_signal_file_found)
5479 {
5480 StandbyModeRequested = false;
5481 ArchiveRecoveryRequested = true;
5482 }
5483 else
5484 return;
5485
5486 /*
5487 * We don't support standby mode in standalone backends; that requires
5488 * other processes such as the WAL receiver to be alive.
5489 */
5490 if (StandbyModeRequested && !IsUnderPostmaster)
5491 ereport(FATAL,
5492 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5493 errmsg("standby mode is not supported by single-user servers")));
5494 }
5495
5496 static void
validateRecoveryParameters(void)5497 validateRecoveryParameters(void)
5498 {
5499 if (!ArchiveRecoveryRequested)
5500 return;
5501
5502 /*
5503 * Check for compulsory parameters
5504 */
5505 if (StandbyModeRequested)
5506 {
5507 if ((PrimaryConnInfo == NULL || strcmp(PrimaryConnInfo, "") == 0) &&
5508 (recoveryRestoreCommand == NULL || strcmp(recoveryRestoreCommand, "") == 0))
5509 ereport(WARNING,
5510 (errmsg("specified neither primary_conninfo nor restore_command"),
5511 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5512 }
5513 else
5514 {
5515 if (recoveryRestoreCommand == NULL ||
5516 strcmp(recoveryRestoreCommand, "") == 0)
5517 ereport(FATAL,
5518 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5519 errmsg("must specify restore_command when standby mode is not enabled")));
5520 }
5521
5522 /*
5523 * Override any inconsistent requests. Note that this is a change of
5524 * behaviour in 9.5; prior to this we simply ignored a request to pause if
5525 * hot_standby = off, which was surprising behaviour.
5526 */
5527 if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5528 !EnableHotStandby)
5529 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5530
5531 /*
5532 * Final parsing of recovery_target_time string; see also
5533 * check_recovery_target_time().
5534 */
5535 if (recoveryTarget == RECOVERY_TARGET_TIME)
5536 {
5537 recoveryTargetTime = DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5538 CStringGetDatum(recovery_target_time_string),
5539 ObjectIdGetDatum(InvalidOid),
5540 Int32GetDatum(-1)));
5541 }
5542
5543 /*
5544 * If user specified recovery_target_timeline, validate it or compute the
5545 * "latest" value. We can't do this until after we've gotten the restore
5546 * command and set InArchiveRecovery, because we need to fetch timeline
5547 * history files from the archive.
5548 */
5549 if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_NUMERIC)
5550 {
5551 TimeLineID rtli = recoveryTargetTLIRequested;
5552
5553 /* Timeline 1 does not have a history file, all else should */
5554 if (rtli != 1 && !existsTimeLineHistory(rtli))
5555 ereport(FATAL,
5556 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5557 errmsg("recovery target timeline %u does not exist",
5558 rtli)));
5559 recoveryTargetTLI = rtli;
5560 }
5561 else if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
5562 {
5563 /* We start the "latest" search from pg_control's timeline */
5564 recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5565 }
5566 else
5567 {
5568 /*
5569 * else we just use the recoveryTargetTLI as already read from
5570 * ControlFile
5571 */
5572 Assert(recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_CONTROLFILE);
5573 }
5574 }
5575
5576 /*
5577 * Exit archive-recovery state
5578 */
5579 static void
exitArchiveRecovery(TimeLineID endTLI,XLogRecPtr endOfLog)5580 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5581 {
5582 char xlogfname[MAXFNAMELEN];
5583 XLogSegNo endLogSegNo;
5584 XLogSegNo startLogSegNo;
5585
5586 /* we always switch to a new timeline after archive recovery */
5587 Assert(endTLI != ThisTimeLineID);
5588
5589 /*
5590 * We are no longer in archive recovery state.
5591 */
5592 InArchiveRecovery = false;
5593
5594 /*
5595 * Update min recovery point one last time.
5596 */
5597 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5598
5599 /*
5600 * If the ending log segment is still open, close it (to avoid problems on
5601 * Windows with trying to rename or delete an open file).
5602 */
5603 if (readFile >= 0)
5604 {
5605 close(readFile);
5606 readFile = -1;
5607 }
5608
5609 /*
5610 * Calculate the last segment on the old timeline, and the first segment
5611 * on the new timeline. If the switch happens in the middle of a segment,
5612 * they are the same, but if the switch happens exactly at a segment
5613 * boundary, startLogSegNo will be endLogSegNo + 1.
5614 */
5615 XLByteToPrevSeg(endOfLog, endLogSegNo, wal_segment_size);
5616 XLByteToSeg(endOfLog, startLogSegNo, wal_segment_size);
5617
5618 /*
5619 * Initialize the starting WAL segment for the new timeline. If the switch
5620 * happens in the middle of a segment, copy data from the last WAL segment
5621 * of the old timeline up to the switch point, to the starting WAL segment
5622 * on the new timeline.
5623 */
5624 if (endLogSegNo == startLogSegNo)
5625 {
5626 /*
5627 * Make a copy of the file on the new timeline.
5628 *
5629 * Writing WAL isn't allowed yet, so there are no locking
5630 * considerations. But we should be just as tense as XLogFileInit to
5631 * avoid emplacing a bogus file.
5632 */
5633 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5634 XLogSegmentOffset(endOfLog, wal_segment_size));
5635 }
5636 else
5637 {
5638 /*
5639 * The switch happened at a segment boundary, so just create the next
5640 * segment on the new timeline.
5641 */
5642 bool use_existent = true;
5643 int fd;
5644
5645 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5646
5647 if (close(fd) != 0)
5648 {
5649 char xlogfname[MAXFNAMELEN];
5650 int save_errno = errno;
5651
5652 XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo,
5653 wal_segment_size);
5654 errno = save_errno;
5655 ereport(ERROR,
5656 (errcode_for_file_access(),
5657 errmsg("could not close file \"%s\": %m", xlogfname)));
5658 }
5659 }
5660
5661 /*
5662 * Let's just make real sure there are not .ready or .done flags posted
5663 * for the new segment.
5664 */
5665 XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo, wal_segment_size);
5666 XLogArchiveCleanup(xlogfname);
5667
5668 /*
5669 * Remove the signal files out of the way, so that we don't accidentally
5670 * re-enter archive recovery mode in a subsequent crash.
5671 */
5672 if (standby_signal_file_found)
5673 durable_unlink(STANDBY_SIGNAL_FILE, FATAL);
5674
5675 if (recovery_signal_file_found)
5676 durable_unlink(RECOVERY_SIGNAL_FILE, FATAL);
5677
5678 ereport(LOG,
5679 (errmsg("archive recovery complete")));
5680 }
5681
5682 /*
5683 * Extract timestamp from WAL record.
5684 *
5685 * If the record contains a timestamp, returns true, and saves the timestamp
5686 * in *recordXtime. If the record type has no timestamp, returns false.
5687 * Currently, only transaction commit/abort records and restore points contain
5688 * timestamps.
5689 */
5690 static bool
getRecordTimestamp(XLogReaderState * record,TimestampTz * recordXtime)5691 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5692 {
5693 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5694 uint8 xact_info = info & XLOG_XACT_OPMASK;
5695 uint8 rmid = XLogRecGetRmid(record);
5696
5697 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5698 {
5699 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5700 return true;
5701 }
5702 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5703 xact_info == XLOG_XACT_COMMIT_PREPARED))
5704 {
5705 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5706 return true;
5707 }
5708 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5709 xact_info == XLOG_XACT_ABORT_PREPARED))
5710 {
5711 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5712 return true;
5713 }
5714 return false;
5715 }
5716
5717 /*
5718 * For point-in-time recovery, this function decides whether we want to
5719 * stop applying the XLOG before the current record.
5720 *
5721 * Returns true if we are stopping, false otherwise. If stopping, some
5722 * information is saved in recoveryStopXid et al for use in annotating the
5723 * new timeline's history file.
5724 */
5725 static bool
recoveryStopsBefore(XLogReaderState * record)5726 recoveryStopsBefore(XLogReaderState *record)
5727 {
5728 bool stopsHere = false;
5729 uint8 xact_info;
5730 bool isCommit;
5731 TimestampTz recordXtime = 0;
5732 TransactionId recordXid;
5733
5734 /*
5735 * Ignore recovery target settings when not in archive recovery (meaning
5736 * we are in crash recovery).
5737 */
5738 if (!ArchiveRecoveryRequested)
5739 return false;
5740
5741 /* Check if we should stop as soon as reaching consistency */
5742 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5743 {
5744 ereport(LOG,
5745 (errmsg("recovery stopping after reaching consistency")));
5746
5747 recoveryStopAfter = false;
5748 recoveryStopXid = InvalidTransactionId;
5749 recoveryStopLSN = InvalidXLogRecPtr;
5750 recoveryStopTime = 0;
5751 recoveryStopName[0] = '\0';
5752 return true;
5753 }
5754
5755 /* Check if target LSN has been reached */
5756 if (recoveryTarget == RECOVERY_TARGET_LSN &&
5757 !recoveryTargetInclusive &&
5758 record->ReadRecPtr >= recoveryTargetLSN)
5759 {
5760 recoveryStopAfter = false;
5761 recoveryStopXid = InvalidTransactionId;
5762 recoveryStopLSN = record->ReadRecPtr;
5763 recoveryStopTime = 0;
5764 recoveryStopName[0] = '\0';
5765 ereport(LOG,
5766 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5767 (uint32) (recoveryStopLSN >> 32),
5768 (uint32) recoveryStopLSN)));
5769 return true;
5770 }
5771
5772 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5773 if (XLogRecGetRmid(record) != RM_XACT_ID)
5774 return false;
5775
5776 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5777
5778 if (xact_info == XLOG_XACT_COMMIT)
5779 {
5780 isCommit = true;
5781 recordXid = XLogRecGetXid(record);
5782 }
5783 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5784 {
5785 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5786 xl_xact_parsed_commit parsed;
5787
5788 isCommit = true;
5789 ParseCommitRecord(XLogRecGetInfo(record),
5790 xlrec,
5791 &parsed);
5792 recordXid = parsed.twophase_xid;
5793 }
5794 else if (xact_info == XLOG_XACT_ABORT)
5795 {
5796 isCommit = false;
5797 recordXid = XLogRecGetXid(record);
5798 }
5799 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5800 {
5801 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5802 xl_xact_parsed_abort parsed;
5803
5804 isCommit = false;
5805 ParseAbortRecord(XLogRecGetInfo(record),
5806 xlrec,
5807 &parsed);
5808 recordXid = parsed.twophase_xid;
5809 }
5810 else
5811 return false;
5812
5813 if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5814 {
5815 /*
5816 * There can be only one transaction end record with this exact
5817 * transactionid
5818 *
5819 * when testing for an xid, we MUST test for equality only, since
5820 * transactions are numbered in the order they start, not the order
5821 * they complete. A higher numbered xid will complete before you about
5822 * 50% of the time...
5823 */
5824 stopsHere = (recordXid == recoveryTargetXid);
5825 }
5826
5827 if (recoveryTarget == RECOVERY_TARGET_TIME &&
5828 getRecordTimestamp(record, &recordXtime))
5829 {
5830 /*
5831 * There can be many transactions that share the same commit time, so
5832 * we stop after the last one, if we are inclusive, or stop at the
5833 * first one if we are exclusive
5834 */
5835 if (recoveryTargetInclusive)
5836 stopsHere = (recordXtime > recoveryTargetTime);
5837 else
5838 stopsHere = (recordXtime >= recoveryTargetTime);
5839 }
5840
5841 if (stopsHere)
5842 {
5843 recoveryStopAfter = false;
5844 recoveryStopXid = recordXid;
5845 recoveryStopTime = recordXtime;
5846 recoveryStopLSN = InvalidXLogRecPtr;
5847 recoveryStopName[0] = '\0';
5848
5849 if (isCommit)
5850 {
5851 ereport(LOG,
5852 (errmsg("recovery stopping before commit of transaction %u, time %s",
5853 recoveryStopXid,
5854 timestamptz_to_str(recoveryStopTime))));
5855 }
5856 else
5857 {
5858 ereport(LOG,
5859 (errmsg("recovery stopping before abort of transaction %u, time %s",
5860 recoveryStopXid,
5861 timestamptz_to_str(recoveryStopTime))));
5862 }
5863 }
5864
5865 return stopsHere;
5866 }
5867
5868 /*
5869 * Same as recoveryStopsBefore, but called after applying the record.
5870 *
5871 * We also track the timestamp of the latest applied COMMIT/ABORT
5872 * record in XLogCtl->recoveryLastXTime.
5873 */
5874 static bool
recoveryStopsAfter(XLogReaderState * record)5875 recoveryStopsAfter(XLogReaderState *record)
5876 {
5877 uint8 info;
5878 uint8 xact_info;
5879 uint8 rmid;
5880 TimestampTz recordXtime;
5881
5882 /*
5883 * Ignore recovery target settings when not in archive recovery (meaning
5884 * we are in crash recovery).
5885 */
5886 if (!ArchiveRecoveryRequested)
5887 return false;
5888
5889 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5890 rmid = XLogRecGetRmid(record);
5891
5892 /*
5893 * There can be many restore points that share the same name; we stop at
5894 * the first one.
5895 */
5896 if (recoveryTarget == RECOVERY_TARGET_NAME &&
5897 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5898 {
5899 xl_restore_point *recordRestorePointData;
5900
5901 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5902
5903 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5904 {
5905 recoveryStopAfter = true;
5906 recoveryStopXid = InvalidTransactionId;
5907 recoveryStopLSN = InvalidXLogRecPtr;
5908 (void) getRecordTimestamp(record, &recoveryStopTime);
5909 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5910
5911 ereport(LOG,
5912 (errmsg("recovery stopping at restore point \"%s\", time %s",
5913 recoveryStopName,
5914 timestamptz_to_str(recoveryStopTime))));
5915 return true;
5916 }
5917 }
5918
5919 /* Check if the target LSN has been reached */
5920 if (recoveryTarget == RECOVERY_TARGET_LSN &&
5921 recoveryTargetInclusive &&
5922 record->ReadRecPtr >= recoveryTargetLSN)
5923 {
5924 recoveryStopAfter = true;
5925 recoveryStopXid = InvalidTransactionId;
5926 recoveryStopLSN = record->ReadRecPtr;
5927 recoveryStopTime = 0;
5928 recoveryStopName[0] = '\0';
5929 ereport(LOG,
5930 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5931 (uint32) (recoveryStopLSN >> 32),
5932 (uint32) recoveryStopLSN)));
5933 return true;
5934 }
5935
5936 if (rmid != RM_XACT_ID)
5937 return false;
5938
5939 xact_info = info & XLOG_XACT_OPMASK;
5940
5941 if (xact_info == XLOG_XACT_COMMIT ||
5942 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5943 xact_info == XLOG_XACT_ABORT ||
5944 xact_info == XLOG_XACT_ABORT_PREPARED)
5945 {
5946 TransactionId recordXid;
5947
5948 /* Update the last applied transaction timestamp */
5949 if (getRecordTimestamp(record, &recordXtime))
5950 SetLatestXTime(recordXtime);
5951
5952 /* Extract the XID of the committed/aborted transaction */
5953 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5954 {
5955 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5956 xl_xact_parsed_commit parsed;
5957
5958 ParseCommitRecord(XLogRecGetInfo(record),
5959 xlrec,
5960 &parsed);
5961 recordXid = parsed.twophase_xid;
5962 }
5963 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5964 {
5965 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5966 xl_xact_parsed_abort parsed;
5967
5968 ParseAbortRecord(XLogRecGetInfo(record),
5969 xlrec,
5970 &parsed);
5971 recordXid = parsed.twophase_xid;
5972 }
5973 else
5974 recordXid = XLogRecGetXid(record);
5975
5976 /*
5977 * There can be only one transaction end record with this exact
5978 * transactionid
5979 *
5980 * when testing for an xid, we MUST test for equality only, since
5981 * transactions are numbered in the order they start, not the order
5982 * they complete. A higher numbered xid will complete before you about
5983 * 50% of the time...
5984 */
5985 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5986 recordXid == recoveryTargetXid)
5987 {
5988 recoveryStopAfter = true;
5989 recoveryStopXid = recordXid;
5990 recoveryStopTime = recordXtime;
5991 recoveryStopLSN = InvalidXLogRecPtr;
5992 recoveryStopName[0] = '\0';
5993
5994 if (xact_info == XLOG_XACT_COMMIT ||
5995 xact_info == XLOG_XACT_COMMIT_PREPARED)
5996 {
5997 ereport(LOG,
5998 (errmsg("recovery stopping after commit of transaction %u, time %s",
5999 recoveryStopXid,
6000 timestamptz_to_str(recoveryStopTime))));
6001 }
6002 else if (xact_info == XLOG_XACT_ABORT ||
6003 xact_info == XLOG_XACT_ABORT_PREPARED)
6004 {
6005 ereport(LOG,
6006 (errmsg("recovery stopping after abort of transaction %u, time %s",
6007 recoveryStopXid,
6008 timestamptz_to_str(recoveryStopTime))));
6009 }
6010 return true;
6011 }
6012 }
6013
6014 /* Check if we should stop as soon as reaching consistency */
6015 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
6016 {
6017 ereport(LOG,
6018 (errmsg("recovery stopping after reaching consistency")));
6019
6020 recoveryStopAfter = true;
6021 recoveryStopXid = InvalidTransactionId;
6022 recoveryStopTime = 0;
6023 recoveryStopLSN = InvalidXLogRecPtr;
6024 recoveryStopName[0] = '\0';
6025 return true;
6026 }
6027
6028 return false;
6029 }
6030
6031 /*
6032 * Wait until shared recoveryPause flag is cleared.
6033 *
6034 * endOfRecovery is true if the recovery target is reached and
6035 * the paused state starts at the end of recovery because of
6036 * recovery_target_action=pause, and false otherwise.
6037 *
6038 * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
6039 * Probably not worth the trouble though. This state shouldn't be one that
6040 * anyone cares about server power consumption in.
6041 */
6042 static void
recoveryPausesHere(bool endOfRecovery)6043 recoveryPausesHere(bool endOfRecovery)
6044 {
6045 /* Don't pause unless users can connect! */
6046 if (!LocalHotStandbyActive)
6047 return;
6048
6049 /* Don't pause after standby promotion has been triggered */
6050 if (LocalPromoteIsTriggered)
6051 return;
6052
6053 if (endOfRecovery)
6054 ereport(LOG,
6055 (errmsg("pausing at the end of recovery"),
6056 errhint("Execute pg_wal_replay_resume() to promote.")));
6057 else
6058 ereport(LOG,
6059 (errmsg("recovery has paused"),
6060 errhint("Execute pg_wal_replay_resume() to continue.")));
6061
6062 while (RecoveryIsPaused())
6063 {
6064 HandleStartupProcInterrupts();
6065 if (CheckForStandbyTrigger())
6066 return;
6067 pgstat_report_wait_start(WAIT_EVENT_RECOVERY_PAUSE);
6068 pg_usleep(1000000L); /* 1000 ms */
6069 pgstat_report_wait_end();
6070 }
6071 }
6072
6073 bool
RecoveryIsPaused(void)6074 RecoveryIsPaused(void)
6075 {
6076 bool recoveryPause;
6077
6078 SpinLockAcquire(&XLogCtl->info_lck);
6079 recoveryPause = XLogCtl->recoveryPause;
6080 SpinLockRelease(&XLogCtl->info_lck);
6081
6082 return recoveryPause;
6083 }
6084
6085 void
SetRecoveryPause(bool recoveryPause)6086 SetRecoveryPause(bool recoveryPause)
6087 {
6088 SpinLockAcquire(&XLogCtl->info_lck);
6089 XLogCtl->recoveryPause = recoveryPause;
6090 SpinLockRelease(&XLogCtl->info_lck);
6091 }
6092
6093 /*
6094 * When recovery_min_apply_delay is set, we wait long enough to make sure
6095 * certain record types are applied at least that interval behind the master.
6096 *
6097 * Returns true if we waited.
6098 *
6099 * Note that the delay is calculated between the WAL record log time and
6100 * the current time on standby. We would prefer to keep track of when this
6101 * standby received each WAL record, which would allow a more consistent
6102 * approach and one not affected by time synchronisation issues, but that
6103 * is significantly more effort and complexity for little actual gain in
6104 * usability.
6105 */
6106 static bool
recoveryApplyDelay(XLogReaderState * record)6107 recoveryApplyDelay(XLogReaderState *record)
6108 {
6109 uint8 xact_info;
6110 TimestampTz xtime;
6111 TimestampTz delayUntil;
6112 long msecs;
6113
6114 /* nothing to do if no delay configured */
6115 if (recovery_min_apply_delay <= 0)
6116 return false;
6117
6118 /* no delay is applied on a database not yet consistent */
6119 if (!reachedConsistency)
6120 return false;
6121
6122 /* nothing to do if crash recovery is requested */
6123 if (!ArchiveRecoveryRequested)
6124 return false;
6125
6126 /*
6127 * Is it a COMMIT record?
6128 *
6129 * We deliberately choose not to delay aborts since they have no effect on
6130 * MVCC. We already allow replay of records that don't have a timestamp,
6131 * so there is already opportunity for issues caused by early conflicts on
6132 * standbys.
6133 */
6134 if (XLogRecGetRmid(record) != RM_XACT_ID)
6135 return false;
6136
6137 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6138
6139 if (xact_info != XLOG_XACT_COMMIT &&
6140 xact_info != XLOG_XACT_COMMIT_PREPARED)
6141 return false;
6142
6143 if (!getRecordTimestamp(record, &xtime))
6144 return false;
6145
6146 delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6147
6148 /*
6149 * Exit without arming the latch if it's already past time to apply this
6150 * record
6151 */
6152 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(), delayUntil);
6153 if (msecs <= 0)
6154 return false;
6155
6156 while (true)
6157 {
6158 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6159
6160 /*
6161 * This might change recovery_min_apply_delay or the trigger file's
6162 * location.
6163 */
6164 HandleStartupProcInterrupts();
6165
6166 if (CheckForStandbyTrigger())
6167 break;
6168
6169 /*
6170 * Recalculate delayUntil as recovery_min_apply_delay could have
6171 * changed while waiting in this loop.
6172 */
6173 delayUntil = TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6174
6175 /*
6176 * Wait for difference between GetCurrentTimestamp() and delayUntil.
6177 */
6178 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
6179 delayUntil);
6180
6181 if (msecs <= 0)
6182 break;
6183
6184 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
6185
6186 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
6187 WL_LATCH_SET | WL_TIMEOUT | WL_EXIT_ON_PM_DEATH,
6188 msecs,
6189 WAIT_EVENT_RECOVERY_APPLY_DELAY);
6190 }
6191 return true;
6192 }
6193
6194 /*
6195 * Save timestamp of latest processed commit/abort record.
6196 *
6197 * We keep this in XLogCtl, not a simple static variable, so that it can be
6198 * seen by processes other than the startup process. Note in particular
6199 * that CreateRestartPoint is executed in the checkpointer.
6200 */
6201 static void
SetLatestXTime(TimestampTz xtime)6202 SetLatestXTime(TimestampTz xtime)
6203 {
6204 SpinLockAcquire(&XLogCtl->info_lck);
6205 XLogCtl->recoveryLastXTime = xtime;
6206 SpinLockRelease(&XLogCtl->info_lck);
6207 }
6208
6209 /*
6210 * Fetch timestamp of latest processed commit/abort record.
6211 */
6212 TimestampTz
GetLatestXTime(void)6213 GetLatestXTime(void)
6214 {
6215 TimestampTz xtime;
6216
6217 SpinLockAcquire(&XLogCtl->info_lck);
6218 xtime = XLogCtl->recoveryLastXTime;
6219 SpinLockRelease(&XLogCtl->info_lck);
6220
6221 return xtime;
6222 }
6223
6224 /*
6225 * Save timestamp of the next chunk of WAL records to apply.
6226 *
6227 * We keep this in XLogCtl, not a simple static variable, so that it can be
6228 * seen by all backends.
6229 */
6230 static void
SetCurrentChunkStartTime(TimestampTz xtime)6231 SetCurrentChunkStartTime(TimestampTz xtime)
6232 {
6233 SpinLockAcquire(&XLogCtl->info_lck);
6234 XLogCtl->currentChunkStartTime = xtime;
6235 SpinLockRelease(&XLogCtl->info_lck);
6236 }
6237
6238 /*
6239 * Fetch timestamp of latest processed commit/abort record.
6240 * Startup process maintains an accurate local copy in XLogReceiptTime
6241 */
6242 TimestampTz
GetCurrentChunkReplayStartTime(void)6243 GetCurrentChunkReplayStartTime(void)
6244 {
6245 TimestampTz xtime;
6246
6247 SpinLockAcquire(&XLogCtl->info_lck);
6248 xtime = XLogCtl->currentChunkStartTime;
6249 SpinLockRelease(&XLogCtl->info_lck);
6250
6251 return xtime;
6252 }
6253
6254 /*
6255 * Returns time of receipt of current chunk of XLOG data, as well as
6256 * whether it was received from streaming replication or from archives.
6257 */
6258 void
GetXLogReceiptTime(TimestampTz * rtime,bool * fromStream)6259 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6260 {
6261 /*
6262 * This must be executed in the startup process, since we don't export the
6263 * relevant state to shared memory.
6264 */
6265 Assert(InRecovery);
6266
6267 *rtime = XLogReceiptTime;
6268 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6269 }
6270
6271 /*
6272 * Note that text field supplied is a parameter name and does not require
6273 * translation
6274 */
6275 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6276 do { \
6277 if ((currValue) < (minValue)) \
6278 ereport(ERROR, \
6279 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6280 errmsg("hot standby is not possible because %s = %d is a lower setting than on the master server (its value was %d)", \
6281 param_name, \
6282 currValue, \
6283 minValue))); \
6284 } while(0)
6285
6286 /*
6287 * Check to see if required parameters are set high enough on this server
6288 * for various aspects of recovery operation.
6289 *
6290 * Note that all the parameters which this function tests need to be
6291 * listed in Administrator's Overview section in high-availability.sgml.
6292 * If you change them, don't forget to update the list.
6293 */
6294 static void
CheckRequiredParameterValues(void)6295 CheckRequiredParameterValues(void)
6296 {
6297 /*
6298 * For archive recovery, the WAL must be generated with at least 'replica'
6299 * wal_level.
6300 */
6301 if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6302 {
6303 ereport(WARNING,
6304 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6305 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6306 }
6307
6308 /*
6309 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6310 * must have at least as many backend slots as the primary.
6311 */
6312 if (ArchiveRecoveryRequested && EnableHotStandby)
6313 {
6314 if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6315 ereport(ERROR,
6316 (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6317 errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6318
6319 /* We ignore autovacuum_max_workers when we make this test. */
6320 RecoveryRequiresIntParameter("max_connections",
6321 MaxConnections,
6322 ControlFile->MaxConnections);
6323 RecoveryRequiresIntParameter("max_worker_processes",
6324 max_worker_processes,
6325 ControlFile->max_worker_processes);
6326 RecoveryRequiresIntParameter("max_wal_senders",
6327 max_wal_senders,
6328 ControlFile->max_wal_senders);
6329 RecoveryRequiresIntParameter("max_prepared_transactions",
6330 max_prepared_xacts,
6331 ControlFile->max_prepared_xacts);
6332 RecoveryRequiresIntParameter("max_locks_per_transaction",
6333 max_locks_per_xact,
6334 ControlFile->max_locks_per_xact);
6335 }
6336 }
6337
6338 /*
6339 * This must be called ONCE during postmaster or standalone-backend startup
6340 */
6341 void
StartupXLOG(void)6342 StartupXLOG(void)
6343 {
6344 XLogCtlInsert *Insert;
6345 CheckPoint checkPoint;
6346 bool wasShutdown;
6347 bool reachedRecoveryTarget = false;
6348 bool haveBackupLabel = false;
6349 bool haveTblspcMap = false;
6350 XLogRecPtr RecPtr,
6351 checkPointLoc,
6352 EndOfLog;
6353 TimeLineID EndOfLogTLI;
6354 TimeLineID PrevTimeLineID;
6355 XLogRecord *record;
6356 TransactionId oldestActiveXID;
6357 bool backupEndRequired = false;
6358 bool backupFromStandby = false;
6359 DBState dbstate_at_startup;
6360 XLogReaderState *xlogreader;
6361 XLogPageReadPrivate private;
6362 bool fast_promoted = false;
6363 struct stat st;
6364
6365 /*
6366 * We should have an aux process resource owner to use, and we should not
6367 * be in a transaction that's installed some other resowner.
6368 */
6369 Assert(AuxProcessResourceOwner != NULL);
6370 Assert(CurrentResourceOwner == NULL ||
6371 CurrentResourceOwner == AuxProcessResourceOwner);
6372 CurrentResourceOwner = AuxProcessResourceOwner;
6373
6374 /*
6375 * Check that contents look valid.
6376 */
6377 if (!XRecOffIsValid(ControlFile->checkPoint))
6378 ereport(FATAL,
6379 (errmsg("control file contains invalid checkpoint location")));
6380
6381 switch (ControlFile->state)
6382 {
6383 case DB_SHUTDOWNED:
6384
6385 /*
6386 * This is the expected case, so don't be chatty in standalone
6387 * mode
6388 */
6389 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6390 (errmsg("database system was shut down at %s",
6391 str_time(ControlFile->time))));
6392 break;
6393
6394 case DB_SHUTDOWNED_IN_RECOVERY:
6395 ereport(LOG,
6396 (errmsg("database system was shut down in recovery at %s",
6397 str_time(ControlFile->time))));
6398 break;
6399
6400 case DB_SHUTDOWNING:
6401 ereport(LOG,
6402 (errmsg("database system shutdown was interrupted; last known up at %s",
6403 str_time(ControlFile->time))));
6404 break;
6405
6406 case DB_IN_CRASH_RECOVERY:
6407 ereport(LOG,
6408 (errmsg("database system was interrupted while in recovery at %s",
6409 str_time(ControlFile->time)),
6410 errhint("This probably means that some data is corrupted and"
6411 " you will have to use the last backup for recovery.")));
6412 break;
6413
6414 case DB_IN_ARCHIVE_RECOVERY:
6415 ereport(LOG,
6416 (errmsg("database system was interrupted while in recovery at log time %s",
6417 str_time(ControlFile->checkPointCopy.time)),
6418 errhint("If this has occurred more than once some data might be corrupted"
6419 " and you might need to choose an earlier recovery target.")));
6420 break;
6421
6422 case DB_IN_PRODUCTION:
6423 ereport(LOG,
6424 (errmsg("database system was interrupted; last known up at %s",
6425 str_time(ControlFile->time))));
6426 break;
6427
6428 default:
6429 ereport(FATAL,
6430 (errmsg("control file contains invalid database cluster state")));
6431 }
6432
6433 /* This is just to allow attaching to startup process with a debugger */
6434 #ifdef XLOG_REPLAY_DELAY
6435 if (ControlFile->state != DB_SHUTDOWNED)
6436 pg_usleep(60000000L);
6437 #endif
6438
6439 /*
6440 * Verify that pg_wal and pg_wal/archive_status exist. In cases where
6441 * someone has performed a copy for PITR, these directories may have been
6442 * excluded and need to be re-created.
6443 */
6444 ValidateXLOGDirectoryStructure();
6445
6446 /*----------
6447 * If we previously crashed, perform a couple of actions:
6448 *
6449 * - The pg_wal directory may still include some temporary WAL segments
6450 * used when creating a new segment, so perform some clean up to not
6451 * bloat this path. This is done first as there is no point to sync
6452 * this temporary data.
6453 *
6454 * - There might be data which we had written, intending to fsync it, but
6455 * which we had not actually fsync'd yet. Therefore, a power failure in
6456 * the near future might cause earlier unflushed writes to be lost, even
6457 * though more recent data written to disk from here on would be
6458 * persisted. To avoid that, fsync the entire data directory.
6459 */
6460 if (ControlFile->state != DB_SHUTDOWNED &&
6461 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6462 {
6463 RemoveTempXlogFiles();
6464 SyncDataDirectory();
6465 }
6466
6467 /*
6468 * Initialize on the assumption we want to recover to the latest timeline
6469 * that's active according to pg_control.
6470 */
6471 if (ControlFile->minRecoveryPointTLI >
6472 ControlFile->checkPointCopy.ThisTimeLineID)
6473 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6474 else
6475 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6476
6477 /*
6478 * Check for signal files, and if so set up state for offline recovery
6479 */
6480 readRecoverySignalFile();
6481 validateRecoveryParameters();
6482
6483 if (ArchiveRecoveryRequested)
6484 {
6485 if (StandbyModeRequested)
6486 ereport(LOG,
6487 (errmsg("entering standby mode")));
6488 else if (recoveryTarget == RECOVERY_TARGET_XID)
6489 ereport(LOG,
6490 (errmsg("starting point-in-time recovery to XID %u",
6491 recoveryTargetXid)));
6492 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6493 ereport(LOG,
6494 (errmsg("starting point-in-time recovery to %s",
6495 timestamptz_to_str(recoveryTargetTime))));
6496 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6497 ereport(LOG,
6498 (errmsg("starting point-in-time recovery to \"%s\"",
6499 recoveryTargetName)));
6500 else if (recoveryTarget == RECOVERY_TARGET_LSN)
6501 ereport(LOG,
6502 (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6503 (uint32) (recoveryTargetLSN >> 32),
6504 (uint32) recoveryTargetLSN)));
6505 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6506 ereport(LOG,
6507 (errmsg("starting point-in-time recovery to earliest consistent point")));
6508 else
6509 ereport(LOG,
6510 (errmsg("starting archive recovery")));
6511 }
6512
6513 /*
6514 * Take ownership of the wakeup latch if we're going to sleep during
6515 * recovery.
6516 */
6517 if (ArchiveRecoveryRequested)
6518 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6519
6520 /* Set up XLOG reader facility */
6521 MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6522 xlogreader =
6523 XLogReaderAllocate(wal_segment_size, NULL,
6524 XL_ROUTINE(.page_read = &XLogPageRead,
6525 .segment_open = NULL,
6526 .segment_close = wal_segment_close),
6527 &private);
6528 if (!xlogreader)
6529 ereport(ERROR,
6530 (errcode(ERRCODE_OUT_OF_MEMORY),
6531 errmsg("out of memory"),
6532 errdetail("Failed while allocating a WAL reading processor.")));
6533 xlogreader->system_identifier = ControlFile->system_identifier;
6534
6535 /*
6536 * Allocate two page buffers dedicated to WAL consistency checks. We do
6537 * it this way, rather than just making static arrays, for two reasons:
6538 * (1) no need to waste the storage in most instantiations of the backend;
6539 * (2) a static char array isn't guaranteed to have any particular
6540 * alignment, whereas palloc() will provide MAXALIGN'd storage.
6541 */
6542 replay_image_masked = (char *) palloc(BLCKSZ);
6543 master_image_masked = (char *) palloc(BLCKSZ);
6544
6545 if (read_backup_label(&checkPointLoc, &backupEndRequired,
6546 &backupFromStandby))
6547 {
6548 List *tablespaces = NIL;
6549
6550 /*
6551 * Archive recovery was requested, and thanks to the backup label
6552 * file, we know how far we need to replay to reach consistency. Enter
6553 * archive recovery directly.
6554 */
6555 InArchiveRecovery = true;
6556 if (StandbyModeRequested)
6557 StandbyMode = true;
6558
6559 /*
6560 * When a backup_label file is present, we want to roll forward from
6561 * the checkpoint it identifies, rather than using pg_control.
6562 */
6563 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6564 if (record != NULL)
6565 {
6566 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6567 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6568 ereport(DEBUG1,
6569 (errmsg("checkpoint record is at %X/%X",
6570 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6571 InRecovery = true; /* force recovery even if SHUTDOWNED */
6572
6573 /*
6574 * Make sure that REDO location exists. This may not be the case
6575 * if there was a crash during an online backup, which left a
6576 * backup_label around that references a WAL segment that's
6577 * already been archived.
6578 */
6579 if (checkPoint.redo < checkPointLoc)
6580 {
6581 XLogBeginRead(xlogreader, checkPoint.redo);
6582 if (!ReadRecord(xlogreader, LOG, false))
6583 ereport(FATAL,
6584 (errmsg("could not find redo location referenced by checkpoint record"),
6585 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6586 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6587 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6588 DataDir, DataDir, DataDir)));
6589 }
6590 }
6591 else
6592 {
6593 ereport(FATAL,
6594 (errmsg("could not locate required checkpoint record"),
6595 errhint("If you are restoring from a backup, touch \"%s/recovery.signal\" and add required recovery options.\n"
6596 "If you are not restoring from a backup, try removing the file \"%s/backup_label\".\n"
6597 "Be careful: removing \"%s/backup_label\" will result in a corrupt cluster if restoring from a backup.",
6598 DataDir, DataDir, DataDir)));
6599 wasShutdown = false; /* keep compiler quiet */
6600 }
6601
6602 /* read the tablespace_map file if present and create symlinks. */
6603 if (read_tablespace_map(&tablespaces))
6604 {
6605 ListCell *lc;
6606
6607 foreach(lc, tablespaces)
6608 {
6609 tablespaceinfo *ti = lfirst(lc);
6610 char *linkloc;
6611
6612 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6613
6614 /*
6615 * Remove the existing symlink if any and Create the symlink
6616 * under PGDATA.
6617 */
6618 remove_tablespace_symlink(linkloc);
6619
6620 if (symlink(ti->path, linkloc) < 0)
6621 ereport(ERROR,
6622 (errcode_for_file_access(),
6623 errmsg("could not create symbolic link \"%s\": %m",
6624 linkloc)));
6625
6626 pfree(ti->oid);
6627 pfree(ti->path);
6628 pfree(ti);
6629 }
6630
6631 /* set flag to delete it later */
6632 haveTblspcMap = true;
6633 }
6634
6635 /* set flag to delete it later */
6636 haveBackupLabel = true;
6637 }
6638 else
6639 {
6640 /*
6641 * If tablespace_map file is present without backup_label file, there
6642 * is no use of such file. There is no harm in retaining it, but it
6643 * is better to get rid of the map file so that we don't have any
6644 * redundant file in data directory and it will avoid any sort of
6645 * confusion. It seems prudent though to just rename the file out of
6646 * the way rather than delete it completely, also we ignore any error
6647 * that occurs in rename operation as even if map file is present
6648 * without backup_label file, it is harmless.
6649 */
6650 if (stat(TABLESPACE_MAP, &st) == 0)
6651 {
6652 unlink(TABLESPACE_MAP_OLD);
6653 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6654 ereport(LOG,
6655 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6656 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6657 errdetail("File \"%s\" was renamed to \"%s\".",
6658 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6659 else
6660 ereport(LOG,
6661 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6662 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6663 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6664 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6665 }
6666
6667 /*
6668 * It's possible that archive recovery was requested, but we don't
6669 * know how far we need to replay the WAL before we reach consistency.
6670 * This can happen for example if a base backup is taken from a
6671 * running server using an atomic filesystem snapshot, without calling
6672 * pg_start/stop_backup. Or if you just kill a running master server
6673 * and put it into archive recovery by creating a recovery signal
6674 * file.
6675 *
6676 * Our strategy in that case is to perform crash recovery first,
6677 * replaying all the WAL present in pg_wal, and only enter archive
6678 * recovery after that.
6679 *
6680 * But usually we already know how far we need to replay the WAL (up
6681 * to minRecoveryPoint, up to backupEndPoint, or until we see an
6682 * end-of-backup record), and we can enter archive recovery directly.
6683 */
6684 if (ArchiveRecoveryRequested &&
6685 (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6686 ControlFile->backupEndRequired ||
6687 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6688 ControlFile->state == DB_SHUTDOWNED))
6689 {
6690 InArchiveRecovery = true;
6691 if (StandbyModeRequested)
6692 StandbyMode = true;
6693 }
6694
6695 /* Get the last valid checkpoint record. */
6696 checkPointLoc = ControlFile->checkPoint;
6697 RedoStartLSN = ControlFile->checkPointCopy.redo;
6698 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6699 if (record != NULL)
6700 {
6701 ereport(DEBUG1,
6702 (errmsg("checkpoint record is at %X/%X",
6703 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6704 }
6705 else
6706 {
6707 /*
6708 * We used to attempt to go back to a secondary checkpoint record
6709 * here, but only when not in standby mode. We now just fail if we
6710 * can't read the last checkpoint because this allows us to
6711 * simplify processing around checkpoints.
6712 */
6713 ereport(PANIC,
6714 (errmsg("could not locate a valid checkpoint record")));
6715 }
6716 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6717 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6718 }
6719
6720 /*
6721 * Clear out any old relcache cache files. This is *necessary* if we do
6722 * any WAL replay, since that would probably result in the cache files
6723 * being out of sync with database reality. In theory we could leave them
6724 * in place if the database had been cleanly shut down, but it seems
6725 * safest to just remove them always and let them be rebuilt during the
6726 * first backend startup. These files needs to be removed from all
6727 * directories including pg_tblspc, however the symlinks are created only
6728 * after reading tablespace_map file in case of archive recovery from
6729 * backup, so needs to clear old relcache files here after creating
6730 * symlinks.
6731 */
6732 RelationCacheInitFileRemove();
6733
6734 /*
6735 * If the location of the checkpoint record is not on the expected
6736 * timeline in the history of the requested timeline, we cannot proceed:
6737 * the backup is not part of the history of the requested timeline.
6738 */
6739 Assert(expectedTLEs); /* was initialized by reading checkpoint
6740 * record */
6741 if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6742 checkPoint.ThisTimeLineID)
6743 {
6744 XLogRecPtr switchpoint;
6745
6746 /*
6747 * tliSwitchPoint will throw an error if the checkpoint's timeline is
6748 * not in expectedTLEs at all.
6749 */
6750 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6751 ereport(FATAL,
6752 (errmsg("requested timeline %u is not a child of this server's history",
6753 recoveryTargetTLI),
6754 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6755 (uint32) (ControlFile->checkPoint >> 32),
6756 (uint32) ControlFile->checkPoint,
6757 ControlFile->checkPointCopy.ThisTimeLineID,
6758 (uint32) (switchpoint >> 32),
6759 (uint32) switchpoint)));
6760 }
6761
6762 /*
6763 * The min recovery point should be part of the requested timeline's
6764 * history, too.
6765 */
6766 if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6767 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6768 ControlFile->minRecoveryPointTLI)
6769 ereport(FATAL,
6770 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6771 recoveryTargetTLI,
6772 (uint32) (ControlFile->minRecoveryPoint >> 32),
6773 (uint32) ControlFile->minRecoveryPoint,
6774 ControlFile->minRecoveryPointTLI)));
6775
6776 LastRec = RecPtr = checkPointLoc;
6777
6778 ereport(DEBUG1,
6779 (errmsg_internal("redo record is at %X/%X; shutdown %s",
6780 (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6781 wasShutdown ? "true" : "false")));
6782 ereport(DEBUG1,
6783 (errmsg_internal("next transaction ID: " UINT64_FORMAT "; next OID: %u",
6784 U64FromFullTransactionId(checkPoint.nextFullXid),
6785 checkPoint.nextOid)));
6786 ereport(DEBUG1,
6787 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6788 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6789 ereport(DEBUG1,
6790 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6791 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6792 ereport(DEBUG1,
6793 (errmsg_internal("oldest MultiXactId: %u, in database %u",
6794 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6795 ereport(DEBUG1,
6796 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6797 checkPoint.oldestCommitTsXid,
6798 checkPoint.newestCommitTsXid)));
6799 if (!TransactionIdIsNormal(XidFromFullTransactionId(checkPoint.nextFullXid)))
6800 ereport(PANIC,
6801 (errmsg("invalid next transaction ID")));
6802
6803 /* initialize shared memory variables from the checkpoint record */
6804 ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
6805 ShmemVariableCache->nextOid = checkPoint.nextOid;
6806 ShmemVariableCache->oidCount = 0;
6807 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6808 AdvanceOldestClogXid(checkPoint.oldestXid);
6809 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6810 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6811 SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6812 checkPoint.newestCommitTsXid);
6813 XLogCtl->ckptFullXid = checkPoint.nextFullXid;
6814
6815 /*
6816 * Initialize replication slots, before there's a chance to remove
6817 * required resources.
6818 */
6819 StartupReplicationSlots();
6820
6821 /*
6822 * Startup logical state, needs to be setup now so we have proper data
6823 * during crash recovery.
6824 */
6825 StartupReorderBuffer();
6826
6827 /*
6828 * Startup MultiXact. We need to do this early to be able to replay
6829 * truncations.
6830 */
6831 StartupMultiXact();
6832
6833 /*
6834 * Ditto for commit timestamps. Activate the facility if the setting is
6835 * enabled in the control file, as there should be no tracking of commit
6836 * timestamps done when the setting was disabled. This facility can be
6837 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6838 */
6839 if (ControlFile->track_commit_timestamp)
6840 StartupCommitTs();
6841
6842 /*
6843 * Recover knowledge about replay progress of known replication partners.
6844 */
6845 StartupReplicationOrigin();
6846
6847 /*
6848 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6849 * control file. On recovery, all unlogged relations are blown away, so
6850 * the unlogged LSN counter can be reset too.
6851 */
6852 if (ControlFile->state == DB_SHUTDOWNED)
6853 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6854 else
6855 XLogCtl->unloggedLSN = FirstNormalUnloggedLSN;
6856
6857 /*
6858 * We must replay WAL entries using the same TimeLineID they were created
6859 * under, so temporarily adopt the TLI indicated by the checkpoint (see
6860 * also xlog_redo()).
6861 */
6862 ThisTimeLineID = checkPoint.ThisTimeLineID;
6863
6864 /*
6865 * Copy any missing timeline history files between 'now' and the recovery
6866 * target timeline from archive to pg_wal. While we don't need those files
6867 * ourselves - the history file of the recovery target timeline covers all
6868 * the previous timelines in the history too - a cascading standby server
6869 * might be interested in them. Or, if you archive the WAL from this
6870 * server to a different archive than the master, it'd be good for all the
6871 * history files to get archived there after failover, so that you can use
6872 * one of the old timelines as a PITR target. Timeline history files are
6873 * small, so it's better to copy them unnecessarily than not copy them and
6874 * regret later.
6875 */
6876 restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6877
6878 /*
6879 * Before running in recovery, scan pg_twophase and fill in its status to
6880 * be able to work on entries generated by redo. Doing a scan before
6881 * taking any recovery action has the merit to discard any 2PC files that
6882 * are newer than the first record to replay, saving from any conflicts at
6883 * replay. This avoids as well any subsequent scans when doing recovery
6884 * of the on-disk two-phase data.
6885 */
6886 restoreTwoPhaseData();
6887
6888 lastFullPageWrites = checkPoint.fullPageWrites;
6889
6890 RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6891 doPageWrites = lastFullPageWrites;
6892
6893 if (RecPtr < checkPoint.redo)
6894 ereport(PANIC,
6895 (errmsg("invalid redo in checkpoint record")));
6896
6897 /*
6898 * Check whether we need to force recovery from WAL. If it appears to
6899 * have been a clean shutdown and we did not have a recovery signal file,
6900 * then assume no recovery needed.
6901 */
6902 if (checkPoint.redo < RecPtr)
6903 {
6904 if (wasShutdown)
6905 ereport(PANIC,
6906 (errmsg("invalid redo record in shutdown checkpoint")));
6907 InRecovery = true;
6908 }
6909 else if (ControlFile->state != DB_SHUTDOWNED)
6910 InRecovery = true;
6911 else if (ArchiveRecoveryRequested)
6912 {
6913 /* force recovery due to presence of recovery signal file */
6914 InRecovery = true;
6915 }
6916
6917 /*
6918 * Start recovery assuming that the final record isn't lost.
6919 */
6920 abortedRecPtr = InvalidXLogRecPtr;
6921 missingContrecPtr = InvalidXLogRecPtr;
6922
6923 /* REDO */
6924 if (InRecovery)
6925 {
6926 int rmid;
6927
6928 /*
6929 * Update pg_control to show that we are recovering and to show the
6930 * selected checkpoint as the place we are starting from. We also mark
6931 * pg_control with any minimum recovery stop point obtained from a
6932 * backup history file.
6933 */
6934 dbstate_at_startup = ControlFile->state;
6935 if (InArchiveRecovery)
6936 {
6937 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6938
6939 SpinLockAcquire(&XLogCtl->info_lck);
6940 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
6941 SpinLockRelease(&XLogCtl->info_lck);
6942 }
6943 else
6944 {
6945 ereport(LOG,
6946 (errmsg("database system was not properly shut down; "
6947 "automatic recovery in progress")));
6948 if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6949 ereport(LOG,
6950 (errmsg("crash recovery starts in timeline %u "
6951 "and has target timeline %u",
6952 ControlFile->checkPointCopy.ThisTimeLineID,
6953 recoveryTargetTLI)));
6954 ControlFile->state = DB_IN_CRASH_RECOVERY;
6955
6956 SpinLockAcquire(&XLogCtl->info_lck);
6957 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
6958 SpinLockRelease(&XLogCtl->info_lck);
6959 }
6960 ControlFile->checkPoint = checkPointLoc;
6961 ControlFile->checkPointCopy = checkPoint;
6962 if (InArchiveRecovery)
6963 {
6964 /* initialize minRecoveryPoint if not set yet */
6965 if (ControlFile->minRecoveryPoint < checkPoint.redo)
6966 {
6967 ControlFile->minRecoveryPoint = checkPoint.redo;
6968 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6969 }
6970 }
6971
6972 /*
6973 * Set backupStartPoint if we're starting recovery from a base backup.
6974 *
6975 * Also set backupEndPoint and use minRecoveryPoint as the backup end
6976 * location if we're starting recovery from a base backup which was
6977 * taken from a standby. In this case, the database system status in
6978 * pg_control must indicate that the database was already in recovery.
6979 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6980 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6981 * before reaching this point; e.g. because restore_command or
6982 * primary_conninfo were faulty.
6983 *
6984 * Any other state indicates that the backup somehow became corrupted
6985 * and we can't sensibly continue with recovery.
6986 */
6987 if (haveBackupLabel)
6988 {
6989 ControlFile->backupStartPoint = checkPoint.redo;
6990 ControlFile->backupEndRequired = backupEndRequired;
6991
6992 if (backupFromStandby)
6993 {
6994 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6995 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6996 ereport(FATAL,
6997 (errmsg("backup_label contains data inconsistent with control file"),
6998 errhint("This means that the backup is corrupted and you will "
6999 "have to use another backup for recovery.")));
7000 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
7001 }
7002 }
7003 ControlFile->time = (pg_time_t) time(NULL);
7004 /* No need to hold ControlFileLock yet, we aren't up far enough */
7005 UpdateControlFile();
7006
7007 /*
7008 * Initialize our local copy of minRecoveryPoint. When doing crash
7009 * recovery we want to replay up to the end of WAL. Particularly, in
7010 * the case of a promoted standby minRecoveryPoint value in the
7011 * control file is only updated after the first checkpoint. However,
7012 * if the instance crashes before the first post-recovery checkpoint
7013 * is completed then recovery will use a stale location causing the
7014 * startup process to think that there are still invalid page
7015 * references when checking for data consistency.
7016 */
7017 if (InArchiveRecovery)
7018 {
7019 minRecoveryPoint = ControlFile->minRecoveryPoint;
7020 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
7021 }
7022 else
7023 {
7024 minRecoveryPoint = InvalidXLogRecPtr;
7025 minRecoveryPointTLI = 0;
7026 }
7027
7028 /*
7029 * Reset pgstat data, because it may be invalid after recovery.
7030 */
7031 pgstat_reset_all();
7032
7033 /*
7034 * If there was a backup label file, it's done its job and the info
7035 * has now been propagated into pg_control. We must get rid of the
7036 * label file so that if we crash during recovery, we'll pick up at
7037 * the latest recovery restartpoint instead of going all the way back
7038 * to the backup start point. It seems prudent though to just rename
7039 * the file out of the way rather than delete it completely.
7040 */
7041 if (haveBackupLabel)
7042 {
7043 unlink(BACKUP_LABEL_OLD);
7044 durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
7045 }
7046
7047 /*
7048 * If there was a tablespace_map file, it's done its job and the
7049 * symlinks have been created. We must get rid of the map file so
7050 * that if we crash during recovery, we don't create symlinks again.
7051 * It seems prudent though to just rename the file out of the way
7052 * rather than delete it completely.
7053 */
7054 if (haveTblspcMap)
7055 {
7056 unlink(TABLESPACE_MAP_OLD);
7057 durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
7058 }
7059
7060 /* Check that the GUCs used to generate the WAL allow recovery */
7061 CheckRequiredParameterValues();
7062
7063 /*
7064 * We're in recovery, so unlogged relations may be trashed and must be
7065 * reset. This should be done BEFORE allowing Hot Standby
7066 * connections, so that read-only backends don't try to read whatever
7067 * garbage is left over from before.
7068 */
7069 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
7070
7071 /*
7072 * Likewise, delete any saved transaction snapshot files that got left
7073 * behind by crashed backends.
7074 */
7075 DeleteAllExportedSnapshotFiles();
7076
7077 /*
7078 * Initialize for Hot Standby, if enabled. We won't let backends in
7079 * yet, not until we've reached the min recovery point specified in
7080 * control file and we've established a recovery snapshot from a
7081 * running-xacts WAL record.
7082 */
7083 if (ArchiveRecoveryRequested && EnableHotStandby)
7084 {
7085 TransactionId *xids;
7086 int nxids;
7087
7088 ereport(DEBUG1,
7089 (errmsg("initializing for hot standby")));
7090
7091 InitRecoveryTransactionEnvironment();
7092
7093 if (wasShutdown)
7094 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7095 else
7096 oldestActiveXID = checkPoint.oldestActiveXid;
7097 Assert(TransactionIdIsValid(oldestActiveXID));
7098
7099 /* Tell procarray about the range of xids it has to deal with */
7100 ProcArrayInitRecovery(XidFromFullTransactionId(ShmemVariableCache->nextFullXid));
7101
7102 /*
7103 * Startup commit log and subtrans only. MultiXact and commit
7104 * timestamp have already been started up and other SLRUs are not
7105 * maintained during recovery and need not be started yet.
7106 */
7107 StartupCLOG();
7108 StartupSUBTRANS(oldestActiveXID);
7109
7110 /*
7111 * If we're beginning at a shutdown checkpoint, we know that
7112 * nothing was running on the master at this point. So fake-up an
7113 * empty running-xacts record and use that here and now. Recover
7114 * additional standby state for prepared transactions.
7115 */
7116 if (wasShutdown)
7117 {
7118 RunningTransactionsData running;
7119 TransactionId latestCompletedXid;
7120
7121 /*
7122 * Construct a RunningTransactions snapshot representing a
7123 * shut down server, with only prepared transactions still
7124 * alive. We're never overflowed at this point because all
7125 * subxids are listed with their parent prepared transactions.
7126 */
7127 running.xcnt = nxids;
7128 running.subxcnt = 0;
7129 running.subxid_overflow = false;
7130 running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
7131 running.oldestRunningXid = oldestActiveXID;
7132 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
7133 TransactionIdRetreat(latestCompletedXid);
7134 Assert(TransactionIdIsNormal(latestCompletedXid));
7135 running.latestCompletedXid = latestCompletedXid;
7136 running.xids = xids;
7137
7138 ProcArrayApplyRecoveryInfo(&running);
7139
7140 StandbyRecoverPreparedTransactions();
7141 }
7142 }
7143
7144 /* Initialize resource managers */
7145 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7146 {
7147 if (RmgrTable[rmid].rm_startup != NULL)
7148 RmgrTable[rmid].rm_startup();
7149 }
7150
7151 /*
7152 * Initialize shared variables for tracking progress of WAL replay, as
7153 * if we had just replayed the record before the REDO location (or the
7154 * checkpoint record itself, if it's a shutdown checkpoint).
7155 */
7156 SpinLockAcquire(&XLogCtl->info_lck);
7157 if (checkPoint.redo < RecPtr)
7158 XLogCtl->replayEndRecPtr = checkPoint.redo;
7159 else
7160 XLogCtl->replayEndRecPtr = EndRecPtr;
7161 XLogCtl->replayEndTLI = ThisTimeLineID;
7162 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7163 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7164 XLogCtl->recoveryLastXTime = 0;
7165 XLogCtl->currentChunkStartTime = 0;
7166 XLogCtl->recoveryPause = false;
7167 SpinLockRelease(&XLogCtl->info_lck);
7168
7169 /* Also ensure XLogReceiptTime has a sane value */
7170 XLogReceiptTime = GetCurrentTimestamp();
7171
7172 /*
7173 * Let postmaster know we've started redo now, so that it can launch
7174 * checkpointer to perform restartpoints. We don't bother during
7175 * crash recovery as restartpoints can only be performed during
7176 * archive recovery. And we'd like to keep crash recovery simple, to
7177 * avoid introducing bugs that could affect you when recovering after
7178 * crash.
7179 *
7180 * After this point, we can no longer assume that we're the only
7181 * process in addition to postmaster! Also, fsync requests are
7182 * subsequently to be handled by the checkpointer, not locally.
7183 */
7184 if (ArchiveRecoveryRequested && IsUnderPostmaster)
7185 {
7186 PublishStartupProcessInformation();
7187 EnableSyncRequestForwarding();
7188 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7189 bgwriterLaunched = true;
7190 }
7191
7192 /*
7193 * Allow read-only connections immediately if we're consistent
7194 * already.
7195 */
7196 CheckRecoveryConsistency();
7197
7198 /*
7199 * Find the first record that logically follows the checkpoint --- it
7200 * might physically precede it, though.
7201 */
7202 if (checkPoint.redo < RecPtr)
7203 {
7204 /* back up to find the record */
7205 XLogBeginRead(xlogreader, checkPoint.redo);
7206 record = ReadRecord(xlogreader, PANIC, false);
7207 }
7208 else
7209 {
7210 /* just have to read next record after CheckPoint */
7211 record = ReadRecord(xlogreader, LOG, false);
7212 }
7213
7214 if (record != NULL)
7215 {
7216 ErrorContextCallback errcallback;
7217 TimestampTz xtime;
7218
7219 InRedo = true;
7220
7221 ereport(LOG,
7222 (errmsg("redo starts at %X/%X",
7223 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7224
7225 /*
7226 * main redo apply loop
7227 */
7228 do
7229 {
7230 bool switchedTLI = false;
7231
7232 #ifdef WAL_DEBUG
7233 if (XLOG_DEBUG ||
7234 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7235 (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7236 {
7237 StringInfoData buf;
7238
7239 initStringInfo(&buf);
7240 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7241 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
7242 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
7243 xlog_outrec(&buf, xlogreader);
7244 appendStringInfoString(&buf, " - ");
7245 xlog_outdesc(&buf, xlogreader);
7246 elog(LOG, "%s", buf.data);
7247 pfree(buf.data);
7248 }
7249 #endif
7250
7251 /* Handle interrupt signals of startup process */
7252 HandleStartupProcInterrupts();
7253
7254 /*
7255 * Pause WAL replay, if requested by a hot-standby session via
7256 * SetRecoveryPause().
7257 *
7258 * Note that we intentionally don't take the info_lck spinlock
7259 * here. We might therefore read a slightly stale value of
7260 * the recoveryPause flag, but it can't be very stale (no
7261 * worse than the last spinlock we did acquire). Since a
7262 * pause request is a pretty asynchronous thing anyway,
7263 * possibly responding to it one WAL record later than we
7264 * otherwise would is a minor issue, so it doesn't seem worth
7265 * adding another spinlock cycle to prevent that.
7266 */
7267 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7268 recoveryPausesHere(false);
7269
7270 /*
7271 * Have we reached our recovery target?
7272 */
7273 if (recoveryStopsBefore(xlogreader))
7274 {
7275 reachedRecoveryTarget = true;
7276 break;
7277 }
7278
7279 /*
7280 * If we've been asked to lag the master, wait on latch until
7281 * enough time has passed.
7282 */
7283 if (recoveryApplyDelay(xlogreader))
7284 {
7285 /*
7286 * We test for paused recovery again here. If user sets
7287 * delayed apply, it may be because they expect to pause
7288 * recovery in case of problems, so we must test again
7289 * here otherwise pausing during the delay-wait wouldn't
7290 * work.
7291 */
7292 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7293 recoveryPausesHere(false);
7294 }
7295
7296 /* Setup error traceback support for ereport() */
7297 errcallback.callback = rm_redo_error_callback;
7298 errcallback.arg = (void *) xlogreader;
7299 errcallback.previous = error_context_stack;
7300 error_context_stack = &errcallback;
7301
7302 /*
7303 * ShmemVariableCache->nextFullXid must be beyond record's
7304 * xid.
7305 */
7306 AdvanceNextFullTransactionIdPastXid(record->xl_xid);
7307
7308 /*
7309 * Before replaying this record, check if this record causes
7310 * the current timeline to change. The record is already
7311 * considered to be part of the new timeline, so we update
7312 * ThisTimeLineID before replaying it. That's important so
7313 * that replayEndTLI, which is recorded as the minimum
7314 * recovery point's TLI if recovery stops after this record,
7315 * is set correctly.
7316 */
7317 if (record->xl_rmid == RM_XLOG_ID)
7318 {
7319 TimeLineID newTLI = ThisTimeLineID;
7320 TimeLineID prevTLI = ThisTimeLineID;
7321 uint8 info = record->xl_info & ~XLR_INFO_MASK;
7322
7323 if (info == XLOG_CHECKPOINT_SHUTDOWN)
7324 {
7325 CheckPoint checkPoint;
7326
7327 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7328 newTLI = checkPoint.ThisTimeLineID;
7329 prevTLI = checkPoint.PrevTimeLineID;
7330 }
7331 else if (info == XLOG_END_OF_RECOVERY)
7332 {
7333 xl_end_of_recovery xlrec;
7334
7335 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7336 newTLI = xlrec.ThisTimeLineID;
7337 prevTLI = xlrec.PrevTimeLineID;
7338 }
7339
7340 if (newTLI != ThisTimeLineID)
7341 {
7342 /* Check that it's OK to switch to this TLI */
7343 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7344
7345 /* Following WAL records should be run with new TLI */
7346 ThisTimeLineID = newTLI;
7347 switchedTLI = true;
7348 }
7349 }
7350
7351 /*
7352 * Update shared replayEndRecPtr before replaying this record,
7353 * so that XLogFlush will update minRecoveryPoint correctly.
7354 */
7355 SpinLockAcquire(&XLogCtl->info_lck);
7356 XLogCtl->replayEndRecPtr = EndRecPtr;
7357 XLogCtl->replayEndTLI = ThisTimeLineID;
7358 SpinLockRelease(&XLogCtl->info_lck);
7359
7360 /*
7361 * If we are attempting to enter Hot Standby mode, process
7362 * XIDs we see
7363 */
7364 if (standbyState >= STANDBY_INITIALIZED &&
7365 TransactionIdIsValid(record->xl_xid))
7366 RecordKnownAssignedTransactionIds(record->xl_xid);
7367
7368 /* Now apply the WAL record itself */
7369 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7370
7371 /*
7372 * After redo, check whether the backup pages associated with
7373 * the WAL record are consistent with the existing pages. This
7374 * check is done only if consistency check is enabled for this
7375 * record.
7376 */
7377 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7378 checkXLogConsistency(xlogreader);
7379
7380 /* Pop the error context stack */
7381 error_context_stack = errcallback.previous;
7382
7383 /*
7384 * Update lastReplayedEndRecPtr after this record has been
7385 * successfully replayed.
7386 */
7387 SpinLockAcquire(&XLogCtl->info_lck);
7388 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7389 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7390 SpinLockRelease(&XLogCtl->info_lck);
7391
7392 /*
7393 * If rm_redo called XLogRequestWalReceiverReply, then we wake
7394 * up the receiver so that it notices the updated
7395 * lastReplayedEndRecPtr and sends a reply to the master.
7396 */
7397 if (doRequestWalReceiverReply)
7398 {
7399 doRequestWalReceiverReply = false;
7400 WalRcvForceReply();
7401 }
7402
7403 /* Remember this record as the last-applied one */
7404 LastRec = ReadRecPtr;
7405
7406 /* Allow read-only connections if we're consistent now */
7407 CheckRecoveryConsistency();
7408
7409 /* Is this a timeline switch? */
7410 if (switchedTLI)
7411 {
7412 /*
7413 * Before we continue on the new timeline, clean up any
7414 * (possibly bogus) future WAL segments on the old
7415 * timeline.
7416 */
7417 RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7418
7419 /*
7420 * Wake up any walsenders to notice that we are on a new
7421 * timeline.
7422 */
7423 if (switchedTLI && AllowCascadeReplication())
7424 WalSndWakeup();
7425 }
7426
7427 /* Exit loop if we reached inclusive recovery target */
7428 if (recoveryStopsAfter(xlogreader))
7429 {
7430 reachedRecoveryTarget = true;
7431 break;
7432 }
7433
7434 /* Else, try to fetch the next WAL record */
7435 record = ReadRecord(xlogreader, LOG, false);
7436 } while (record != NULL);
7437
7438 /*
7439 * end of main redo apply loop
7440 */
7441
7442 if (reachedRecoveryTarget)
7443 {
7444 if (!reachedConsistency)
7445 ereport(FATAL,
7446 (errmsg("requested recovery stop point is before consistent recovery point")));
7447
7448 /*
7449 * This is the last point where we can restart recovery with a
7450 * new recovery target, if we shutdown and begin again. After
7451 * this, Resource Managers may choose to do permanent
7452 * corrective actions at end of recovery.
7453 */
7454 switch (recoveryTargetAction)
7455 {
7456 case RECOVERY_TARGET_ACTION_SHUTDOWN:
7457
7458 /*
7459 * exit with special return code to request shutdown
7460 * of postmaster. Log messages issued from
7461 * postmaster.
7462 */
7463 proc_exit(3);
7464
7465 case RECOVERY_TARGET_ACTION_PAUSE:
7466 SetRecoveryPause(true);
7467 recoveryPausesHere(true);
7468
7469 /* drop into promote */
7470
7471 case RECOVERY_TARGET_ACTION_PROMOTE:
7472 break;
7473 }
7474 }
7475
7476 /* Allow resource managers to do any required cleanup. */
7477 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7478 {
7479 if (RmgrTable[rmid].rm_cleanup != NULL)
7480 RmgrTable[rmid].rm_cleanup();
7481 }
7482
7483 ereport(LOG,
7484 (errmsg("redo done at %X/%X",
7485 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7486 xtime = GetLatestXTime();
7487 if (xtime)
7488 ereport(LOG,
7489 (errmsg("last completed transaction was at log time %s",
7490 timestamptz_to_str(xtime))));
7491
7492 InRedo = false;
7493 }
7494 else
7495 {
7496 /* there are no WAL records following the checkpoint */
7497 ereport(LOG,
7498 (errmsg("redo is not required")));
7499
7500 }
7501
7502 /*
7503 * This check is intentionally after the above log messages that
7504 * indicate how far recovery went.
7505 */
7506 if (ArchiveRecoveryRequested &&
7507 recoveryTarget != RECOVERY_TARGET_UNSET &&
7508 !reachedRecoveryTarget)
7509 ereport(FATAL,
7510 (errmsg("recovery ended before configured recovery target was reached")));
7511 }
7512
7513 /*
7514 * Kill WAL receiver, if it's still running, before we continue to write
7515 * the startup checkpoint and aborted-contrecord records. It will trump
7516 * over these records and subsequent ones if it's still alive when we
7517 * start writing WAL.
7518 */
7519 ShutdownWalRcv();
7520
7521 /*
7522 * Reset unlogged relations to the contents of their INIT fork. This is
7523 * done AFTER recovery is complete so as to include any unlogged relations
7524 * created during recovery, but BEFORE recovery is marked as having
7525 * completed successfully. Otherwise we'd not retry if any of the post
7526 * end-of-recovery steps fail.
7527 */
7528 if (InRecovery)
7529 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7530
7531 /*
7532 * We don't need the latch anymore. It's not strictly necessary to disown
7533 * it, but let's do it for the sake of tidiness.
7534 */
7535 if (ArchiveRecoveryRequested)
7536 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7537
7538 /*
7539 * We are now done reading the xlog from stream. Turn off streaming
7540 * recovery to force fetching the files (which would be required at end of
7541 * recovery, e.g., timeline history file) from archive or pg_wal.
7542 *
7543 * Note that standby mode must be turned off after killing WAL receiver,
7544 * i.e., calling ShutdownWalRcv().
7545 */
7546 Assert(!WalRcvStreaming());
7547 StandbyMode = false;
7548
7549 /*
7550 * Determine where to start writing WAL next.
7551 *
7552 * When recovery ended in an incomplete record, write a WAL record about
7553 * that and continue after it. In all other cases, re-fetch the last
7554 * valid or last applied record, so we can identify the exact endpoint of
7555 * what we consider the valid portion of WAL.
7556 */
7557 XLogBeginRead(xlogreader, LastRec);
7558 record = ReadRecord(xlogreader, PANIC, false);
7559 EndOfLog = EndRecPtr;
7560
7561 /*
7562 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7563 * the end-of-log. It could be different from the timeline that EndOfLog
7564 * nominally belongs to, if there was a timeline switch in that segment,
7565 * and we were reading the old WAL from a segment belonging to a higher
7566 * timeline.
7567 */
7568 EndOfLogTLI = xlogreader->seg.ws_tli;
7569
7570 /*
7571 * Complain if we did not roll forward far enough to render the backup
7572 * dump consistent. Note: it is indeed okay to look at the local variable
7573 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7574 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7575 * advanced beyond the WAL we processed.
7576 */
7577 if (InRecovery &&
7578 (EndOfLog < minRecoveryPoint ||
7579 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7580 {
7581 /*
7582 * Ran off end of WAL before reaching end-of-backup WAL record, or
7583 * minRecoveryPoint. That's usually a bad sign, indicating that you
7584 * tried to recover from an online backup but never called
7585 * pg_stop_backup(), or you didn't archive all the WAL up to that
7586 * point. However, this also happens in crash recovery, if the system
7587 * crashes while an online backup is in progress. We must not treat
7588 * that as an error, or the database will refuse to start up.
7589 */
7590 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7591 {
7592 if (ControlFile->backupEndRequired)
7593 ereport(FATAL,
7594 (errmsg("WAL ends before end of online backup"),
7595 errhint("All WAL generated while online backup was taken must be available at recovery.")));
7596 else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7597 ereport(FATAL,
7598 (errmsg("WAL ends before end of online backup"),
7599 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7600 else
7601 ereport(FATAL,
7602 (errmsg("WAL ends before consistent recovery point")));
7603 }
7604 }
7605
7606 /*
7607 * Pre-scan prepared transactions to find out the range of XIDs present.
7608 * This information is not quite needed yet, but it is positioned here so
7609 * as potential problems are detected before any on-disk change is done.
7610 */
7611 oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7612
7613 /*
7614 * Consider whether we need to assign a new timeline ID.
7615 *
7616 * If we are doing an archive recovery, we always assign a new ID. This
7617 * handles a couple of issues. If we stopped short of the end of WAL
7618 * during recovery, then we are clearly generating a new timeline and must
7619 * assign it a unique new ID. Even if we ran to the end, modifying the
7620 * current last segment is problematic because it may result in trying to
7621 * overwrite an already-archived copy of that segment, and we encourage
7622 * DBAs to make their archive_commands reject that. We can dodge the
7623 * problem by making the new active segment have a new timeline ID.
7624 *
7625 * In a normal crash recovery, we can just extend the timeline we were in.
7626 */
7627 PrevTimeLineID = ThisTimeLineID;
7628 if (ArchiveRecoveryRequested)
7629 {
7630 char reason[200];
7631 char recoveryPath[MAXPGPATH];
7632
7633 Assert(InArchiveRecovery);
7634
7635 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7636 ereport(LOG,
7637 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7638
7639 /*
7640 * Create a comment for the history file to explain why and where
7641 * timeline changed.
7642 */
7643 if (recoveryTarget == RECOVERY_TARGET_XID)
7644 snprintf(reason, sizeof(reason),
7645 "%s transaction %u",
7646 recoveryStopAfter ? "after" : "before",
7647 recoveryStopXid);
7648 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7649 snprintf(reason, sizeof(reason),
7650 "%s %s\n",
7651 recoveryStopAfter ? "after" : "before",
7652 timestamptz_to_str(recoveryStopTime));
7653 else if (recoveryTarget == RECOVERY_TARGET_LSN)
7654 snprintf(reason, sizeof(reason),
7655 "%s LSN %X/%X\n",
7656 recoveryStopAfter ? "after" : "before",
7657 (uint32) (recoveryStopLSN >> 32),
7658 (uint32) recoveryStopLSN);
7659 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7660 snprintf(reason, sizeof(reason),
7661 "at restore point \"%s\"",
7662 recoveryStopName);
7663 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7664 snprintf(reason, sizeof(reason), "reached consistency");
7665 else
7666 snprintf(reason, sizeof(reason), "no recovery target specified");
7667
7668 /*
7669 * We are now done reading the old WAL. Turn off archive fetching if
7670 * it was active, and make a writable copy of the last WAL segment.
7671 * (Note that we also have a copy of the last block of the old WAL in
7672 * readBuf; we will use that below.)
7673 */
7674 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7675
7676 /*
7677 * Write the timeline history file, and have it archived. After this
7678 * point (or rather, as soon as the file is archived), the timeline
7679 * will appear as "taken" in the WAL archive and to any standby
7680 * servers. If we crash before actually switching to the new
7681 * timeline, standby servers will nevertheless think that we switched
7682 * to the new timeline, and will try to connect to the new timeline.
7683 * To minimize the window for that, try to do as little as possible
7684 * between here and writing the end-of-recovery record.
7685 */
7686 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7687 EndRecPtr, reason);
7688
7689 /*
7690 * Since there might be a partial WAL segment named RECOVERYXLOG, get
7691 * rid of it.
7692 */
7693 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
7694 unlink(recoveryPath); /* ignore any error */
7695
7696 /* Get rid of any remaining recovered timeline-history file, too */
7697 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
7698 unlink(recoveryPath); /* ignore any error */
7699 }
7700
7701 /* Save the selected TimeLineID in shared memory, too */
7702 XLogCtl->ThisTimeLineID = ThisTimeLineID;
7703 XLogCtl->PrevTimeLineID = PrevTimeLineID;
7704
7705 /*
7706 * Actually, if WAL ended in an incomplete record, skip the parts that
7707 * made it through and start writing after the portion that persisted.
7708 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
7709 * we'll do as soon as we're open for writing new WAL.)
7710 */
7711 if (!XLogRecPtrIsInvalid(missingContrecPtr))
7712 {
7713 Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
7714 EndOfLog = missingContrecPtr;
7715 }
7716
7717 /*
7718 * Prepare to write WAL starting at EndOfLog location, and init xlog
7719 * buffer cache using the block containing the last record from the
7720 * previous incarnation.
7721 */
7722 Insert = &XLogCtl->Insert;
7723 Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7724 Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7725
7726 /*
7727 * Tricky point here: readBuf contains the *last* block that the LastRec
7728 * record spans, not the one it starts in. The last block is indeed the
7729 * one we want to use.
7730 */
7731 if (EndOfLog % XLOG_BLCKSZ != 0)
7732 {
7733 char *page;
7734 int len;
7735 int firstIdx;
7736 XLogRecPtr pageBeginPtr;
7737
7738 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7739 Assert(readOff == XLogSegmentOffset(pageBeginPtr, wal_segment_size));
7740
7741 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7742
7743 /* Copy the valid part of the last block, and zero the rest */
7744 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7745 len = EndOfLog % XLOG_BLCKSZ;
7746 memcpy(page, xlogreader->readBuf, len);
7747 memset(page + len, 0, XLOG_BLCKSZ - len);
7748
7749 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7750 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7751 }
7752 else
7753 {
7754 /*
7755 * There is no partial block to copy. Just set InitializedUpTo, and
7756 * let the first attempt to insert a log record to initialize the next
7757 * buffer.
7758 */
7759 XLogCtl->InitializedUpTo = EndOfLog;
7760 }
7761
7762 LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7763
7764 XLogCtl->LogwrtResult = LogwrtResult;
7765
7766 XLogCtl->LogwrtRqst.Write = EndOfLog;
7767 XLogCtl->LogwrtRqst.Flush = EndOfLog;
7768
7769 LocalSetXLogInsertAllowed();
7770
7771 /* If necessary, write overwrite-contrecord before doing anything else */
7772 if (!XLogRecPtrIsInvalid(abortedRecPtr))
7773 {
7774 Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
7775 CreateOverwriteContrecordRecord(abortedRecPtr);
7776 abortedRecPtr = InvalidXLogRecPtr;
7777 missingContrecPtr = InvalidXLogRecPtr;
7778 }
7779
7780 /*
7781 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7782 * record before resource manager writes cleanup WAL records or checkpoint
7783 * record is written.
7784 */
7785 Insert->fullPageWrites = lastFullPageWrites;
7786 UpdateFullPageWrites();
7787 LocalXLogInsertAllowed = -1;
7788
7789 if (InRecovery)
7790 {
7791 /*
7792 * Perform a checkpoint to update all our recovery activity to disk.
7793 *
7794 * Note that we write a shutdown checkpoint rather than an on-line
7795 * one. This is not particularly critical, but since we may be
7796 * assigning a new TLI, using a shutdown checkpoint allows us to have
7797 * the rule that TLI only changes in shutdown checkpoints, which
7798 * allows some extra error checking in xlog_redo.
7799 *
7800 * In fast promotion, only create a lightweight end-of-recovery record
7801 * instead of a full checkpoint. A checkpoint is requested later,
7802 * after we're fully out of recovery mode and already accepting
7803 * queries.
7804 */
7805 if (bgwriterLaunched)
7806 {
7807 if (fast_promote)
7808 {
7809 checkPointLoc = ControlFile->checkPoint;
7810
7811 /*
7812 * Confirm the last checkpoint is available for us to recover
7813 * from if we fail.
7814 */
7815 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7816 if (record != NULL)
7817 {
7818 fast_promoted = true;
7819
7820 /*
7821 * Insert a special WAL record to mark the end of
7822 * recovery, since we aren't doing a checkpoint. That
7823 * means that the checkpointer process may likely be in
7824 * the middle of a time-smoothed restartpoint and could
7825 * continue to be for minutes after this. That sounds
7826 * strange, but the effect is roughly the same and it
7827 * would be stranger to try to come out of the
7828 * restartpoint and then checkpoint. We request a
7829 * checkpoint later anyway, just for safety.
7830 */
7831 CreateEndOfRecoveryRecord();
7832 }
7833 }
7834
7835 if (!fast_promoted)
7836 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7837 CHECKPOINT_IMMEDIATE |
7838 CHECKPOINT_WAIT);
7839 }
7840 else
7841 CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7842 }
7843
7844 if (ArchiveRecoveryRequested)
7845 {
7846 /*
7847 * And finally, execute the recovery_end_command, if any.
7848 */
7849 if (recoveryEndCommand && strcmp(recoveryEndCommand, "") != 0)
7850 ExecuteRecoveryCommand(recoveryEndCommand,
7851 "recovery_end_command",
7852 true);
7853
7854 /*
7855 * We switched to a new timeline. Clean up segments on the old
7856 * timeline.
7857 *
7858 * If there are any higher-numbered segments on the old timeline,
7859 * remove them. They might contain valid WAL, but they might also be
7860 * pre-allocated files containing garbage. In any case, they are not
7861 * part of the new timeline's history so we don't need them.
7862 */
7863 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7864
7865 /*
7866 * If the switch happened in the middle of a segment, what to do with
7867 * the last, partial segment on the old timeline? If we don't archive
7868 * it, and the server that created the WAL never archives it either
7869 * (e.g. because it was hit by a meteor), it will never make it to the
7870 * archive. That's OK from our point of view, because the new segment
7871 * that we created with the new TLI contains all the WAL from the old
7872 * timeline up to the switch point. But if you later try to do PITR to
7873 * the "missing" WAL on the old timeline, recovery won't find it in
7874 * the archive. It's physically present in the new file with new TLI,
7875 * but recovery won't look there when it's recovering to the older
7876 * timeline. On the other hand, if we archive the partial segment, and
7877 * the original server on that timeline is still running and archives
7878 * the completed version of the same segment later, it will fail. (We
7879 * used to do that in 9.4 and below, and it caused such problems).
7880 *
7881 * As a compromise, we rename the last segment with the .partial
7882 * suffix, and archive it. Archive recovery will never try to read
7883 * .partial segments, so they will normally go unused. But in the odd
7884 * PITR case, the administrator can copy them manually to the pg_wal
7885 * directory (removing the suffix). They can be useful in debugging,
7886 * too.
7887 *
7888 * If a .done or .ready file already exists for the old timeline,
7889 * however, we had already determined that the segment is complete, so
7890 * we can let it be archived normally. (In particular, if it was
7891 * restored from the archive to begin with, it's expected to have a
7892 * .done file).
7893 */
7894 if (XLogSegmentOffset(EndOfLog, wal_segment_size) != 0 &&
7895 XLogArchivingActive())
7896 {
7897 char origfname[MAXFNAMELEN];
7898 XLogSegNo endLogSegNo;
7899
7900 XLByteToPrevSeg(EndOfLog, endLogSegNo, wal_segment_size);
7901 XLogFileName(origfname, EndOfLogTLI, endLogSegNo, wal_segment_size);
7902
7903 if (!XLogArchiveIsReadyOrDone(origfname))
7904 {
7905 char origpath[MAXPGPATH];
7906 char partialfname[MAXFNAMELEN];
7907 char partialpath[MAXPGPATH];
7908
7909 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo, wal_segment_size);
7910 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7911 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7912
7913 /*
7914 * Make sure there's no .done or .ready file for the .partial
7915 * file.
7916 */
7917 XLogArchiveCleanup(partialfname);
7918
7919 durable_rename(origpath, partialpath, ERROR);
7920 XLogArchiveNotify(partialfname);
7921 }
7922 }
7923 }
7924
7925 /*
7926 * Preallocate additional log files, if wanted.
7927 */
7928 PreallocXlogFiles(EndOfLog);
7929
7930 /*
7931 * Okay, we're officially UP.
7932 */
7933 InRecovery = false;
7934
7935 /* start the archive_timeout timer and LSN running */
7936 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7937 XLogCtl->lastSegSwitchLSN = EndOfLog;
7938
7939 /* also initialize latestCompletedXid, to nextXid - 1 */
7940 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7941 ShmemVariableCache->latestCompletedXid = XidFromFullTransactionId(ShmemVariableCache->nextFullXid);
7942 TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7943 LWLockRelease(ProcArrayLock);
7944
7945 /*
7946 * Start up the commit log and subtrans, if not already done for hot
7947 * standby. (commit timestamps are started below, if necessary.)
7948 */
7949 if (standbyState == STANDBY_DISABLED)
7950 {
7951 StartupCLOG();
7952 StartupSUBTRANS(oldestActiveXID);
7953 }
7954
7955 /*
7956 * Perform end of recovery actions for any SLRUs that need it.
7957 */
7958 TrimCLOG();
7959 TrimMultiXact();
7960
7961 /* Reload shared-memory state for prepared transactions */
7962 RecoverPreparedTransactions();
7963
7964 /* Shut down xlogreader */
7965 if (readFile >= 0)
7966 {
7967 close(readFile);
7968 readFile = -1;
7969 }
7970 XLogReaderFree(xlogreader);
7971
7972 /*
7973 * If any of the critical GUCs have changed, log them before we allow
7974 * backends to write WAL.
7975 */
7976 LocalSetXLogInsertAllowed();
7977 XLogReportParameters();
7978
7979 /*
7980 * Local WAL inserts enabled, so it's time to finish initialization of
7981 * commit timestamp.
7982 */
7983 CompleteCommitTsInitialization();
7984
7985 /*
7986 * All done with end-of-recovery actions.
7987 *
7988 * Now allow backends to write WAL and update the control file status in
7989 * consequence. The boolean flag allowing backends to write WAL is
7990 * updated while holding ControlFileLock to prevent other backends to look
7991 * at an inconsistent state of the control file in shared memory. There
7992 * is still a small window during which backends can write WAL and the
7993 * control file is still referring to a system not in DB_IN_PRODUCTION
7994 * state while looking at the on-disk control file.
7995 *
7996 * Also, although the boolean flag to allow WAL is probably atomic in
7997 * itself, we use the info_lck here to ensure that there are no race
7998 * conditions concerning visibility of other recent updates to shared
7999 * memory.
8000 */
8001 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8002 ControlFile->state = DB_IN_PRODUCTION;
8003 ControlFile->time = (pg_time_t) time(NULL);
8004
8005 SpinLockAcquire(&XLogCtl->info_lck);
8006 XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
8007 SpinLockRelease(&XLogCtl->info_lck);
8008
8009 UpdateControlFile();
8010 LWLockRelease(ControlFileLock);
8011
8012 /*
8013 * Shutdown the recovery environment. This must occur after
8014 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
8015 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
8016 * any session building a snapshot will not rely on KnownAssignedXids as
8017 * RecoveryInProgress() would return false at this stage. This is
8018 * particularly critical for prepared 2PC transactions, that would still
8019 * need to be included in snapshots once recovery has ended.
8020 */
8021 if (standbyState != STANDBY_DISABLED)
8022 ShutdownRecoveryTransactionEnvironment();
8023
8024 /*
8025 * If there were cascading standby servers connected to us, nudge any wal
8026 * sender processes to notice that we've been promoted.
8027 */
8028 WalSndWakeup();
8029
8030 /*
8031 * If this was a fast promotion, request an (online) checkpoint now. This
8032 * isn't required for consistency, but the last restartpoint might be far
8033 * back, and in case of a crash, recovering from it might take a longer
8034 * than is appropriate now that we're not in standby mode anymore.
8035 */
8036 if (fast_promoted)
8037 RequestCheckpoint(CHECKPOINT_FORCE);
8038 }
8039
8040 /*
8041 * Checks if recovery has reached a consistent state. When consistency is
8042 * reached and we have a valid starting standby snapshot, tell postmaster
8043 * that it can start accepting read-only connections.
8044 */
8045 static void
CheckRecoveryConsistency(void)8046 CheckRecoveryConsistency(void)
8047 {
8048 XLogRecPtr lastReplayedEndRecPtr;
8049
8050 /*
8051 * During crash recovery, we don't reach a consistent state until we've
8052 * replayed all the WAL.
8053 */
8054 if (XLogRecPtrIsInvalid(minRecoveryPoint))
8055 return;
8056
8057 Assert(InArchiveRecovery);
8058
8059 /*
8060 * assume that we are called in the startup process, and hence don't need
8061 * a lock to read lastReplayedEndRecPtr
8062 */
8063 lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
8064
8065 /*
8066 * Have we reached the point where our base backup was completed?
8067 */
8068 if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
8069 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
8070 {
8071 /*
8072 * We have reached the end of base backup, as indicated by pg_control.
8073 * The data on disk is now consistent. Reset backupStartPoint and
8074 * backupEndPoint, and update minRecoveryPoint to make sure we don't
8075 * allow starting up at an earlier point even if recovery is stopped
8076 * and restarted soon after this.
8077 */
8078 elog(DEBUG1, "end of backup reached");
8079
8080 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8081
8082 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
8083 ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
8084
8085 ControlFile->backupStartPoint = InvalidXLogRecPtr;
8086 ControlFile->backupEndPoint = InvalidXLogRecPtr;
8087 ControlFile->backupEndRequired = false;
8088 UpdateControlFile();
8089
8090 LWLockRelease(ControlFileLock);
8091 }
8092
8093 /*
8094 * Have we passed our safe starting point? Note that minRecoveryPoint is
8095 * known to be incorrectly set if ControlFile->backupEndRequired, until
8096 * the XLOG_BACKUP_END arrives to advise us of the correct
8097 * minRecoveryPoint. All we know prior to that is that we're not
8098 * consistent yet.
8099 */
8100 if (!reachedConsistency && !ControlFile->backupEndRequired &&
8101 minRecoveryPoint <= lastReplayedEndRecPtr &&
8102 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8103 {
8104 /*
8105 * Check to see if the XLOG sequence contained any unresolved
8106 * references to uninitialized pages.
8107 */
8108 XLogCheckInvalidPages();
8109
8110 reachedConsistency = true;
8111 ereport(LOG,
8112 (errmsg("consistent recovery state reached at %X/%X",
8113 (uint32) (lastReplayedEndRecPtr >> 32),
8114 (uint32) lastReplayedEndRecPtr)));
8115 }
8116
8117 /*
8118 * Have we got a valid starting snapshot that will allow queries to be
8119 * run? If so, we can tell postmaster that the database is consistent now,
8120 * enabling connections.
8121 */
8122 if (standbyState == STANDBY_SNAPSHOT_READY &&
8123 !LocalHotStandbyActive &&
8124 reachedConsistency &&
8125 IsUnderPostmaster)
8126 {
8127 SpinLockAcquire(&XLogCtl->info_lck);
8128 XLogCtl->SharedHotStandbyActive = true;
8129 SpinLockRelease(&XLogCtl->info_lck);
8130
8131 LocalHotStandbyActive = true;
8132
8133 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8134 }
8135 }
8136
8137 /*
8138 * Is the system still in recovery?
8139 *
8140 * Unlike testing InRecovery, this works in any process that's connected to
8141 * shared memory.
8142 *
8143 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8144 * variables the first time we see that recovery is finished.
8145 */
8146 bool
RecoveryInProgress(void)8147 RecoveryInProgress(void)
8148 {
8149 /*
8150 * We check shared state each time only until we leave recovery mode. We
8151 * can't re-enter recovery, so there's no need to keep checking after the
8152 * shared variable has once been seen false.
8153 */
8154 if (!LocalRecoveryInProgress)
8155 return false;
8156 else
8157 {
8158 /*
8159 * use volatile pointer to make sure we make a fresh read of the
8160 * shared variable.
8161 */
8162 volatile XLogCtlData *xlogctl = XLogCtl;
8163
8164 LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
8165
8166 /*
8167 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8168 * is finished. InitPostgres() relies upon this behaviour to ensure
8169 * that InitXLOGAccess() is called at backend startup. (If you change
8170 * this, see also LocalSetXLogInsertAllowed.)
8171 */
8172 if (!LocalRecoveryInProgress)
8173 {
8174 /*
8175 * If we just exited recovery, make sure we read TimeLineID and
8176 * RedoRecPtr after SharedRecoveryState (for machines with weak
8177 * memory ordering).
8178 */
8179 pg_memory_barrier();
8180 InitXLOGAccess();
8181 }
8182
8183 /*
8184 * Note: We don't need a memory barrier when we're still in recovery.
8185 * We might exit recovery immediately after return, so the caller
8186 * can't rely on 'true' meaning that we're still in recovery anyway.
8187 */
8188
8189 return LocalRecoveryInProgress;
8190 }
8191 }
8192
8193 /*
8194 * Returns current recovery state from shared memory.
8195 *
8196 * This returned state is kept consistent with the contents of the control
8197 * file. See details about the possible values of RecoveryState in xlog.h.
8198 */
8199 RecoveryState
GetRecoveryState(void)8200 GetRecoveryState(void)
8201 {
8202 RecoveryState retval;
8203
8204 SpinLockAcquire(&XLogCtl->info_lck);
8205 retval = XLogCtl->SharedRecoveryState;
8206 SpinLockRelease(&XLogCtl->info_lck);
8207
8208 return retval;
8209 }
8210
8211 /*
8212 * Is HotStandby active yet? This is only important in special backends
8213 * since normal backends won't ever be able to connect until this returns
8214 * true. Postmaster knows this by way of signal, not via shared memory.
8215 *
8216 * Unlike testing standbyState, this works in any process that's connected to
8217 * shared memory. (And note that standbyState alone doesn't tell the truth
8218 * anyway.)
8219 */
8220 bool
HotStandbyActive(void)8221 HotStandbyActive(void)
8222 {
8223 /*
8224 * We check shared state each time only until Hot Standby is active. We
8225 * can't de-activate Hot Standby, so there's no need to keep checking
8226 * after the shared variable has once been seen true.
8227 */
8228 if (LocalHotStandbyActive)
8229 return true;
8230 else
8231 {
8232 /* spinlock is essential on machines with weak memory ordering! */
8233 SpinLockAcquire(&XLogCtl->info_lck);
8234 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8235 SpinLockRelease(&XLogCtl->info_lck);
8236
8237 return LocalHotStandbyActive;
8238 }
8239 }
8240
8241 /*
8242 * Like HotStandbyActive(), but to be used only in WAL replay code,
8243 * where we don't need to ask any other process what the state is.
8244 */
8245 bool
HotStandbyActiveInReplay(void)8246 HotStandbyActiveInReplay(void)
8247 {
8248 Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8249 return LocalHotStandbyActive;
8250 }
8251
8252 /*
8253 * Is this process allowed to insert new WAL records?
8254 *
8255 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8256 * But we also have provisions for forcing the result "true" or "false"
8257 * within specific processes regardless of the global state.
8258 */
8259 bool
XLogInsertAllowed(void)8260 XLogInsertAllowed(void)
8261 {
8262 /*
8263 * If value is "unconditionally true" or "unconditionally false", just
8264 * return it. This provides the normal fast path once recovery is known
8265 * done.
8266 */
8267 if (LocalXLogInsertAllowed >= 0)
8268 return (bool) LocalXLogInsertAllowed;
8269
8270 /*
8271 * Else, must check to see if we're still in recovery.
8272 */
8273 if (RecoveryInProgress())
8274 return false;
8275
8276 /*
8277 * On exit from recovery, reset to "unconditionally true", since there is
8278 * no need to keep checking.
8279 */
8280 LocalXLogInsertAllowed = 1;
8281 return true;
8282 }
8283
8284 /*
8285 * Make XLogInsertAllowed() return true in the current process only.
8286 *
8287 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8288 * and even call LocalSetXLogInsertAllowed() again after that.
8289 */
8290 static void
LocalSetXLogInsertAllowed(void)8291 LocalSetXLogInsertAllowed(void)
8292 {
8293 Assert(LocalXLogInsertAllowed == -1);
8294 LocalXLogInsertAllowed = 1;
8295
8296 /* Initialize as RecoveryInProgress() would do when switching state */
8297 InitXLOGAccess();
8298 }
8299
8300 /*
8301 * Subroutine to try to fetch and validate a prior checkpoint record.
8302 *
8303 * whichChkpt identifies the checkpoint (merely for reporting purposes).
8304 * 1 for "primary", 0 for "other" (backup_label)
8305 */
8306 static XLogRecord *
ReadCheckpointRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int whichChkpt,bool report)8307 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8308 int whichChkpt, bool report)
8309 {
8310 XLogRecord *record;
8311 uint8 info;
8312
8313 if (!XRecOffIsValid(RecPtr))
8314 {
8315 if (!report)
8316 return NULL;
8317
8318 switch (whichChkpt)
8319 {
8320 case 1:
8321 ereport(LOG,
8322 (errmsg("invalid primary checkpoint link in control file")));
8323 break;
8324 default:
8325 ereport(LOG,
8326 (errmsg("invalid checkpoint link in backup_label file")));
8327 break;
8328 }
8329 return NULL;
8330 }
8331
8332 XLogBeginRead(xlogreader, RecPtr);
8333 record = ReadRecord(xlogreader, LOG, true);
8334
8335 if (record == NULL)
8336 {
8337 if (!report)
8338 return NULL;
8339
8340 switch (whichChkpt)
8341 {
8342 case 1:
8343 ereport(LOG,
8344 (errmsg("invalid primary checkpoint record")));
8345 break;
8346 default:
8347 ereport(LOG,
8348 (errmsg("invalid checkpoint record")));
8349 break;
8350 }
8351 return NULL;
8352 }
8353 if (record->xl_rmid != RM_XLOG_ID)
8354 {
8355 switch (whichChkpt)
8356 {
8357 case 1:
8358 ereport(LOG,
8359 (errmsg("invalid resource manager ID in primary checkpoint record")));
8360 break;
8361 default:
8362 ereport(LOG,
8363 (errmsg("invalid resource manager ID in checkpoint record")));
8364 break;
8365 }
8366 return NULL;
8367 }
8368 info = record->xl_info & ~XLR_INFO_MASK;
8369 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8370 info != XLOG_CHECKPOINT_ONLINE)
8371 {
8372 switch (whichChkpt)
8373 {
8374 case 1:
8375 ereport(LOG,
8376 (errmsg("invalid xl_info in primary checkpoint record")));
8377 break;
8378 default:
8379 ereport(LOG,
8380 (errmsg("invalid xl_info in checkpoint record")));
8381 break;
8382 }
8383 return NULL;
8384 }
8385 if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8386 {
8387 switch (whichChkpt)
8388 {
8389 case 1:
8390 ereport(LOG,
8391 (errmsg("invalid length of primary checkpoint record")));
8392 break;
8393 default:
8394 ereport(LOG,
8395 (errmsg("invalid length of checkpoint record")));
8396 break;
8397 }
8398 return NULL;
8399 }
8400 return record;
8401 }
8402
8403 /*
8404 * This must be called in a backend process before creating WAL records
8405 * (except in a standalone backend, which does StartupXLOG instead). We need
8406 * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8407 *
8408 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8409 * process's copies of ThisTimeLineID and RedoRecPtr valid too. This was
8410 * unnecessary however, since the postmaster itself never touches XLOG anyway.
8411 */
8412 void
InitXLOGAccess(void)8413 InitXLOGAccess(void)
8414 {
8415 XLogCtlInsert *Insert = &XLogCtl->Insert;
8416
8417 /* ThisTimeLineID doesn't change so we need no lock to copy it */
8418 ThisTimeLineID = XLogCtl->ThisTimeLineID;
8419 Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8420
8421 /* set wal_segment_size */
8422 wal_segment_size = ControlFile->xlog_seg_size;
8423
8424 /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8425 (void) GetRedoRecPtr();
8426 /* Also update our copy of doPageWrites. */
8427 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8428
8429 /* Also initialize the working areas for constructing WAL records */
8430 InitXLogInsert();
8431 }
8432
8433 /*
8434 * Return the current Redo pointer from shared memory.
8435 *
8436 * As a side-effect, the local RedoRecPtr copy is updated.
8437 */
8438 XLogRecPtr
GetRedoRecPtr(void)8439 GetRedoRecPtr(void)
8440 {
8441 XLogRecPtr ptr;
8442
8443 /*
8444 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8445 * grabbed a WAL insertion lock to read the master copy, someone might
8446 * update it just after we've released the lock.
8447 */
8448 SpinLockAcquire(&XLogCtl->info_lck);
8449 ptr = XLogCtl->RedoRecPtr;
8450 SpinLockRelease(&XLogCtl->info_lck);
8451
8452 if (RedoRecPtr < ptr)
8453 RedoRecPtr = ptr;
8454
8455 return RedoRecPtr;
8456 }
8457
8458 /*
8459 * Return information needed to decide whether a modified block needs a
8460 * full-page image to be included in the WAL record.
8461 *
8462 * The returned values are cached copies from backend-private memory, and
8463 * possibly out-of-date. XLogInsertRecord will re-check them against
8464 * up-to-date values, while holding the WAL insert lock.
8465 */
8466 void
GetFullPageWriteInfo(XLogRecPtr * RedoRecPtr_p,bool * doPageWrites_p)8467 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8468 {
8469 *RedoRecPtr_p = RedoRecPtr;
8470 *doPageWrites_p = doPageWrites;
8471 }
8472
8473 /*
8474 * GetInsertRecPtr -- Returns the current insert position.
8475 *
8476 * NOTE: The value *actually* returned is the position of the last full
8477 * xlog page. It lags behind the real insert position by at most 1 page.
8478 * For that, we don't need to scan through WAL insertion locks, and an
8479 * approximation is enough for the current usage of this function.
8480 */
8481 XLogRecPtr
GetInsertRecPtr(void)8482 GetInsertRecPtr(void)
8483 {
8484 XLogRecPtr recptr;
8485
8486 SpinLockAcquire(&XLogCtl->info_lck);
8487 recptr = XLogCtl->LogwrtRqst.Write;
8488 SpinLockRelease(&XLogCtl->info_lck);
8489
8490 return recptr;
8491 }
8492
8493 /*
8494 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8495 * position known to be fsync'd to disk.
8496 */
8497 XLogRecPtr
GetFlushRecPtr(void)8498 GetFlushRecPtr(void)
8499 {
8500 SpinLockAcquire(&XLogCtl->info_lck);
8501 LogwrtResult = XLogCtl->LogwrtResult;
8502 SpinLockRelease(&XLogCtl->info_lck);
8503
8504 return LogwrtResult.Flush;
8505 }
8506
8507 /*
8508 * GetLastImportantRecPtr -- Returns the LSN of the last important record
8509 * inserted. All records not explicitly marked as unimportant are considered
8510 * important.
8511 *
8512 * The LSN is determined by computing the maximum of
8513 * WALInsertLocks[i].lastImportantAt.
8514 */
8515 XLogRecPtr
GetLastImportantRecPtr(void)8516 GetLastImportantRecPtr(void)
8517 {
8518 XLogRecPtr res = InvalidXLogRecPtr;
8519 int i;
8520
8521 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8522 {
8523 XLogRecPtr last_important;
8524
8525 /*
8526 * Need to take a lock to prevent torn reads of the LSN, which are
8527 * possible on some of the supported platforms. WAL insert locks only
8528 * support exclusive mode, so we have to use that.
8529 */
8530 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8531 last_important = WALInsertLocks[i].l.lastImportantAt;
8532 LWLockRelease(&WALInsertLocks[i].l.lock);
8533
8534 if (res < last_important)
8535 res = last_important;
8536 }
8537
8538 return res;
8539 }
8540
8541 /*
8542 * Get the time and LSN of the last xlog segment switch
8543 */
8544 pg_time_t
GetLastSegSwitchData(XLogRecPtr * lastSwitchLSN)8545 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8546 {
8547 pg_time_t result;
8548
8549 /* Need WALWriteLock, but shared lock is sufficient */
8550 LWLockAcquire(WALWriteLock, LW_SHARED);
8551 result = XLogCtl->lastSegSwitchTime;
8552 *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8553 LWLockRelease(WALWriteLock);
8554
8555 return result;
8556 }
8557
8558 /*
8559 * This must be called ONCE during postmaster or standalone-backend shutdown
8560 */
8561 void
ShutdownXLOG(int code,Datum arg)8562 ShutdownXLOG(int code, Datum arg)
8563 {
8564 /*
8565 * We should have an aux process resource owner to use, and we should not
8566 * be in a transaction that's installed some other resowner.
8567 */
8568 Assert(AuxProcessResourceOwner != NULL);
8569 Assert(CurrentResourceOwner == NULL ||
8570 CurrentResourceOwner == AuxProcessResourceOwner);
8571 CurrentResourceOwner = AuxProcessResourceOwner;
8572
8573 /* Don't be chatty in standalone mode */
8574 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8575 (errmsg("shutting down")));
8576
8577 /*
8578 * Signal walsenders to move to stopping state.
8579 */
8580 WalSndInitStopping();
8581
8582 /*
8583 * Wait for WAL senders to be in stopping state. This prevents commands
8584 * from writing new WAL.
8585 */
8586 WalSndWaitStopping();
8587
8588 if (RecoveryInProgress())
8589 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8590 else
8591 {
8592 /*
8593 * If archiving is enabled, rotate the last XLOG file so that all the
8594 * remaining records are archived (postmaster wakes up the archiver
8595 * process one more time at the end of shutdown). The checkpoint
8596 * record will go to the next XLOG file and won't be archived (yet).
8597 */
8598 if (XLogArchivingActive() && XLogArchiveCommandSet())
8599 RequestXLogSwitch(false);
8600
8601 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8602 }
8603 ShutdownCLOG();
8604 ShutdownCommitTs();
8605 ShutdownSUBTRANS();
8606 ShutdownMultiXact();
8607 }
8608
8609 /*
8610 * Log start of a checkpoint.
8611 */
8612 static void
LogCheckpointStart(int flags,bool restartpoint)8613 LogCheckpointStart(int flags, bool restartpoint)
8614 {
8615 elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8616 restartpoint ? "restartpoint" : "checkpoint",
8617 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8618 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8619 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8620 (flags & CHECKPOINT_FORCE) ? " force" : "",
8621 (flags & CHECKPOINT_WAIT) ? " wait" : "",
8622 (flags & CHECKPOINT_CAUSE_XLOG) ? " wal" : "",
8623 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8624 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8625 }
8626
8627 /*
8628 * Log end of a checkpoint.
8629 */
8630 static void
LogCheckpointEnd(bool restartpoint)8631 LogCheckpointEnd(bool restartpoint)
8632 {
8633 long write_msecs,
8634 sync_msecs,
8635 total_msecs,
8636 longest_msecs,
8637 average_msecs;
8638 uint64 average_sync_time;
8639
8640 CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8641
8642 write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
8643 CheckpointStats.ckpt_sync_t);
8644
8645 sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
8646 CheckpointStats.ckpt_sync_end_t);
8647
8648 /* Accumulate checkpoint timing summary data, in milliseconds. */
8649 BgWriterStats.m_checkpoint_write_time += write_msecs;
8650 BgWriterStats.m_checkpoint_sync_time += sync_msecs;
8651
8652 /*
8653 * All of the published timing statistics are accounted for. Only
8654 * continue if a log message is to be written.
8655 */
8656 if (!log_checkpoints)
8657 return;
8658
8659 total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
8660 CheckpointStats.ckpt_end_t);
8661
8662 /*
8663 * Timing values returned from CheckpointStats are in microseconds.
8664 * Convert to milliseconds for consistent printing.
8665 */
8666 longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
8667
8668 average_sync_time = 0;
8669 if (CheckpointStats.ckpt_sync_rels > 0)
8670 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8671 CheckpointStats.ckpt_sync_rels;
8672 average_msecs = (long) ((average_sync_time + 999) / 1000);
8673
8674 elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8675 "%d WAL file(s) added, %d removed, %d recycled; "
8676 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8677 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8678 "distance=%d kB, estimate=%d kB",
8679 restartpoint ? "restartpoint" : "checkpoint",
8680 CheckpointStats.ckpt_bufs_written,
8681 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8682 CheckpointStats.ckpt_segs_added,
8683 CheckpointStats.ckpt_segs_removed,
8684 CheckpointStats.ckpt_segs_recycled,
8685 write_msecs / 1000, (int) (write_msecs % 1000),
8686 sync_msecs / 1000, (int) (sync_msecs % 1000),
8687 total_msecs / 1000, (int) (total_msecs % 1000),
8688 CheckpointStats.ckpt_sync_rels,
8689 longest_msecs / 1000, (int) (longest_msecs % 1000),
8690 average_msecs / 1000, (int) (average_msecs % 1000),
8691 (int) (PrevCheckPointDistance / 1024.0),
8692 (int) (CheckPointDistanceEstimate / 1024.0));
8693 }
8694
8695 /*
8696 * Update the estimate of distance between checkpoints.
8697 *
8698 * The estimate is used to calculate the number of WAL segments to keep
8699 * preallocated, see XLOGfileslop().
8700 */
8701 static void
UpdateCheckPointDistanceEstimate(uint64 nbytes)8702 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8703 {
8704 /*
8705 * To estimate the number of segments consumed between checkpoints, keep a
8706 * moving average of the amount of WAL generated in previous checkpoint
8707 * cycles. However, if the load is bursty, with quiet periods and busy
8708 * periods, we want to cater for the peak load. So instead of a plain
8709 * moving average, let the average decline slowly if the previous cycle
8710 * used less WAL than estimated, but bump it up immediately if it used
8711 * more.
8712 *
8713 * When checkpoints are triggered by max_wal_size, this should converge to
8714 * CheckpointSegments * wal_segment_size,
8715 *
8716 * Note: This doesn't pay any attention to what caused the checkpoint.
8717 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8718 * starting a base backup, are counted the same as those created
8719 * automatically. The slow-decline will largely mask them out, if they are
8720 * not frequent. If they are frequent, it seems reasonable to count them
8721 * in as any others; if you issue a manual checkpoint every 5 minutes and
8722 * never let a timed checkpoint happen, it makes sense to base the
8723 * preallocation on that 5 minute interval rather than whatever
8724 * checkpoint_timeout is set to.
8725 */
8726 PrevCheckPointDistance = nbytes;
8727 if (CheckPointDistanceEstimate < nbytes)
8728 CheckPointDistanceEstimate = nbytes;
8729 else
8730 CheckPointDistanceEstimate =
8731 (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8732 }
8733
8734 /*
8735 * Perform a checkpoint --- either during shutdown, or on-the-fly
8736 *
8737 * flags is a bitwise OR of the following:
8738 * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8739 * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8740 * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8741 * ignoring checkpoint_completion_target parameter.
8742 * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8743 * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8744 * CHECKPOINT_END_OF_RECOVERY).
8745 * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8746 *
8747 * Note: flags contains other bits, of interest here only for logging purposes.
8748 * In particular note that this routine is synchronous and does not pay
8749 * attention to CHECKPOINT_WAIT.
8750 *
8751 * If !shutdown then we are writing an online checkpoint. This is a very special
8752 * kind of operation and WAL record because the checkpoint action occurs over
8753 * a period of time yet logically occurs at just a single LSN. The logical
8754 * position of the WAL record (redo ptr) is the same or earlier than the
8755 * physical position. When we replay WAL we locate the checkpoint via its
8756 * physical position then read the redo ptr and actually start replay at the
8757 * earlier logical position. Note that we don't write *anything* to WAL at
8758 * the logical position, so that location could be any other kind of WAL record.
8759 * All of this mechanism allows us to continue working while we checkpoint.
8760 * As a result, timing of actions is critical here and be careful to note that
8761 * this function will likely take minutes to execute on a busy system.
8762 */
8763 void
CreateCheckPoint(int flags)8764 CreateCheckPoint(int flags)
8765 {
8766 bool shutdown;
8767 CheckPoint checkPoint;
8768 XLogRecPtr recptr;
8769 XLogSegNo _logSegNo;
8770 XLogCtlInsert *Insert = &XLogCtl->Insert;
8771 uint32 freespace;
8772 XLogRecPtr PriorRedoPtr;
8773 XLogRecPtr curInsert;
8774 XLogRecPtr last_important_lsn;
8775 VirtualTransactionId *vxids;
8776 int nvxids;
8777
8778 /*
8779 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8780 * issued at a different time.
8781 */
8782 if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8783 shutdown = true;
8784 else
8785 shutdown = false;
8786
8787 /* sanity check */
8788 if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8789 elog(ERROR, "can't create a checkpoint during recovery");
8790
8791 /*
8792 * Initialize InitXLogInsert working areas before entering the critical
8793 * section. Normally, this is done by the first call to
8794 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8795 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8796 * done below in a critical section, and InitXLogInsert cannot be called
8797 * in a critical section.
8798 */
8799 InitXLogInsert();
8800
8801 /*
8802 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8803 * (This is just pro forma, since in the present system structure there is
8804 * only one process that is allowed to issue checkpoints at any given
8805 * time.)
8806 */
8807 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8808
8809 /*
8810 * Prepare to accumulate statistics.
8811 *
8812 * Note: because it is possible for log_checkpoints to change while a
8813 * checkpoint proceeds, we always accumulate stats, even if
8814 * log_checkpoints is currently off.
8815 */
8816 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8817 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8818
8819 /*
8820 * Use a critical section to force system panic if we have trouble.
8821 */
8822 START_CRIT_SECTION();
8823
8824 if (shutdown)
8825 {
8826 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8827 ControlFile->state = DB_SHUTDOWNING;
8828 ControlFile->time = (pg_time_t) time(NULL);
8829 UpdateControlFile();
8830 LWLockRelease(ControlFileLock);
8831 }
8832
8833 /*
8834 * Let smgr prepare for checkpoint; this has to happen before we determine
8835 * the REDO pointer. Note that smgr must not do anything that'd have to
8836 * be undone if we decide no checkpoint is needed.
8837 */
8838 SyncPreCheckpoint();
8839
8840 /* Begin filling in the checkpoint WAL record */
8841 MemSet(&checkPoint, 0, sizeof(checkPoint));
8842 checkPoint.time = (pg_time_t) time(NULL);
8843
8844 /*
8845 * For Hot Standby, derive the oldestActiveXid before we fix the redo
8846 * pointer. This allows us to begin accumulating changes to assemble our
8847 * starting snapshot of locks and transactions.
8848 */
8849 if (!shutdown && XLogStandbyInfoActive())
8850 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8851 else
8852 checkPoint.oldestActiveXid = InvalidTransactionId;
8853
8854 /*
8855 * Get location of last important record before acquiring insert locks (as
8856 * GetLastImportantRecPtr() also locks WAL locks).
8857 */
8858 last_important_lsn = GetLastImportantRecPtr();
8859
8860 /*
8861 * We must block concurrent insertions while examining insert state to
8862 * determine the checkpoint REDO pointer.
8863 */
8864 WALInsertLockAcquireExclusive();
8865 curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8866
8867 /*
8868 * If this isn't a shutdown or forced checkpoint, and if there has been no
8869 * WAL activity requiring a checkpoint, skip it. The idea here is to
8870 * avoid inserting duplicate checkpoints when the system is idle.
8871 */
8872 if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8873 CHECKPOINT_FORCE)) == 0)
8874 {
8875 if (last_important_lsn == ControlFile->checkPoint)
8876 {
8877 WALInsertLockRelease();
8878 LWLockRelease(CheckpointLock);
8879 END_CRIT_SECTION();
8880 ereport(DEBUG1,
8881 (errmsg("checkpoint skipped because system is idle")));
8882 return;
8883 }
8884 }
8885
8886 /*
8887 * An end-of-recovery checkpoint is created before anyone is allowed to
8888 * write WAL. To allow us to write the checkpoint record, temporarily
8889 * enable XLogInsertAllowed. (This also ensures ThisTimeLineID is
8890 * initialized, which we need here and in AdvanceXLInsertBuffer.)
8891 */
8892 if (flags & CHECKPOINT_END_OF_RECOVERY)
8893 LocalSetXLogInsertAllowed();
8894
8895 checkPoint.ThisTimeLineID = ThisTimeLineID;
8896 if (flags & CHECKPOINT_END_OF_RECOVERY)
8897 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8898 else
8899 checkPoint.PrevTimeLineID = ThisTimeLineID;
8900
8901 checkPoint.fullPageWrites = Insert->fullPageWrites;
8902
8903 /*
8904 * Compute new REDO record ptr = location of next XLOG record.
8905 *
8906 * NB: this is NOT necessarily where the checkpoint record itself will be,
8907 * since other backends may insert more XLOG records while we're off doing
8908 * the buffer flush work. Those XLOG records are logically after the
8909 * checkpoint, even though physically before it. Got that?
8910 */
8911 freespace = INSERT_FREESPACE(curInsert);
8912 if (freespace == 0)
8913 {
8914 if (XLogSegmentOffset(curInsert, wal_segment_size) == 0)
8915 curInsert += SizeOfXLogLongPHD;
8916 else
8917 curInsert += SizeOfXLogShortPHD;
8918 }
8919 checkPoint.redo = curInsert;
8920
8921 /*
8922 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8923 * must be done while holding all the insertion locks.
8924 *
8925 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8926 * pointing past where it really needs to point. This is okay; the only
8927 * consequence is that XLogInsert might back up whole buffers that it
8928 * didn't really need to. We can't postpone advancing RedoRecPtr because
8929 * XLogInserts that happen while we are dumping buffers must assume that
8930 * their buffer changes are not included in the checkpoint.
8931 */
8932 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8933
8934 /*
8935 * Now we can release the WAL insertion locks, allowing other xacts to
8936 * proceed while we are flushing disk buffers.
8937 */
8938 WALInsertLockRelease();
8939
8940 /* Update the info_lck-protected copy of RedoRecPtr as well */
8941 SpinLockAcquire(&XLogCtl->info_lck);
8942 XLogCtl->RedoRecPtr = checkPoint.redo;
8943 SpinLockRelease(&XLogCtl->info_lck);
8944
8945 /*
8946 * If enabled, log checkpoint start. We postpone this until now so as not
8947 * to log anything if we decided to skip the checkpoint.
8948 */
8949 if (log_checkpoints)
8950 LogCheckpointStart(flags, false);
8951
8952 TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8953
8954 /*
8955 * Get the other info we need for the checkpoint record.
8956 *
8957 * We don't need to save oldestClogXid in the checkpoint, it only matters
8958 * for the short period in which clog is being truncated, and if we crash
8959 * during that we'll redo the clog truncation and fix up oldestClogXid
8960 * there.
8961 */
8962 LWLockAcquire(XidGenLock, LW_SHARED);
8963 checkPoint.nextFullXid = ShmemVariableCache->nextFullXid;
8964 checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8965 checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8966 LWLockRelease(XidGenLock);
8967
8968 LWLockAcquire(CommitTsLock, LW_SHARED);
8969 checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8970 checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8971 LWLockRelease(CommitTsLock);
8972
8973 LWLockAcquire(OidGenLock, LW_SHARED);
8974 checkPoint.nextOid = ShmemVariableCache->nextOid;
8975 if (!shutdown)
8976 checkPoint.nextOid += ShmemVariableCache->oidCount;
8977 LWLockRelease(OidGenLock);
8978
8979 MultiXactGetCheckptMulti(shutdown,
8980 &checkPoint.nextMulti,
8981 &checkPoint.nextMultiOffset,
8982 &checkPoint.oldestMulti,
8983 &checkPoint.oldestMultiDB);
8984
8985 /*
8986 * Having constructed the checkpoint record, ensure all shmem disk buffers
8987 * and commit-log buffers are flushed to disk.
8988 *
8989 * This I/O could fail for various reasons. If so, we will fail to
8990 * complete the checkpoint, but there is no reason to force a system
8991 * panic. Accordingly, exit critical section while doing it.
8992 */
8993 END_CRIT_SECTION();
8994
8995 /*
8996 * In some cases there are groups of actions that must all occur on one
8997 * side or the other of a checkpoint record. Before flushing the
8998 * checkpoint record we must explicitly wait for any backend currently
8999 * performing those groups of actions.
9000 *
9001 * One example is end of transaction, so we must wait for any transactions
9002 * that are currently in commit critical sections. If an xact inserted
9003 * its commit record into XLOG just before the REDO point, then a crash
9004 * restart from the REDO point would not replay that record, which means
9005 * that our flushing had better include the xact's update of pg_xact. So
9006 * we wait till he's out of his commit critical section before proceeding.
9007 * See notes in RecordTransactionCommit().
9008 *
9009 * Because we've already released the insertion locks, this test is a bit
9010 * fuzzy: it is possible that we will wait for xacts we didn't really need
9011 * to wait for. But the delay should be short and it seems better to make
9012 * checkpoint take a bit longer than to hold off insertions longer than
9013 * necessary. (In fact, the whole reason we have this issue is that xact.c
9014 * does commit record XLOG insertion and clog update as two separate steps
9015 * protected by different locks, but again that seems best on grounds of
9016 * minimizing lock contention.)
9017 *
9018 * A transaction that has not yet set delayChkpt when we look cannot be at
9019 * risk, since he's not inserted his commit record yet; and one that's
9020 * already cleared it is not at risk either, since he's done fixing clog
9021 * and we will correctly flush the update below. So we cannot miss any
9022 * xacts we need to wait for.
9023 */
9024 vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
9025 if (nvxids > 0)
9026 {
9027 do
9028 {
9029 pg_usleep(10000L); /* wait for 10 msec */
9030 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9031 }
9032 pfree(vxids);
9033
9034 CheckPointGuts(checkPoint.redo, flags);
9035
9036 /*
9037 * Take a snapshot of running transactions and write this to WAL. This
9038 * allows us to reconstruct the state of running transactions during
9039 * archive recovery, if required. Skip, if this info disabled.
9040 *
9041 * If we are shutting down, or Startup process is completing crash
9042 * recovery we don't need to write running xact data.
9043 */
9044 if (!shutdown && XLogStandbyInfoActive())
9045 LogStandbySnapshot();
9046
9047 START_CRIT_SECTION();
9048
9049 /*
9050 * Now insert the checkpoint record into XLOG.
9051 */
9052 XLogBeginInsert();
9053 XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
9054 recptr = XLogInsert(RM_XLOG_ID,
9055 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
9056 XLOG_CHECKPOINT_ONLINE);
9057
9058 XLogFlush(recptr);
9059
9060 /*
9061 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
9062 * overwritten at next startup. No-one should even try, this just allows
9063 * sanity-checking. In the case of an end-of-recovery checkpoint, we want
9064 * to just temporarily disable writing until the system has exited
9065 * recovery.
9066 */
9067 if (shutdown)
9068 {
9069 if (flags & CHECKPOINT_END_OF_RECOVERY)
9070 LocalXLogInsertAllowed = -1; /* return to "check" state */
9071 else
9072 LocalXLogInsertAllowed = 0; /* never again write WAL */
9073 }
9074
9075 /*
9076 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
9077 * = end of actual checkpoint record.
9078 */
9079 if (shutdown && checkPoint.redo != ProcLastRecPtr)
9080 ereport(PANIC,
9081 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
9082
9083 /*
9084 * Remember the prior checkpoint's redo ptr for
9085 * UpdateCheckPointDistanceEstimate()
9086 */
9087 PriorRedoPtr = ControlFile->checkPointCopy.redo;
9088
9089 /*
9090 * Update the control file.
9091 */
9092 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9093 if (shutdown)
9094 ControlFile->state = DB_SHUTDOWNED;
9095 ControlFile->checkPoint = ProcLastRecPtr;
9096 ControlFile->checkPointCopy = checkPoint;
9097 ControlFile->time = (pg_time_t) time(NULL);
9098 /* crash recovery should always recover to the end of WAL */
9099 ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
9100 ControlFile->minRecoveryPointTLI = 0;
9101
9102 /*
9103 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9104 * unused on non-shutdown checkpoints, but seems useful to store it always
9105 * for debugging purposes.
9106 */
9107 SpinLockAcquire(&XLogCtl->ulsn_lck);
9108 ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9109 SpinLockRelease(&XLogCtl->ulsn_lck);
9110
9111 UpdateControlFile();
9112 LWLockRelease(ControlFileLock);
9113
9114 /* Update shared-memory copy of checkpoint XID/epoch */
9115 SpinLockAcquire(&XLogCtl->info_lck);
9116 XLogCtl->ckptFullXid = checkPoint.nextFullXid;
9117 SpinLockRelease(&XLogCtl->info_lck);
9118
9119 /*
9120 * We are now done with critical updates; no need for system panic if we
9121 * have trouble while fooling with old log segments.
9122 */
9123 END_CRIT_SECTION();
9124
9125 /*
9126 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9127 */
9128 SyncPostCheckpoint();
9129
9130 /*
9131 * Update the average distance between checkpoints if the prior checkpoint
9132 * exists.
9133 */
9134 if (PriorRedoPtr != InvalidXLogRecPtr)
9135 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9136
9137 /*
9138 * Delete old log files, those no longer needed for last checkpoint to
9139 * prevent the disk holding the xlog from growing full.
9140 */
9141 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9142 KeepLogSeg(recptr, &_logSegNo);
9143 if (InvalidateObsoleteReplicationSlots(_logSegNo))
9144 {
9145 /*
9146 * Some slots have been invalidated; recalculate the old-segment
9147 * horizon, starting again from RedoRecPtr.
9148 */
9149 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9150 KeepLogSeg(recptr, &_logSegNo);
9151 }
9152 _logSegNo--;
9153 RemoveOldXlogFiles(_logSegNo, RedoRecPtr, recptr);
9154
9155 /*
9156 * Make more log segments if needed. (Do this after recycling old log
9157 * segments, since that may supply some of the needed files.)
9158 */
9159 if (!shutdown)
9160 PreallocXlogFiles(recptr);
9161
9162 /*
9163 * Truncate pg_subtrans if possible. We can throw away all data before
9164 * the oldest XMIN of any running transaction. No future transaction will
9165 * attempt to reference any pg_subtrans entry older than that (see Asserts
9166 * in subtrans.c). During recovery, though, we mustn't do this because
9167 * StartupSUBTRANS hasn't been called yet.
9168 */
9169 if (!RecoveryInProgress())
9170 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9171
9172 /* Real work is done, but log and update stats before releasing lock. */
9173 LogCheckpointEnd(false);
9174
9175 TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9176 NBuffers,
9177 CheckpointStats.ckpt_segs_added,
9178 CheckpointStats.ckpt_segs_removed,
9179 CheckpointStats.ckpt_segs_recycled);
9180
9181 LWLockRelease(CheckpointLock);
9182 }
9183
9184 /*
9185 * Mark the end of recovery in WAL though without running a full checkpoint.
9186 * We can expect that a restartpoint is likely to be in progress as we
9187 * do this, though we are unwilling to wait for it to complete. So be
9188 * careful to avoid taking the CheckpointLock anywhere here.
9189 *
9190 * CreateRestartPoint() allows for the case where recovery may end before
9191 * the restartpoint completes so there is no concern of concurrent behaviour.
9192 */
9193 static void
CreateEndOfRecoveryRecord(void)9194 CreateEndOfRecoveryRecord(void)
9195 {
9196 xl_end_of_recovery xlrec;
9197 XLogRecPtr recptr;
9198
9199 /* sanity check */
9200 if (!RecoveryInProgress())
9201 elog(ERROR, "can only be used to end recovery");
9202
9203 xlrec.end_time = GetCurrentTimestamp();
9204
9205 WALInsertLockAcquireExclusive();
9206 xlrec.ThisTimeLineID = ThisTimeLineID;
9207 xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9208 WALInsertLockRelease();
9209
9210 LocalSetXLogInsertAllowed();
9211
9212 START_CRIT_SECTION();
9213
9214 XLogBeginInsert();
9215 XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9216 recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9217
9218 XLogFlush(recptr);
9219
9220 /*
9221 * Update the control file so that crash recovery can follow the timeline
9222 * changes to this point.
9223 */
9224 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9225 ControlFile->time = (pg_time_t) time(NULL);
9226 ControlFile->minRecoveryPoint = recptr;
9227 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9228 UpdateControlFile();
9229 LWLockRelease(ControlFileLock);
9230
9231 END_CRIT_SECTION();
9232
9233 LocalXLogInsertAllowed = -1; /* return to "check" state */
9234 }
9235
9236 /*
9237 * Write an OVERWRITE_CONTRECORD message.
9238 *
9239 * When on WAL replay we expect a continuation record at the start of a page
9240 * that is not there, recovery ends and WAL writing resumes at that point.
9241 * But it's wrong to resume writing new WAL back at the start of the record
9242 * that was broken, because downstream consumers of that WAL (physical
9243 * replicas) are not prepared to "rewind". So the first action after
9244 * finishing replay of all valid WAL must be to write a record of this type
9245 * at the point where the contrecord was missing; to support xlogreader
9246 * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
9247 * to the page header where the record occurs. xlogreader has an ad-hoc
9248 * mechanism to report metadata about the broken record, which is what we
9249 * use here.
9250 *
9251 * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
9252 * skip the record it was reading, and pass back the LSN of the skipped
9253 * record, so that its caller can verify (on "replay" of that record) that the
9254 * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
9255 */
9256 static XLogRecPtr
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)9257 CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
9258 {
9259 xl_overwrite_contrecord xlrec;
9260 XLogRecPtr recptr;
9261
9262 /* sanity check */
9263 if (!RecoveryInProgress())
9264 elog(ERROR, "can only be used at end of recovery");
9265
9266 xlrec.overwritten_lsn = aborted_lsn;
9267 xlrec.overwrite_time = GetCurrentTimestamp();
9268
9269 START_CRIT_SECTION();
9270
9271 XLogBeginInsert();
9272 XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
9273
9274 recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
9275
9276 XLogFlush(recptr);
9277
9278 END_CRIT_SECTION();
9279
9280 return recptr;
9281 }
9282
9283 /*
9284 * Flush all data in shared memory to disk, and fsync
9285 *
9286 * This is the common code shared between regular checkpoints and
9287 * recovery restartpoints.
9288 */
9289 static void
CheckPointGuts(XLogRecPtr checkPointRedo,int flags)9290 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9291 {
9292 CheckPointCLOG();
9293 CheckPointCommitTs();
9294 CheckPointSUBTRANS();
9295 CheckPointMultiXact();
9296 CheckPointPredicate();
9297 CheckPointRelationMap();
9298 CheckPointReplicationSlots();
9299 CheckPointSnapBuild();
9300 CheckPointLogicalRewriteHeap();
9301 CheckPointBuffers(flags); /* performs all required fsyncs */
9302 CheckPointReplicationOrigin();
9303 /* We deliberately delay 2PC checkpointing as long as possible */
9304 CheckPointTwoPhase(checkPointRedo);
9305 }
9306
9307 /*
9308 * Save a checkpoint for recovery restart if appropriate
9309 *
9310 * This function is called each time a checkpoint record is read from XLOG.
9311 * It must determine whether the checkpoint represents a safe restartpoint or
9312 * not. If so, the checkpoint record is stashed in shared memory so that
9313 * CreateRestartPoint can consult it. (Note that the latter function is
9314 * executed by the checkpointer, while this one will be executed by the
9315 * startup process.)
9316 */
9317 static void
RecoveryRestartPoint(const CheckPoint * checkPoint)9318 RecoveryRestartPoint(const CheckPoint *checkPoint)
9319 {
9320 /*
9321 * Also refrain from creating a restartpoint if we have seen any
9322 * references to non-existent pages. Restarting recovery from the
9323 * restartpoint would not see the references, so we would lose the
9324 * cross-check that the pages belonged to a relation that was dropped
9325 * later.
9326 */
9327 if (XLogHaveInvalidPages())
9328 {
9329 elog(trace_recovery(DEBUG2),
9330 "could not record restart point at %X/%X because there "
9331 "are unresolved references to invalid pages",
9332 (uint32) (checkPoint->redo >> 32),
9333 (uint32) checkPoint->redo);
9334 return;
9335 }
9336
9337 /*
9338 * Copy the checkpoint record to shared memory, so that checkpointer can
9339 * work out the next time it wants to perform a restartpoint.
9340 */
9341 SpinLockAcquire(&XLogCtl->info_lck);
9342 XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9343 XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9344 XLogCtl->lastCheckPoint = *checkPoint;
9345 SpinLockRelease(&XLogCtl->info_lck);
9346 }
9347
9348 /*
9349 * Establish a restartpoint if possible.
9350 *
9351 * This is similar to CreateCheckPoint, but is used during WAL recovery
9352 * to establish a point from which recovery can roll forward without
9353 * replaying the entire recovery log.
9354 *
9355 * Returns true if a new restartpoint was established. We can only establish
9356 * a restartpoint if we have replayed a safe checkpoint record since last
9357 * restartpoint.
9358 */
9359 bool
CreateRestartPoint(int flags)9360 CreateRestartPoint(int flags)
9361 {
9362 XLogRecPtr lastCheckPointRecPtr;
9363 XLogRecPtr lastCheckPointEndPtr;
9364 CheckPoint lastCheckPoint;
9365 XLogRecPtr PriorRedoPtr;
9366 XLogRecPtr receivePtr;
9367 XLogRecPtr replayPtr;
9368 TimeLineID replayTLI;
9369 XLogRecPtr endptr;
9370 XLogSegNo _logSegNo;
9371 TimestampTz xtime;
9372
9373 /*
9374 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9375 * happens at a time.
9376 */
9377 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9378
9379 /* Get a local copy of the last safe checkpoint record. */
9380 SpinLockAcquire(&XLogCtl->info_lck);
9381 lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9382 lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9383 lastCheckPoint = XLogCtl->lastCheckPoint;
9384 SpinLockRelease(&XLogCtl->info_lck);
9385
9386 /*
9387 * Check that we're still in recovery mode. It's ok if we exit recovery
9388 * mode after this check, the restart point is valid anyway.
9389 */
9390 if (!RecoveryInProgress())
9391 {
9392 ereport(DEBUG2,
9393 (errmsg("skipping restartpoint, recovery has already ended")));
9394 LWLockRelease(CheckpointLock);
9395 return false;
9396 }
9397
9398 /*
9399 * If the last checkpoint record we've replayed is already our last
9400 * restartpoint, we can't perform a new restart point. We still update
9401 * minRecoveryPoint in that case, so that if this is a shutdown restart
9402 * point, we won't start up earlier than before. That's not strictly
9403 * necessary, but when hot standby is enabled, it would be rather weird if
9404 * the database opened up for read-only connections at a point-in-time
9405 * before the last shutdown. Such time travel is still possible in case of
9406 * immediate shutdown, though.
9407 *
9408 * We don't explicitly advance minRecoveryPoint when we do create a
9409 * restartpoint. It's assumed that flushing the buffers will do that as a
9410 * side-effect.
9411 */
9412 if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9413 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9414 {
9415 ereport(DEBUG2,
9416 (errmsg("skipping restartpoint, already performed at %X/%X",
9417 (uint32) (lastCheckPoint.redo >> 32),
9418 (uint32) lastCheckPoint.redo)));
9419
9420 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9421 if (flags & CHECKPOINT_IS_SHUTDOWN)
9422 {
9423 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9424 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9425 ControlFile->time = (pg_time_t) time(NULL);
9426 UpdateControlFile();
9427 LWLockRelease(ControlFileLock);
9428 }
9429 LWLockRelease(CheckpointLock);
9430 return false;
9431 }
9432
9433 /*
9434 * Update the shared RedoRecPtr so that the startup process can calculate
9435 * the number of segments replayed since last restartpoint, and request a
9436 * restartpoint if it exceeds CheckPointSegments.
9437 *
9438 * Like in CreateCheckPoint(), hold off insertions to update it, although
9439 * during recovery this is just pro forma, because no WAL insertions are
9440 * happening.
9441 */
9442 WALInsertLockAcquireExclusive();
9443 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9444 WALInsertLockRelease();
9445
9446 /* Also update the info_lck-protected copy */
9447 SpinLockAcquire(&XLogCtl->info_lck);
9448 XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9449 SpinLockRelease(&XLogCtl->info_lck);
9450
9451 /*
9452 * Prepare to accumulate statistics.
9453 *
9454 * Note: because it is possible for log_checkpoints to change while a
9455 * checkpoint proceeds, we always accumulate stats, even if
9456 * log_checkpoints is currently off.
9457 */
9458 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9459 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9460
9461 if (log_checkpoints)
9462 LogCheckpointStart(flags, true);
9463
9464 CheckPointGuts(lastCheckPoint.redo, flags);
9465
9466 /*
9467 * Remember the prior checkpoint's redo ptr for
9468 * UpdateCheckPointDistanceEstimate()
9469 */
9470 PriorRedoPtr = ControlFile->checkPointCopy.redo;
9471
9472 /*
9473 * Update pg_control, using current time. Check that it still shows
9474 * DB_IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9475 * this is a quick hack to make sure nothing really bad happens if somehow
9476 * we get here after the end-of-recovery checkpoint.
9477 */
9478 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9479 if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9480 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9481 {
9482 ControlFile->checkPoint = lastCheckPointRecPtr;
9483 ControlFile->checkPointCopy = lastCheckPoint;
9484 ControlFile->time = (pg_time_t) time(NULL);
9485
9486 /*
9487 * Ensure minRecoveryPoint is past the checkpoint record. Normally,
9488 * this will have happened already while writing out dirty buffers,
9489 * but not necessarily - e.g. because no buffers were dirtied. We do
9490 * this because a non-exclusive base backup uses minRecoveryPoint to
9491 * determine which WAL files must be included in the backup, and the
9492 * file (or files) containing the checkpoint record must be included,
9493 * at a minimum. Note that for an ordinary restart of recovery there's
9494 * no value in having the minimum recovery point any earlier than this
9495 * anyway, because redo will begin just after the checkpoint record.
9496 */
9497 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9498 {
9499 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9500 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9501
9502 /* update local copy */
9503 minRecoveryPoint = ControlFile->minRecoveryPoint;
9504 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9505 }
9506 if (flags & CHECKPOINT_IS_SHUTDOWN)
9507 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9508 UpdateControlFile();
9509 }
9510 LWLockRelease(ControlFileLock);
9511
9512 /*
9513 * Update the average distance between checkpoints/restartpoints if the
9514 * prior checkpoint exists.
9515 */
9516 if (PriorRedoPtr != InvalidXLogRecPtr)
9517 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9518
9519 /*
9520 * Delete old log files, those no longer needed for last restartpoint to
9521 * prevent the disk holding the xlog from growing full.
9522 */
9523 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9524
9525 /*
9526 * Retreat _logSegNo using the current end of xlog replayed or received,
9527 * whichever is later.
9528 */
9529 receivePtr = GetWalRcvFlushRecPtr(NULL, NULL);
9530 replayPtr = GetXLogReplayRecPtr(&replayTLI);
9531 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9532 KeepLogSeg(endptr, &_logSegNo);
9533 if (InvalidateObsoleteReplicationSlots(_logSegNo))
9534 {
9535 /*
9536 * Some slots have been invalidated; recalculate the old-segment
9537 * horizon, starting again from RedoRecPtr.
9538 */
9539 XLByteToSeg(RedoRecPtr, _logSegNo, wal_segment_size);
9540 KeepLogSeg(endptr, &_logSegNo);
9541 }
9542 _logSegNo--;
9543
9544 /*
9545 * Try to recycle segments on a useful timeline. If we've been promoted
9546 * since the beginning of this restartpoint, use the new timeline chosen
9547 * at end of recovery (RecoveryInProgress() sets ThisTimeLineID in that
9548 * case). If we're still in recovery, use the timeline we're currently
9549 * replaying.
9550 *
9551 * There is no guarantee that the WAL segments will be useful on the
9552 * current timeline; if recovery proceeds to a new timeline right after
9553 * this, the pre-allocated WAL segments on this timeline will not be used,
9554 * and will go wasted until recycled on the next restartpoint. We'll live
9555 * with that.
9556 */
9557 if (RecoveryInProgress())
9558 ThisTimeLineID = replayTLI;
9559
9560 RemoveOldXlogFiles(_logSegNo, RedoRecPtr, endptr);
9561
9562 /*
9563 * Make more log segments if needed. (Do this after recycling old log
9564 * segments, since that may supply some of the needed files.)
9565 */
9566 PreallocXlogFiles(endptr);
9567
9568 /*
9569 * ThisTimeLineID is normally not set when we're still in recovery.
9570 * However, recycling/preallocating segments above needed ThisTimeLineID
9571 * to determine which timeline to install the segments on. Reset it now,
9572 * to restore the normal state of affairs for debugging purposes.
9573 */
9574 if (RecoveryInProgress())
9575 ThisTimeLineID = 0;
9576
9577 /*
9578 * Truncate pg_subtrans if possible. We can throw away all data before
9579 * the oldest XMIN of any running transaction. No future transaction will
9580 * attempt to reference any pg_subtrans entry older than that (see Asserts
9581 * in subtrans.c). When hot standby is disabled, though, we mustn't do
9582 * this because StartupSUBTRANS hasn't been called yet.
9583 */
9584 if (EnableHotStandby)
9585 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9586
9587 /* Real work is done, but log and update before releasing lock. */
9588 LogCheckpointEnd(true);
9589
9590 xtime = GetLatestXTime();
9591 ereport((log_checkpoints ? LOG : DEBUG2),
9592 (errmsg("recovery restart point at %X/%X",
9593 (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9594 xtime ? errdetail("Last completed transaction was at log time %s.",
9595 timestamptz_to_str(xtime)) : 0));
9596
9597 LWLockRelease(CheckpointLock);
9598
9599 /*
9600 * Finally, execute archive_cleanup_command, if any.
9601 */
9602 if (archiveCleanupCommand && strcmp(archiveCleanupCommand, "") != 0)
9603 ExecuteRecoveryCommand(archiveCleanupCommand,
9604 "archive_cleanup_command",
9605 false);
9606
9607 return true;
9608 }
9609
9610 /*
9611 * Report availability of WAL for the given target LSN
9612 * (typically a slot's restart_lsn)
9613 *
9614 * Returns one of the following enum values:
9615 *
9616 * * WALAVAIL_RESERVED means targetLSN is available and it is in the range of
9617 * max_wal_size.
9618 *
9619 * * WALAVAIL_EXTENDED means it is still available by preserving extra
9620 * segments beyond max_wal_size. If max_slot_wal_keep_size is smaller
9621 * than max_wal_size, this state is not returned.
9622 *
9623 * * WALAVAIL_UNRESERVED means it is being lost and the next checkpoint will
9624 * remove reserved segments. The walsender using this slot may return to the
9625 * above.
9626 *
9627 * * WALAVAIL_REMOVED means it has been removed. A replication stream on
9628 * a slot with this LSN cannot continue after a restart.
9629 *
9630 * * WALAVAIL_INVALID_LSN means the slot hasn't been set to reserve WAL.
9631 */
9632 WALAvailability
GetWALAvailability(XLogRecPtr targetLSN)9633 GetWALAvailability(XLogRecPtr targetLSN)
9634 {
9635 XLogRecPtr currpos; /* current write LSN */
9636 XLogSegNo currSeg; /* segid of currpos */
9637 XLogSegNo targetSeg; /* segid of targetLSN */
9638 XLogSegNo oldestSeg; /* actual oldest segid */
9639 XLogSegNo oldestSegMaxWalSize; /* oldest segid kept by max_wal_size */
9640 XLogSegNo oldestSlotSeg; /* oldest segid kept by slot */
9641 uint64 keepSegs;
9642
9643 /*
9644 * slot does not reserve WAL. Either deactivated, or has never been active
9645 */
9646 if (XLogRecPtrIsInvalid(targetLSN))
9647 return WALAVAIL_INVALID_LSN;
9648
9649 /*
9650 * Calculate the oldest segment currently reserved by all slots,
9651 * considering wal_keep_size and max_slot_wal_keep_size. Initialize
9652 * oldestSlotSeg to the current segment.
9653 */
9654 currpos = GetXLogWriteRecPtr();
9655 XLByteToSeg(currpos, oldestSlotSeg, wal_segment_size);
9656 KeepLogSeg(currpos, &oldestSlotSeg);
9657
9658 /*
9659 * Find the oldest extant segment file. We get 1 until checkpoint removes
9660 * the first WAL segment file since startup, which causes the status being
9661 * wrong under certain abnormal conditions but that doesn't actually harm.
9662 */
9663 oldestSeg = XLogGetLastRemovedSegno() + 1;
9664
9665 /* calculate oldest segment by max_wal_size */
9666 XLByteToSeg(currpos, currSeg, wal_segment_size);
9667 keepSegs = ConvertToXSegs(max_wal_size_mb, wal_segment_size) + 1;
9668
9669 if (currSeg > keepSegs)
9670 oldestSegMaxWalSize = currSeg - keepSegs;
9671 else
9672 oldestSegMaxWalSize = 1;
9673
9674 /* the segment we care about */
9675 XLByteToSeg(targetLSN, targetSeg, wal_segment_size);
9676
9677 /*
9678 * No point in returning reserved or extended status values if the
9679 * targetSeg is known to be lost.
9680 */
9681 if (targetSeg >= oldestSlotSeg)
9682 {
9683 /* show "reserved" when targetSeg is within max_wal_size */
9684 if (targetSeg >= oldestSegMaxWalSize)
9685 return WALAVAIL_RESERVED;
9686
9687 /* being retained by slots exceeding max_wal_size */
9688 return WALAVAIL_EXTENDED;
9689 }
9690
9691 /* WAL segments are no longer retained but haven't been removed yet */
9692 if (targetSeg >= oldestSeg)
9693 return WALAVAIL_UNRESERVED;
9694
9695 /* Definitely lost */
9696 return WALAVAIL_REMOVED;
9697 }
9698
9699
9700 /*
9701 * Retreat *logSegNo to the last segment that we need to retain because of
9702 * either wal_keep_size or replication slots.
9703 *
9704 * This is calculated by subtracting wal_keep_size from the given xlog
9705 * location, recptr and by making sure that that result is below the
9706 * requirement of replication slots. For the latter criterion we do consider
9707 * the effects of max_slot_wal_keep_size: reserve at most that much space back
9708 * from recptr.
9709 *
9710 * Note about replication slots: if this function calculates a value
9711 * that's further ahead than what slots need reserved, then affected
9712 * slots need to be invalidated and this function invoked again.
9713 * XXX it might be a good idea to rewrite this function so that
9714 * invalidation is optionally done here, instead.
9715 */
9716 static void
KeepLogSeg(XLogRecPtr recptr,XLogSegNo * logSegNo)9717 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9718 {
9719 XLogSegNo currSegNo;
9720 XLogSegNo segno;
9721 XLogRecPtr keep;
9722
9723 XLByteToSeg(recptr, currSegNo, wal_segment_size);
9724 segno = currSegNo;
9725
9726 /*
9727 * Calculate how many segments are kept by slots first, adjusting for
9728 * max_slot_wal_keep_size.
9729 */
9730 keep = XLogGetReplicationSlotMinimumLSN();
9731 if (keep != InvalidXLogRecPtr)
9732 {
9733 XLByteToSeg(keep, segno, wal_segment_size);
9734
9735 /* Cap by max_slot_wal_keep_size ... */
9736 if (max_slot_wal_keep_size_mb >= 0)
9737 {
9738 uint64 slot_keep_segs;
9739
9740 slot_keep_segs =
9741 ConvertToXSegs(max_slot_wal_keep_size_mb, wal_segment_size);
9742
9743 if (currSegNo - segno > slot_keep_segs)
9744 segno = currSegNo - slot_keep_segs;
9745 }
9746 }
9747
9748 /* but, keep at least wal_keep_size if that's set */
9749 if (wal_keep_size_mb > 0)
9750 {
9751 uint64 keep_segs;
9752
9753 keep_segs = ConvertToXSegs(wal_keep_size_mb, wal_segment_size);
9754 if (currSegNo - segno < keep_segs)
9755 {
9756 /* avoid underflow, don't go below 1 */
9757 if (currSegNo <= keep_segs)
9758 segno = 1;
9759 else
9760 segno = currSegNo - keep_segs;
9761 }
9762 }
9763
9764 /* don't delete WAL segments newer than the calculated segment */
9765 if (segno < *logSegNo)
9766 *logSegNo = segno;
9767 }
9768
9769 /*
9770 * Write a NEXTOID log record
9771 */
9772 void
XLogPutNextOid(Oid nextOid)9773 XLogPutNextOid(Oid nextOid)
9774 {
9775 XLogBeginInsert();
9776 XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9777 (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9778
9779 /*
9780 * We need not flush the NEXTOID record immediately, because any of the
9781 * just-allocated OIDs could only reach disk as part of a tuple insert or
9782 * update that would have its own XLOG record that must follow the NEXTOID
9783 * record. Therefore, the standard buffer LSN interlock applied to those
9784 * records will ensure no such OID reaches disk before the NEXTOID record
9785 * does.
9786 *
9787 * Note, however, that the above statement only covers state "within" the
9788 * database. When we use a generated OID as a file or directory name, we
9789 * are in a sense violating the basic WAL rule, because that filesystem
9790 * change may reach disk before the NEXTOID WAL record does. The impact
9791 * of this is that if a database crash occurs immediately afterward, we
9792 * might after restart re-generate the same OID and find that it conflicts
9793 * with the leftover file or directory. But since for safety's sake we
9794 * always loop until finding a nonconflicting filename, this poses no real
9795 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9796 */
9797 }
9798
9799 /*
9800 * Write an XLOG SWITCH record.
9801 *
9802 * Here we just blindly issue an XLogInsert request for the record.
9803 * All the magic happens inside XLogInsert.
9804 *
9805 * The return value is either the end+1 address of the switch record,
9806 * or the end+1 address of the prior segment if we did not need to
9807 * write a switch record because we are already at segment start.
9808 */
9809 XLogRecPtr
RequestXLogSwitch(bool mark_unimportant)9810 RequestXLogSwitch(bool mark_unimportant)
9811 {
9812 XLogRecPtr RecPtr;
9813
9814 /* XLOG SWITCH has no data */
9815 XLogBeginInsert();
9816
9817 if (mark_unimportant)
9818 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9819 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9820
9821 return RecPtr;
9822 }
9823
9824 /*
9825 * Write a RESTORE POINT record
9826 */
9827 XLogRecPtr
XLogRestorePoint(const char * rpName)9828 XLogRestorePoint(const char *rpName)
9829 {
9830 XLogRecPtr RecPtr;
9831 xl_restore_point xlrec;
9832
9833 xlrec.rp_time = GetCurrentTimestamp();
9834 strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9835
9836 XLogBeginInsert();
9837 XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9838
9839 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9840
9841 ereport(LOG,
9842 (errmsg("restore point \"%s\" created at %X/%X",
9843 rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9844
9845 return RecPtr;
9846 }
9847
9848 /*
9849 * Check if any of the GUC parameters that are critical for hot standby
9850 * have changed, and update the value in pg_control file if necessary.
9851 */
9852 static void
XLogReportParameters(void)9853 XLogReportParameters(void)
9854 {
9855 if (wal_level != ControlFile->wal_level ||
9856 wal_log_hints != ControlFile->wal_log_hints ||
9857 MaxConnections != ControlFile->MaxConnections ||
9858 max_worker_processes != ControlFile->max_worker_processes ||
9859 max_wal_senders != ControlFile->max_wal_senders ||
9860 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9861 max_locks_per_xact != ControlFile->max_locks_per_xact ||
9862 track_commit_timestamp != ControlFile->track_commit_timestamp)
9863 {
9864 /*
9865 * The change in number of backend slots doesn't need to be WAL-logged
9866 * if archiving is not enabled, as you can't start archive recovery
9867 * with wal_level=minimal anyway. We don't really care about the
9868 * values in pg_control either if wal_level=minimal, but seems better
9869 * to keep them up-to-date to avoid confusion.
9870 */
9871 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9872 {
9873 xl_parameter_change xlrec;
9874 XLogRecPtr recptr;
9875
9876 xlrec.MaxConnections = MaxConnections;
9877 xlrec.max_worker_processes = max_worker_processes;
9878 xlrec.max_wal_senders = max_wal_senders;
9879 xlrec.max_prepared_xacts = max_prepared_xacts;
9880 xlrec.max_locks_per_xact = max_locks_per_xact;
9881 xlrec.wal_level = wal_level;
9882 xlrec.wal_log_hints = wal_log_hints;
9883 xlrec.track_commit_timestamp = track_commit_timestamp;
9884
9885 XLogBeginInsert();
9886 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9887
9888 recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9889 XLogFlush(recptr);
9890 }
9891
9892 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9893
9894 ControlFile->MaxConnections = MaxConnections;
9895 ControlFile->max_worker_processes = max_worker_processes;
9896 ControlFile->max_wal_senders = max_wal_senders;
9897 ControlFile->max_prepared_xacts = max_prepared_xacts;
9898 ControlFile->max_locks_per_xact = max_locks_per_xact;
9899 ControlFile->wal_level = wal_level;
9900 ControlFile->wal_log_hints = wal_log_hints;
9901 ControlFile->track_commit_timestamp = track_commit_timestamp;
9902 UpdateControlFile();
9903
9904 LWLockRelease(ControlFileLock);
9905 }
9906 }
9907
9908 /*
9909 * Update full_page_writes in shared memory, and write an
9910 * XLOG_FPW_CHANGE record if necessary.
9911 *
9912 * Note: this function assumes there is no other process running
9913 * concurrently that could update it.
9914 */
9915 void
UpdateFullPageWrites(void)9916 UpdateFullPageWrites(void)
9917 {
9918 XLogCtlInsert *Insert = &XLogCtl->Insert;
9919 bool recoveryInProgress;
9920
9921 /*
9922 * Do nothing if full_page_writes has not been changed.
9923 *
9924 * It's safe to check the shared full_page_writes without the lock,
9925 * because we assume that there is no concurrently running process which
9926 * can update it.
9927 */
9928 if (fullPageWrites == Insert->fullPageWrites)
9929 return;
9930
9931 /*
9932 * Perform this outside critical section so that the WAL insert
9933 * initialization done by RecoveryInProgress() doesn't trigger an
9934 * assertion failure.
9935 */
9936 recoveryInProgress = RecoveryInProgress();
9937
9938 START_CRIT_SECTION();
9939
9940 /*
9941 * It's always safe to take full page images, even when not strictly
9942 * required, but not the other round. So if we're setting full_page_writes
9943 * to true, first set it true and then write the WAL record. If we're
9944 * setting it to false, first write the WAL record and then set the global
9945 * flag.
9946 */
9947 if (fullPageWrites)
9948 {
9949 WALInsertLockAcquireExclusive();
9950 Insert->fullPageWrites = true;
9951 WALInsertLockRelease();
9952 }
9953
9954 /*
9955 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9956 * full_page_writes during archive recovery, if required.
9957 */
9958 if (XLogStandbyInfoActive() && !recoveryInProgress)
9959 {
9960 XLogBeginInsert();
9961 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9962
9963 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9964 }
9965
9966 if (!fullPageWrites)
9967 {
9968 WALInsertLockAcquireExclusive();
9969 Insert->fullPageWrites = false;
9970 WALInsertLockRelease();
9971 }
9972 END_CRIT_SECTION();
9973 }
9974
9975 /*
9976 * Check that it's OK to switch to new timeline during recovery.
9977 *
9978 * 'lsn' is the address of the shutdown checkpoint record we're about to
9979 * replay. (Currently, timeline can only change at a shutdown checkpoint).
9980 */
9981 static void
checkTimeLineSwitch(XLogRecPtr lsn,TimeLineID newTLI,TimeLineID prevTLI)9982 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9983 {
9984 /* Check that the record agrees on what the current (old) timeline is */
9985 if (prevTLI != ThisTimeLineID)
9986 ereport(PANIC,
9987 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9988 prevTLI, ThisTimeLineID)));
9989
9990 /*
9991 * The new timeline better be in the list of timelines we expect to see,
9992 * according to the timeline history. It should also not decrease.
9993 */
9994 if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9995 ereport(PANIC,
9996 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9997 newTLI, ThisTimeLineID)));
9998
9999 /*
10000 * If we have not yet reached min recovery point, and we're about to
10001 * switch to a timeline greater than the timeline of the min recovery
10002 * point: trouble. After switching to the new timeline, we could not
10003 * possibly visit the min recovery point on the correct timeline anymore.
10004 * This can happen if there is a newer timeline in the archive that
10005 * branched before the timeline the min recovery point is on, and you
10006 * attempt to do PITR to the new timeline.
10007 */
10008 if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
10009 lsn < minRecoveryPoint &&
10010 newTLI > minRecoveryPointTLI)
10011 ereport(PANIC,
10012 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
10013 newTLI,
10014 (uint32) (minRecoveryPoint >> 32),
10015 (uint32) minRecoveryPoint,
10016 minRecoveryPointTLI)));
10017
10018 /* Looks good */
10019 }
10020
10021 /*
10022 * XLOG resource manager's routines
10023 *
10024 * Definitions of info values are in include/catalog/pg_control.h, though
10025 * not all record types are related to control file updates.
10026 */
10027 void
xlog_redo(XLogReaderState * record)10028 xlog_redo(XLogReaderState *record)
10029 {
10030 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
10031 XLogRecPtr lsn = record->EndRecPtr;
10032
10033 /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
10034 Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
10035 !XLogRecHasAnyBlockRefs(record));
10036
10037 if (info == XLOG_NEXTOID)
10038 {
10039 Oid nextOid;
10040
10041 /*
10042 * We used to try to take the maximum of ShmemVariableCache->nextOid
10043 * and the recorded nextOid, but that fails if the OID counter wraps
10044 * around. Since no OID allocation should be happening during replay
10045 * anyway, better to just believe the record exactly. We still take
10046 * OidGenLock while setting the variable, just in case.
10047 */
10048 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
10049 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
10050 ShmemVariableCache->nextOid = nextOid;
10051 ShmemVariableCache->oidCount = 0;
10052 LWLockRelease(OidGenLock);
10053 }
10054 else if (info == XLOG_CHECKPOINT_SHUTDOWN)
10055 {
10056 CheckPoint checkPoint;
10057
10058 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10059 /* In a SHUTDOWN checkpoint, believe the counters exactly */
10060 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10061 ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
10062 LWLockRelease(XidGenLock);
10063 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
10064 ShmemVariableCache->nextOid = checkPoint.nextOid;
10065 ShmemVariableCache->oidCount = 0;
10066 LWLockRelease(OidGenLock);
10067 MultiXactSetNextMXact(checkPoint.nextMulti,
10068 checkPoint.nextMultiOffset);
10069
10070 MultiXactAdvanceOldest(checkPoint.oldestMulti,
10071 checkPoint.oldestMultiDB);
10072
10073 /*
10074 * No need to set oldestClogXid here as well; it'll be set when we
10075 * redo an xl_clog_truncate if it changed since initialization.
10076 */
10077 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
10078
10079 /*
10080 * If we see a shutdown checkpoint while waiting for an end-of-backup
10081 * record, the backup was canceled and the end-of-backup record will
10082 * never arrive.
10083 */
10084 if (ArchiveRecoveryRequested &&
10085 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
10086 XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
10087 ereport(PANIC,
10088 (errmsg("online backup was canceled, recovery cannot continue")));
10089
10090 /*
10091 * If we see a shutdown checkpoint, we know that nothing was running
10092 * on the master at this point. So fake-up an empty running-xacts
10093 * record and use that here and now. Recover additional standby state
10094 * for prepared transactions.
10095 */
10096 if (standbyState >= STANDBY_INITIALIZED)
10097 {
10098 TransactionId *xids;
10099 int nxids;
10100 TransactionId oldestActiveXID;
10101 TransactionId latestCompletedXid;
10102 RunningTransactionsData running;
10103
10104 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
10105
10106 /*
10107 * Construct a RunningTransactions snapshot representing a shut
10108 * down server, with only prepared transactions still alive. We're
10109 * never overflowed at this point because all subxids are listed
10110 * with their parent prepared transactions.
10111 */
10112 running.xcnt = nxids;
10113 running.subxcnt = 0;
10114 running.subxid_overflow = false;
10115 running.nextXid = XidFromFullTransactionId(checkPoint.nextFullXid);
10116 running.oldestRunningXid = oldestActiveXID;
10117 latestCompletedXid = XidFromFullTransactionId(checkPoint.nextFullXid);
10118 TransactionIdRetreat(latestCompletedXid);
10119 Assert(TransactionIdIsNormal(latestCompletedXid));
10120 running.latestCompletedXid = latestCompletedXid;
10121 running.xids = xids;
10122
10123 ProcArrayApplyRecoveryInfo(&running);
10124
10125 StandbyRecoverPreparedTransactions();
10126 }
10127
10128 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10129 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10130 ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
10131 LWLockRelease(ControlFileLock);
10132
10133 /* Update shared-memory copy of checkpoint XID/epoch */
10134 SpinLockAcquire(&XLogCtl->info_lck);
10135 XLogCtl->ckptFullXid = checkPoint.nextFullXid;
10136 SpinLockRelease(&XLogCtl->info_lck);
10137
10138 /*
10139 * We should've already switched to the new TLI before replaying this
10140 * record.
10141 */
10142 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10143 ereport(PANIC,
10144 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10145 checkPoint.ThisTimeLineID, ThisTimeLineID)));
10146
10147 RecoveryRestartPoint(&checkPoint);
10148 }
10149 else if (info == XLOG_CHECKPOINT_ONLINE)
10150 {
10151 CheckPoint checkPoint;
10152
10153 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10154 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
10155 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10156 if (FullTransactionIdPrecedes(ShmemVariableCache->nextFullXid,
10157 checkPoint.nextFullXid))
10158 ShmemVariableCache->nextFullXid = checkPoint.nextFullXid;
10159 LWLockRelease(XidGenLock);
10160
10161 /*
10162 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
10163 * to track OID assignment through XLOG_NEXTOID records. The nextOid
10164 * counter is from the start of the checkpoint and might well be stale
10165 * compared to later XLOG_NEXTOID records. We could try to take the
10166 * maximum of the nextOid counter and our latest value, but since
10167 * there's no particular guarantee about the speed with which the OID
10168 * counter wraps around, that's a risky thing to do. In any case,
10169 * users of the nextOid counter are required to avoid assignment of
10170 * duplicates, so that a somewhat out-of-date value should be safe.
10171 */
10172
10173 /* Handle multixact */
10174 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
10175 checkPoint.nextMultiOffset);
10176
10177 /*
10178 * NB: This may perform multixact truncation when replaying WAL
10179 * generated by an older primary.
10180 */
10181 MultiXactAdvanceOldest(checkPoint.oldestMulti,
10182 checkPoint.oldestMultiDB);
10183 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
10184 checkPoint.oldestXid))
10185 SetTransactionIdLimit(checkPoint.oldestXid,
10186 checkPoint.oldestXidDB);
10187 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10188 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10189 ControlFile->checkPointCopy.nextFullXid = checkPoint.nextFullXid;
10190 LWLockRelease(ControlFileLock);
10191
10192 /* Update shared-memory copy of checkpoint XID/epoch */
10193 SpinLockAcquire(&XLogCtl->info_lck);
10194 XLogCtl->ckptFullXid = checkPoint.nextFullXid;
10195 SpinLockRelease(&XLogCtl->info_lck);
10196
10197 /* TLI should not change in an on-line checkpoint */
10198 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10199 ereport(PANIC,
10200 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10201 checkPoint.ThisTimeLineID, ThisTimeLineID)));
10202
10203 RecoveryRestartPoint(&checkPoint);
10204 }
10205 else if (info == XLOG_OVERWRITE_CONTRECORD)
10206 {
10207 xl_overwrite_contrecord xlrec;
10208
10209 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
10210 VerifyOverwriteContrecord(&xlrec, record);
10211 }
10212 else if (info == XLOG_END_OF_RECOVERY)
10213 {
10214 xl_end_of_recovery xlrec;
10215
10216 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
10217
10218 /*
10219 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
10220 * but this case is rarer and harder to test, so the benefit doesn't
10221 * outweigh the potential extra cost of maintenance.
10222 */
10223
10224 /*
10225 * We should've already switched to the new TLI before replaying this
10226 * record.
10227 */
10228 if (xlrec.ThisTimeLineID != ThisTimeLineID)
10229 ereport(PANIC,
10230 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10231 xlrec.ThisTimeLineID, ThisTimeLineID)));
10232 }
10233 else if (info == XLOG_NOOP)
10234 {
10235 /* nothing to do here */
10236 }
10237 else if (info == XLOG_SWITCH)
10238 {
10239 /* nothing to do here */
10240 }
10241 else if (info == XLOG_RESTORE_POINT)
10242 {
10243 /* nothing to do here */
10244 }
10245 else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT)
10246 {
10247 /*
10248 * Full-page image (FPI) records contain nothing else but a backup
10249 * block (or multiple backup blocks). Every block reference must
10250 * include a full-page image - otherwise there would be no point in
10251 * this record.
10252 *
10253 * No recovery conflicts are generated by these generic records - if a
10254 * resource manager needs to generate conflicts, it has to define a
10255 * separate WAL record type and redo routine.
10256 *
10257 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
10258 * WAL- logged because of a hint bit update. They are only generated
10259 * when checksums are enabled. There is no difference in handling
10260 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
10261 * code just to distinguish them for statistics purposes.
10262 */
10263 for (uint8 block_id = 0; block_id <= record->max_block_id; block_id++)
10264 {
10265 Buffer buffer;
10266
10267 if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
10268 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
10269 UnlockReleaseBuffer(buffer);
10270 }
10271 }
10272 else if (info == XLOG_BACKUP_END)
10273 {
10274 XLogRecPtr startpoint;
10275
10276 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
10277
10278 if (ControlFile->backupStartPoint == startpoint)
10279 {
10280 /*
10281 * We have reached the end of base backup, the point where
10282 * pg_stop_backup() was done. The data on disk is now consistent.
10283 * Reset backupStartPoint, and update minRecoveryPoint to make
10284 * sure we don't allow starting up at an earlier point even if
10285 * recovery is stopped and restarted soon after this.
10286 */
10287 elog(DEBUG1, "end of backup reached");
10288
10289 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10290
10291 if (ControlFile->minRecoveryPoint < lsn)
10292 {
10293 ControlFile->minRecoveryPoint = lsn;
10294 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10295 }
10296 ControlFile->backupStartPoint = InvalidXLogRecPtr;
10297 ControlFile->backupEndRequired = false;
10298 UpdateControlFile();
10299
10300 LWLockRelease(ControlFileLock);
10301 }
10302 }
10303 else if (info == XLOG_PARAMETER_CHANGE)
10304 {
10305 xl_parameter_change xlrec;
10306
10307 /* Update our copy of the parameters in pg_control */
10308 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10309
10310 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10311 ControlFile->MaxConnections = xlrec.MaxConnections;
10312 ControlFile->max_worker_processes = xlrec.max_worker_processes;
10313 ControlFile->max_wal_senders = xlrec.max_wal_senders;
10314 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10315 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10316 ControlFile->wal_level = xlrec.wal_level;
10317 ControlFile->wal_log_hints = xlrec.wal_log_hints;
10318
10319 /*
10320 * Update minRecoveryPoint to ensure that if recovery is aborted, we
10321 * recover back up to this point before allowing hot standby again.
10322 * This is important if the max_* settings are decreased, to ensure
10323 * you don't run queries against the WAL preceding the change. The
10324 * local copies cannot be updated as long as crash recovery is
10325 * happening and we expect all the WAL to be replayed.
10326 */
10327 if (InArchiveRecovery)
10328 {
10329 minRecoveryPoint = ControlFile->minRecoveryPoint;
10330 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10331 }
10332 if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10333 {
10334 ControlFile->minRecoveryPoint = lsn;
10335 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10336 }
10337
10338 CommitTsParameterChange(xlrec.track_commit_timestamp,
10339 ControlFile->track_commit_timestamp);
10340 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10341
10342 UpdateControlFile();
10343 LWLockRelease(ControlFileLock);
10344
10345 /* Check to see if any parameter change gives a problem on recovery */
10346 CheckRequiredParameterValues();
10347 }
10348 else if (info == XLOG_FPW_CHANGE)
10349 {
10350 bool fpw;
10351
10352 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10353
10354 /*
10355 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10356 * do_pg_start_backup() and do_pg_stop_backup() can check whether
10357 * full_page_writes has been disabled during online backup.
10358 */
10359 if (!fpw)
10360 {
10361 SpinLockAcquire(&XLogCtl->info_lck);
10362 if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10363 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10364 SpinLockRelease(&XLogCtl->info_lck);
10365 }
10366
10367 /* Keep track of full_page_writes */
10368 lastFullPageWrites = fpw;
10369 }
10370 }
10371
10372 /*
10373 * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
10374 */
10375 static void
VerifyOverwriteContrecord(xl_overwrite_contrecord * xlrec,XLogReaderState * state)10376 VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
10377 {
10378 if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
10379 elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
10380 (uint32) (xlrec->overwritten_lsn >> 32),
10381 (uint32) xlrec->overwritten_lsn,
10382 (uint32) (state->overwrittenRecPtr >> 32),
10383 (uint32) state->overwrittenRecPtr);
10384
10385 ereport(LOG,
10386 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
10387 (uint32) (xlrec->overwritten_lsn >> 32),
10388 (uint32) xlrec->overwritten_lsn,
10389 timestamptz_to_str(xlrec->overwrite_time))));
10390
10391 /* Verifying the record should only happen once */
10392 state->overwrittenRecPtr = InvalidXLogRecPtr;
10393 }
10394
10395 #ifdef WAL_DEBUG
10396
10397 static void
xlog_outrec(StringInfo buf,XLogReaderState * record)10398 xlog_outrec(StringInfo buf, XLogReaderState *record)
10399 {
10400 int block_id;
10401
10402 appendStringInfo(buf, "prev %X/%X; xid %u",
10403 (uint32) (XLogRecGetPrev(record) >> 32),
10404 (uint32) XLogRecGetPrev(record),
10405 XLogRecGetXid(record));
10406
10407 appendStringInfo(buf, "; len %u",
10408 XLogRecGetDataLen(record));
10409
10410 /* decode block references */
10411 for (block_id = 0; block_id <= record->max_block_id; block_id++)
10412 {
10413 RelFileNode rnode;
10414 ForkNumber forknum;
10415 BlockNumber blk;
10416
10417 if (!XLogRecHasBlockRef(record, block_id))
10418 continue;
10419
10420 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10421 if (forknum != MAIN_FORKNUM)
10422 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10423 block_id,
10424 rnode.spcNode, rnode.dbNode, rnode.relNode,
10425 forknum,
10426 blk);
10427 else
10428 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10429 block_id,
10430 rnode.spcNode, rnode.dbNode, rnode.relNode,
10431 blk);
10432 if (XLogRecHasBlockImage(record, block_id))
10433 appendStringInfoString(buf, " FPW");
10434 }
10435 }
10436 #endif /* WAL_DEBUG */
10437
10438 /*
10439 * Returns a string describing an XLogRecord, consisting of its identity
10440 * optionally followed by a colon, a space, and a further description.
10441 */
10442 static void
xlog_outdesc(StringInfo buf,XLogReaderState * record)10443 xlog_outdesc(StringInfo buf, XLogReaderState *record)
10444 {
10445 RmgrId rmid = XLogRecGetRmid(record);
10446 uint8 info = XLogRecGetInfo(record);
10447 const char *id;
10448
10449 appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10450 appendStringInfoChar(buf, '/');
10451
10452 id = RmgrTable[rmid].rm_identify(info);
10453 if (id == NULL)
10454 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10455 else
10456 appendStringInfo(buf, "%s: ", id);
10457
10458 RmgrTable[rmid].rm_desc(buf, record);
10459 }
10460
10461
10462 /*
10463 * Return the (possible) sync flag used for opening a file, depending on the
10464 * value of the GUC wal_sync_method.
10465 */
10466 static int
get_sync_bit(int method)10467 get_sync_bit(int method)
10468 {
10469 int o_direct_flag = 0;
10470
10471 /* If fsync is disabled, never open in sync mode */
10472 if (!enableFsync)
10473 return 0;
10474
10475 /*
10476 * Optimize writes by bypassing kernel cache with O_DIRECT when using
10477 * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
10478 * disabled, otherwise the archive command or walsender process will read
10479 * the WAL soon after writing it, which is guaranteed to cause a physical
10480 * read if we bypassed the kernel cache. We also skip the
10481 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10482 * reason.
10483 *
10484 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10485 * written by walreceiver is normally read by the startup process soon
10486 * after its written. Also, walreceiver performs unaligned writes, which
10487 * don't work with O_DIRECT, so it is required for correctness too.
10488 */
10489 if (!XLogIsNeeded() && !AmWalReceiverProcess())
10490 o_direct_flag = PG_O_DIRECT;
10491
10492 switch (method)
10493 {
10494 /*
10495 * enum values for all sync options are defined even if they are
10496 * not supported on the current platform. But if not, they are
10497 * not included in the enum option array, and therefore will never
10498 * be seen here.
10499 */
10500 case SYNC_METHOD_FSYNC:
10501 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10502 case SYNC_METHOD_FDATASYNC:
10503 return 0;
10504 #ifdef OPEN_SYNC_FLAG
10505 case SYNC_METHOD_OPEN:
10506 return OPEN_SYNC_FLAG | o_direct_flag;
10507 #endif
10508 #ifdef OPEN_DATASYNC_FLAG
10509 case SYNC_METHOD_OPEN_DSYNC:
10510 return OPEN_DATASYNC_FLAG | o_direct_flag;
10511 #endif
10512 default:
10513 /* can't happen (unless we are out of sync with option array) */
10514 elog(ERROR, "unrecognized wal_sync_method: %d", method);
10515 return 0; /* silence warning */
10516 }
10517 }
10518
10519 /*
10520 * GUC support
10521 */
10522 void
assign_xlog_sync_method(int new_sync_method,void * extra)10523 assign_xlog_sync_method(int new_sync_method, void *extra)
10524 {
10525 if (sync_method != new_sync_method)
10526 {
10527 /*
10528 * To ensure that no blocks escape unsynced, force an fsync on the
10529 * currently open log segment (if any). Also, if the open flag is
10530 * changing, close the log file so it will be reopened (with new flag
10531 * bit) at next use.
10532 */
10533 if (openLogFile >= 0)
10534 {
10535 pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10536 if (pg_fsync(openLogFile) != 0)
10537 {
10538 char xlogfname[MAXFNAMELEN];
10539 int save_errno;
10540
10541 save_errno = errno;
10542 XLogFileName(xlogfname, ThisTimeLineID, openLogSegNo,
10543 wal_segment_size);
10544 errno = save_errno;
10545 ereport(PANIC,
10546 (errcode_for_file_access(),
10547 errmsg("could not fsync file \"%s\": %m", xlogfname)));
10548 }
10549
10550 pgstat_report_wait_end();
10551 if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10552 XLogFileClose();
10553 }
10554 }
10555 }
10556
10557
10558 /*
10559 * Issue appropriate kind of fsync (if any) for an XLOG output file.
10560 *
10561 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10562 * 'segno' is for error reporting purposes.
10563 */
10564 void
issue_xlog_fsync(int fd,XLogSegNo segno)10565 issue_xlog_fsync(int fd, XLogSegNo segno)
10566 {
10567 char *msg = NULL;
10568
10569 pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC);
10570 switch (sync_method)
10571 {
10572 case SYNC_METHOD_FSYNC:
10573 if (pg_fsync_no_writethrough(fd) != 0)
10574 msg = _("could not fsync file \"%s\": %m");
10575 break;
10576 #ifdef HAVE_FSYNC_WRITETHROUGH
10577 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10578 if (pg_fsync_writethrough(fd) != 0)
10579 msg = _("could not fsync write-through file \"%s\": %m");
10580 break;
10581 #endif
10582 #ifdef HAVE_FDATASYNC
10583 case SYNC_METHOD_FDATASYNC:
10584 if (pg_fdatasync(fd) != 0)
10585 msg = _("could not fdatasync file \"%s\": %m");
10586 break;
10587 #endif
10588 case SYNC_METHOD_OPEN:
10589 case SYNC_METHOD_OPEN_DSYNC:
10590 /* write synced it already */
10591 break;
10592 default:
10593 elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10594 break;
10595 }
10596
10597 /* PANIC if failed to fsync */
10598 if (msg)
10599 {
10600 char xlogfname[MAXFNAMELEN];
10601 int save_errno = errno;
10602
10603 XLogFileName(xlogfname, ThisTimeLineID, segno,
10604 wal_segment_size);
10605 errno = save_errno;
10606 ereport(PANIC,
10607 (errcode_for_file_access(),
10608 errmsg(msg, xlogfname)));
10609 }
10610
10611 pgstat_report_wait_end();
10612 }
10613
10614 /*
10615 * do_pg_start_backup
10616 *
10617 * Utility function called at the start of an online backup. It creates the
10618 * necessary starting checkpoint and constructs the backup label file.
10619 *
10620 * There are two kind of backups: exclusive and non-exclusive. An exclusive
10621 * backup is started with pg_start_backup(), and there can be only one active
10622 * at a time. The backup and tablespace map files of an exclusive backup are
10623 * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10624 * removed by pg_stop_backup().
10625 *
10626 * A non-exclusive backup is used for the streaming base backups (see
10627 * src/backend/replication/basebackup.c). The difference to exclusive backups
10628 * is that the backup label and tablespace map files are not written to disk.
10629 * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10630 * and the caller is responsible for including them in the backup archive as
10631 * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10632 * active at the same time, and they don't conflict with an exclusive backup
10633 * either.
10634 *
10635 * tablespaces is required only when this function is called while
10636 * the streaming base backup requested by pg_basebackup is running.
10637 * NULL should be specified otherwise.
10638 *
10639 * tblspcmapfile is required mainly for tar format in windows as native windows
10640 * utilities are not able to create symlinks while extracting files from tar.
10641 * However for consistency, the same is used for all platforms.
10642 *
10643 * needtblspcmapfile is true for the cases (exclusive backup and for
10644 * non-exclusive backup only when tar format is used for taking backup)
10645 * when backup needs to generate tablespace_map file, it is used to
10646 * embed escape character before newline character in tablespace path.
10647 *
10648 * Returns the minimum WAL location that must be present to restore from this
10649 * backup, and the corresponding timeline ID in *starttli_p.
10650 *
10651 * Every successfully started non-exclusive backup must be stopped by calling
10652 * do_pg_stop_backup() or do_pg_abort_backup().
10653 *
10654 * It is the responsibility of the caller of this function to verify the
10655 * permissions of the calling user!
10656 */
10657 XLogRecPtr
do_pg_start_backup(const char * backupidstr,bool fast,TimeLineID * starttli_p,StringInfo labelfile,List ** tablespaces,StringInfo tblspcmapfile,bool infotbssize,bool needtblspcmapfile)10658 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10659 StringInfo labelfile, List **tablespaces,
10660 StringInfo tblspcmapfile, bool infotbssize,
10661 bool needtblspcmapfile)
10662 {
10663 bool exclusive = (labelfile == NULL);
10664 bool backup_started_in_recovery = false;
10665 XLogRecPtr checkpointloc;
10666 XLogRecPtr startpoint;
10667 TimeLineID starttli;
10668 pg_time_t stamp_time;
10669 char strfbuf[128];
10670 char xlogfilename[MAXFNAMELEN];
10671 XLogSegNo _logSegNo;
10672 struct stat stat_buf;
10673 FILE *fp;
10674
10675 backup_started_in_recovery = RecoveryInProgress();
10676
10677 /*
10678 * Currently only non-exclusive backup can be taken during recovery.
10679 */
10680 if (backup_started_in_recovery && exclusive)
10681 ereport(ERROR,
10682 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10683 errmsg("recovery is in progress"),
10684 errhint("WAL control functions cannot be executed during recovery.")));
10685
10686 /*
10687 * During recovery, we don't need to check WAL level. Because, if WAL
10688 * level is not sufficient, it's impossible to get here during recovery.
10689 */
10690 if (!backup_started_in_recovery && !XLogIsNeeded())
10691 ereport(ERROR,
10692 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10693 errmsg("WAL level not sufficient for making an online backup"),
10694 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10695
10696 if (strlen(backupidstr) > MAXPGPATH)
10697 ereport(ERROR,
10698 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10699 errmsg("backup label too long (max %d bytes)",
10700 MAXPGPATH)));
10701
10702 /*
10703 * Mark backup active in shared memory. We must do full-page WAL writes
10704 * during an on-line backup even if not doing so at other times, because
10705 * it's quite possible for the backup dump to obtain a "torn" (partially
10706 * written) copy of a database page if it reads the page concurrently with
10707 * our write to the same page. This can be fixed as long as the first
10708 * write to the page in the WAL sequence is a full-page write. Hence, we
10709 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10710 * are no dirty pages in shared memory that might get dumped while the
10711 * backup is in progress without having a corresponding WAL record. (Once
10712 * the backup is complete, we need not force full-page writes anymore,
10713 * since we expect that any pages not modified during the backup interval
10714 * must have been correctly captured by the backup.)
10715 *
10716 * Note that forcePageWrites has no effect during an online backup from
10717 * the standby.
10718 *
10719 * We must hold all the insertion locks to change the value of
10720 * forcePageWrites, to ensure adequate interlocking against
10721 * XLogInsertRecord().
10722 */
10723 WALInsertLockAcquireExclusive();
10724 if (exclusive)
10725 {
10726 /*
10727 * At first, mark that we're now starting an exclusive backup, to
10728 * ensure that there are no other sessions currently running
10729 * pg_start_backup() or pg_stop_backup().
10730 */
10731 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10732 {
10733 WALInsertLockRelease();
10734 ereport(ERROR,
10735 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10736 errmsg("a backup is already in progress"),
10737 errhint("Run pg_stop_backup() and try again.")));
10738 }
10739 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10740 }
10741 else
10742 XLogCtl->Insert.nonExclusiveBackups++;
10743 XLogCtl->Insert.forcePageWrites = true;
10744 WALInsertLockRelease();
10745
10746 /* Ensure we release forcePageWrites if fail below */
10747 PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10748 {
10749 bool gotUniqueStartpoint = false;
10750 DIR *tblspcdir;
10751 struct dirent *de;
10752 tablespaceinfo *ti;
10753 int datadirpathlen;
10754
10755 /*
10756 * Force an XLOG file switch before the checkpoint, to ensure that the
10757 * WAL segment the checkpoint is written to doesn't contain pages with
10758 * old timeline IDs. That would otherwise happen if you called
10759 * pg_start_backup() right after restoring from a PITR archive: the
10760 * first WAL segment containing the startup checkpoint has pages in
10761 * the beginning with the old timeline ID. That can cause trouble at
10762 * recovery: we won't have a history file covering the old timeline if
10763 * pg_wal directory was not included in the base backup and the WAL
10764 * archive was cleared too before starting the backup.
10765 *
10766 * This also ensures that we have emitted a WAL page header that has
10767 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10768 * Therefore, if a WAL archiver (such as pglesslog) is trying to
10769 * compress out removable backup blocks, it won't remove any that
10770 * occur after this point.
10771 *
10772 * During recovery, we skip forcing XLOG file switch, which means that
10773 * the backup taken during recovery is not available for the special
10774 * recovery case described above.
10775 */
10776 if (!backup_started_in_recovery)
10777 RequestXLogSwitch(false);
10778
10779 do
10780 {
10781 bool checkpointfpw;
10782
10783 /*
10784 * Force a CHECKPOINT. Aside from being necessary to prevent torn
10785 * page problems, this guarantees that two successive backup runs
10786 * will have different checkpoint positions and hence different
10787 * history file names, even if nothing happened in between.
10788 *
10789 * During recovery, establish a restartpoint if possible. We use
10790 * the last restartpoint as the backup starting checkpoint. This
10791 * means that two successive backup runs can have same checkpoint
10792 * positions.
10793 *
10794 * Since the fact that we are executing do_pg_start_backup()
10795 * during recovery means that checkpointer is running, we can use
10796 * RequestCheckpoint() to establish a restartpoint.
10797 *
10798 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10799 * passing fast = true). Otherwise this can take awhile.
10800 */
10801 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10802 (fast ? CHECKPOINT_IMMEDIATE : 0));
10803
10804 /*
10805 * Now we need to fetch the checkpoint record location, and also
10806 * its REDO pointer. The oldest point in WAL that would be needed
10807 * to restore starting from the checkpoint is precisely the REDO
10808 * pointer.
10809 */
10810 LWLockAcquire(ControlFileLock, LW_SHARED);
10811 checkpointloc = ControlFile->checkPoint;
10812 startpoint = ControlFile->checkPointCopy.redo;
10813 starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10814 checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10815 LWLockRelease(ControlFileLock);
10816
10817 if (backup_started_in_recovery)
10818 {
10819 XLogRecPtr recptr;
10820
10821 /*
10822 * Check to see if all WAL replayed during online backup
10823 * (i.e., since last restartpoint used as backup starting
10824 * checkpoint) contain full-page writes.
10825 */
10826 SpinLockAcquire(&XLogCtl->info_lck);
10827 recptr = XLogCtl->lastFpwDisableRecPtr;
10828 SpinLockRelease(&XLogCtl->info_lck);
10829
10830 if (!checkpointfpw || startpoint <= recptr)
10831 ereport(ERROR,
10832 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10833 errmsg("WAL generated with full_page_writes=off was replayed "
10834 "since last restartpoint"),
10835 errhint("This means that the backup being taken on the standby "
10836 "is corrupt and should not be used. "
10837 "Enable full_page_writes and run CHECKPOINT on the master, "
10838 "and then try an online backup again.")));
10839
10840 /*
10841 * During recovery, since we don't use the end-of-backup WAL
10842 * record and don't write the backup history file, the
10843 * starting WAL location doesn't need to be unique. This means
10844 * that two base backups started at the same time might use
10845 * the same checkpoint as starting locations.
10846 */
10847 gotUniqueStartpoint = true;
10848 }
10849
10850 /*
10851 * If two base backups are started at the same time (in WAL sender
10852 * processes), we need to make sure that they use different
10853 * checkpoints as starting locations, because we use the starting
10854 * WAL location as a unique identifier for the base backup in the
10855 * end-of-backup WAL record and when we write the backup history
10856 * file. Perhaps it would be better generate a separate unique ID
10857 * for each backup instead of forcing another checkpoint, but
10858 * taking a checkpoint right after another is not that expensive
10859 * either because only few buffers have been dirtied yet.
10860 */
10861 WALInsertLockAcquireExclusive();
10862 if (XLogCtl->Insert.lastBackupStart < startpoint)
10863 {
10864 XLogCtl->Insert.lastBackupStart = startpoint;
10865 gotUniqueStartpoint = true;
10866 }
10867 WALInsertLockRelease();
10868 } while (!gotUniqueStartpoint);
10869
10870 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
10871 XLogFileName(xlogfilename, starttli, _logSegNo, wal_segment_size);
10872
10873 /*
10874 * Construct tablespace_map file
10875 */
10876 if (exclusive)
10877 tblspcmapfile = makeStringInfo();
10878
10879 datadirpathlen = strlen(DataDir);
10880
10881 /*
10882 * Report that we are now estimating the total backup size if we're
10883 * streaming base backup as requested by pg_basebackup
10884 */
10885 if (tablespaces)
10886 pgstat_progress_update_param(PROGRESS_BASEBACKUP_PHASE,
10887 PROGRESS_BASEBACKUP_PHASE_ESTIMATE_BACKUP_SIZE);
10888
10889 /* Collect information about all tablespaces */
10890 tblspcdir = AllocateDir("pg_tblspc");
10891 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10892 {
10893 char fullpath[MAXPGPATH + 10];
10894 char linkpath[MAXPGPATH];
10895 char *relpath = NULL;
10896 int rllen;
10897 StringInfoData buflinkpath;
10898 char *s = linkpath;
10899
10900 /* Skip special stuff */
10901 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10902 continue;
10903
10904 snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10905
10906 #if defined(HAVE_READLINK) || defined(WIN32)
10907 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10908 if (rllen < 0)
10909 {
10910 ereport(WARNING,
10911 (errmsg("could not read symbolic link \"%s\": %m",
10912 fullpath)));
10913 continue;
10914 }
10915 else if (rllen >= sizeof(linkpath))
10916 {
10917 ereport(WARNING,
10918 (errmsg("symbolic link \"%s\" target is too long",
10919 fullpath)));
10920 continue;
10921 }
10922 linkpath[rllen] = '\0';
10923
10924 /*
10925 * Add the escape character '\\' before newline in a string to
10926 * ensure that we can distinguish between the newline in the
10927 * tablespace path and end of line while reading tablespace_map
10928 * file during archive recovery.
10929 */
10930 initStringInfo(&buflinkpath);
10931
10932 while (*s)
10933 {
10934 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10935 appendStringInfoChar(&buflinkpath, '\\');
10936 appendStringInfoChar(&buflinkpath, *s++);
10937 }
10938
10939 /*
10940 * Relpath holds the relative path of the tablespace directory
10941 * when it's located within PGDATA, or NULL if it's located
10942 * elsewhere.
10943 */
10944 if (rllen > datadirpathlen &&
10945 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10946 IS_DIR_SEP(linkpath[datadirpathlen]))
10947 relpath = linkpath + datadirpathlen + 1;
10948
10949 ti = palloc(sizeof(tablespaceinfo));
10950 ti->oid = pstrdup(de->d_name);
10951 ti->path = pstrdup(buflinkpath.data);
10952 ti->rpath = relpath ? pstrdup(relpath) : NULL;
10953 ti->size = infotbssize ?
10954 sendTablespace(fullpath, ti->oid, true, NULL) : -1;
10955
10956 if (tablespaces)
10957 *tablespaces = lappend(*tablespaces, ti);
10958
10959 appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10960
10961 pfree(buflinkpath.data);
10962 #else
10963
10964 /*
10965 * If the platform does not have symbolic links, it should not be
10966 * possible to have tablespaces - clearly somebody else created
10967 * them. Warn about it and ignore.
10968 */
10969 ereport(WARNING,
10970 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10971 errmsg("tablespaces are not supported on this platform")));
10972 #endif
10973 }
10974 FreeDir(tblspcdir);
10975
10976 /*
10977 * Construct backup label file
10978 */
10979 if (exclusive)
10980 labelfile = makeStringInfo();
10981
10982 /* Use the log timezone here, not the session timezone */
10983 stamp_time = (pg_time_t) time(NULL);
10984 pg_strftime(strfbuf, sizeof(strfbuf),
10985 "%Y-%m-%d %H:%M:%S %Z",
10986 pg_localtime(&stamp_time, log_timezone));
10987 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10988 (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10989 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10990 (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10991 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10992 exclusive ? "pg_start_backup" : "streamed");
10993 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10994 backup_started_in_recovery ? "standby" : "master");
10995 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10996 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10997 appendStringInfo(labelfile, "START TIMELINE: %u\n", starttli);
10998
10999 /*
11000 * Okay, write the file, or return its contents to caller.
11001 */
11002 if (exclusive)
11003 {
11004 /*
11005 * Check for existing backup label --- implies a backup is already
11006 * running. (XXX given that we checked exclusiveBackupState
11007 * above, maybe it would be OK to just unlink any such label
11008 * file?)
11009 */
11010 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
11011 {
11012 if (errno != ENOENT)
11013 ereport(ERROR,
11014 (errcode_for_file_access(),
11015 errmsg("could not stat file \"%s\": %m",
11016 BACKUP_LABEL_FILE)));
11017 }
11018 else
11019 ereport(ERROR,
11020 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11021 errmsg("a backup is already in progress"),
11022 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
11023 BACKUP_LABEL_FILE)));
11024
11025 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
11026
11027 if (!fp)
11028 ereport(ERROR,
11029 (errcode_for_file_access(),
11030 errmsg("could not create file \"%s\": %m",
11031 BACKUP_LABEL_FILE)));
11032 if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
11033 fflush(fp) != 0 ||
11034 pg_fsync(fileno(fp)) != 0 ||
11035 ferror(fp) ||
11036 FreeFile(fp))
11037 ereport(ERROR,
11038 (errcode_for_file_access(),
11039 errmsg("could not write file \"%s\": %m",
11040 BACKUP_LABEL_FILE)));
11041 /* Allocated locally for exclusive backups, so free separately */
11042 pfree(labelfile->data);
11043 pfree(labelfile);
11044
11045 /* Write backup tablespace_map file. */
11046 if (tblspcmapfile->len > 0)
11047 {
11048 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
11049 {
11050 if (errno != ENOENT)
11051 ereport(ERROR,
11052 (errcode_for_file_access(),
11053 errmsg("could not stat file \"%s\": %m",
11054 TABLESPACE_MAP)));
11055 }
11056 else
11057 ereport(ERROR,
11058 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11059 errmsg("a backup is already in progress"),
11060 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
11061 TABLESPACE_MAP)));
11062
11063 fp = AllocateFile(TABLESPACE_MAP, "w");
11064
11065 if (!fp)
11066 ereport(ERROR,
11067 (errcode_for_file_access(),
11068 errmsg("could not create file \"%s\": %m",
11069 TABLESPACE_MAP)));
11070 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
11071 fflush(fp) != 0 ||
11072 pg_fsync(fileno(fp)) != 0 ||
11073 ferror(fp) ||
11074 FreeFile(fp))
11075 ereport(ERROR,
11076 (errcode_for_file_access(),
11077 errmsg("could not write file \"%s\": %m",
11078 TABLESPACE_MAP)));
11079 }
11080
11081 /* Allocated locally for exclusive backups, so free separately */
11082 pfree(tblspcmapfile->data);
11083 pfree(tblspcmapfile);
11084 }
11085 }
11086 PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
11087
11088 /*
11089 * Mark that start phase has correctly finished for an exclusive backup.
11090 * Session-level locks are updated as well to reflect that state.
11091 *
11092 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating backup
11093 * counters and session-level lock. Otherwise they can be updated
11094 * inconsistently, and which might cause do_pg_abort_backup() to fail.
11095 */
11096 if (exclusive)
11097 {
11098 WALInsertLockAcquireExclusive();
11099 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11100
11101 /* Set session-level lock */
11102 sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
11103 WALInsertLockRelease();
11104 }
11105 else
11106 sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
11107
11108 /*
11109 * We're done. As a convenience, return the starting WAL location.
11110 */
11111 if (starttli_p)
11112 *starttli_p = starttli;
11113 return startpoint;
11114 }
11115
11116 /* Error cleanup callback for pg_start_backup */
11117 static void
pg_start_backup_callback(int code,Datum arg)11118 pg_start_backup_callback(int code, Datum arg)
11119 {
11120 bool exclusive = DatumGetBool(arg);
11121
11122 /* Update backup counters and forcePageWrites on failure */
11123 WALInsertLockAcquireExclusive();
11124 if (exclusive)
11125 {
11126 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
11127 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11128 }
11129 else
11130 {
11131 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11132 XLogCtl->Insert.nonExclusiveBackups--;
11133 }
11134
11135 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11136 XLogCtl->Insert.nonExclusiveBackups == 0)
11137 {
11138 XLogCtl->Insert.forcePageWrites = false;
11139 }
11140 WALInsertLockRelease();
11141 }
11142
11143 /*
11144 * Error cleanup callback for pg_stop_backup
11145 */
11146 static void
pg_stop_backup_callback(int code,Datum arg)11147 pg_stop_backup_callback(int code, Datum arg)
11148 {
11149 bool exclusive = DatumGetBool(arg);
11150
11151 /* Update backup status on failure */
11152 WALInsertLockAcquireExclusive();
11153 if (exclusive)
11154 {
11155 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
11156 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11157 }
11158 WALInsertLockRelease();
11159 }
11160
11161 /*
11162 * Utility routine to fetch the session-level status of a backup running.
11163 */
11164 SessionBackupState
get_backup_status(void)11165 get_backup_status(void)
11166 {
11167 return sessionBackupState;
11168 }
11169
11170 /*
11171 * do_pg_stop_backup
11172 *
11173 * Utility function called at the end of an online backup. It cleans up the
11174 * backup state and can optionally wait for WAL segments to be archived.
11175 *
11176 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
11177 * the non-exclusive backup specified by 'labelfile'.
11178 *
11179 * Returns the last WAL location that must be present to restore from this
11180 * backup, and the corresponding timeline ID in *stoptli_p.
11181 *
11182 * It is the responsibility of the caller of this function to verify the
11183 * permissions of the calling user!
11184 */
11185 XLogRecPtr
do_pg_stop_backup(char * labelfile,bool waitforarchive,TimeLineID * stoptli_p)11186 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
11187 {
11188 bool exclusive = (labelfile == NULL);
11189 bool backup_started_in_recovery = false;
11190 XLogRecPtr startpoint;
11191 XLogRecPtr stoppoint;
11192 TimeLineID stoptli;
11193 pg_time_t stamp_time;
11194 char strfbuf[128];
11195 char histfilepath[MAXPGPATH];
11196 char startxlogfilename[MAXFNAMELEN];
11197 char stopxlogfilename[MAXFNAMELEN];
11198 char lastxlogfilename[MAXFNAMELEN];
11199 char histfilename[MAXFNAMELEN];
11200 char backupfrom[20];
11201 XLogSegNo _logSegNo;
11202 FILE *lfp;
11203 FILE *fp;
11204 char ch;
11205 int seconds_before_warning;
11206 int waits = 0;
11207 bool reported_waiting = false;
11208 char *remaining;
11209 char *ptr;
11210 uint32 hi,
11211 lo;
11212
11213 backup_started_in_recovery = RecoveryInProgress();
11214
11215 /*
11216 * Currently only non-exclusive backup can be taken during recovery.
11217 */
11218 if (backup_started_in_recovery && exclusive)
11219 ereport(ERROR,
11220 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11221 errmsg("recovery is in progress"),
11222 errhint("WAL control functions cannot be executed during recovery.")));
11223
11224 /*
11225 * During recovery, we don't need to check WAL level. Because, if WAL
11226 * level is not sufficient, it's impossible to get here during recovery.
11227 */
11228 if (!backup_started_in_recovery && !XLogIsNeeded())
11229 ereport(ERROR,
11230 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11231 errmsg("WAL level not sufficient for making an online backup"),
11232 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
11233
11234 if (exclusive)
11235 {
11236 /*
11237 * At first, mark that we're now stopping an exclusive backup, to
11238 * ensure that there are no other sessions currently running
11239 * pg_start_backup() or pg_stop_backup().
11240 */
11241 WALInsertLockAcquireExclusive();
11242 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
11243 {
11244 WALInsertLockRelease();
11245 ereport(ERROR,
11246 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11247 errmsg("exclusive backup not in progress")));
11248 }
11249 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
11250 WALInsertLockRelease();
11251
11252 /*
11253 * Remove backup_label. In case of failure, the state for an exclusive
11254 * backup is switched back to in-progress.
11255 */
11256 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11257 {
11258 /*
11259 * Read the existing label file into memory.
11260 */
11261 struct stat statbuf;
11262 int r;
11263
11264 if (stat(BACKUP_LABEL_FILE, &statbuf))
11265 {
11266 /* should not happen per the upper checks */
11267 if (errno != ENOENT)
11268 ereport(ERROR,
11269 (errcode_for_file_access(),
11270 errmsg("could not stat file \"%s\": %m",
11271 BACKUP_LABEL_FILE)));
11272 ereport(ERROR,
11273 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11274 errmsg("a backup is not in progress")));
11275 }
11276
11277 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11278 if (!lfp)
11279 {
11280 ereport(ERROR,
11281 (errcode_for_file_access(),
11282 errmsg("could not read file \"%s\": %m",
11283 BACKUP_LABEL_FILE)));
11284 }
11285 labelfile = palloc(statbuf.st_size + 1);
11286 r = fread(labelfile, statbuf.st_size, 1, lfp);
11287 labelfile[statbuf.st_size] = '\0';
11288
11289 /*
11290 * Close and remove the backup label file
11291 */
11292 if (r != 1 || ferror(lfp) || FreeFile(lfp))
11293 ereport(ERROR,
11294 (errcode_for_file_access(),
11295 errmsg("could not read file \"%s\": %m",
11296 BACKUP_LABEL_FILE)));
11297 durable_unlink(BACKUP_LABEL_FILE, ERROR);
11298
11299 /*
11300 * Remove tablespace_map file if present, it is created only if
11301 * there are tablespaces.
11302 */
11303 durable_unlink(TABLESPACE_MAP, DEBUG1);
11304 }
11305 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11306 }
11307
11308 /*
11309 * OK to update backup counters, forcePageWrites and session-level lock.
11310 *
11311 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
11312 * Otherwise they can be updated inconsistently, and which might cause
11313 * do_pg_abort_backup() to fail.
11314 */
11315 WALInsertLockAcquireExclusive();
11316 if (exclusive)
11317 {
11318 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11319 }
11320 else
11321 {
11322 /*
11323 * The user-visible pg_start/stop_backup() functions that operate on
11324 * exclusive backups can be called at any time, but for non-exclusive
11325 * backups, it is expected that each do_pg_start_backup() call is
11326 * matched by exactly one do_pg_stop_backup() call.
11327 */
11328 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11329 XLogCtl->Insert.nonExclusiveBackups--;
11330 }
11331
11332 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11333 XLogCtl->Insert.nonExclusiveBackups == 0)
11334 {
11335 XLogCtl->Insert.forcePageWrites = false;
11336 }
11337
11338 /*
11339 * Clean up session-level lock.
11340 *
11341 * You might think that WALInsertLockRelease() can be called before
11342 * cleaning up session-level lock because session-level lock doesn't need
11343 * to be protected with WAL insertion lock. But since
11344 * CHECK_FOR_INTERRUPTS() can occur in it, session-level lock must be
11345 * cleaned up before it.
11346 */
11347 sessionBackupState = SESSION_BACKUP_NONE;
11348
11349 WALInsertLockRelease();
11350
11351 /*
11352 * Read and parse the START WAL LOCATION line (this code is pretty crude,
11353 * but we are not expecting any variability in the file format).
11354 */
11355 if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11356 &hi, &lo, startxlogfilename,
11357 &ch) != 4 || ch != '\n')
11358 ereport(ERROR,
11359 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11360 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11361 startpoint = ((uint64) hi) << 32 | lo;
11362 remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
11363
11364 /*
11365 * Parse the BACKUP FROM line. If we are taking an online backup from the
11366 * standby, we confirm that the standby has not been promoted during the
11367 * backup.
11368 */
11369 ptr = strstr(remaining, "BACKUP FROM:");
11370 if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11371 ereport(ERROR,
11372 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11373 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11374 if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11375 ereport(ERROR,
11376 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11377 errmsg("the standby was promoted during online backup"),
11378 errhint("This means that the backup being taken is corrupt "
11379 "and should not be used. "
11380 "Try taking another online backup.")));
11381
11382 /*
11383 * During recovery, we don't write an end-of-backup record. We assume that
11384 * pg_control was backed up last and its minimum recovery point can be
11385 * available as the backup end location. Since we don't have an
11386 * end-of-backup record, we use the pg_control value to check whether
11387 * we've reached the end of backup when starting recovery from this
11388 * backup. We have no way of checking if pg_control wasn't backed up last
11389 * however.
11390 *
11391 * We don't force a switch to new WAL file but it is still possible to
11392 * wait for all the required files to be archived if waitforarchive is
11393 * true. This is okay if we use the backup to start a standby and fetch
11394 * the missing WAL using streaming replication. But in the case of an
11395 * archive recovery, a user should set waitforarchive to true and wait for
11396 * them to be archived to ensure that all the required files are
11397 * available.
11398 *
11399 * We return the current minimum recovery point as the backup end
11400 * location. Note that it can be greater than the exact backup end
11401 * location if the minimum recovery point is updated after the backup of
11402 * pg_control. This is harmless for current uses.
11403 *
11404 * XXX currently a backup history file is for informational and debug
11405 * purposes only. It's not essential for an online backup. Furthermore,
11406 * even if it's created, it will not be archived during recovery because
11407 * an archiver is not invoked. So it doesn't seem worthwhile to write a
11408 * backup history file during recovery.
11409 */
11410 if (backup_started_in_recovery)
11411 {
11412 XLogRecPtr recptr;
11413
11414 /*
11415 * Check to see if all WAL replayed during online backup contain
11416 * full-page writes.
11417 */
11418 SpinLockAcquire(&XLogCtl->info_lck);
11419 recptr = XLogCtl->lastFpwDisableRecPtr;
11420 SpinLockRelease(&XLogCtl->info_lck);
11421
11422 if (startpoint <= recptr)
11423 ereport(ERROR,
11424 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11425 errmsg("WAL generated with full_page_writes=off was replayed "
11426 "during online backup"),
11427 errhint("This means that the backup being taken on the standby "
11428 "is corrupt and should not be used. "
11429 "Enable full_page_writes and run CHECKPOINT on the master, "
11430 "and then try an online backup again.")));
11431
11432
11433 LWLockAcquire(ControlFileLock, LW_SHARED);
11434 stoppoint = ControlFile->minRecoveryPoint;
11435 stoptli = ControlFile->minRecoveryPointTLI;
11436 LWLockRelease(ControlFileLock);
11437 }
11438 else
11439 {
11440 /*
11441 * Write the backup-end xlog record
11442 */
11443 XLogBeginInsert();
11444 XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11445 stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11446 stoptli = ThisTimeLineID;
11447
11448 /*
11449 * Force a switch to a new xlog segment file, so that the backup is
11450 * valid as soon as archiver moves out the current segment file.
11451 */
11452 RequestXLogSwitch(false);
11453
11454 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11455 XLogFileName(stopxlogfilename, stoptli, _logSegNo, wal_segment_size);
11456
11457 /* Use the log timezone here, not the session timezone */
11458 stamp_time = (pg_time_t) time(NULL);
11459 pg_strftime(strfbuf, sizeof(strfbuf),
11460 "%Y-%m-%d %H:%M:%S %Z",
11461 pg_localtime(&stamp_time, log_timezone));
11462
11463 /*
11464 * Write the backup history file
11465 */
11466 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11467 BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11468 startpoint, wal_segment_size);
11469 fp = AllocateFile(histfilepath, "w");
11470 if (!fp)
11471 ereport(ERROR,
11472 (errcode_for_file_access(),
11473 errmsg("could not create file \"%s\": %m",
11474 histfilepath)));
11475 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11476 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
11477 fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11478 (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
11479
11480 /*
11481 * Transfer remaining lines including label and start timeline to
11482 * history file.
11483 */
11484 fprintf(fp, "%s", remaining);
11485 fprintf(fp, "STOP TIME: %s\n", strfbuf);
11486 fprintf(fp, "STOP TIMELINE: %u\n", stoptli);
11487 if (fflush(fp) || ferror(fp) || FreeFile(fp))
11488 ereport(ERROR,
11489 (errcode_for_file_access(),
11490 errmsg("could not write file \"%s\": %m",
11491 histfilepath)));
11492
11493 /*
11494 * Clean out any no-longer-needed history files. As a side effect,
11495 * this will post a .ready file for the newly created history file,
11496 * notifying the archiver that history file may be archived
11497 * immediately.
11498 */
11499 CleanupBackupHistory();
11500 }
11501
11502 /*
11503 * If archiving is enabled, wait for all the required WAL files to be
11504 * archived before returning. If archiving isn't enabled, the required WAL
11505 * needs to be transported via streaming replication (hopefully with
11506 * wal_keep_size set high enough), or some more exotic mechanism like
11507 * polling and copying files from pg_wal with script. We have no knowledge
11508 * of those mechanisms, so it's up to the user to ensure that he gets all
11509 * the required WAL.
11510 *
11511 * We wait until both the last WAL file filled during backup and the
11512 * history file have been archived, and assume that the alphabetic sorting
11513 * property of the WAL files ensures any earlier WAL files are safely
11514 * archived as well.
11515 *
11516 * We wait forever, since archive_command is supposed to work and we
11517 * assume the admin wanted his backup to work completely. If you don't
11518 * wish to wait, then either waitforarchive should be passed in as false,
11519 * or you can set statement_timeout. Also, some notices are issued to
11520 * clue in anyone who might be doing this interactively.
11521 */
11522
11523 if (waitforarchive &&
11524 ((!backup_started_in_recovery && XLogArchivingActive()) ||
11525 (backup_started_in_recovery && XLogArchivingAlways())))
11526 {
11527 XLByteToPrevSeg(stoppoint, _logSegNo, wal_segment_size);
11528 XLogFileName(lastxlogfilename, stoptli, _logSegNo, wal_segment_size);
11529
11530 XLByteToSeg(startpoint, _logSegNo, wal_segment_size);
11531 BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11532 startpoint, wal_segment_size);
11533
11534 seconds_before_warning = 60;
11535 waits = 0;
11536
11537 while (XLogArchiveIsBusy(lastxlogfilename) ||
11538 XLogArchiveIsBusy(histfilename))
11539 {
11540 CHECK_FOR_INTERRUPTS();
11541
11542 if (!reported_waiting && waits > 5)
11543 {
11544 ereport(NOTICE,
11545 (errmsg("base backup done, waiting for required WAL segments to be archived")));
11546 reported_waiting = true;
11547 }
11548
11549 pgstat_report_wait_start(WAIT_EVENT_BACKUP_WAIT_WAL_ARCHIVE);
11550 pg_usleep(1000000L);
11551 pgstat_report_wait_end();
11552
11553 if (++waits >= seconds_before_warning)
11554 {
11555 seconds_before_warning *= 2; /* This wraps in >10 years... */
11556 ereport(WARNING,
11557 (errmsg("still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11558 waits),
11559 errhint("Check that your archive_command is executing properly. "
11560 "You can safely cancel this backup, "
11561 "but the database backup will not be usable without all the WAL segments.")));
11562 }
11563 }
11564
11565 ereport(NOTICE,
11566 (errmsg("all required WAL segments have been archived")));
11567 }
11568 else if (waitforarchive)
11569 ereport(NOTICE,
11570 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11571
11572 /*
11573 * We're done. As a convenience, return the ending WAL location.
11574 */
11575 if (stoptli_p)
11576 *stoptli_p = stoptli;
11577 return stoppoint;
11578 }
11579
11580
11581 /*
11582 * do_pg_abort_backup: abort a running backup
11583 *
11584 * This does just the most basic steps of do_pg_stop_backup(), by taking the
11585 * system out of backup mode, thus making it a lot more safe to call from
11586 * an error handler.
11587 *
11588 * The caller can pass 'arg' as 'true' or 'false' to control whether a warning
11589 * is emitted.
11590 *
11591 * NB: This is only for aborting a non-exclusive backup that doesn't write
11592 * backup_label. A backup started with pg_start_backup() needs to be finished
11593 * with pg_stop_backup().
11594 *
11595 * NB: This gets used as a before_shmem_exit handler, hence the odd-looking
11596 * signature.
11597 */
11598 void
do_pg_abort_backup(int code,Datum arg)11599 do_pg_abort_backup(int code, Datum arg)
11600 {
11601 bool emit_warning = DatumGetBool(arg);
11602
11603 /*
11604 * Quick exit if session is not keeping around a non-exclusive backup
11605 * already started.
11606 */
11607 if (sessionBackupState != SESSION_BACKUP_NON_EXCLUSIVE)
11608 return;
11609
11610 WALInsertLockAcquireExclusive();
11611 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11612 XLogCtl->Insert.nonExclusiveBackups--;
11613
11614 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11615 XLogCtl->Insert.nonExclusiveBackups == 0)
11616 {
11617 XLogCtl->Insert.forcePageWrites = false;
11618 }
11619 WALInsertLockRelease();
11620
11621 if (emit_warning)
11622 ereport(WARNING,
11623 (errmsg("aborting backup due to backend exiting before pg_stop_backup was called")));
11624 }
11625
11626 /*
11627 * Register a handler that will warn about unterminated backups at end of
11628 * session, unless this has already been done.
11629 */
11630 void
register_persistent_abort_backup_handler(void)11631 register_persistent_abort_backup_handler(void)
11632 {
11633 static bool already_done = false;
11634
11635 if (already_done)
11636 return;
11637 before_shmem_exit(do_pg_abort_backup, DatumGetBool(true));
11638 already_done = true;
11639 }
11640
11641 /*
11642 * Get latest redo apply position.
11643 *
11644 * Exported to allow WALReceiver to read the pointer directly.
11645 */
11646 XLogRecPtr
GetXLogReplayRecPtr(TimeLineID * replayTLI)11647 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11648 {
11649 XLogRecPtr recptr;
11650 TimeLineID tli;
11651
11652 SpinLockAcquire(&XLogCtl->info_lck);
11653 recptr = XLogCtl->lastReplayedEndRecPtr;
11654 tli = XLogCtl->lastReplayedTLI;
11655 SpinLockRelease(&XLogCtl->info_lck);
11656
11657 if (replayTLI)
11658 *replayTLI = tli;
11659 return recptr;
11660 }
11661
11662 /*
11663 * Get latest WAL insert pointer
11664 */
11665 XLogRecPtr
GetXLogInsertRecPtr(void)11666 GetXLogInsertRecPtr(void)
11667 {
11668 XLogCtlInsert *Insert = &XLogCtl->Insert;
11669 uint64 current_bytepos;
11670
11671 SpinLockAcquire(&Insert->insertpos_lck);
11672 current_bytepos = Insert->CurrBytePos;
11673 SpinLockRelease(&Insert->insertpos_lck);
11674
11675 return XLogBytePosToRecPtr(current_bytepos);
11676 }
11677
11678 /*
11679 * Get latest WAL write pointer
11680 */
11681 XLogRecPtr
GetXLogWriteRecPtr(void)11682 GetXLogWriteRecPtr(void)
11683 {
11684 SpinLockAcquire(&XLogCtl->info_lck);
11685 LogwrtResult = XLogCtl->LogwrtResult;
11686 SpinLockRelease(&XLogCtl->info_lck);
11687
11688 return LogwrtResult.Write;
11689 }
11690
11691 /*
11692 * Returns the redo pointer of the last checkpoint or restartpoint. This is
11693 * the oldest point in WAL that we still need, if we have to restart recovery.
11694 */
11695 void
GetOldestRestartPoint(XLogRecPtr * oldrecptr,TimeLineID * oldtli)11696 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11697 {
11698 LWLockAcquire(ControlFileLock, LW_SHARED);
11699 *oldrecptr = ControlFile->checkPointCopy.redo;
11700 *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11701 LWLockRelease(ControlFileLock);
11702 }
11703
11704 /*
11705 * read_backup_label: check to see if a backup_label file is present
11706 *
11707 * If we see a backup_label during recovery, we assume that we are recovering
11708 * from a backup dump file, and we therefore roll forward from the checkpoint
11709 * identified by the label file, NOT what pg_control says. This avoids the
11710 * problem that pg_control might have been archived one or more checkpoints
11711 * later than the start of the dump, and so if we rely on it as the start
11712 * point, we will fail to restore a consistent database state.
11713 *
11714 * Returns true if a backup_label was found (and fills the checkpoint
11715 * location and its REDO location into *checkPointLoc and RedoStartLSN,
11716 * respectively); returns false if not. If this backup_label came from a
11717 * streamed backup, *backupEndRequired is set to true. If this backup_label
11718 * was created during recovery, *backupFromStandby is set to true.
11719 */
11720 static bool
read_backup_label(XLogRecPtr * checkPointLoc,bool * backupEndRequired,bool * backupFromStandby)11721 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11722 bool *backupFromStandby)
11723 {
11724 char startxlogfilename[MAXFNAMELEN];
11725 TimeLineID tli_from_walseg,
11726 tli_from_file;
11727 FILE *lfp;
11728 char ch;
11729 char backuptype[20];
11730 char backupfrom[20];
11731 char backuplabel[MAXPGPATH];
11732 char backuptime[128];
11733 uint32 hi,
11734 lo;
11735
11736 *backupEndRequired = false;
11737 *backupFromStandby = false;
11738
11739 /*
11740 * See if label file is present
11741 */
11742 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11743 if (!lfp)
11744 {
11745 if (errno != ENOENT)
11746 ereport(FATAL,
11747 (errcode_for_file_access(),
11748 errmsg("could not read file \"%s\": %m",
11749 BACKUP_LABEL_FILE)));
11750 return false; /* it's not there, all is fine */
11751 }
11752
11753 /*
11754 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11755 * is pretty crude, but we are not expecting any variability in the file
11756 * format).
11757 */
11758 if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11759 &hi, &lo, &tli_from_walseg, startxlogfilename, &ch) != 5 || ch != '\n')
11760 ereport(FATAL,
11761 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11762 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11763 RedoStartLSN = ((uint64) hi) << 32 | lo;
11764 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11765 &hi, &lo, &ch) != 3 || ch != '\n')
11766 ereport(FATAL,
11767 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11768 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11769 *checkPointLoc = ((uint64) hi) << 32 | lo;
11770
11771 /*
11772 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11773 * from an older backup anyway, but since the information on it is not
11774 * strictly required, don't error out if it's missing for some reason.
11775 */
11776 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11777 {
11778 if (strcmp(backuptype, "streamed") == 0)
11779 *backupEndRequired = true;
11780 }
11781
11782 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11783 {
11784 if (strcmp(backupfrom, "standby") == 0)
11785 *backupFromStandby = true;
11786 }
11787
11788 /*
11789 * Parse START TIME and LABEL. Those are not mandatory fields for recovery
11790 * but checking for their presence is useful for debugging and the next
11791 * sanity checks. Cope also with the fact that the result buffers have a
11792 * pre-allocated size, hence if the backup_label file has been generated
11793 * with strings longer than the maximum assumed here an incorrect parsing
11794 * happens. That's fine as only minor consistency checks are done
11795 * afterwards.
11796 */
11797 if (fscanf(lfp, "START TIME: %127[^\n]\n", backuptime) == 1)
11798 ereport(DEBUG1,
11799 (errmsg("backup time %s in file \"%s\"",
11800 backuptime, BACKUP_LABEL_FILE)));
11801
11802 if (fscanf(lfp, "LABEL: %1023[^\n]\n", backuplabel) == 1)
11803 ereport(DEBUG1,
11804 (errmsg("backup label %s in file \"%s\"",
11805 backuplabel, BACKUP_LABEL_FILE)));
11806
11807 /*
11808 * START TIMELINE is new as of 11. Its parsing is not mandatory, still use
11809 * it as a sanity check if present.
11810 */
11811 if (fscanf(lfp, "START TIMELINE: %u\n", &tli_from_file) == 1)
11812 {
11813 if (tli_from_walseg != tli_from_file)
11814 ereport(FATAL,
11815 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11816 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE),
11817 errdetail("Timeline ID parsed is %u, but expected %u.",
11818 tli_from_file, tli_from_walseg)));
11819
11820 ereport(DEBUG1,
11821 (errmsg("backup timeline %u in file \"%s\"",
11822 tli_from_file, BACKUP_LABEL_FILE)));
11823 }
11824
11825 if (ferror(lfp) || FreeFile(lfp))
11826 ereport(FATAL,
11827 (errcode_for_file_access(),
11828 errmsg("could not read file \"%s\": %m",
11829 BACKUP_LABEL_FILE)));
11830
11831 return true;
11832 }
11833
11834 /*
11835 * read_tablespace_map: check to see if a tablespace_map file is present
11836 *
11837 * If we see a tablespace_map file during recovery, we assume that we are
11838 * recovering from a backup dump file, and we therefore need to create symlinks
11839 * as per the information present in tablespace_map file.
11840 *
11841 * Returns true if a tablespace_map file was found (and fills the link
11842 * information for all the tablespace links present in file); returns false
11843 * if not.
11844 */
11845 static bool
read_tablespace_map(List ** tablespaces)11846 read_tablespace_map(List **tablespaces)
11847 {
11848 tablespaceinfo *ti;
11849 FILE *lfp;
11850 char tbsoid[MAXPGPATH];
11851 char *tbslinkpath;
11852 char str[MAXPGPATH];
11853 int ch,
11854 prev_ch = -1,
11855 i = 0,
11856 n;
11857
11858 /*
11859 * See if tablespace_map file is present
11860 */
11861 lfp = AllocateFile(TABLESPACE_MAP, "r");
11862 if (!lfp)
11863 {
11864 if (errno != ENOENT)
11865 ereport(FATAL,
11866 (errcode_for_file_access(),
11867 errmsg("could not read file \"%s\": %m",
11868 TABLESPACE_MAP)));
11869 return false; /* it's not there, all is fine */
11870 }
11871
11872 /*
11873 * Read and parse the link name and path lines from tablespace_map file
11874 * (this code is pretty crude, but we are not expecting any variability in
11875 * the file format). While taking backup we embed escape character '\\'
11876 * before newline in tablespace path, so that during reading of
11877 * tablespace_map file, we could distinguish newline in tablespace path
11878 * and end of line. Now while reading tablespace_map file, remove the
11879 * escape character that has been added in tablespace path during backup.
11880 */
11881 while ((ch = fgetc(lfp)) != EOF)
11882 {
11883 if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11884 {
11885 str[i] = '\0';
11886 if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11887 ereport(FATAL,
11888 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11889 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11890 tbslinkpath = str + n;
11891 i = 0;
11892
11893 ti = palloc(sizeof(tablespaceinfo));
11894 ti->oid = pstrdup(tbsoid);
11895 ti->path = pstrdup(tbslinkpath);
11896
11897 *tablespaces = lappend(*tablespaces, ti);
11898 continue;
11899 }
11900 else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11901 str[i - 1] = ch;
11902 else if (i < sizeof(str) - 1)
11903 str[i++] = ch;
11904 prev_ch = ch;
11905 }
11906
11907 if (ferror(lfp) || FreeFile(lfp))
11908 ereport(FATAL,
11909 (errcode_for_file_access(),
11910 errmsg("could not read file \"%s\": %m",
11911 TABLESPACE_MAP)));
11912
11913 return true;
11914 }
11915
11916 /*
11917 * Error context callback for errors occurring during rm_redo().
11918 */
11919 static void
rm_redo_error_callback(void * arg)11920 rm_redo_error_callback(void *arg)
11921 {
11922 XLogReaderState *record = (XLogReaderState *) arg;
11923 StringInfoData buf;
11924
11925 initStringInfo(&buf);
11926 xlog_outdesc(&buf, record);
11927
11928 /* translator: %s is a WAL record description */
11929 errcontext("WAL redo at %X/%X for %s",
11930 (uint32) (record->ReadRecPtr >> 32),
11931 (uint32) record->ReadRecPtr,
11932 buf.data);
11933
11934 pfree(buf.data);
11935 }
11936
11937 /*
11938 * BackupInProgress: check if online backup mode is active
11939 *
11940 * This is done by checking for existence of the "backup_label" file.
11941 */
11942 bool
BackupInProgress(void)11943 BackupInProgress(void)
11944 {
11945 struct stat stat_buf;
11946
11947 return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11948 }
11949
11950 /*
11951 * CancelBackup: rename the "backup_label" and "tablespace_map"
11952 * files to cancel backup mode
11953 *
11954 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11955 * Similarly, if the "tablespace_map" file exists, it will be renamed to
11956 * "tablespace_map.old".
11957 *
11958 * Note that this will render an online backup in progress
11959 * useless. To correctly finish an online backup, pg_stop_backup must be
11960 * called.
11961 */
11962 void
CancelBackup(void)11963 CancelBackup(void)
11964 {
11965 struct stat stat_buf;
11966
11967 /* if the backup_label file is not there, return */
11968 if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11969 return;
11970
11971 /* remove leftover file from previously canceled backup if it exists */
11972 unlink(BACKUP_LABEL_OLD);
11973
11974 if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11975 {
11976 ereport(WARNING,
11977 (errcode_for_file_access(),
11978 errmsg("online backup mode was not canceled"),
11979 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11980 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11981 return;
11982 }
11983
11984 /* if the tablespace_map file is not there, return */
11985 if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11986 {
11987 ereport(LOG,
11988 (errmsg("online backup mode canceled"),
11989 errdetail("File \"%s\" was renamed to \"%s\".",
11990 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11991 return;
11992 }
11993
11994 /* remove leftover file from previously canceled backup if it exists */
11995 unlink(TABLESPACE_MAP_OLD);
11996
11997 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11998 {
11999 ereport(LOG,
12000 (errmsg("online backup mode canceled"),
12001 errdetail("Files \"%s\" and \"%s\" were renamed to "
12002 "\"%s\" and \"%s\", respectively.",
12003 BACKUP_LABEL_FILE, TABLESPACE_MAP,
12004 BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
12005 }
12006 else
12007 {
12008 ereport(WARNING,
12009 (errcode_for_file_access(),
12010 errmsg("online backup mode canceled"),
12011 errdetail("File \"%s\" was renamed to \"%s\", but "
12012 "file \"%s\" could not be renamed to \"%s\": %m.",
12013 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
12014 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
12015 }
12016 }
12017
12018 /*
12019 * Read the XLOG page containing RecPtr into readBuf (if not read already).
12020 * Returns number of bytes read, if the page is read successfully, or -1
12021 * in case of errors. When errors occur, they are ereport'ed, but only
12022 * if they have not been previously reported.
12023 *
12024 * This is responsible for restoring files from archive as needed, as well
12025 * as for waiting for the requested WAL record to arrive in standby mode.
12026 *
12027 * 'emode' specifies the log level used for reporting "file not found" or
12028 * "end of WAL" situations in archive recovery, or in standby mode when a
12029 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
12030 * false in those situations, on higher log levels the ereport() won't
12031 * return.
12032 *
12033 * In standby mode, if after a successful return of XLogPageRead() the
12034 * caller finds the record it's interested in to be broken, it should
12035 * ereport the error with the level determined by
12036 * emode_for_corrupt_record(), and then set lastSourceFailed
12037 * and call XLogPageRead() again with the same arguments. This lets
12038 * XLogPageRead() to try fetching the record from another source, or to
12039 * sleep and retry.
12040 */
12041 static int
XLogPageRead(XLogReaderState * xlogreader,XLogRecPtr targetPagePtr,int reqLen,XLogRecPtr targetRecPtr,char * readBuf)12042 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
12043 XLogRecPtr targetRecPtr, char *readBuf)
12044 {
12045 XLogPageReadPrivate *private =
12046 (XLogPageReadPrivate *) xlogreader->private_data;
12047 int emode = private->emode;
12048 uint32 targetPageOff;
12049 XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
12050 int r;
12051
12052 XLByteToSeg(targetPagePtr, targetSegNo, wal_segment_size);
12053 targetPageOff = XLogSegmentOffset(targetPagePtr, wal_segment_size);
12054
12055 /*
12056 * See if we need to switch to a new segment because the requested record
12057 * is not in the currently open one.
12058 */
12059 if (readFile >= 0 &&
12060 !XLByteInSeg(targetPagePtr, readSegNo, wal_segment_size))
12061 {
12062 /*
12063 * Request a restartpoint if we've replayed too much xlog since the
12064 * last one.
12065 */
12066 if (bgwriterLaunched)
12067 {
12068 if (XLogCheckpointNeeded(readSegNo))
12069 {
12070 (void) GetRedoRecPtr();
12071 if (XLogCheckpointNeeded(readSegNo))
12072 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
12073 }
12074 }
12075
12076 close(readFile);
12077 readFile = -1;
12078 readSource = XLOG_FROM_ANY;
12079 }
12080
12081 XLByteToSeg(targetPagePtr, readSegNo, wal_segment_size);
12082
12083 retry:
12084 /* See if we need to retrieve more data */
12085 if (readFile < 0 ||
12086 (readSource == XLOG_FROM_STREAM &&
12087 flushedUpto < targetPagePtr + reqLen))
12088 {
12089 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
12090 private->randAccess,
12091 private->fetching_ckpt,
12092 targetRecPtr))
12093 {
12094 if (readFile >= 0)
12095 close(readFile);
12096 readFile = -1;
12097 readLen = 0;
12098 readSource = XLOG_FROM_ANY;
12099
12100 return -1;
12101 }
12102 }
12103
12104 /*
12105 * At this point, we have the right segment open and if we're streaming we
12106 * know the requested record is in it.
12107 */
12108 Assert(readFile != -1);
12109
12110 /*
12111 * If the current segment is being streamed from master, calculate how
12112 * much of the current page we have received already. We know the
12113 * requested record has been received, but this is for the benefit of
12114 * future calls, to allow quick exit at the top of this function.
12115 */
12116 if (readSource == XLOG_FROM_STREAM)
12117 {
12118 if (((targetPagePtr) / XLOG_BLCKSZ) != (flushedUpto / XLOG_BLCKSZ))
12119 readLen = XLOG_BLCKSZ;
12120 else
12121 readLen = XLogSegmentOffset(flushedUpto, wal_segment_size) -
12122 targetPageOff;
12123 }
12124 else
12125 readLen = XLOG_BLCKSZ;
12126
12127 /* Read the requested page */
12128 readOff = targetPageOff;
12129
12130 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
12131 r = pg_pread(readFile, readBuf, XLOG_BLCKSZ, (off_t) readOff);
12132 if (r != XLOG_BLCKSZ)
12133 {
12134 char fname[MAXFNAMELEN];
12135 int save_errno = errno;
12136
12137 pgstat_report_wait_end();
12138 XLogFileName(fname, curFileTLI, readSegNo, wal_segment_size);
12139 if (r < 0)
12140 {
12141 errno = save_errno;
12142 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
12143 (errcode_for_file_access(),
12144 errmsg("could not read from log segment %s, offset %u: %m",
12145 fname, readOff)));
12146 }
12147 else
12148 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
12149 (errcode(ERRCODE_DATA_CORRUPTED),
12150 errmsg("could not read from log segment %s, offset %u: read %d of %zu",
12151 fname, readOff, r, (Size) XLOG_BLCKSZ)));
12152 goto next_record_is_invalid;
12153 }
12154 pgstat_report_wait_end();
12155
12156 Assert(targetSegNo == readSegNo);
12157 Assert(targetPageOff == readOff);
12158 Assert(reqLen <= readLen);
12159
12160 xlogreader->seg.ws_tli = curFileTLI;
12161
12162 /*
12163 * Check the page header immediately, so that we can retry immediately if
12164 * it's not valid. This may seem unnecessary, because XLogReadRecord()
12165 * validates the page header anyway, and would propagate the failure up to
12166 * ReadRecord(), which would retry. However, there's a corner case with
12167 * continuation records, if a record is split across two pages such that
12168 * we would need to read the two pages from different sources. For
12169 * example, imagine a scenario where a streaming replica is started up,
12170 * and replay reaches a record that's split across two WAL segments. The
12171 * first page is only available locally, in pg_wal, because it's already
12172 * been recycled in the master. The second page, however, is not present
12173 * in pg_wal, and we should stream it from the master. There is a recycled
12174 * WAL segment present in pg_wal, with garbage contents, however. We would
12175 * read the first page from the local WAL segment, but when reading the
12176 * second page, we would read the bogus, recycled, WAL segment. If we
12177 * didn't catch that case here, we would never recover, because
12178 * ReadRecord() would retry reading the whole record from the beginning.
12179 *
12180 * Of course, this only catches errors in the page header, which is what
12181 * happens in the case of a recycled WAL segment. Other kinds of errors or
12182 * corruption still has the same problem. But this at least fixes the
12183 * common case, which can happen as part of normal operation.
12184 *
12185 * Validating the page header is cheap enough that doing it twice
12186 * shouldn't be a big deal from a performance point of view.
12187 */
12188 if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
12189 {
12190 /* reset any error XLogReaderValidatePageHeader() might have set */
12191 xlogreader->errormsg_buf[0] = '\0';
12192 goto next_record_is_invalid;
12193 }
12194
12195 return readLen;
12196
12197 next_record_is_invalid:
12198 lastSourceFailed = true;
12199
12200 if (readFile >= 0)
12201 close(readFile);
12202 readFile = -1;
12203 readLen = 0;
12204 readSource = XLOG_FROM_ANY;
12205
12206 /* In standby-mode, keep trying */
12207 if (StandbyMode)
12208 goto retry;
12209 else
12210 return -1;
12211 }
12212
12213 /*
12214 * Open the WAL segment containing WAL location 'RecPtr'.
12215 *
12216 * The segment can be fetched via restore_command, or via walreceiver having
12217 * streamed the record, or it can already be present in pg_wal. Checking
12218 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
12219 * too, in case someone copies a new segment directly to pg_wal. That is not
12220 * documented or recommended, though.
12221 *
12222 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
12223 * prepare to read WAL starting from RedoStartLSN after this.
12224 *
12225 * 'RecPtr' might not point to the beginning of the record we're interested
12226 * in, it might also point to the page or segment header. In that case,
12227 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
12228 * used to decide which timeline to stream the requested WAL from.
12229 *
12230 * If the record is not immediately available, the function returns false
12231 * if we're not in standby mode. In standby mode, waits for it to become
12232 * available.
12233 *
12234 * When the requested record becomes available, the function opens the file
12235 * containing it (if not open already), and returns true. When end of standby
12236 * mode is triggered by the user, and there is no more WAL available, returns
12237 * false.
12238 */
12239 static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,bool randAccess,bool fetching_ckpt,XLogRecPtr tliRecPtr)12240 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
12241 bool fetching_ckpt, XLogRecPtr tliRecPtr)
12242 {
12243 static TimestampTz last_fail_time = 0;
12244 TimestampTz now;
12245 bool streaming_reply_sent = false;
12246
12247 /*-------
12248 * Standby mode is implemented by a state machine:
12249 *
12250 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
12251 * pg_wal (XLOG_FROM_PG_WAL)
12252 * 2. Check trigger file
12253 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
12254 * 4. Rescan timelines
12255 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
12256 *
12257 * Failure to read from the current source advances the state machine to
12258 * the next state.
12259 *
12260 * 'currentSource' indicates the current state. There are no currentSource
12261 * values for "check trigger", "rescan timelines", and "sleep" states,
12262 * those actions are taken when reading from the previous source fails, as
12263 * part of advancing to the next state.
12264 *
12265 * If standby mode is turned off while reading WAL from stream, we move
12266 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
12267 * the files (which would be required at end of recovery, e.g., timeline
12268 * history file) from archive or pg_wal. We don't need to kill WAL receiver
12269 * here because it's already stopped when standby mode is turned off at
12270 * the end of recovery.
12271 *-------
12272 */
12273 if (!InArchiveRecovery)
12274 currentSource = XLOG_FROM_PG_WAL;
12275 else if (currentSource == XLOG_FROM_ANY ||
12276 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
12277 {
12278 lastSourceFailed = false;
12279 currentSource = XLOG_FROM_ARCHIVE;
12280 }
12281
12282 for (;;)
12283 {
12284 XLogSource oldSource = currentSource;
12285 bool startWalReceiver = false;
12286
12287 /*
12288 * First check if we failed to read from the current source, and
12289 * advance the state machine if so. The failure to read might've
12290 * happened outside this function, e.g when a CRC check fails on a
12291 * record, or within this loop.
12292 */
12293 if (lastSourceFailed)
12294 {
12295 switch (currentSource)
12296 {
12297 case XLOG_FROM_ARCHIVE:
12298 case XLOG_FROM_PG_WAL:
12299
12300 /*
12301 * Check to see if the trigger file exists. Note that we
12302 * do this only after failure, so when you create the
12303 * trigger file, we still finish replaying as much as we
12304 * can from archive and pg_wal before failover.
12305 */
12306 if (StandbyMode && CheckForStandbyTrigger())
12307 {
12308 ShutdownWalRcv();
12309 return false;
12310 }
12311
12312 /*
12313 * Not in standby mode, and we've now tried the archive
12314 * and pg_wal.
12315 */
12316 if (!StandbyMode)
12317 return false;
12318
12319 /*
12320 * Move to XLOG_FROM_STREAM state, and set to start a
12321 * walreceiver if necessary.
12322 */
12323 currentSource = XLOG_FROM_STREAM;
12324 startWalReceiver = true;
12325 break;
12326
12327 case XLOG_FROM_STREAM:
12328
12329 /*
12330 * Failure while streaming. Most likely, we got here
12331 * because streaming replication was terminated, or
12332 * promotion was triggered. But we also get here if we
12333 * find an invalid record in the WAL streamed from master,
12334 * in which case something is seriously wrong. There's
12335 * little chance that the problem will just go away, but
12336 * PANIC is not good for availability either, especially
12337 * in hot standby mode. So, we treat that the same as
12338 * disconnection, and retry from archive/pg_wal again. The
12339 * WAL in the archive should be identical to what was
12340 * streamed, so it's unlikely that it helps, but one can
12341 * hope...
12342 */
12343
12344 /*
12345 * We should be able to move to XLOG_FROM_STREAM only in
12346 * standby mode.
12347 */
12348 Assert(StandbyMode);
12349
12350 /*
12351 * Before we leave XLOG_FROM_STREAM state, make sure that
12352 * walreceiver is not active, so that it won't overwrite
12353 * WAL that we restore from archive.
12354 */
12355 if (WalRcvStreaming())
12356 ShutdownWalRcv();
12357
12358 /*
12359 * Before we sleep, re-scan for possible new timelines if
12360 * we were requested to recover to the latest timeline.
12361 */
12362 if (recoveryTargetTimeLineGoal == RECOVERY_TARGET_TIMELINE_LATEST)
12363 {
12364 if (rescanLatestTimeLine())
12365 {
12366 currentSource = XLOG_FROM_ARCHIVE;
12367 break;
12368 }
12369 }
12370
12371 /*
12372 * XLOG_FROM_STREAM is the last state in our state
12373 * machine, so we've exhausted all the options for
12374 * obtaining the requested WAL. We're going to loop back
12375 * and retry from the archive, but if it hasn't been long
12376 * since last attempt, sleep wal_retrieve_retry_interval
12377 * milliseconds to avoid busy-waiting.
12378 */
12379 now = GetCurrentTimestamp();
12380 if (!TimestampDifferenceExceeds(last_fail_time, now,
12381 wal_retrieve_retry_interval))
12382 {
12383 long wait_time;
12384
12385 wait_time = wal_retrieve_retry_interval -
12386 TimestampDifferenceMilliseconds(last_fail_time, now);
12387
12388 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12389 WL_LATCH_SET | WL_TIMEOUT |
12390 WL_EXIT_ON_PM_DEATH,
12391 wait_time,
12392 WAIT_EVENT_RECOVERY_RETRIEVE_RETRY_INTERVAL);
12393 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12394 now = GetCurrentTimestamp();
12395
12396 /* Handle interrupt signals of startup process */
12397 HandleStartupProcInterrupts();
12398 }
12399 last_fail_time = now;
12400 currentSource = XLOG_FROM_ARCHIVE;
12401 break;
12402
12403 default:
12404 elog(ERROR, "unexpected WAL source %d", currentSource);
12405 }
12406 }
12407 else if (currentSource == XLOG_FROM_PG_WAL)
12408 {
12409 /*
12410 * We just successfully read a file in pg_wal. We prefer files in
12411 * the archive over ones in pg_wal, so try the next file again
12412 * from the archive first.
12413 */
12414 if (InArchiveRecovery)
12415 currentSource = XLOG_FROM_ARCHIVE;
12416 }
12417
12418 if (currentSource != oldSource)
12419 elog(DEBUG2, "switched WAL source from %s to %s after %s",
12420 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12421 lastSourceFailed ? "failure" : "success");
12422
12423 /*
12424 * We've now handled possible failure. Try to read from the chosen
12425 * source.
12426 */
12427 lastSourceFailed = false;
12428
12429 switch (currentSource)
12430 {
12431 case XLOG_FROM_ARCHIVE:
12432 case XLOG_FROM_PG_WAL:
12433
12434 /*
12435 * WAL receiver must not be running when reading WAL from
12436 * archive or pg_wal.
12437 */
12438 Assert(!WalRcvStreaming());
12439
12440 /* Close any old file we might have open. */
12441 if (readFile >= 0)
12442 {
12443 close(readFile);
12444 readFile = -1;
12445 }
12446 /* Reset curFileTLI if random fetch. */
12447 if (randAccess)
12448 curFileTLI = 0;
12449
12450 /*
12451 * Try to restore the file from archive, or read an existing
12452 * file from pg_wal.
12453 */
12454 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12455 currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
12456 currentSource);
12457 if (readFile >= 0)
12458 return true; /* success! */
12459
12460 /*
12461 * Nope, not found in archive or pg_wal.
12462 */
12463 lastSourceFailed = true;
12464 break;
12465
12466 case XLOG_FROM_STREAM:
12467 {
12468 bool havedata;
12469
12470 /*
12471 * We should be able to move to XLOG_FROM_STREAM only in
12472 * standby mode.
12473 */
12474 Assert(StandbyMode);
12475
12476 /*
12477 * First, shutdown walreceiver if its restart has been
12478 * requested -- but no point if we're already slated for
12479 * starting it.
12480 */
12481 if (pendingWalRcvRestart && !startWalReceiver)
12482 {
12483 ShutdownWalRcv();
12484
12485 /*
12486 * Re-scan for possible new timelines if we were
12487 * requested to recover to the latest timeline.
12488 */
12489 if (recoveryTargetTimeLineGoal ==
12490 RECOVERY_TARGET_TIMELINE_LATEST)
12491 rescanLatestTimeLine();
12492
12493 startWalReceiver = true;
12494 }
12495 pendingWalRcvRestart = false;
12496
12497 /*
12498 * Launch walreceiver if needed.
12499 *
12500 * If fetching_ckpt is true, RecPtr points to the initial
12501 * checkpoint location. In that case, we use RedoStartLSN
12502 * as the streaming start position instead of RecPtr, so
12503 * that when we later jump backwards to start redo at
12504 * RedoStartLSN, we will have the logs streamed already.
12505 */
12506 if (startWalReceiver &&
12507 PrimaryConnInfo && strcmp(PrimaryConnInfo, "") != 0)
12508 {
12509 XLogRecPtr ptr;
12510 TimeLineID tli;
12511
12512 if (fetching_ckpt)
12513 {
12514 ptr = RedoStartLSN;
12515 tli = ControlFile->checkPointCopy.ThisTimeLineID;
12516 }
12517 else
12518 {
12519 ptr = RecPtr;
12520
12521 /*
12522 * Use the record begin position to determine the
12523 * TLI, rather than the position we're reading.
12524 */
12525 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
12526
12527 if (curFileTLI > 0 && tli < curFileTLI)
12528 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
12529 (uint32) (tliRecPtr >> 32),
12530 (uint32) tliRecPtr,
12531 tli, curFileTLI);
12532 }
12533 curFileTLI = tli;
12534 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
12535 PrimarySlotName,
12536 wal_receiver_create_temp_slot);
12537 flushedUpto = 0;
12538 }
12539
12540 /*
12541 * Check if WAL receiver is active or wait to start up.
12542 */
12543 if (!WalRcvStreaming())
12544 {
12545 lastSourceFailed = true;
12546 break;
12547 }
12548
12549 /*
12550 * Walreceiver is active, so see if new data has arrived.
12551 *
12552 * We only advance XLogReceiptTime when we obtain fresh
12553 * WAL from walreceiver and observe that we had already
12554 * processed everything before the most recent "chunk"
12555 * that it flushed to disk. In steady state where we are
12556 * keeping up with the incoming data, XLogReceiptTime will
12557 * be updated on each cycle. When we are behind,
12558 * XLogReceiptTime will not advance, so the grace time
12559 * allotted to conflicting queries will decrease.
12560 */
12561 if (RecPtr < flushedUpto)
12562 havedata = true;
12563 else
12564 {
12565 XLogRecPtr latestChunkStart;
12566
12567 flushedUpto = GetWalRcvFlushRecPtr(&latestChunkStart, &receiveTLI);
12568 if (RecPtr < flushedUpto && receiveTLI == curFileTLI)
12569 {
12570 havedata = true;
12571 if (latestChunkStart <= RecPtr)
12572 {
12573 XLogReceiptTime = GetCurrentTimestamp();
12574 SetCurrentChunkStartTime(XLogReceiptTime);
12575 }
12576 }
12577 else
12578 havedata = false;
12579 }
12580 if (havedata)
12581 {
12582 /*
12583 * Great, streamed far enough. Open the file if it's
12584 * not open already. Also read the timeline history
12585 * file if we haven't initialized timeline history
12586 * yet; it should be streamed over and present in
12587 * pg_wal by now. Use XLOG_FROM_STREAM so that source
12588 * info is set correctly and XLogReceiptTime isn't
12589 * changed.
12590 *
12591 * NB: We must set readTimeLineHistory based on
12592 * recoveryTargetTLI, not receiveTLI. Normally they'll
12593 * be the same, but if recovery_target_timeline is
12594 * 'latest' and archiving is configured, then it's
12595 * possible that we managed to retrieve one or more
12596 * new timeline history files from the archive,
12597 * updating recoveryTargetTLI.
12598 */
12599 if (readFile < 0)
12600 {
12601 if (!expectedTLEs)
12602 expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
12603 readFile = XLogFileRead(readSegNo, PANIC,
12604 receiveTLI,
12605 XLOG_FROM_STREAM, false);
12606 Assert(readFile >= 0);
12607 }
12608 else
12609 {
12610 /* just make sure source info is correct... */
12611 readSource = XLOG_FROM_STREAM;
12612 XLogReceiptSource = XLOG_FROM_STREAM;
12613 return true;
12614 }
12615 break;
12616 }
12617
12618 /*
12619 * Data not here yet. Check for trigger, then wait for
12620 * walreceiver to wake us up when new WAL arrives.
12621 */
12622 if (CheckForStandbyTrigger())
12623 {
12624 /*
12625 * Note that we don't "return false" immediately here.
12626 * After being triggered, we still want to replay all
12627 * the WAL that was already streamed. It's in pg_wal
12628 * now, so we just treat this as a failure, and the
12629 * state machine will move on to replay the streamed
12630 * WAL from pg_wal, and then recheck the trigger and
12631 * exit replay.
12632 */
12633 lastSourceFailed = true;
12634 break;
12635 }
12636
12637 /*
12638 * Since we have replayed everything we have received so
12639 * far and are about to start waiting for more WAL, let's
12640 * tell the upstream server our replay location now so
12641 * that pg_stat_replication doesn't show stale
12642 * information.
12643 */
12644 if (!streaming_reply_sent)
12645 {
12646 WalRcvForceReply();
12647 streaming_reply_sent = true;
12648 }
12649
12650 /*
12651 * Wait for more WAL to arrive. Time out after 5 seconds
12652 * to react to a trigger file promptly and to check if the
12653 * WAL receiver is still active.
12654 */
12655 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
12656 WL_LATCH_SET | WL_TIMEOUT |
12657 WL_EXIT_ON_PM_DEATH,
12658 5000L, WAIT_EVENT_RECOVERY_WAL_STREAM);
12659 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12660 break;
12661 }
12662
12663 default:
12664 elog(ERROR, "unexpected WAL source %d", currentSource);
12665 }
12666
12667 /*
12668 * This possibly-long loop needs to handle interrupts of startup
12669 * process.
12670 */
12671 HandleStartupProcInterrupts();
12672 }
12673
12674 return false; /* not reached */
12675 }
12676
12677 /*
12678 * Set flag to signal the walreceiver to restart. (The startup process calls
12679 * this on noticing a relevant configuration change.)
12680 */
12681 void
StartupRequestWalReceiverRestart(void)12682 StartupRequestWalReceiverRestart(void)
12683 {
12684 if (currentSource == XLOG_FROM_STREAM && WalRcvRunning())
12685 {
12686 ereport(LOG,
12687 (errmsg("WAL receiver process shutdown requested")));
12688
12689 pendingWalRcvRestart = true;
12690 }
12691 }
12692
12693 /*
12694 * Determine what log level should be used to report a corrupt WAL record
12695 * in the current WAL page, previously read by XLogPageRead().
12696 *
12697 * 'emode' is the error mode that would be used to report a file-not-found
12698 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
12699 * we're retrying the exact same record that we've tried previously, only
12700 * complain the first time to keep the noise down. However, we only do when
12701 * reading from pg_wal, because we don't expect any invalid records in archive
12702 * or in records streamed from master. Files in the archive should be complete,
12703 * and we should never hit the end of WAL because we stop and wait for more WAL
12704 * to arrive before replaying it.
12705 *
12706 * NOTE: This function remembers the RecPtr value it was last called with,
12707 * to suppress repeated messages about the same record. Only call this when
12708 * you are about to ereport(), or you might cause a later message to be
12709 * erroneously suppressed.
12710 */
12711 static int
emode_for_corrupt_record(int emode,XLogRecPtr RecPtr)12712 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12713 {
12714 static XLogRecPtr lastComplaint = 0;
12715
12716 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12717 {
12718 if (RecPtr == lastComplaint)
12719 emode = DEBUG1;
12720 else
12721 lastComplaint = RecPtr;
12722 }
12723 return emode;
12724 }
12725
12726 /*
12727 * Has a standby promotion already been triggered?
12728 *
12729 * Unlike CheckForStandbyTrigger(), this works in any process
12730 * that's connected to shared memory.
12731 */
12732 bool
PromoteIsTriggered(void)12733 PromoteIsTriggered(void)
12734 {
12735 /*
12736 * We check shared state each time only until a standby promotion is
12737 * triggered. We can't trigger a promotion again, so there's no need to
12738 * keep checking after the shared variable has once been seen true.
12739 */
12740 if (LocalPromoteIsTriggered)
12741 return true;
12742
12743 SpinLockAcquire(&XLogCtl->info_lck);
12744 LocalPromoteIsTriggered = XLogCtl->SharedPromoteIsTriggered;
12745 SpinLockRelease(&XLogCtl->info_lck);
12746
12747 return LocalPromoteIsTriggered;
12748 }
12749
12750 static void
SetPromoteIsTriggered(void)12751 SetPromoteIsTriggered(void)
12752 {
12753 SpinLockAcquire(&XLogCtl->info_lck);
12754 XLogCtl->SharedPromoteIsTriggered = true;
12755 SpinLockRelease(&XLogCtl->info_lck);
12756
12757 LocalPromoteIsTriggered = true;
12758 }
12759
12760 /*
12761 * Check to see whether the user-specified trigger file exists and whether a
12762 * promote request has arrived. If either condition holds, return true.
12763 */
12764 static bool
CheckForStandbyTrigger(void)12765 CheckForStandbyTrigger(void)
12766 {
12767 struct stat stat_buf;
12768
12769 if (LocalPromoteIsTriggered)
12770 return true;
12771
12772 if (IsPromoteSignaled())
12773 {
12774 /*
12775 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12776 * signal handler. It now leaves the file in place and lets the
12777 * Startup process do the unlink. This allows Startup to know whether
12778 * it should create a full checkpoint before starting up (fallback
12779 * mode). Fast promotion takes precedence.
12780 */
12781 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12782 {
12783 unlink(PROMOTE_SIGNAL_FILE);
12784 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12785 fast_promote = true;
12786 }
12787 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12788 {
12789 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12790 fast_promote = false;
12791 }
12792
12793 ereport(LOG, (errmsg("received promote request")));
12794
12795 ResetPromoteSignaled();
12796 SetPromoteIsTriggered();
12797 return true;
12798 }
12799
12800 if (PromoteTriggerFile == NULL || strcmp(PromoteTriggerFile, "") == 0)
12801 return false;
12802
12803 if (stat(PromoteTriggerFile, &stat_buf) == 0)
12804 {
12805 ereport(LOG,
12806 (errmsg("promote trigger file found: %s", PromoteTriggerFile)));
12807 unlink(PromoteTriggerFile);
12808 SetPromoteIsTriggered();
12809 fast_promote = true;
12810 return true;
12811 }
12812 else if (errno != ENOENT)
12813 ereport(ERROR,
12814 (errcode_for_file_access(),
12815 errmsg("could not stat promote trigger file \"%s\": %m",
12816 PromoteTriggerFile)));
12817
12818 return false;
12819 }
12820
12821 /*
12822 * Remove the files signaling a standby promotion request.
12823 */
12824 void
RemovePromoteSignalFiles(void)12825 RemovePromoteSignalFiles(void)
12826 {
12827 unlink(PROMOTE_SIGNAL_FILE);
12828 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12829 }
12830
12831 /*
12832 * Check to see if a promote request has arrived. Should be
12833 * called by postmaster after receiving SIGUSR1.
12834 */
12835 bool
CheckPromoteSignal(void)12836 CheckPromoteSignal(void)
12837 {
12838 struct stat stat_buf;
12839
12840 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12841 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12842 return true;
12843
12844 return false;
12845 }
12846
12847 /*
12848 * Wake up startup process to replay newly arrived WAL, or to notice that
12849 * failover has been requested.
12850 */
12851 void
WakeupRecovery(void)12852 WakeupRecovery(void)
12853 {
12854 SetLatch(&XLogCtl->recoveryWakeupLatch);
12855 }
12856
12857 /*
12858 * Update the WalWriterSleeping flag.
12859 */
12860 void
SetWalWriterSleeping(bool sleeping)12861 SetWalWriterSleeping(bool sleeping)
12862 {
12863 SpinLockAcquire(&XLogCtl->info_lck);
12864 XLogCtl->WalWriterSleeping = sleeping;
12865 SpinLockRelease(&XLogCtl->info_lck);
12866 }
12867
12868 /*
12869 * Schedule a walreceiver wakeup in the main recovery loop.
12870 */
12871 void
XLogRequestWalReceiverReply(void)12872 XLogRequestWalReceiverReply(void)
12873 {
12874 doRequestWalReceiverReply = true;
12875 }
12876