1 /*-------------------------------------------------------------------------
2 *
3 * xlog.c
4 * PostgreSQL write-ahead log manager
5 *
6 *
7 * Portions Copyright (c) 1996-2017, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/backend/access/transam/xlog.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <math.h>
19 #include <time.h>
20 #include <fcntl.h>
21 #include <sys/stat.h>
22 #include <sys/time.h>
23 #include <unistd.h>
24
25 #include "access/clog.h"
26 #include "access/commit_ts.h"
27 #include "access/multixact.h"
28 #include "access/rewriteheap.h"
29 #include "access/subtrans.h"
30 #include "access/timeline.h"
31 #include "access/transam.h"
32 #include "access/tuptoaster.h"
33 #include "access/twophase.h"
34 #include "access/xact.h"
35 #include "access/xlog_internal.h"
36 #include "access/xloginsert.h"
37 #include "access/xlogreader.h"
38 #include "access/xlogutils.h"
39 #include "catalog/catversion.h"
40 #include "catalog/pg_control.h"
41 #include "catalog/pg_database.h"
42 #include "commands/tablespace.h"
43 #include "miscadmin.h"
44 #include "pgstat.h"
45 #include "port/atomics.h"
46 #include "postmaster/bgwriter.h"
47 #include "postmaster/walwriter.h"
48 #include "postmaster/startup.h"
49 #include "replication/basebackup.h"
50 #include "replication/logical.h"
51 #include "replication/slot.h"
52 #include "replication/origin.h"
53 #include "replication/snapbuild.h"
54 #include "replication/walreceiver.h"
55 #include "replication/walsender.h"
56 #include "storage/bufmgr.h"
57 #include "storage/fd.h"
58 #include "storage/ipc.h"
59 #include "storage/large_object.h"
60 #include "storage/latch.h"
61 #include "storage/pmsignal.h"
62 #include "storage/predicate.h"
63 #include "storage/proc.h"
64 #include "storage/procarray.h"
65 #include "storage/reinit.h"
66 #include "storage/smgr.h"
67 #include "storage/spin.h"
68 #include "utils/backend_random.h"
69 #include "utils/builtins.h"
70 #include "utils/guc.h"
71 #include "utils/memutils.h"
72 #include "utils/pg_lsn.h"
73 #include "utils/ps_status.h"
74 #include "utils/relmapper.h"
75 #include "utils/snapmgr.h"
76 #include "utils/timestamp.h"
77 #include "pg_trace.h"
78
79 extern uint32 bootstrap_data_checksum_version;
80
81 /* File path names (all relative to $PGDATA) */
82 #define RECOVERY_COMMAND_FILE "recovery.conf"
83 #define RECOVERY_COMMAND_DONE "recovery.done"
84 #define PROMOTE_SIGNAL_FILE "promote"
85 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
86
87
88 /* User-settable parameters */
89 int max_wal_size_mb = 1024; /* 1 GB */
90 int min_wal_size_mb = 80; /* 80 MB */
91 int wal_keep_segments = 0;
92 int XLOGbuffers = -1;
93 int XLogArchiveTimeout = 0;
94 int XLogArchiveMode = ARCHIVE_MODE_OFF;
95 char *XLogArchiveCommand = NULL;
96 bool EnableHotStandby = false;
97 bool fullPageWrites = true;
98 bool wal_log_hints = false;
99 bool wal_compression = false;
100 char *wal_consistency_checking_string = NULL;
101 bool *wal_consistency_checking = NULL;
102 bool log_checkpoints = false;
103 int sync_method = DEFAULT_SYNC_METHOD;
104 int wal_level = WAL_LEVEL_MINIMAL;
105 int CommitDelay = 0; /* precommit delay in microseconds */
106 int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
107 int wal_retrieve_retry_interval = 5000;
108
109 #ifdef WAL_DEBUG
110 bool XLOG_DEBUG = false;
111 #endif
112
113 /*
114 * Number of WAL insertion locks to use. A higher value allows more insertions
115 * to happen concurrently, but adds some CPU overhead to flushing the WAL,
116 * which needs to iterate all the locks.
117 */
118 #define NUM_XLOGINSERT_LOCKS 8
119
120 /*
121 * Max distance from last checkpoint, before triggering a new xlog-based
122 * checkpoint.
123 */
124 int CheckPointSegments;
125
126 /* Estimated distance between checkpoints, in bytes */
127 static double CheckPointDistanceEstimate = 0;
128 static double PrevCheckPointDistance = 0;
129
130 /*
131 * GUC support
132 */
133 const struct config_enum_entry sync_method_options[] = {
134 {"fsync", SYNC_METHOD_FSYNC, false},
135 #ifdef HAVE_FSYNC_WRITETHROUGH
136 {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
137 #endif
138 #ifdef HAVE_FDATASYNC
139 {"fdatasync", SYNC_METHOD_FDATASYNC, false},
140 #endif
141 #ifdef OPEN_SYNC_FLAG
142 {"open_sync", SYNC_METHOD_OPEN, false},
143 #endif
144 #ifdef OPEN_DATASYNC_FLAG
145 {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
146 #endif
147 {NULL, 0, false}
148 };
149
150
151 /*
152 * Although only "on", "off", and "always" are documented,
153 * we accept all the likely variants of "on" and "off".
154 */
155 const struct config_enum_entry archive_mode_options[] = {
156 {"always", ARCHIVE_MODE_ALWAYS, false},
157 {"on", ARCHIVE_MODE_ON, false},
158 {"off", ARCHIVE_MODE_OFF, false},
159 {"true", ARCHIVE_MODE_ON, true},
160 {"false", ARCHIVE_MODE_OFF, true},
161 {"yes", ARCHIVE_MODE_ON, true},
162 {"no", ARCHIVE_MODE_OFF, true},
163 {"1", ARCHIVE_MODE_ON, true},
164 {"0", ARCHIVE_MODE_OFF, true},
165 {NULL, 0, false}
166 };
167
168 /*
169 * Statistics for current checkpoint are collected in this global struct.
170 * Because only the checkpointer or a stand-alone backend can perform
171 * checkpoints, this will be unused in normal backends.
172 */
173 CheckpointStatsData CheckpointStats;
174
175 /*
176 * ThisTimeLineID will be same in all backends --- it identifies current
177 * WAL timeline for the database system.
178 */
179 TimeLineID ThisTimeLineID = 0;
180
181 /*
182 * Are we doing recovery from XLOG?
183 *
184 * This is only ever true in the startup process; it should be read as meaning
185 * "this process is replaying WAL records", rather than "the system is in
186 * recovery mode". It should be examined primarily by functions that need
187 * to act differently when called from a WAL redo function (e.g., to skip WAL
188 * logging). To check whether the system is in recovery regardless of which
189 * process you're running in, use RecoveryInProgress() but only after shared
190 * memory startup and lock initialization.
191 */
192 bool InRecovery = false;
193
194 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
195 HotStandbyState standbyState = STANDBY_DISABLED;
196
197 static XLogRecPtr LastRec;
198
199 /* Local copy of WalRcv->receivedUpto */
200 static XLogRecPtr receivedUpto = 0;
201 static TimeLineID receiveTLI = 0;
202
203 /*
204 * abortedRecPtr is the start pointer of a broken record at end of WAL when
205 * recovery completes; missingContrecPtr is the location of the first
206 * contrecord that went missing. See CreateOverwriteContrecordRecord for
207 * details.
208 */
209 static XLogRecPtr abortedRecPtr;
210 static XLogRecPtr missingContrecPtr;
211
212 /*
213 * During recovery, lastFullPageWrites keeps track of full_page_writes that
214 * the replayed WAL records indicate. It's initialized with full_page_writes
215 * that the recovery starting checkpoint record indicates, and then updated
216 * each time XLOG_FPW_CHANGE record is replayed.
217 */
218 static bool lastFullPageWrites;
219
220 /*
221 * Local copy of the state tracked by SharedRecoveryState in shared memory,
222 * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
223 * means "not known, need to check the shared state".
224 */
225 static bool LocalRecoveryInProgress = true;
226
227 /*
228 * Local copy of SharedHotStandbyActive variable. False actually means "not
229 * known, need to check the shared state".
230 */
231 static bool LocalHotStandbyActive = false;
232
233 /*
234 * Local state for XLogInsertAllowed():
235 * 1: unconditionally allowed to insert XLOG
236 * 0: unconditionally not allowed to insert XLOG
237 * -1: must check RecoveryInProgress(); disallow until it is false
238 * Most processes start with -1 and transition to 1 after seeing that recovery
239 * is not in progress. But we can also force the value for special cases.
240 * The coding in XLogInsertAllowed() depends on the first two of these states
241 * being numerically the same as bool true and false.
242 */
243 static int LocalXLogInsertAllowed = -1;
244
245 /*
246 * When ArchiveRecoveryRequested is set, archive recovery was requested,
247 * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
248 * currently recovering using offline XLOG archives. These variables are only
249 * valid in the startup process.
250 *
251 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
252 * currently performing crash recovery using only XLOG files in pg_wal, but
253 * will switch to using offline XLOG archives as soon as we reach the end of
254 * WAL in pg_wal.
255 */
256 bool ArchiveRecoveryRequested = false;
257 bool InArchiveRecovery = false;
258
259 /* Was the last xlog file restored from archive, or local? */
260 static bool restoredFromArchive = false;
261
262 /* Buffers dedicated to consistency checks of size BLCKSZ */
263 static char *replay_image_masked = NULL;
264 static char *master_image_masked = NULL;
265
266 /* options taken from recovery.conf for archive recovery */
267 char *recoveryRestoreCommand = NULL;
268 static char *recoveryEndCommand = NULL;
269 static char *archiveCleanupCommand = NULL;
270 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
271 static bool recoveryTargetInclusive = true;
272 static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
273 static TransactionId recoveryTargetXid;
274 static TimestampTz recoveryTargetTime;
275 static char *recoveryTargetName;
276 static XLogRecPtr recoveryTargetLSN;
277 static int recovery_min_apply_delay = 0;
278 static TimestampTz recoveryDelayUntilTime;
279
280 /* options taken from recovery.conf for XLOG streaming */
281 static bool StandbyModeRequested = false;
282 static char *PrimaryConnInfo = NULL;
283 static char *PrimarySlotName = NULL;
284 static char *TriggerFile = NULL;
285
286 /* are we currently in standby mode? */
287 bool StandbyMode = false;
288
289 /* whether request for fast promotion has been made yet */
290 static bool fast_promote = false;
291
292 /*
293 * if recoveryStopsBefore/After returns true, it saves information of the stop
294 * point here
295 */
296 static TransactionId recoveryStopXid;
297 static TimestampTz recoveryStopTime;
298 static XLogRecPtr recoveryStopLSN;
299 static char recoveryStopName[MAXFNAMELEN];
300 static bool recoveryStopAfter;
301
302 /*
303 * During normal operation, the only timeline we care about is ThisTimeLineID.
304 * During recovery, however, things are more complicated. To simplify life
305 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
306 * scan through the WAL history (that is, it is the line that was active when
307 * the currently-scanned WAL record was generated). We also need these
308 * timeline values:
309 *
310 * recoveryTargetTLI: the desired timeline that we want to end in.
311 *
312 * recoveryTargetIsLatest: was the requested target timeline 'latest'?
313 *
314 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
315 * its known parents, newest first (so recoveryTargetTLI is always the
316 * first list member). Only these TLIs are expected to be seen in the WAL
317 * segments we read, and indeed only these TLIs will be considered as
318 * candidate WAL files to open at all.
319 *
320 * curFileTLI: the TLI appearing in the name of the current input WAL file.
321 * (This is not necessarily the same as ThisTimeLineID, because we could
322 * be scanning data that was copied from an ancestor timeline when the current
323 * file was created.) During a sequential scan we do not allow this value
324 * to decrease.
325 */
326 static TimeLineID recoveryTargetTLI;
327 static bool recoveryTargetIsLatest = false;
328 static List *expectedTLEs;
329 static TimeLineID curFileTLI;
330
331 /*
332 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
333 * current backend. It is updated for all inserts. XactLastRecEnd points to
334 * end+1 of the last record, and is reset when we end a top-level transaction,
335 * or start a new one; so it can be used to tell if the current transaction has
336 * created any XLOG records.
337 *
338 * While in parallel mode, this may not be fully up to date. When committing,
339 * a transaction can assume this covers all xlog records written either by the
340 * user backend or by any parallel worker which was present at any point during
341 * the transaction. But when aborting, or when still in parallel mode, other
342 * parallel backends may have written WAL records at later LSNs than the value
343 * stored here. The parallel leader advances its own copy, when necessary,
344 * in WaitForParallelWorkersToFinish.
345 */
346 XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
347 XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
348 XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
349
350 /*
351 * RedoRecPtr is this backend's local copy of the REDO record pointer
352 * (which is almost but not quite the same as a pointer to the most recent
353 * CHECKPOINT record). We update this from the shared-memory copy,
354 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
355 * hold an insertion lock). See XLogInsertRecord for details. We are also
356 * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
357 * see GetRedoRecPtr. A freshly spawned backend obtains the value during
358 * InitXLOGAccess.
359 */
360 static XLogRecPtr RedoRecPtr;
361
362 /*
363 * doPageWrites is this backend's local copy of (forcePageWrites ||
364 * fullPageWrites). It is used together with RedoRecPtr to decide whether
365 * a full-page image of a page need to be taken.
366 */
367 static bool doPageWrites;
368
369 /* Has the recovery code requested a walreceiver wakeup? */
370 static bool doRequestWalReceiverReply;
371
372 /*
373 * RedoStartLSN points to the checkpoint's REDO location which is specified
374 * in a backup label file, backup history file or control file. In standby
375 * mode, XLOG streaming usually starts from the position where an invalid
376 * record was found. But if we fail to read even the initial checkpoint
377 * record, we use the REDO location instead of the checkpoint location as
378 * the start position of XLOG streaming. Otherwise we would have to jump
379 * backwards to the REDO location after reading the checkpoint record,
380 * because the REDO record can precede the checkpoint record.
381 */
382 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
383
384 /*----------
385 * Shared-memory data structures for XLOG control
386 *
387 * LogwrtRqst indicates a byte position that we need to write and/or fsync
388 * the log up to (all records before that point must be written or fsynced).
389 * LogwrtResult indicates the byte positions we have already written/fsynced.
390 * These structs are identical but are declared separately to indicate their
391 * slightly different functions.
392 *
393 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
394 * WALWriteLock. To update it, you need to hold both locks. The point of
395 * this arrangement is that the value can be examined by code that already
396 * holds WALWriteLock without needing to grab info_lck as well. In addition
397 * to the shared variable, each backend has a private copy of LogwrtResult,
398 * which is updated when convenient.
399 *
400 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
401 * (protected by info_lck), but we don't need to cache any copies of it.
402 *
403 * info_lck is only held long enough to read/update the protected variables,
404 * so it's a plain spinlock. The other locks are held longer (potentially
405 * over I/O operations), so we use LWLocks for them. These locks are:
406 *
407 * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
408 * It is only held while initializing and changing the mapping. If the
409 * contents of the buffer being replaced haven't been written yet, the mapping
410 * lock is released while the write is done, and reacquired afterwards.
411 *
412 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
413 * XLogFlush).
414 *
415 * ControlFileLock: must be held to read/update control file or create
416 * new log file.
417 *
418 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
419 * only one checkpointer at a time; currently, with all checkpoints done by
420 * the checkpointer, this is just pro forma).
421 *
422 *----------
423 */
424
425 typedef struct XLogwrtRqst
426 {
427 XLogRecPtr Write; /* last byte + 1 to write out */
428 XLogRecPtr Flush; /* last byte + 1 to flush */
429 } XLogwrtRqst;
430
431 typedef struct XLogwrtResult
432 {
433 XLogRecPtr Write; /* last byte + 1 written out */
434 XLogRecPtr Flush; /* last byte + 1 flushed */
435 } XLogwrtResult;
436
437 /*
438 * Inserting to WAL is protected by a small fixed number of WAL insertion
439 * locks. To insert to the WAL, you must hold one of the locks - it doesn't
440 * matter which one. To lock out other concurrent insertions, you must hold
441 * of them. Each WAL insertion lock consists of a lightweight lock, plus an
442 * indicator of how far the insertion has progressed (insertingAt).
443 *
444 * The insertingAt values are read when a process wants to flush WAL from
445 * the in-memory buffers to disk, to check that all the insertions to the
446 * region the process is about to write out have finished. You could simply
447 * wait for all currently in-progress insertions to finish, but the
448 * insertingAt indicator allows you to ignore insertions to later in the WAL,
449 * so that you only wait for the insertions that are modifying the buffers
450 * you're about to write out.
451 *
452 * This isn't just an optimization. If all the WAL buffers are dirty, an
453 * inserter that's holding a WAL insert lock might need to evict an old WAL
454 * buffer, which requires flushing the WAL. If it's possible for an inserter
455 * to block on another inserter unnecessarily, deadlock can arise when two
456 * inserters holding a WAL insert lock wait for each other to finish their
457 * insertion.
458 *
459 * Small WAL records that don't cross a page boundary never update the value,
460 * the WAL record is just copied to the page and the lock is released. But
461 * to avoid the deadlock-scenario explained above, the indicator is always
462 * updated before sleeping while holding an insertion lock.
463 *
464 * lastImportantAt contains the LSN of the last important WAL record inserted
465 * using a given lock. This value is used to detect if there has been
466 * important WAL activity since the last time some action, like a checkpoint,
467 * was performed - allowing to not repeat the action if not. The LSN is
468 * updated for all insertions, unless the XLOG_MARK_UNIMPORTANT flag was
469 * set. lastImportantAt is never cleared, only overwritten by the LSN of newer
470 * records. Tracking the WAL activity directly in WALInsertLock has the
471 * advantage of not needing any additional locks to update the value.
472 */
473 typedef struct
474 {
475 LWLock lock;
476 XLogRecPtr insertingAt;
477 XLogRecPtr lastImportantAt;
478 } WALInsertLock;
479
480 /*
481 * All the WAL insertion locks are allocated as an array in shared memory. We
482 * force the array stride to be a power of 2, which saves a few cycles in
483 * indexing, but more importantly also ensures that individual slots don't
484 * cross cache line boundaries. (Of course, we have to also ensure that the
485 * array start address is suitably aligned.)
486 */
487 typedef union WALInsertLockPadded
488 {
489 WALInsertLock l;
490 char pad[PG_CACHE_LINE_SIZE];
491 } WALInsertLockPadded;
492
493 /*
494 * State of an exclusive backup, necessary to control concurrent activities
495 * across sessions when working on exclusive backups.
496 *
497 * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
498 * running, to be more precise pg_start_backup() is not being executed for
499 * an exclusive backup and there is no exclusive backup in progress.
500 * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
501 * exclusive backup.
502 * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
503 * running and an exclusive backup is in progress. pg_stop_backup() is
504 * needed to finish it.
505 * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
506 * exclusive backup.
507 */
508 typedef enum ExclusiveBackupState
509 {
510 EXCLUSIVE_BACKUP_NONE = 0,
511 EXCLUSIVE_BACKUP_STARTING,
512 EXCLUSIVE_BACKUP_IN_PROGRESS,
513 EXCLUSIVE_BACKUP_STOPPING
514 } ExclusiveBackupState;
515
516 /*
517 * Session status of running backup, used for sanity checks in SQL-callable
518 * functions to start and stop backups.
519 */
520 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
521
522 /*
523 * Shared state data for WAL insertion.
524 */
525 typedef struct XLogCtlInsert
526 {
527 slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
528
529 /*
530 * CurrBytePos is the end of reserved WAL. The next record will be
531 * inserted at that position. PrevBytePos is the start position of the
532 * previously inserted (or rather, reserved) record - it is copied to the
533 * prev-link of the next record. These are stored as "usable byte
534 * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
535 */
536 uint64 CurrBytePos;
537 uint64 PrevBytePos;
538
539 /*
540 * Make sure the above heavily-contended spinlock and byte positions are
541 * on their own cache line. In particular, the RedoRecPtr and full page
542 * write variables below should be on a different cache line. They are
543 * read on every WAL insertion, but updated rarely, and we don't want
544 * those reads to steal the cache line containing Curr/PrevBytePos.
545 */
546 char pad[PG_CACHE_LINE_SIZE];
547
548 /*
549 * fullPageWrites is the master copy used by all backends to determine
550 * whether to write full-page to WAL, instead of using process-local one.
551 * This is required because, when full_page_writes is changed by SIGHUP,
552 * we must WAL-log it before it actually affects WAL-logging by backends.
553 * Checkpointer sets at startup or after SIGHUP.
554 *
555 * To read these fields, you must hold an insertion lock. To modify them,
556 * you must hold ALL the locks.
557 */
558 XLogRecPtr RedoRecPtr; /* current redo point for insertions */
559 bool forcePageWrites; /* forcing full-page writes for PITR? */
560 bool fullPageWrites;
561
562 /*
563 * exclusiveBackupState indicates the state of an exclusive backup (see
564 * comments of ExclusiveBackupState for more details). nonExclusiveBackups
565 * is a counter indicating the number of streaming base backups currently
566 * in progress. forcePageWrites is set to true when either of these is
567 * non-zero. lastBackupStart is the latest checkpoint redo location used
568 * as a starting point for an online backup.
569 */
570 ExclusiveBackupState exclusiveBackupState;
571 int nonExclusiveBackups;
572 XLogRecPtr lastBackupStart;
573
574 /*
575 * WAL insertion locks.
576 */
577 WALInsertLockPadded *WALInsertLocks;
578 } XLogCtlInsert;
579
580 /*
581 * Total shared-memory state for XLOG.
582 */
583 typedef struct XLogCtlData
584 {
585 XLogCtlInsert Insert;
586
587 /* Protected by info_lck: */
588 XLogwrtRqst LogwrtRqst;
589 XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
590 uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
591 TransactionId ckptXid;
592 XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
593 XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
594
595 XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG segment */
596
597 /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
598 XLogRecPtr unloggedLSN;
599 slock_t ulsn_lck;
600
601 /* Time and LSN of last xlog segment switch. Protected by WALWriteLock. */
602 pg_time_t lastSegSwitchTime;
603 XLogRecPtr lastSegSwitchLSN;
604
605 /*
606 * Protected by info_lck and WALWriteLock (you must hold either lock to
607 * read it, but both to update)
608 */
609 XLogwrtResult LogwrtResult;
610
611 /*
612 * Latest initialized page in the cache (last byte position + 1).
613 *
614 * To change the identity of a buffer (and InitializedUpTo), you need to
615 * hold WALBufMappingLock. To change the identity of a buffer that's
616 * still dirty, the old page needs to be written out first, and for that
617 * you need WALWriteLock, and you need to ensure that there are no
618 * in-progress insertions to the page by calling
619 * WaitXLogInsertionsToFinish().
620 */
621 XLogRecPtr InitializedUpTo;
622
623 /*
624 * These values do not change after startup, although the pointed-to pages
625 * and xlblocks values certainly do. xlblock values are protected by
626 * WALBufMappingLock.
627 */
628 char *pages; /* buffers for unwritten XLOG pages */
629 XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
630 int XLogCacheBlck; /* highest allocated xlog buffer index */
631
632 /*
633 * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
634 * If we created a new timeline when the system was started up,
635 * PrevTimeLineID is the old timeline's ID that we forked off from.
636 * Otherwise it's equal to ThisTimeLineID.
637 */
638 TimeLineID ThisTimeLineID;
639 TimeLineID PrevTimeLineID;
640
641 /*
642 * archiveCleanupCommand is read from recovery.conf but needs to be in
643 * shared memory so that the checkpointer process can access it.
644 */
645 char archiveCleanupCommand[MAXPGPATH];
646
647 /*
648 * SharedRecoveryState indicates if we're still in crash or archive
649 * recovery. Protected by info_lck.
650 */
651 RecoveryState SharedRecoveryState;
652
653 /*
654 * SharedHotStandbyActive indicates if we're still in crash or archive
655 * recovery. Protected by info_lck.
656 */
657 bool SharedHotStandbyActive;
658
659 /*
660 * WalWriterSleeping indicates whether the WAL writer is currently in
661 * low-power mode (and hence should be nudged if an async commit occurs).
662 * Protected by info_lck.
663 */
664 bool WalWriterSleeping;
665
666 /*
667 * recoveryWakeupLatch is used to wake up the startup process to continue
668 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
669 * to appear.
670 */
671 Latch recoveryWakeupLatch;
672
673 /*
674 * During recovery, we keep a copy of the latest checkpoint record here.
675 * lastCheckPointRecPtr points to start of checkpoint record and
676 * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
677 * checkpointer when it wants to create a restartpoint.
678 *
679 * Protected by info_lck.
680 */
681 XLogRecPtr lastCheckPointRecPtr;
682 XLogRecPtr lastCheckPointEndPtr;
683 CheckPoint lastCheckPoint;
684
685 /*
686 * lastReplayedEndRecPtr points to end+1 of the last record successfully
687 * replayed. When we're currently replaying a record, ie. in a redo
688 * function, replayEndRecPtr points to the end+1 of the record being
689 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
690 */
691 XLogRecPtr lastReplayedEndRecPtr;
692 TimeLineID lastReplayedTLI;
693 XLogRecPtr replayEndRecPtr;
694 TimeLineID replayEndTLI;
695 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
696 TimestampTz recoveryLastXTime;
697
698 /*
699 * timestamp of when we started replaying the current chunk of WAL data,
700 * only relevant for replication or archive recovery
701 */
702 TimestampTz currentChunkStartTime;
703 /* Are we requested to pause recovery? */
704 bool recoveryPause;
705
706 /*
707 * lastFpwDisableRecPtr points to the start of the last replayed
708 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
709 */
710 XLogRecPtr lastFpwDisableRecPtr;
711
712 slock_t info_lck; /* locks shared variables shown above */
713 } XLogCtlData;
714
715 static XLogCtlData *XLogCtl = NULL;
716
717 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
718 static WALInsertLockPadded *WALInsertLocks = NULL;
719
720 /*
721 * We maintain an image of pg_control in shared memory.
722 */
723 static ControlFileData *ControlFile = NULL;
724
725 /*
726 * Calculate the amount of space left on the page after 'endptr'. Beware
727 * multiple evaluation!
728 */
729 #define INSERT_FREESPACE(endptr) \
730 (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
731
732 /* Macro to advance to next buffer index. */
733 #define NextBufIdx(idx) \
734 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
735
736 /*
737 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
738 * would hold if it was in cache, the page containing 'recptr'.
739 */
740 #define XLogRecPtrToBufIdx(recptr) \
741 (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
742
743 /*
744 * These are the number of bytes in a WAL page and segment usable for WAL data.
745 */
746 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
747 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
748
749 /* Convert min_wal_size_mb and max wal_size_mb to equivalent segment count */
750 #define ConvertToXSegs(x) \
751 (x / (XLOG_SEG_SIZE / (1024 * 1024)))
752
753 /*
754 * Private, possibly out-of-date copy of shared LogwrtResult.
755 * See discussion above.
756 */
757 static XLogwrtResult LogwrtResult = {0, 0};
758
759 /*
760 * Codes indicating where we got a WAL file from during recovery, or where
761 * to attempt to get one.
762 */
763 typedef enum
764 {
765 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
766 XLOG_FROM_ARCHIVE, /* restored using restore_command */
767 XLOG_FROM_PG_WAL, /* existing file in pg_wal */
768 XLOG_FROM_STREAM /* streamed from master */
769 } XLogSource;
770
771 /* human-readable names for XLogSources, for debugging output */
772 static const char *xlogSourceNames[] = {"any", "archive", "pg_wal", "stream"};
773
774 /*
775 * openLogFile is -1 or a kernel FD for an open log file segment.
776 * When it's open, openLogOff is the current seek offset in the file.
777 * openLogSegNo identifies the segment. These variables are only
778 * used to write the XLOG, and so will normally refer to the active segment.
779 */
780 static int openLogFile = -1;
781 static XLogSegNo openLogSegNo = 0;
782 static uint32 openLogOff = 0;
783
784 /*
785 * These variables are used similarly to the ones above, but for reading
786 * the XLOG. Note, however, that readOff generally represents the offset
787 * of the page just read, not the seek position of the FD itself, which
788 * will be just past that page. readLen indicates how much of the current
789 * page has been read into readBuf, and readSource indicates where we got
790 * the currently open file from.
791 */
792 static int readFile = -1;
793 static XLogSegNo readSegNo = 0;
794 static uint32 readOff = 0;
795 static uint32 readLen = 0;
796 static XLogSource readSource = 0; /* XLOG_FROM_* code */
797
798 /*
799 * Keeps track of which source we're currently reading from. This is
800 * different from readSource in that this is always set, even when we don't
801 * currently have a WAL file open. If lastSourceFailed is set, our last
802 * attempt to read from currentSource failed, and we should try another source
803 * next.
804 */
805 static XLogSource currentSource = 0; /* XLOG_FROM_* code */
806 static bool lastSourceFailed = false;
807
808 typedef struct XLogPageReadPrivate
809 {
810 int emode;
811 bool fetching_ckpt; /* are we fetching a checkpoint record? */
812 bool randAccess;
813 } XLogPageReadPrivate;
814
815 /*
816 * These variables track when we last obtained some WAL data to process,
817 * and where we got it from. (XLogReceiptSource is initially the same as
818 * readSource, but readSource gets reset to zero when we don't have data
819 * to process right now. It is also different from currentSource, which
820 * also changes when we try to read from a source and fail, while
821 * XLogReceiptSource tracks where we last successfully read some WAL.)
822 */
823 static TimestampTz XLogReceiptTime = 0;
824 static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */
825
826 /* State information for XLOG reading */
827 static XLogRecPtr ReadRecPtr; /* start of last record read */
828 static XLogRecPtr EndRecPtr; /* end+1 of last record read */
829
830 /*
831 * Local copies of equivalent fields in the control file. When running
832 * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
833 * expect to replay all the WAL available, and updateMinRecoveryPoint is
834 * switched to false to prevent any updates while replaying records.
835 * Those values are kept consistent as long as crash recovery runs.
836 */
837 static XLogRecPtr minRecoveryPoint;
838 static TimeLineID minRecoveryPointTLI;
839 static bool updateMinRecoveryPoint = true;
840
841 /*
842 * Have we reached a consistent database state? In crash recovery, we have
843 * to replay all the WAL, so reachedConsistency is never set. During archive
844 * recovery, the database is consistent once minRecoveryPoint is reached.
845 */
846 bool reachedConsistency = false;
847
848 static bool InRedo = false;
849
850 /* Have we launched bgwriter during recovery? */
851 static bool bgwriterLaunched = false;
852
853 /* For WALInsertLockAcquire/Release functions */
854 static int MyLockNo = 0;
855 static bool holdingAllLocks = false;
856
857 #ifdef WAL_DEBUG
858 static MemoryContext walDebugCxt = NULL;
859 #endif
860
861 static void readRecoveryCommandFile(void);
862 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
863 static bool recoveryStopsBefore(XLogReaderState *record);
864 static bool recoveryStopsAfter(XLogReaderState *record);
865 static void recoveryPausesHere(void);
866 static bool recoveryApplyDelay(XLogReaderState *record);
867 static void SetLatestXTime(TimestampTz xtime);
868 static void SetCurrentChunkStartTime(TimestampTz xtime);
869 static void CheckRequiredParameterValues(void);
870 static void XLogReportParameters(void);
871 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
872 TimeLineID prevTLI);
873 static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
874 XLogReaderState *state);
875 static void LocalSetXLogInsertAllowed(void);
876 static void CreateEndOfRecoveryRecord(void);
877 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
878 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
879 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
880 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
881
882 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
883 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
884 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
885 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
886 bool find_free, XLogSegNo max_segno,
887 bool use_lock);
888 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
889 int source, bool notfoundOk);
890 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
891 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
892 int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
893 TimeLineID *readTLI);
894 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
895 bool fetching_ckpt, XLogRecPtr tliRecPtr);
896 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
897 static void XLogFileClose(void);
898 static void PreallocXlogFiles(XLogRecPtr endptr);
899 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
900 static void RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
901 static void UpdateLastRemovedPtr(char *filename);
902 static void ValidateXLOGDirectoryStructure(void);
903 static void CleanupBackupHistory(void);
904 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
905 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
906 int emode, bool fetching_ckpt);
907 static void CheckRecoveryConsistency(void);
908 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
909 XLogRecPtr RecPtr, int whichChkpti, bool report);
910 static bool rescanLatestTimeLine(void);
911 static void WriteControlFile(void);
912 static void ReadControlFile(void);
913 static char *str_time(pg_time_t tnow);
914 static bool CheckForStandbyTrigger(void);
915
916 #ifdef WAL_DEBUG
917 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
918 #endif
919 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
920 static void pg_start_backup_callback(int code, Datum arg);
921 static void pg_stop_backup_callback(int code, Datum arg);
922 static bool read_backup_label(XLogRecPtr *checkPointLoc,
923 bool *backupEndRequired, bool *backupFromStandby);
924 static bool read_tablespace_map(List **tablespaces);
925
926 static void rm_redo_error_callback(void *arg);
927 static int get_sync_bit(int method);
928
929 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
930 XLogRecData *rdata,
931 XLogRecPtr StartPos, XLogRecPtr EndPos);
932 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
933 XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
934 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
935 XLogRecPtr *PrevPtr);
936 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
937 static char *GetXLogBuffer(XLogRecPtr ptr);
938 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
939 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
940 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
941 static void checkXLogConsistency(XLogReaderState *record);
942
943 static void WALInsertLockAcquire(void);
944 static void WALInsertLockAcquireExclusive(void);
945 static void WALInsertLockRelease(void);
946 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
947
948 /*
949 * Insert an XLOG record represented by an already-constructed chain of data
950 * chunks. This is a low-level routine; to construct the WAL record header
951 * and data, use the higher-level routines in xloginsert.c.
952 *
953 * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
954 * WAL record applies to, that were not included in the record as full page
955 * images. If fpw_lsn <= RedoRecPtr, the function does not perform the
956 * insertion and returns InvalidXLogRecPtr. The caller can then recalculate
957 * which pages need a full-page image, and retry. If fpw_lsn is invalid, the
958 * record is always inserted.
959 *
960 * 'flags' gives more in-depth control on the record being inserted. See
961 * XLogSetRecordFlags() for details.
962 *
963 * The first XLogRecData in the chain must be for the record header, and its
964 * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
965 * xl_crc fields in the header, the rest of the header must already be filled
966 * by the caller.
967 *
968 * Returns XLOG pointer to end of record (beginning of next record).
969 * This can be used as LSN for data pages affected by the logged action.
970 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
971 * before the data page can be written out. This implements the basic
972 * WAL rule "write the log before the data".)
973 */
974 XLogRecPtr
XLogInsertRecord(XLogRecData * rdata,XLogRecPtr fpw_lsn,uint8 flags)975 XLogInsertRecord(XLogRecData *rdata,
976 XLogRecPtr fpw_lsn,
977 uint8 flags)
978 {
979 XLogCtlInsert *Insert = &XLogCtl->Insert;
980 pg_crc32c rdata_crc;
981 bool inserted;
982 XLogRecord *rechdr = (XLogRecord *) rdata->data;
983 uint8 info = rechdr->xl_info & ~XLR_INFO_MASK;
984 bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
985 info == XLOG_SWITCH);
986 XLogRecPtr StartPos;
987 XLogRecPtr EndPos;
988 bool prevDoPageWrites = doPageWrites;
989
990 /* we assume that all of the record header is in the first chunk */
991 Assert(rdata->len >= SizeOfXLogRecord);
992
993 /* cross-check on whether we should be here or not */
994 if (!XLogInsertAllowed())
995 elog(ERROR, "cannot make new WAL entries during recovery");
996
997 /*----------
998 *
999 * We have now done all the preparatory work we can without holding a
1000 * lock or modifying shared state. From here on, inserting the new WAL
1001 * record to the shared WAL buffer cache is a two-step process:
1002 *
1003 * 1. Reserve the right amount of space from the WAL. The current head of
1004 * reserved space is kept in Insert->CurrBytePos, and is protected by
1005 * insertpos_lck.
1006 *
1007 * 2. Copy the record to the reserved WAL space. This involves finding the
1008 * correct WAL buffer containing the reserved space, and copying the
1009 * record in place. This can be done concurrently in multiple processes.
1010 *
1011 * To keep track of which insertions are still in-progress, each concurrent
1012 * inserter acquires an insertion lock. In addition to just indicating that
1013 * an insertion is in progress, the lock tells others how far the inserter
1014 * has progressed. There is a small fixed number of insertion locks,
1015 * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
1016 * boundary, it updates the value stored in the lock to the how far it has
1017 * inserted, to allow the previous buffer to be flushed.
1018 *
1019 * Holding onto an insertion lock also protects RedoRecPtr and
1020 * fullPageWrites from changing until the insertion is finished.
1021 *
1022 * Step 2 can usually be done completely in parallel. If the required WAL
1023 * page is not initialized yet, you have to grab WALBufMappingLock to
1024 * initialize it, but the WAL writer tries to do that ahead of insertions
1025 * to avoid that from happening in the critical path.
1026 *
1027 *----------
1028 */
1029 START_CRIT_SECTION();
1030 if (isLogSwitch)
1031 WALInsertLockAcquireExclusive();
1032 else
1033 WALInsertLockAcquire();
1034
1035 /*
1036 * Check to see if my copy of RedoRecPtr is out of date. If so, may have
1037 * to go back and have the caller recompute everything. This can only
1038 * happen just after a checkpoint, so it's better to be slow in this case
1039 * and fast otherwise.
1040 *
1041 * Also check to see if fullPageWrites or forcePageWrites was just turned
1042 * on; if we weren't already doing full-page writes then go back and
1043 * recompute.
1044 *
1045 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1046 * affect the contents of the XLOG record, so we'll update our local copy
1047 * but not force a recomputation. (If doPageWrites was just turned off,
1048 * we could recompute the record without full pages, but we choose not to
1049 * bother.)
1050 */
1051 if (RedoRecPtr != Insert->RedoRecPtr)
1052 {
1053 Assert(RedoRecPtr < Insert->RedoRecPtr);
1054 RedoRecPtr = Insert->RedoRecPtr;
1055 }
1056 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1057
1058 if (doPageWrites &&
1059 (!prevDoPageWrites ||
1060 (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
1061 {
1062 /*
1063 * Oops, some buffer now needs to be backed up that the caller didn't
1064 * back up. Start over.
1065 */
1066 WALInsertLockRelease();
1067 END_CRIT_SECTION();
1068 return InvalidXLogRecPtr;
1069 }
1070
1071 /*
1072 * Reserve space for the record in the WAL. This also sets the xl_prev
1073 * pointer.
1074 */
1075 if (isLogSwitch)
1076 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1077 else
1078 {
1079 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1080 &rechdr->xl_prev);
1081 inserted = true;
1082 }
1083
1084 if (inserted)
1085 {
1086 /*
1087 * Now that xl_prev has been filled in, calculate CRC of the record
1088 * header.
1089 */
1090 rdata_crc = rechdr->xl_crc;
1091 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1092 FIN_CRC32C(rdata_crc);
1093 rechdr->xl_crc = rdata_crc;
1094
1095 /*
1096 * All the record data, including the header, is now ready to be
1097 * inserted. Copy the record in the space reserved.
1098 */
1099 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1100 StartPos, EndPos);
1101
1102 /*
1103 * Unless record is flagged as not important, update LSN of last
1104 * important record in the current slot. When holding all locks, just
1105 * update the first one.
1106 */
1107 if ((flags & XLOG_MARK_UNIMPORTANT) == 0)
1108 {
1109 int lockno = holdingAllLocks ? 0 : MyLockNo;
1110
1111 WALInsertLocks[lockno].l.lastImportantAt = StartPos;
1112 }
1113 }
1114 else
1115 {
1116 /*
1117 * This was an xlog-switch record, but the current insert location was
1118 * already exactly at the beginning of a segment, so there was no need
1119 * to do anything.
1120 */
1121 }
1122
1123 /*
1124 * Done! Let others know that we're finished.
1125 */
1126 WALInsertLockRelease();
1127
1128 MarkCurrentTransactionIdLoggedIfAny();
1129
1130 END_CRIT_SECTION();
1131
1132 /*
1133 * Update shared LogwrtRqst.Write, if we crossed page boundary.
1134 */
1135 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1136 {
1137 SpinLockAcquire(&XLogCtl->info_lck);
1138 /* advance global request to include new block(s) */
1139 if (XLogCtl->LogwrtRqst.Write < EndPos)
1140 XLogCtl->LogwrtRqst.Write = EndPos;
1141 /* update local result copy while I have the chance */
1142 LogwrtResult = XLogCtl->LogwrtResult;
1143 SpinLockRelease(&XLogCtl->info_lck);
1144 }
1145
1146 /*
1147 * If this was an XLOG_SWITCH record, flush the record and the empty
1148 * padding space that fills the rest of the segment, and perform
1149 * end-of-segment actions (eg, notifying archiver).
1150 */
1151 if (isLogSwitch)
1152 {
1153 TRACE_POSTGRESQL_WAL_SWITCH();
1154 XLogFlush(EndPos);
1155
1156 /*
1157 * Even though we reserved the rest of the segment for us, which is
1158 * reflected in EndPos, we return a pointer to just the end of the
1159 * xlog-switch record.
1160 */
1161 if (inserted)
1162 {
1163 EndPos = StartPos + SizeOfXLogRecord;
1164 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1165 {
1166 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1167 EndPos += SizeOfXLogLongPHD;
1168 else
1169 EndPos += SizeOfXLogShortPHD;
1170 }
1171 }
1172 }
1173
1174 #ifdef WAL_DEBUG
1175 if (XLOG_DEBUG)
1176 {
1177 static XLogReaderState *debug_reader = NULL;
1178 StringInfoData buf;
1179 StringInfoData recordBuf;
1180 char *errormsg = NULL;
1181 MemoryContext oldCxt;
1182
1183 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1184
1185 initStringInfo(&buf);
1186 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1187 (uint32) (EndPos >> 32), (uint32) EndPos);
1188
1189 /*
1190 * We have to piece together the WAL record data from the XLogRecData
1191 * entries, so that we can pass it to the rm_desc function as one
1192 * contiguous chunk.
1193 */
1194 initStringInfo(&recordBuf);
1195 for (; rdata != NULL; rdata = rdata->next)
1196 appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1197
1198 if (!debug_reader)
1199 debug_reader = XLogReaderAllocate(NULL, NULL);
1200
1201 if (!debug_reader)
1202 {
1203 appendStringInfoString(&buf, "error decoding record: out of memory");
1204 }
1205 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1206 &errormsg))
1207 {
1208 appendStringInfo(&buf, "error decoding record: %s",
1209 errormsg ? errormsg : "no error message");
1210 }
1211 else
1212 {
1213 appendStringInfoString(&buf, " - ");
1214 xlog_outdesc(&buf, debug_reader);
1215 }
1216 elog(LOG, "%s", buf.data);
1217
1218 pfree(buf.data);
1219 pfree(recordBuf.data);
1220 MemoryContextSwitchTo(oldCxt);
1221 }
1222 #endif
1223
1224 /*
1225 * Update our global variables
1226 */
1227 ProcLastRecPtr = StartPos;
1228 XactLastRecEnd = EndPos;
1229
1230 return EndPos;
1231 }
1232
1233 /*
1234 * Reserves the right amount of space for a record of given size from the WAL.
1235 * *StartPos is set to the beginning of the reserved section, *EndPos to
1236 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1237 * used to set the xl_prev of this record.
1238 *
1239 * This is the performance critical part of XLogInsert that must be serialized
1240 * across backends. The rest can happen mostly in parallel. Try to keep this
1241 * section as short as possible, insertpos_lck can be heavily contended on a
1242 * busy system.
1243 *
1244 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1245 * where we actually copy the record to the reserved space.
1246 */
1247 static void
ReserveXLogInsertLocation(int size,XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1248 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1249 XLogRecPtr *PrevPtr)
1250 {
1251 XLogCtlInsert *Insert = &XLogCtl->Insert;
1252 uint64 startbytepos;
1253 uint64 endbytepos;
1254 uint64 prevbytepos;
1255
1256 size = MAXALIGN(size);
1257
1258 /* All (non xlog-switch) records should contain data. */
1259 Assert(size > SizeOfXLogRecord);
1260
1261 /*
1262 * The duration the spinlock needs to be held is minimized by minimizing
1263 * the calculations that have to be done while holding the lock. The
1264 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1265 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1266 * page headers. The mapping between "usable" byte positions and physical
1267 * positions (XLogRecPtrs) can be done outside the locked region, and
1268 * because the usable byte position doesn't include any headers, reserving
1269 * X bytes from WAL is almost as simple as "CurrBytePos += X".
1270 */
1271 SpinLockAcquire(&Insert->insertpos_lck);
1272
1273 startbytepos = Insert->CurrBytePos;
1274 endbytepos = startbytepos + size;
1275 prevbytepos = Insert->PrevBytePos;
1276 Insert->CurrBytePos = endbytepos;
1277 Insert->PrevBytePos = startbytepos;
1278
1279 SpinLockRelease(&Insert->insertpos_lck);
1280
1281 *StartPos = XLogBytePosToRecPtr(startbytepos);
1282 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1283 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1284
1285 /*
1286 * Check that the conversions between "usable byte positions" and
1287 * XLogRecPtrs work consistently in both directions.
1288 */
1289 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1290 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1291 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1292 }
1293
1294 /*
1295 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1296 *
1297 * A log-switch record is handled slightly differently. The rest of the
1298 * segment will be reserved for this insertion, as indicated by the returned
1299 * *EndPos value. However, if we are already at the beginning of the current
1300 * segment, *StartPos and *EndPos are set to the current location without
1301 * reserving any space, and the function returns false.
1302 */
1303 static bool
ReserveXLogSwitch(XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1304 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1305 {
1306 XLogCtlInsert *Insert = &XLogCtl->Insert;
1307 uint64 startbytepos;
1308 uint64 endbytepos;
1309 uint64 prevbytepos;
1310 uint32 size = MAXALIGN(SizeOfXLogRecord);
1311 XLogRecPtr ptr;
1312 uint32 segleft;
1313
1314 /*
1315 * These calculations are a bit heavy-weight to be done while holding a
1316 * spinlock, but since we're holding all the WAL insertion locks, there
1317 * are no other inserters competing for it. GetXLogInsertRecPtr() does
1318 * compete for it, but that's not called very frequently.
1319 */
1320 SpinLockAcquire(&Insert->insertpos_lck);
1321
1322 startbytepos = Insert->CurrBytePos;
1323
1324 ptr = XLogBytePosToEndRecPtr(startbytepos);
1325 if (ptr % XLOG_SEG_SIZE == 0)
1326 {
1327 SpinLockRelease(&Insert->insertpos_lck);
1328 *EndPos = *StartPos = ptr;
1329 return false;
1330 }
1331
1332 endbytepos = startbytepos + size;
1333 prevbytepos = Insert->PrevBytePos;
1334
1335 *StartPos = XLogBytePosToRecPtr(startbytepos);
1336 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1337
1338 segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1339 if (segleft != XLOG_SEG_SIZE)
1340 {
1341 /* consume the rest of the segment */
1342 *EndPos += segleft;
1343 endbytepos = XLogRecPtrToBytePos(*EndPos);
1344 }
1345 Insert->CurrBytePos = endbytepos;
1346 Insert->PrevBytePos = startbytepos;
1347
1348 SpinLockRelease(&Insert->insertpos_lck);
1349
1350 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1351
1352 Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1353 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1354 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1355 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1356
1357 return true;
1358 }
1359
1360 /*
1361 * Checks whether the current buffer page and backup page stored in the
1362 * WAL record are consistent or not. Before comparing the two pages, a
1363 * masking can be applied to the pages to ignore certain areas like hint bits,
1364 * unused space between pd_lower and pd_upper among other things. This
1365 * function should be called once WAL replay has been completed for a
1366 * given record.
1367 */
1368 static void
checkXLogConsistency(XLogReaderState * record)1369 checkXLogConsistency(XLogReaderState *record)
1370 {
1371 RmgrId rmid = XLogRecGetRmid(record);
1372 RelFileNode rnode;
1373 ForkNumber forknum;
1374 BlockNumber blkno;
1375 int block_id;
1376
1377 /* Records with no backup blocks have no need for consistency checks. */
1378 if (!XLogRecHasAnyBlockRefs(record))
1379 return;
1380
1381 Assert((XLogRecGetInfo(record) & XLR_CHECK_CONSISTENCY) != 0);
1382
1383 for (block_id = 0; block_id <= record->max_block_id; block_id++)
1384 {
1385 Buffer buf;
1386 Page page;
1387
1388 if (!XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blkno))
1389 {
1390 /*
1391 * WAL record doesn't contain a block reference with the given id.
1392 * Do nothing.
1393 */
1394 continue;
1395 }
1396
1397 Assert(XLogRecHasBlockImage(record, block_id));
1398
1399 if (XLogRecBlockImageApply(record, block_id))
1400 {
1401 /*
1402 * WAL record has already applied the page, so bypass the
1403 * consistency check as that would result in comparing the full
1404 * page stored in the record with itself.
1405 */
1406 continue;
1407 }
1408
1409 /*
1410 * Read the contents from the current buffer and store it in a
1411 * temporary page.
1412 */
1413 buf = XLogReadBufferExtended(rnode, forknum, blkno,
1414 RBM_NORMAL_NO_LOG);
1415 if (!BufferIsValid(buf))
1416 continue;
1417
1418 LockBuffer(buf, BUFFER_LOCK_EXCLUSIVE);
1419 page = BufferGetPage(buf);
1420
1421 /*
1422 * Take a copy of the local page where WAL has been applied to have a
1423 * comparison base before masking it...
1424 */
1425 memcpy(replay_image_masked, page, BLCKSZ);
1426
1427 /* No need for this page anymore now that a copy is in. */
1428 UnlockReleaseBuffer(buf);
1429
1430 /*
1431 * If the block LSN is already ahead of this WAL record, we can't
1432 * expect contents to match. This can happen if recovery is
1433 * restarted.
1434 */
1435 if (PageGetLSN(replay_image_masked) > record->EndRecPtr)
1436 continue;
1437
1438 /*
1439 * Read the contents from the backup copy, stored in WAL record and
1440 * store it in a temporary page. There is no need to allocate a new
1441 * page here, a local buffer is fine to hold its contents and a mask
1442 * can be directly applied on it.
1443 */
1444 if (!RestoreBlockImage(record, block_id, master_image_masked))
1445 elog(ERROR, "failed to restore block image");
1446
1447 /*
1448 * If masking function is defined, mask both the master and replay
1449 * images
1450 */
1451 if (RmgrTable[rmid].rm_mask != NULL)
1452 {
1453 RmgrTable[rmid].rm_mask(replay_image_masked, blkno);
1454 RmgrTable[rmid].rm_mask(master_image_masked, blkno);
1455 }
1456
1457 /* Time to compare the master and replay images. */
1458 if (memcmp(replay_image_masked, master_image_masked, BLCKSZ) != 0)
1459 {
1460 elog(FATAL,
1461 "inconsistent page found, rel %u/%u/%u, forknum %u, blkno %u",
1462 rnode.spcNode, rnode.dbNode, rnode.relNode,
1463 forknum, blkno);
1464 }
1465 }
1466 }
1467
1468 /*
1469 * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
1470 * area in the WAL.
1471 */
1472 static void
CopyXLogRecordToWAL(int write_len,bool isLogSwitch,XLogRecData * rdata,XLogRecPtr StartPos,XLogRecPtr EndPos)1473 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1474 XLogRecPtr StartPos, XLogRecPtr EndPos)
1475 {
1476 char *currpos;
1477 int freespace;
1478 int written;
1479 XLogRecPtr CurrPos;
1480 XLogPageHeader pagehdr;
1481
1482 /*
1483 * Get a pointer to the right place in the right WAL buffer to start
1484 * inserting to.
1485 */
1486 CurrPos = StartPos;
1487 currpos = GetXLogBuffer(CurrPos);
1488 freespace = INSERT_FREESPACE(CurrPos);
1489
1490 /*
1491 * there should be enough space for at least the first field (xl_tot_len)
1492 * on this page.
1493 */
1494 Assert(freespace >= sizeof(uint32));
1495
1496 /* Copy record data */
1497 written = 0;
1498 while (rdata != NULL)
1499 {
1500 char *rdata_data = rdata->data;
1501 int rdata_len = rdata->len;
1502
1503 while (rdata_len > freespace)
1504 {
1505 /*
1506 * Write what fits on this page, and continue on the next page.
1507 */
1508 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1509 memcpy(currpos, rdata_data, freespace);
1510 rdata_data += freespace;
1511 rdata_len -= freespace;
1512 written += freespace;
1513 CurrPos += freespace;
1514
1515 /*
1516 * Get pointer to beginning of next page, and set the xlp_rem_len
1517 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1518 *
1519 * It's safe to set the contrecord flag and xlp_rem_len without a
1520 * lock on the page. All the other flags were already set when the
1521 * page was initialized, in AdvanceXLInsertBuffer, and we're the
1522 * only backend that needs to set the contrecord flag.
1523 */
1524 currpos = GetXLogBuffer(CurrPos);
1525 pagehdr = (XLogPageHeader) currpos;
1526 pagehdr->xlp_rem_len = write_len - written;
1527 pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1528
1529 /* skip over the page header */
1530 if (CurrPos % XLogSegSize == 0)
1531 {
1532 CurrPos += SizeOfXLogLongPHD;
1533 currpos += SizeOfXLogLongPHD;
1534 }
1535 else
1536 {
1537 CurrPos += SizeOfXLogShortPHD;
1538 currpos += SizeOfXLogShortPHD;
1539 }
1540 freespace = INSERT_FREESPACE(CurrPos);
1541 }
1542
1543 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1544 memcpy(currpos, rdata_data, rdata_len);
1545 currpos += rdata_len;
1546 CurrPos += rdata_len;
1547 freespace -= rdata_len;
1548 written += rdata_len;
1549
1550 rdata = rdata->next;
1551 }
1552 Assert(written == write_len);
1553
1554 /*
1555 * If this was an xlog-switch, it's not enough to write the switch record,
1556 * we also have to consume all the remaining space in the WAL segment. We
1557 * have already reserved it for us, but we still need to make sure it's
1558 * allocated and zeroed in the WAL buffers so that when the caller (or
1559 * someone else) does XLogWrite(), it can really write out all the zeros.
1560 */
1561 if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1562 {
1563 /* An xlog-switch record doesn't contain any data besides the header */
1564 Assert(write_len == SizeOfXLogRecord);
1565
1566 /*
1567 * We do this one page at a time, to make sure we don't deadlock
1568 * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1569 */
1570 Assert(EndPos % XLogSegSize == 0);
1571
1572 /* Use up all the remaining space on the first page */
1573 CurrPos += freespace;
1574
1575 while (CurrPos < EndPos)
1576 {
1577 /* initialize the next page (if not initialized already) */
1578 WALInsertLockUpdateInsertingAt(CurrPos);
1579 AdvanceXLInsertBuffer(CurrPos, false);
1580 CurrPos += XLOG_BLCKSZ;
1581 }
1582 }
1583 else
1584 {
1585 /* Align the end position, so that the next record starts aligned */
1586 CurrPos = MAXALIGN64(CurrPos);
1587 }
1588
1589 if (CurrPos != EndPos)
1590 elog(PANIC, "space reserved for WAL record does not match what was written");
1591 }
1592
1593 /*
1594 * Acquire a WAL insertion lock, for inserting to WAL.
1595 */
1596 static void
WALInsertLockAcquire(void)1597 WALInsertLockAcquire(void)
1598 {
1599 bool immed;
1600
1601 /*
1602 * It doesn't matter which of the WAL insertion locks we acquire, so try
1603 * the one we used last time. If the system isn't particularly busy, it's
1604 * a good bet that it's still available, and it's good to have some
1605 * affinity to a particular lock so that you don't unnecessarily bounce
1606 * cache lines between processes when there's no contention.
1607 *
1608 * If this is the first time through in this backend, pick a lock
1609 * (semi-)randomly. This allows the locks to be used evenly if you have a
1610 * lot of very short connections.
1611 */
1612 static int lockToTry = -1;
1613
1614 if (lockToTry == -1)
1615 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1616 MyLockNo = lockToTry;
1617
1618 /*
1619 * The insertingAt value is initially set to 0, as we don't know our
1620 * insert location yet.
1621 */
1622 immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1623 if (!immed)
1624 {
1625 /*
1626 * If we couldn't get the lock immediately, try another lock next
1627 * time. On a system with more insertion locks than concurrent
1628 * inserters, this causes all the inserters to eventually migrate to a
1629 * lock that no-one else is using. On a system with more inserters
1630 * than locks, it still helps to distribute the inserters evenly
1631 * across the locks.
1632 */
1633 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1634 }
1635 }
1636
1637 /*
1638 * Acquire all WAL insertion locks, to prevent other backends from inserting
1639 * to WAL.
1640 */
1641 static void
WALInsertLockAcquireExclusive(void)1642 WALInsertLockAcquireExclusive(void)
1643 {
1644 int i;
1645
1646 /*
1647 * When holding all the locks, all but the last lock's insertingAt
1648 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1649 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1650 */
1651 for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1652 {
1653 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1654 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1655 &WALInsertLocks[i].l.insertingAt,
1656 PG_UINT64_MAX);
1657 }
1658 /* Variable value reset to 0 at release */
1659 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1660
1661 holdingAllLocks = true;
1662 }
1663
1664 /*
1665 * Release our insertion lock (or locks, if we're holding them all).
1666 *
1667 * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1668 * next time the lock is acquired.
1669 */
1670 static void
WALInsertLockRelease(void)1671 WALInsertLockRelease(void)
1672 {
1673 if (holdingAllLocks)
1674 {
1675 int i;
1676
1677 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1678 LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1679 &WALInsertLocks[i].l.insertingAt,
1680 0);
1681
1682 holdingAllLocks = false;
1683 }
1684 else
1685 {
1686 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1687 &WALInsertLocks[MyLockNo].l.insertingAt,
1688 0);
1689 }
1690 }
1691
1692 /*
1693 * Update our insertingAt value, to let others know that we've finished
1694 * inserting up to that point.
1695 */
1696 static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)1697 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1698 {
1699 if (holdingAllLocks)
1700 {
1701 /*
1702 * We use the last lock to mark our actual position, see comments in
1703 * WALInsertLockAcquireExclusive.
1704 */
1705 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1706 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1707 insertingAt);
1708 }
1709 else
1710 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1711 &WALInsertLocks[MyLockNo].l.insertingAt,
1712 insertingAt);
1713 }
1714
1715 /*
1716 * Wait for any WAL insertions < upto to finish.
1717 *
1718 * Returns the location of the oldest insertion that is still in-progress.
1719 * Any WAL prior to that point has been fully copied into WAL buffers, and
1720 * can be flushed out to disk. Because this waits for any insertions older
1721 * than 'upto' to finish, the return value is always >= 'upto'.
1722 *
1723 * Note: When you are about to write out WAL, you must call this function
1724 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1725 * need to wait for an insertion to finish (or at least advance to next
1726 * uninitialized page), and the inserter might need to evict an old WAL buffer
1727 * to make room for a new one, which in turn requires WALWriteLock.
1728 */
1729 static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)1730 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1731 {
1732 uint64 bytepos;
1733 XLogRecPtr reservedUpto;
1734 XLogRecPtr finishedUpto;
1735 XLogCtlInsert *Insert = &XLogCtl->Insert;
1736 int i;
1737
1738 if (MyProc == NULL)
1739 elog(PANIC, "cannot wait without a PGPROC structure");
1740
1741 /* Read the current insert position */
1742 SpinLockAcquire(&Insert->insertpos_lck);
1743 bytepos = Insert->CurrBytePos;
1744 SpinLockRelease(&Insert->insertpos_lck);
1745 reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1746
1747 /*
1748 * No-one should request to flush a piece of WAL that hasn't even been
1749 * reserved yet. However, it can happen if there is a block with a bogus
1750 * LSN on disk, for example. XLogFlush checks for that situation and
1751 * complains, but only after the flush. Here we just assume that to mean
1752 * that all WAL that has been reserved needs to be finished. In this
1753 * corner-case, the return value can be smaller than 'upto' argument.
1754 */
1755 if (upto > reservedUpto)
1756 {
1757 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1758 (uint32) (upto >> 32), (uint32) upto,
1759 (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1760 upto = reservedUpto;
1761 }
1762
1763 /*
1764 * Loop through all the locks, sleeping on any in-progress insert older
1765 * than 'upto'.
1766 *
1767 * finishedUpto is our return value, indicating the point upto which all
1768 * the WAL insertions have been finished. Initialize it to the head of
1769 * reserved WAL, and as we iterate through the insertion locks, back it
1770 * out for any insertion that's still in progress.
1771 */
1772 finishedUpto = reservedUpto;
1773 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1774 {
1775 XLogRecPtr insertingat = InvalidXLogRecPtr;
1776
1777 do
1778 {
1779 /*
1780 * See if this insertion is in progress. LWLockWait will wait for
1781 * the lock to be released, or for the 'value' to be set by a
1782 * LWLockUpdateVar call. When a lock is initially acquired, its
1783 * value is 0 (InvalidXLogRecPtr), which means that we don't know
1784 * where it's inserting yet. We will have to wait for it. If
1785 * it's a small insertion, the record will most likely fit on the
1786 * same page and the inserter will release the lock without ever
1787 * calling LWLockUpdateVar. But if it has to sleep, it will
1788 * advertise the insertion point with LWLockUpdateVar before
1789 * sleeping.
1790 */
1791 if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1792 &WALInsertLocks[i].l.insertingAt,
1793 insertingat, &insertingat))
1794 {
1795 /* the lock was free, so no insertion in progress */
1796 insertingat = InvalidXLogRecPtr;
1797 break;
1798 }
1799
1800 /*
1801 * This insertion is still in progress. Have to wait, unless the
1802 * inserter has proceeded past 'upto'.
1803 */
1804 } while (insertingat < upto);
1805
1806 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1807 finishedUpto = insertingat;
1808 }
1809 return finishedUpto;
1810 }
1811
1812 /*
1813 * Get a pointer to the right location in the WAL buffer containing the
1814 * given XLogRecPtr.
1815 *
1816 * If the page is not initialized yet, it is initialized. That might require
1817 * evicting an old dirty buffer from the buffer cache, which means I/O.
1818 *
1819 * The caller must ensure that the page containing the requested location
1820 * isn't evicted yet, and won't be evicted. The way to ensure that is to
1821 * hold onto a WAL insertion lock with the insertingAt position set to
1822 * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1823 * to evict an old page from the buffer. (This means that once you call
1824 * GetXLogBuffer() with a given 'ptr', you must not access anything before
1825 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1826 * later, because older buffers might be recycled already)
1827 */
1828 static char *
GetXLogBuffer(XLogRecPtr ptr)1829 GetXLogBuffer(XLogRecPtr ptr)
1830 {
1831 int idx;
1832 XLogRecPtr endptr;
1833 static uint64 cachedPage = 0;
1834 static char *cachedPos = NULL;
1835 XLogRecPtr expectedEndPtr;
1836
1837 /*
1838 * Fast path for the common case that we need to access again the same
1839 * page as last time.
1840 */
1841 if (ptr / XLOG_BLCKSZ == cachedPage)
1842 {
1843 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1844 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1845 return cachedPos + ptr % XLOG_BLCKSZ;
1846 }
1847
1848 /*
1849 * The XLog buffer cache is organized so that a page is always loaded to a
1850 * particular buffer. That way we can easily calculate the buffer a given
1851 * page must be loaded into, from the XLogRecPtr alone.
1852 */
1853 idx = XLogRecPtrToBufIdx(ptr);
1854
1855 /*
1856 * See what page is loaded in the buffer at the moment. It could be the
1857 * page we're looking for, or something older. It can't be anything newer
1858 * - that would imply the page we're looking for has already been written
1859 * out to disk and evicted, and the caller is responsible for making sure
1860 * that doesn't happen.
1861 *
1862 * However, we don't hold a lock while we read the value. If someone has
1863 * just initialized the page, it's possible that we get a "torn read" of
1864 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1865 * that case we will see a bogus value. That's ok, we'll grab the mapping
1866 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1867 * the page we're looking for. But it means that when we do this unlocked
1868 * read, we might see a value that appears to be ahead of the page we're
1869 * looking for. Don't PANIC on that, until we've verified the value while
1870 * holding the lock.
1871 */
1872 expectedEndPtr = ptr;
1873 expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1874
1875 endptr = XLogCtl->xlblocks[idx];
1876 if (expectedEndPtr != endptr)
1877 {
1878 XLogRecPtr initializedUpto;
1879
1880 /*
1881 * Before calling AdvanceXLInsertBuffer(), which can block, let others
1882 * know how far we're finished with inserting the record.
1883 *
1884 * NB: If 'ptr' points to just after the page header, advertise a
1885 * position at the beginning of the page rather than 'ptr' itself. If
1886 * there are no other insertions running, someone might try to flush
1887 * up to our advertised location. If we advertised a position after
1888 * the page header, someone might try to flush the page header, even
1889 * though page might actually not be initialized yet. As the first
1890 * inserter on the page, we are effectively responsible for making
1891 * sure that it's initialized, before we let insertingAt to move past
1892 * the page header.
1893 */
1894 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1895 ptr % XLOG_SEG_SIZE > XLOG_BLCKSZ)
1896 initializedUpto = ptr - SizeOfXLogShortPHD;
1897 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1898 ptr % XLOG_SEG_SIZE < XLOG_BLCKSZ)
1899 initializedUpto = ptr - SizeOfXLogLongPHD;
1900 else
1901 initializedUpto = ptr;
1902
1903 WALInsertLockUpdateInsertingAt(initializedUpto);
1904
1905 AdvanceXLInsertBuffer(ptr, false);
1906 endptr = XLogCtl->xlblocks[idx];
1907
1908 if (expectedEndPtr != endptr)
1909 elog(PANIC, "could not find WAL buffer for %X/%X",
1910 (uint32) (ptr >> 32), (uint32) ptr);
1911 }
1912 else
1913 {
1914 /*
1915 * Make sure the initialization of the page is visible to us, and
1916 * won't arrive later to overwrite the WAL data we write on the page.
1917 */
1918 pg_memory_barrier();
1919 }
1920
1921 /*
1922 * Found the buffer holding this page. Return a pointer to the right
1923 * offset within the page.
1924 */
1925 cachedPage = ptr / XLOG_BLCKSZ;
1926 cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1927
1928 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1929 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1930
1931 return cachedPos + ptr % XLOG_BLCKSZ;
1932 }
1933
1934 /*
1935 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1936 * is the position starting from the beginning of WAL, excluding all WAL
1937 * page headers.
1938 */
1939 static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)1940 XLogBytePosToRecPtr(uint64 bytepos)
1941 {
1942 uint64 fullsegs;
1943 uint64 fullpages;
1944 uint64 bytesleft;
1945 uint32 seg_offset;
1946 XLogRecPtr result;
1947
1948 fullsegs = bytepos / UsableBytesInSegment;
1949 bytesleft = bytepos % UsableBytesInSegment;
1950
1951 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1952 {
1953 /* fits on first page of segment */
1954 seg_offset = bytesleft + SizeOfXLogLongPHD;
1955 }
1956 else
1957 {
1958 /* account for the first page on segment with long header */
1959 seg_offset = XLOG_BLCKSZ;
1960 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1961
1962 fullpages = bytesleft / UsableBytesInPage;
1963 bytesleft = bytesleft % UsableBytesInPage;
1964
1965 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1966 }
1967
1968 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1969
1970 return result;
1971 }
1972
1973 /*
1974 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1975 * returns a pointer to the beginning of the page (ie. before page header),
1976 * not to where the first xlog record on that page would go to. This is used
1977 * when converting a pointer to the end of a record.
1978 */
1979 static XLogRecPtr
XLogBytePosToEndRecPtr(uint64 bytepos)1980 XLogBytePosToEndRecPtr(uint64 bytepos)
1981 {
1982 uint64 fullsegs;
1983 uint64 fullpages;
1984 uint64 bytesleft;
1985 uint32 seg_offset;
1986 XLogRecPtr result;
1987
1988 fullsegs = bytepos / UsableBytesInSegment;
1989 bytesleft = bytepos % UsableBytesInSegment;
1990
1991 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1992 {
1993 /* fits on first page of segment */
1994 if (bytesleft == 0)
1995 seg_offset = 0;
1996 else
1997 seg_offset = bytesleft + SizeOfXLogLongPHD;
1998 }
1999 else
2000 {
2001 /* account for the first page on segment with long header */
2002 seg_offset = XLOG_BLCKSZ;
2003 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
2004
2005 fullpages = bytesleft / UsableBytesInPage;
2006 bytesleft = bytesleft % UsableBytesInPage;
2007
2008 if (bytesleft == 0)
2009 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
2010 else
2011 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
2012 }
2013
2014 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
2015
2016 return result;
2017 }
2018
2019 /*
2020 * Convert an XLogRecPtr to a "usable byte position".
2021 */
2022 static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)2023 XLogRecPtrToBytePos(XLogRecPtr ptr)
2024 {
2025 uint64 fullsegs;
2026 uint32 fullpages;
2027 uint32 offset;
2028 uint64 result;
2029
2030 XLByteToSeg(ptr, fullsegs);
2031
2032 fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
2033 offset = ptr % XLOG_BLCKSZ;
2034
2035 if (fullpages == 0)
2036 {
2037 result = fullsegs * UsableBytesInSegment;
2038 if (offset > 0)
2039 {
2040 Assert(offset >= SizeOfXLogLongPHD);
2041 result += offset - SizeOfXLogLongPHD;
2042 }
2043 }
2044 else
2045 {
2046 result = fullsegs * UsableBytesInSegment +
2047 (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
2048 (fullpages - 1) * UsableBytesInPage; /* full pages */
2049 if (offset > 0)
2050 {
2051 Assert(offset >= SizeOfXLogShortPHD);
2052 result += offset - SizeOfXLogShortPHD;
2053 }
2054 }
2055
2056 return result;
2057 }
2058
2059 /*
2060 * Initialize XLOG buffers, writing out old buffers if they still contain
2061 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
2062 * true, initialize as many pages as we can without having to write out
2063 * unwritten data. Any new pages are initialized to zeros, with pages headers
2064 * initialized properly.
2065 */
2066 static void
AdvanceXLInsertBuffer(XLogRecPtr upto,bool opportunistic)2067 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
2068 {
2069 XLogCtlInsert *Insert = &XLogCtl->Insert;
2070 int nextidx;
2071 XLogRecPtr OldPageRqstPtr;
2072 XLogwrtRqst WriteRqst;
2073 XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
2074 XLogRecPtr NewPageBeginPtr;
2075 XLogPageHeader NewPage;
2076 int npages = 0;
2077
2078 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2079
2080 /*
2081 * Now that we have the lock, check if someone initialized the page
2082 * already.
2083 */
2084 while (upto >= XLogCtl->InitializedUpTo || opportunistic)
2085 {
2086 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
2087
2088 /*
2089 * Get ending-offset of the buffer page we need to replace (this may
2090 * be zero if the buffer hasn't been used yet). Fall through if it's
2091 * already written out.
2092 */
2093 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
2094 if (LogwrtResult.Write < OldPageRqstPtr)
2095 {
2096 /*
2097 * Nope, got work to do. If we just want to pre-initialize as much
2098 * as we can without flushing, give up now.
2099 */
2100 if (opportunistic)
2101 break;
2102
2103 /* Before waiting, get info_lck and update LogwrtResult */
2104 SpinLockAcquire(&XLogCtl->info_lck);
2105 if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
2106 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
2107 LogwrtResult = XLogCtl->LogwrtResult;
2108 SpinLockRelease(&XLogCtl->info_lck);
2109
2110 /*
2111 * Now that we have an up-to-date LogwrtResult value, see if we
2112 * still need to write it or if someone else already did.
2113 */
2114 if (LogwrtResult.Write < OldPageRqstPtr)
2115 {
2116 /*
2117 * Must acquire write lock. Release WALBufMappingLock first,
2118 * to make sure that all insertions that we need to wait for
2119 * can finish (up to this same position). Otherwise we risk
2120 * deadlock.
2121 */
2122 LWLockRelease(WALBufMappingLock);
2123
2124 WaitXLogInsertionsToFinish(OldPageRqstPtr);
2125
2126 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2127
2128 LogwrtResult = XLogCtl->LogwrtResult;
2129 if (LogwrtResult.Write >= OldPageRqstPtr)
2130 {
2131 /* OK, someone wrote it already */
2132 LWLockRelease(WALWriteLock);
2133 }
2134 else
2135 {
2136 /* Have to write it ourselves */
2137 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
2138 WriteRqst.Write = OldPageRqstPtr;
2139 WriteRqst.Flush = 0;
2140 XLogWrite(WriteRqst, false);
2141 LWLockRelease(WALWriteLock);
2142 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
2143 }
2144 /* Re-acquire WALBufMappingLock and retry */
2145 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
2146 continue;
2147 }
2148 }
2149
2150 /*
2151 * Now the next buffer slot is free and we can set it up to be the
2152 * next output page.
2153 */
2154 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2155 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2156
2157 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2158
2159 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2160
2161 /*
2162 * Be sure to re-zero the buffer so that bytes beyond what we've
2163 * written will look like zeroes and not valid XLOG records...
2164 */
2165 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2166
2167 /*
2168 * Fill the new page's header
2169 */
2170 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2171
2172 /* NewPage->xlp_info = 0; */ /* done by memset */
2173 NewPage->xlp_tli = ThisTimeLineID;
2174 NewPage->xlp_pageaddr = NewPageBeginPtr;
2175
2176 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2177
2178 /*
2179 * If online backup is not in progress, mark the header to indicate
2180 * that* WAL records beginning in this page have removable backup
2181 * blocks. This allows the WAL archiver to know whether it is safe to
2182 * compress archived WAL data by transforming full-block records into
2183 * the non-full-block format. It is sufficient to record this at the
2184 * page level because we force a page switch (in fact a segment
2185 * switch) when starting a backup, so the flag will be off before any
2186 * records can be written during the backup. At the end of a backup,
2187 * the last page will be marked as all unsafe when perhaps only part
2188 * is unsafe, but at worst the archiver would miss the opportunity to
2189 * compress a few records.
2190 */
2191 if (!Insert->forcePageWrites)
2192 NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2193
2194 /*
2195 * If a record was found to be broken at the end of recovery, and
2196 * we're going to write on the page where its first contrecord was
2197 * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
2198 * header. See CreateOverwriteContrecordRecord().
2199 */
2200 if (missingContrecPtr == NewPageBeginPtr)
2201 {
2202 NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
2203 missingContrecPtr = InvalidXLogRecPtr;
2204 }
2205
2206 /*
2207 * If first page of an XLOG segment file, make it a long header.
2208 */
2209 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2210 {
2211 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2212
2213 NewLongPage->xlp_sysid = ControlFile->system_identifier;
2214 NewLongPage->xlp_seg_size = XLogSegSize;
2215 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2216 NewPage->xlp_info |= XLP_LONG_HEADER;
2217 }
2218
2219 /*
2220 * Make sure the initialization of the page becomes visible to others
2221 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2222 * holding a lock.
2223 */
2224 pg_write_barrier();
2225
2226 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2227
2228 XLogCtl->InitializedUpTo = NewPageEndPtr;
2229
2230 npages++;
2231 }
2232 LWLockRelease(WALBufMappingLock);
2233
2234 #ifdef WAL_DEBUG
2235 if (XLOG_DEBUG && npages > 0)
2236 {
2237 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2238 npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2239 }
2240 #endif
2241 }
2242
2243 /*
2244 * Calculate CheckPointSegments based on max_wal_size_mb and
2245 * checkpoint_completion_target.
2246 */
2247 static void
CalculateCheckpointSegments(void)2248 CalculateCheckpointSegments(void)
2249 {
2250 double target;
2251
2252 /*-------
2253 * Calculate the distance at which to trigger a checkpoint, to avoid
2254 * exceeding max_wal_size_mb. This is based on two assumptions:
2255 *
2256 * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
2257 * b) during checkpoint, we consume checkpoint_completion_target *
2258 * number of segments consumed between checkpoints.
2259 *-------
2260 */
2261 target = (double) ConvertToXSegs(max_wal_size_mb) / (2.0 + CheckPointCompletionTarget);
2262
2263 /* round down */
2264 CheckPointSegments = (int) target;
2265
2266 if (CheckPointSegments < 1)
2267 CheckPointSegments = 1;
2268 }
2269
2270 void
assign_max_wal_size(int newval,void * extra)2271 assign_max_wal_size(int newval, void *extra)
2272 {
2273 max_wal_size_mb = newval;
2274 CalculateCheckpointSegments();
2275 }
2276
2277 void
assign_checkpoint_completion_target(double newval,void * extra)2278 assign_checkpoint_completion_target(double newval, void *extra)
2279 {
2280 CheckPointCompletionTarget = newval;
2281 CalculateCheckpointSegments();
2282 }
2283
2284 /*
2285 * At a checkpoint, how many WAL segments to recycle as preallocated future
2286 * XLOG segments? Returns the highest segment that should be preallocated.
2287 */
2288 static XLogSegNo
XLOGfileslop(XLogRecPtr PriorRedoPtr)2289 XLOGfileslop(XLogRecPtr PriorRedoPtr)
2290 {
2291 XLogSegNo minSegNo;
2292 XLogSegNo maxSegNo;
2293 double distance;
2294 XLogSegNo recycleSegNo;
2295
2296 /*
2297 * Calculate the segment numbers that min_wal_size_mb and max_wal_size_mb
2298 * correspond to. Always recycle enough segments to meet the minimum, and
2299 * remove enough segments to stay below the maximum.
2300 */
2301 minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + ConvertToXSegs(min_wal_size_mb) - 1;
2302 maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + ConvertToXSegs(max_wal_size_mb) - 1;
2303
2304 /*
2305 * Between those limits, recycle enough segments to get us through to the
2306 * estimated end of next checkpoint.
2307 *
2308 * To estimate where the next checkpoint will finish, assume that the
2309 * system runs steadily consuming CheckPointDistanceEstimate bytes between
2310 * every checkpoint.
2311 *
2312 * The reason this calculation is done from the prior checkpoint, not the
2313 * one that just finished, is that this behaves better if some checkpoint
2314 * cycles are abnormally short, like if you perform a manual checkpoint
2315 * right after a timed one. The manual checkpoint will make almost a full
2316 * cycle's worth of WAL segments available for recycling, because the
2317 * segments from the prior's prior, fully-sized checkpoint cycle are no
2318 * longer needed. However, the next checkpoint will make only few segments
2319 * available for recycling, the ones generated between the timed
2320 * checkpoint and the manual one right after that. If at the manual
2321 * checkpoint we only retained enough segments to get us to the next timed
2322 * one, and removed the rest, then at the next checkpoint we would not
2323 * have enough segments around for recycling, to get us to the checkpoint
2324 * after that. Basing the calculations on the distance from the prior redo
2325 * pointer largely fixes that problem.
2326 */
2327 distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2328 /* add 10% for good measure. */
2329 distance *= 1.10;
2330
2331 recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
2332
2333 if (recycleSegNo < minSegNo)
2334 recycleSegNo = minSegNo;
2335 if (recycleSegNo > maxSegNo)
2336 recycleSegNo = maxSegNo;
2337
2338 return recycleSegNo;
2339 }
2340
2341 /*
2342 * Check whether we've consumed enough xlog space that a checkpoint is needed.
2343 *
2344 * new_segno indicates a log file that has just been filled up (or read
2345 * during recovery). We measure the distance from RedoRecPtr to new_segno
2346 * and see if that exceeds CheckPointSegments.
2347 *
2348 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2349 */
2350 static bool
XLogCheckpointNeeded(XLogSegNo new_segno)2351 XLogCheckpointNeeded(XLogSegNo new_segno)
2352 {
2353 XLogSegNo old_segno;
2354
2355 XLByteToSeg(RedoRecPtr, old_segno);
2356
2357 if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2358 return true;
2359 return false;
2360 }
2361
2362 /*
2363 * Write and/or fsync the log at least as far as WriteRqst indicates.
2364 *
2365 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2366 * may stop at any convenient boundary (such as a cache or logfile boundary).
2367 * This option allows us to avoid uselessly issuing multiple writes when a
2368 * single one would do.
2369 *
2370 * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2371 * must be called before grabbing the lock, to make sure the data is ready to
2372 * write.
2373 */
2374 static void
XLogWrite(XLogwrtRqst WriteRqst,bool flexible)2375 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2376 {
2377 bool ispartialpage;
2378 bool last_iteration;
2379 bool finishing_seg;
2380 bool use_existent;
2381 int curridx;
2382 int npages;
2383 int startidx;
2384 uint32 startoffset;
2385
2386 /* We should always be inside a critical section here */
2387 Assert(CritSectionCount > 0);
2388
2389 /*
2390 * Update local LogwrtResult (caller probably did this already, but...)
2391 */
2392 LogwrtResult = XLogCtl->LogwrtResult;
2393
2394 /*
2395 * Since successive pages in the xlog cache are consecutively allocated,
2396 * we can usually gather multiple pages together and issue just one
2397 * write() call. npages is the number of pages we have determined can be
2398 * written together; startidx is the cache block index of the first one,
2399 * and startoffset is the file offset at which it should go. The latter
2400 * two variables are only valid when npages > 0, but we must initialize
2401 * all of them to keep the compiler quiet.
2402 */
2403 npages = 0;
2404 startidx = 0;
2405 startoffset = 0;
2406
2407 /*
2408 * Within the loop, curridx is the cache block index of the page to
2409 * consider writing. Begin at the buffer containing the next unwritten
2410 * page, or last partially written page.
2411 */
2412 curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2413
2414 while (LogwrtResult.Write < WriteRqst.Write)
2415 {
2416 /*
2417 * Make sure we're not ahead of the insert process. This could happen
2418 * if we're passed a bogus WriteRqst.Write that is past the end of the
2419 * last page that's been initialized by AdvanceXLInsertBuffer.
2420 */
2421 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2422
2423 if (LogwrtResult.Write >= EndPtr)
2424 elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2425 (uint32) (LogwrtResult.Write >> 32),
2426 (uint32) LogwrtResult.Write,
2427 (uint32) (EndPtr >> 32), (uint32) EndPtr);
2428
2429 /* Advance LogwrtResult.Write to end of current buffer page */
2430 LogwrtResult.Write = EndPtr;
2431 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2432
2433 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2434 {
2435 /*
2436 * Switch to new logfile segment. We cannot have any pending
2437 * pages here (since we dump what we have at segment end).
2438 */
2439 Assert(npages == 0);
2440 if (openLogFile >= 0)
2441 XLogFileClose();
2442 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2443
2444 /* create/use new log file */
2445 use_existent = true;
2446 openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2447 openLogOff = 0;
2448 }
2449
2450 /* Make sure we have the current logfile open */
2451 if (openLogFile < 0)
2452 {
2453 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2454 openLogFile = XLogFileOpen(openLogSegNo);
2455 openLogOff = 0;
2456 }
2457
2458 /* Add current page to the set of pending pages-to-dump */
2459 if (npages == 0)
2460 {
2461 /* first of group */
2462 startidx = curridx;
2463 startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2464 }
2465 npages++;
2466
2467 /*
2468 * Dump the set if this will be the last loop iteration, or if we are
2469 * at the last page of the cache area (since the next page won't be
2470 * contiguous in memory), or if we are at the end of the logfile
2471 * segment.
2472 */
2473 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2474
2475 finishing_seg = !ispartialpage &&
2476 (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2477
2478 if (last_iteration ||
2479 curridx == XLogCtl->XLogCacheBlck ||
2480 finishing_seg)
2481 {
2482 char *from;
2483 Size nbytes;
2484 Size nleft;
2485 int written;
2486
2487 /* Need to seek in the file? */
2488 if (openLogOff != startoffset)
2489 {
2490 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2491 ereport(PANIC,
2492 (errcode_for_file_access(),
2493 errmsg("could not seek in log file %s to offset %u: %m",
2494 XLogFileNameP(ThisTimeLineID, openLogSegNo),
2495 startoffset)));
2496 openLogOff = startoffset;
2497 }
2498
2499 /* OK to write the page(s) */
2500 from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2501 nbytes = npages * (Size) XLOG_BLCKSZ;
2502 nleft = nbytes;
2503 do
2504 {
2505 errno = 0;
2506 pgstat_report_wait_start(WAIT_EVENT_WAL_WRITE);
2507 written = write(openLogFile, from, nleft);
2508 pgstat_report_wait_end();
2509 if (written <= 0)
2510 {
2511 if (errno == EINTR)
2512 continue;
2513 ereport(PANIC,
2514 (errcode_for_file_access(),
2515 errmsg("could not write to log file %s "
2516 "at offset %u, length %zu: %m",
2517 XLogFileNameP(ThisTimeLineID, openLogSegNo),
2518 openLogOff, nbytes)));
2519 }
2520 nleft -= written;
2521 from += written;
2522 } while (nleft > 0);
2523
2524 /* Update state for write */
2525 openLogOff += nbytes;
2526 npages = 0;
2527
2528 /*
2529 * If we just wrote the whole last page of a logfile segment,
2530 * fsync the segment immediately. This avoids having to go back
2531 * and re-open prior segments when an fsync request comes along
2532 * later. Doing it here ensures that one and only one backend will
2533 * perform this fsync.
2534 *
2535 * This is also the right place to notify the Archiver that the
2536 * segment is ready to copy to archival storage, and to update the
2537 * timer for archive_timeout, and to signal for a checkpoint if
2538 * too many logfile segments have been used since the last
2539 * checkpoint.
2540 */
2541 if (finishing_seg)
2542 {
2543 issue_xlog_fsync(openLogFile, openLogSegNo);
2544
2545 /* signal that we need to wakeup walsenders later */
2546 WalSndWakeupRequest();
2547
2548 LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
2549
2550 if (XLogArchivingActive())
2551 XLogArchiveNotifySeg(openLogSegNo);
2552
2553 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2554 XLogCtl->lastSegSwitchLSN = LogwrtResult.Flush;
2555
2556 /*
2557 * Request a checkpoint if we've consumed too much xlog since
2558 * the last one. For speed, we first check using the local
2559 * copy of RedoRecPtr, which might be out of date; if it looks
2560 * like a checkpoint is needed, forcibly update RedoRecPtr and
2561 * recheck.
2562 */
2563 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2564 {
2565 (void) GetRedoRecPtr();
2566 if (XLogCheckpointNeeded(openLogSegNo))
2567 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2568 }
2569 }
2570 }
2571
2572 if (ispartialpage)
2573 {
2574 /* Only asked to write a partial page */
2575 LogwrtResult.Write = WriteRqst.Write;
2576 break;
2577 }
2578 curridx = NextBufIdx(curridx);
2579
2580 /* If flexible, break out of loop as soon as we wrote something */
2581 if (flexible && npages == 0)
2582 break;
2583 }
2584
2585 Assert(npages == 0);
2586
2587 /*
2588 * If asked to flush, do so
2589 */
2590 if (LogwrtResult.Flush < WriteRqst.Flush &&
2591 LogwrtResult.Flush < LogwrtResult.Write)
2592
2593 {
2594 /*
2595 * Could get here without iterating above loop, in which case we might
2596 * have no open file or the wrong one. However, we do not need to
2597 * fsync more than one file.
2598 */
2599 if (sync_method != SYNC_METHOD_OPEN &&
2600 sync_method != SYNC_METHOD_OPEN_DSYNC)
2601 {
2602 if (openLogFile >= 0 &&
2603 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2604 XLogFileClose();
2605 if (openLogFile < 0)
2606 {
2607 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2608 openLogFile = XLogFileOpen(openLogSegNo);
2609 openLogOff = 0;
2610 }
2611
2612 issue_xlog_fsync(openLogFile, openLogSegNo);
2613 }
2614
2615 /* signal that we need to wakeup walsenders later */
2616 WalSndWakeupRequest();
2617
2618 LogwrtResult.Flush = LogwrtResult.Write;
2619 }
2620
2621 /*
2622 * Update shared-memory status
2623 *
2624 * We make sure that the shared 'request' values do not fall behind the
2625 * 'result' values. This is not absolutely essential, but it saves some
2626 * code in a couple of places.
2627 */
2628 {
2629 SpinLockAcquire(&XLogCtl->info_lck);
2630 XLogCtl->LogwrtResult = LogwrtResult;
2631 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2632 XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2633 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2634 XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2635 SpinLockRelease(&XLogCtl->info_lck);
2636 }
2637 }
2638
2639 /*
2640 * Record the LSN for an asynchronous transaction commit/abort
2641 * and nudge the WALWriter if there is work for it to do.
2642 * (This should not be called for synchronous commits.)
2643 */
2644 void
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)2645 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2646 {
2647 XLogRecPtr WriteRqstPtr = asyncXactLSN;
2648 bool sleeping;
2649
2650 SpinLockAcquire(&XLogCtl->info_lck);
2651 LogwrtResult = XLogCtl->LogwrtResult;
2652 sleeping = XLogCtl->WalWriterSleeping;
2653 if (XLogCtl->asyncXactLSN < asyncXactLSN)
2654 XLogCtl->asyncXactLSN = asyncXactLSN;
2655 SpinLockRelease(&XLogCtl->info_lck);
2656
2657 /*
2658 * If the WALWriter is sleeping, we should kick it to make it come out of
2659 * low-power mode. Otherwise, determine whether there's a full page of
2660 * WAL available to write.
2661 */
2662 if (!sleeping)
2663 {
2664 /* back off to last completed page boundary */
2665 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2666
2667 /* if we have already flushed that far, we're done */
2668 if (WriteRqstPtr <= LogwrtResult.Flush)
2669 return;
2670 }
2671
2672 /*
2673 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2674 * to come out of low-power mode so that this async commit will reach disk
2675 * within the expected amount of time.
2676 */
2677 if (ProcGlobal->walwriterLatch)
2678 SetLatch(ProcGlobal->walwriterLatch);
2679 }
2680
2681 /*
2682 * Record the LSN up to which we can remove WAL because it's not required by
2683 * any replication slot.
2684 */
2685 void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)2686 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2687 {
2688 SpinLockAcquire(&XLogCtl->info_lck);
2689 XLogCtl->replicationSlotMinLSN = lsn;
2690 SpinLockRelease(&XLogCtl->info_lck);
2691 }
2692
2693
2694 /*
2695 * Return the oldest LSN we must retain to satisfy the needs of some
2696 * replication slot.
2697 */
2698 static XLogRecPtr
XLogGetReplicationSlotMinimumLSN(void)2699 XLogGetReplicationSlotMinimumLSN(void)
2700 {
2701 XLogRecPtr retval;
2702
2703 SpinLockAcquire(&XLogCtl->info_lck);
2704 retval = XLogCtl->replicationSlotMinLSN;
2705 SpinLockRelease(&XLogCtl->info_lck);
2706
2707 return retval;
2708 }
2709
2710 /*
2711 * Advance minRecoveryPoint in control file.
2712 *
2713 * If we crash during recovery, we must reach this point again before the
2714 * database is consistent.
2715 *
2716 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2717 * is only updated if it's not already greater than or equal to 'lsn'.
2718 */
2719 static void
UpdateMinRecoveryPoint(XLogRecPtr lsn,bool force)2720 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2721 {
2722 /* Quick check using our local copy of the variable */
2723 if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2724 return;
2725
2726 /*
2727 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2728 * i.e., we're doing crash recovery. We never modify the control file's
2729 * value in that case, so we can short-circuit future checks here too. The
2730 * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2731 * updated until crash recovery finishes. We only do this for the startup
2732 * process as it should not update its own reference of minRecoveryPoint
2733 * until it has finished crash recovery to make sure that all WAL
2734 * available is replayed in this case. This also saves from extra locks
2735 * taken on the control file from the startup process.
2736 */
2737 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2738 {
2739 updateMinRecoveryPoint = false;
2740 return;
2741 }
2742
2743 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2744
2745 /* update local copy */
2746 minRecoveryPoint = ControlFile->minRecoveryPoint;
2747 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2748
2749 if (XLogRecPtrIsInvalid(minRecoveryPoint))
2750 updateMinRecoveryPoint = false;
2751 else if (force || minRecoveryPoint < lsn)
2752 {
2753 XLogRecPtr newMinRecoveryPoint;
2754 TimeLineID newMinRecoveryPointTLI;
2755
2756 /*
2757 * To avoid having to update the control file too often, we update it
2758 * all the way to the last record being replayed, even though 'lsn'
2759 * would suffice for correctness. This also allows the 'force' case
2760 * to not need a valid 'lsn' value.
2761 *
2762 * Another important reason for doing it this way is that the passed
2763 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2764 * the caller got it from a corrupted heap page. Accepting such a
2765 * value as the min recovery point would prevent us from coming up at
2766 * all. Instead, we just log a warning and continue with recovery.
2767 * (See also the comments about corrupt LSNs in XLogFlush.)
2768 */
2769 SpinLockAcquire(&XLogCtl->info_lck);
2770 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2771 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2772 SpinLockRelease(&XLogCtl->info_lck);
2773
2774 if (!force && newMinRecoveryPoint < lsn)
2775 elog(WARNING,
2776 "xlog min recovery request %X/%X is past current point %X/%X",
2777 (uint32) (lsn >> 32), (uint32) lsn,
2778 (uint32) (newMinRecoveryPoint >> 32),
2779 (uint32) newMinRecoveryPoint);
2780
2781 /* update control file */
2782 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2783 {
2784 ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2785 ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2786 UpdateControlFile();
2787 minRecoveryPoint = newMinRecoveryPoint;
2788 minRecoveryPointTLI = newMinRecoveryPointTLI;
2789
2790 ereport(DEBUG2,
2791 (errmsg("updated min recovery point to %X/%X on timeline %u",
2792 (uint32) (minRecoveryPoint >> 32),
2793 (uint32) minRecoveryPoint,
2794 newMinRecoveryPointTLI)));
2795 }
2796 }
2797 LWLockRelease(ControlFileLock);
2798 }
2799
2800 /*
2801 * Ensure that all XLOG data through the given position is flushed to disk.
2802 *
2803 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2804 * already held, and we try to avoid acquiring it if possible.
2805 */
2806 void
XLogFlush(XLogRecPtr record)2807 XLogFlush(XLogRecPtr record)
2808 {
2809 XLogRecPtr WriteRqstPtr;
2810 XLogwrtRqst WriteRqst;
2811
2812 /*
2813 * During REDO, we are reading not writing WAL. Therefore, instead of
2814 * trying to flush the WAL, we should update minRecoveryPoint instead. We
2815 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2816 * to act this way too, and because when it tries to write the
2817 * end-of-recovery checkpoint, it should indeed flush.
2818 */
2819 if (!XLogInsertAllowed())
2820 {
2821 UpdateMinRecoveryPoint(record, false);
2822 return;
2823 }
2824
2825 /* Quick exit if already known flushed */
2826 if (record <= LogwrtResult.Flush)
2827 return;
2828
2829 #ifdef WAL_DEBUG
2830 if (XLOG_DEBUG)
2831 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2832 (uint32) (record >> 32), (uint32) record,
2833 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2834 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2835 #endif
2836
2837 START_CRIT_SECTION();
2838
2839 /*
2840 * Since fsync is usually a horribly expensive operation, we try to
2841 * piggyback as much data as we can on each fsync: if we see any more data
2842 * entered into the xlog buffer, we'll write and fsync that too, so that
2843 * the final value of LogwrtResult.Flush is as large as possible. This
2844 * gives us some chance of avoiding another fsync immediately after.
2845 */
2846
2847 /* initialize to given target; may increase below */
2848 WriteRqstPtr = record;
2849
2850 /*
2851 * Now wait until we get the write lock, or someone else does the flush
2852 * for us.
2853 */
2854 for (;;)
2855 {
2856 XLogRecPtr insertpos;
2857
2858 /* read LogwrtResult and update local state */
2859 SpinLockAcquire(&XLogCtl->info_lck);
2860 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2861 WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2862 LogwrtResult = XLogCtl->LogwrtResult;
2863 SpinLockRelease(&XLogCtl->info_lck);
2864
2865 /* done already? */
2866 if (record <= LogwrtResult.Flush)
2867 break;
2868
2869 /*
2870 * Before actually performing the write, wait for all in-flight
2871 * insertions to the pages we're about to write to finish.
2872 */
2873 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2874
2875 /*
2876 * Try to get the write lock. If we can't get it immediately, wait
2877 * until it's released, and recheck if we still need to do the flush
2878 * or if the backend that held the lock did it for us already. This
2879 * helps to maintain a good rate of group committing when the system
2880 * is bottlenecked by the speed of fsyncing.
2881 */
2882 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2883 {
2884 /*
2885 * The lock is now free, but we didn't acquire it yet. Before we
2886 * do, loop back to check if someone else flushed the record for
2887 * us already.
2888 */
2889 continue;
2890 }
2891
2892 /* Got the lock; recheck whether request is satisfied */
2893 LogwrtResult = XLogCtl->LogwrtResult;
2894 if (record <= LogwrtResult.Flush)
2895 {
2896 LWLockRelease(WALWriteLock);
2897 break;
2898 }
2899
2900 /*
2901 * Sleep before flush! By adding a delay here, we may give further
2902 * backends the opportunity to join the backlog of group commit
2903 * followers; this can significantly improve transaction throughput,
2904 * at the risk of increasing transaction latency.
2905 *
2906 * We do not sleep if enableFsync is not turned on, nor if there are
2907 * fewer than CommitSiblings other backends with active transactions.
2908 */
2909 if (CommitDelay > 0 && enableFsync &&
2910 MinimumActiveBackends(CommitSiblings))
2911 {
2912 pg_usleep(CommitDelay);
2913
2914 /*
2915 * Re-check how far we can now flush the WAL. It's generally not
2916 * safe to call WaitXLogInsertionsToFinish while holding
2917 * WALWriteLock, because an in-progress insertion might need to
2918 * also grab WALWriteLock to make progress. But we know that all
2919 * the insertions up to insertpos have already finished, because
2920 * that's what the earlier WaitXLogInsertionsToFinish() returned.
2921 * We're only calling it again to allow insertpos to be moved
2922 * further forward, not to actually wait for anyone.
2923 */
2924 insertpos = WaitXLogInsertionsToFinish(insertpos);
2925 }
2926
2927 /* try to write/flush later additions to XLOG as well */
2928 WriteRqst.Write = insertpos;
2929 WriteRqst.Flush = insertpos;
2930
2931 XLogWrite(WriteRqst, false);
2932
2933 LWLockRelease(WALWriteLock);
2934 /* done */
2935 break;
2936 }
2937
2938 END_CRIT_SECTION();
2939
2940 /* wake up walsenders now that we've released heavily contended locks */
2941 WalSndWakeupProcessRequests();
2942
2943 /*
2944 * If we still haven't flushed to the request point then we have a
2945 * problem; most likely, the requested flush point is past end of XLOG.
2946 * This has been seen to occur when a disk page has a corrupted LSN.
2947 *
2948 * Formerly we treated this as a PANIC condition, but that hurts the
2949 * system's robustness rather than helping it: we do not want to take down
2950 * the whole system due to corruption on one data page. In particular, if
2951 * the bad page is encountered again during recovery then we would be
2952 * unable to restart the database at all! (This scenario actually
2953 * happened in the field several times with 7.1 releases.) As of 8.4, bad
2954 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2955 * the only time we can reach here during recovery is while flushing the
2956 * end-of-recovery checkpoint record, and we don't expect that to have a
2957 * bad LSN.
2958 *
2959 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2960 * since xact.c calls this routine inside a critical section. However,
2961 * calls from bufmgr.c are not within critical sections and so we will not
2962 * force a restart for a bad LSN on a data page.
2963 */
2964 if (LogwrtResult.Flush < record)
2965 elog(ERROR,
2966 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2967 (uint32) (record >> 32), (uint32) record,
2968 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2969 }
2970
2971 /*
2972 * Write & flush xlog, but without specifying exactly where to.
2973 *
2974 * We normally write only completed blocks; but if there is nothing to do on
2975 * that basis, we check for unwritten async commits in the current incomplete
2976 * block, and write through the latest one of those. Thus, if async commits
2977 * are not being used, we will write complete blocks only.
2978 *
2979 * If, based on the above, there's anything to write we do so immediately. But
2980 * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2981 * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2982 * more than wal_writer_flush_after unflushed blocks.
2983 *
2984 * We can guarantee that async commits reach disk after at most three
2985 * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2986 * to write "flexibly", meaning it can stop at the end of the buffer ring;
2987 * this makes a difference only with very high load or long wal_writer_delay,
2988 * but imposes one extra cycle for the worst case for async commits.)
2989 *
2990 * This routine is invoked periodically by the background walwriter process.
2991 *
2992 * Returns TRUE if there was any work to do, even if we skipped flushing due
2993 * to wal_writer_delay/wal_writer_flush_after.
2994 */
2995 bool
XLogBackgroundFlush(void)2996 XLogBackgroundFlush(void)
2997 {
2998 XLogwrtRqst WriteRqst;
2999 bool flexible = true;
3000 static TimestampTz lastflush;
3001 TimestampTz now;
3002 int flushbytes;
3003
3004 /* XLOG doesn't need flushing during recovery */
3005 if (RecoveryInProgress())
3006 return false;
3007
3008 /* read LogwrtResult and update local state */
3009 SpinLockAcquire(&XLogCtl->info_lck);
3010 LogwrtResult = XLogCtl->LogwrtResult;
3011 WriteRqst = XLogCtl->LogwrtRqst;
3012 SpinLockRelease(&XLogCtl->info_lck);
3013
3014 /* back off to last completed page boundary */
3015 WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
3016
3017 /* if we have already flushed that far, consider async commit records */
3018 if (WriteRqst.Write <= LogwrtResult.Flush)
3019 {
3020 SpinLockAcquire(&XLogCtl->info_lck);
3021 WriteRqst.Write = XLogCtl->asyncXactLSN;
3022 SpinLockRelease(&XLogCtl->info_lck);
3023 flexible = false; /* ensure it all gets written */
3024 }
3025
3026 /*
3027 * If already known flushed, we're done. Just need to check if we are
3028 * holding an open file handle to a logfile that's no longer in use,
3029 * preventing the file from being deleted.
3030 */
3031 if (WriteRqst.Write <= LogwrtResult.Flush)
3032 {
3033 if (openLogFile >= 0)
3034 {
3035 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
3036 {
3037 XLogFileClose();
3038 }
3039 }
3040 return false;
3041 }
3042
3043 /*
3044 * Determine how far to flush WAL, based on the wal_writer_delay and
3045 * wal_writer_flush_after GUCs.
3046 */
3047 now = GetCurrentTimestamp();
3048 flushbytes =
3049 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
3050
3051 if (WalWriterFlushAfter == 0 || lastflush == 0)
3052 {
3053 /* first call, or block based limits disabled */
3054 WriteRqst.Flush = WriteRqst.Write;
3055 lastflush = now;
3056 }
3057 else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
3058 {
3059 /*
3060 * Flush the writes at least every WalWriteDelay ms. This is important
3061 * to bound the amount of time it takes for an asynchronous commit to
3062 * hit disk.
3063 */
3064 WriteRqst.Flush = WriteRqst.Write;
3065 lastflush = now;
3066 }
3067 else if (flushbytes >= WalWriterFlushAfter)
3068 {
3069 /* exceeded wal_writer_flush_after blocks, flush */
3070 WriteRqst.Flush = WriteRqst.Write;
3071 lastflush = now;
3072 }
3073 else
3074 {
3075 /* no flushing, this time round */
3076 WriteRqst.Flush = 0;
3077 }
3078
3079 #ifdef WAL_DEBUG
3080 if (XLOG_DEBUG)
3081 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
3082 (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
3083 (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
3084 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
3085 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
3086 #endif
3087
3088 START_CRIT_SECTION();
3089
3090 /* now wait for any in-progress insertions to finish and get write lock */
3091 WaitXLogInsertionsToFinish(WriteRqst.Write);
3092 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
3093 LogwrtResult = XLogCtl->LogwrtResult;
3094 if (WriteRqst.Write > LogwrtResult.Write ||
3095 WriteRqst.Flush > LogwrtResult.Flush)
3096 {
3097 XLogWrite(WriteRqst, flexible);
3098 }
3099 LWLockRelease(WALWriteLock);
3100
3101 END_CRIT_SECTION();
3102
3103 /* wake up walsenders now that we've released heavily contended locks */
3104 WalSndWakeupProcessRequests();
3105
3106 /*
3107 * Great, done. To take some work off the critical path, try to initialize
3108 * as many of the no-longer-needed WAL buffers for future use as we can.
3109 */
3110 AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
3111
3112 /*
3113 * If we determined that we need to write data, but somebody else
3114 * wrote/flushed already, it should be considered as being active, to
3115 * avoid hibernating too early.
3116 */
3117 return true;
3118 }
3119
3120 /*
3121 * Test whether XLOG data has been flushed up to (at least) the given position.
3122 *
3123 * Returns true if a flush is still needed. (It may be that someone else
3124 * is already in process of flushing that far, however.)
3125 */
3126 bool
XLogNeedsFlush(XLogRecPtr record)3127 XLogNeedsFlush(XLogRecPtr record)
3128 {
3129 /*
3130 * During recovery, we don't flush WAL but update minRecoveryPoint
3131 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
3132 * would need to be updated.
3133 */
3134 if (RecoveryInProgress())
3135 {
3136 /*
3137 * An invalid minRecoveryPoint means that we need to recover all the
3138 * WAL, i.e., we're doing crash recovery. We never modify the control
3139 * file's value in that case, so we can short-circuit future checks
3140 * here too. This triggers a quick exit path for the startup process,
3141 * which cannot update its local copy of minRecoveryPoint as long as
3142 * it has not replayed all WAL available when doing crash recovery.
3143 */
3144 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
3145 updateMinRecoveryPoint = false;
3146
3147 /* Quick exit if already known to be updated or cannot be updated */
3148 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3149 return false;
3150
3151 /*
3152 * Update local copy of minRecoveryPoint. But if the lock is busy,
3153 * just return a conservative guess.
3154 */
3155 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3156 return true;
3157 minRecoveryPoint = ControlFile->minRecoveryPoint;
3158 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3159 LWLockRelease(ControlFileLock);
3160
3161 /*
3162 * Check minRecoveryPoint for any other process than the startup
3163 * process doing crash recovery, which should not update the control
3164 * file value if crash recovery is still running.
3165 */
3166 if (XLogRecPtrIsInvalid(minRecoveryPoint))
3167 updateMinRecoveryPoint = false;
3168
3169 /* check again */
3170 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3171 return false;
3172 else
3173 return true;
3174 }
3175
3176 /* Quick exit if already known flushed */
3177 if (record <= LogwrtResult.Flush)
3178 return false;
3179
3180 /* read LogwrtResult and update local state */
3181 SpinLockAcquire(&XLogCtl->info_lck);
3182 LogwrtResult = XLogCtl->LogwrtResult;
3183 SpinLockRelease(&XLogCtl->info_lck);
3184
3185 /* check again */
3186 if (record <= LogwrtResult.Flush)
3187 return false;
3188
3189 return true;
3190 }
3191
3192 /*
3193 * Create a new XLOG file segment, or open a pre-existing one.
3194 *
3195 * log, seg: identify segment to be created/opened.
3196 *
3197 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3198 * pre-existing file will be deleted). On return, TRUE if a pre-existing
3199 * file was used.
3200 *
3201 * use_lock: if TRUE, acquire ControlFileLock while moving file into
3202 * place. This should be TRUE except during bootstrap log creation. The
3203 * caller must *not* hold the lock at call.
3204 *
3205 * Returns FD of opened file.
3206 *
3207 * Note: errors here are ERROR not PANIC because we might or might not be
3208 * inside a critical section (eg, during checkpoint there is no reason to
3209 * take down the system on failure). They will promote to PANIC if we are
3210 * in a critical section.
3211 */
3212 int
XLogFileInit(XLogSegNo logsegno,bool * use_existent,bool use_lock)3213 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3214 {
3215 char path[MAXPGPATH];
3216 char tmppath[MAXPGPATH];
3217 PGAlignedXLogBlock zbuffer;
3218 XLogSegNo installed_segno;
3219 XLogSegNo max_segno;
3220 int fd;
3221 int nbytes;
3222
3223 XLogFilePath(path, ThisTimeLineID, logsegno);
3224
3225 /*
3226 * Try to use existent file (checkpoint maker may have created it already)
3227 */
3228 if (*use_existent)
3229 {
3230 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3231 S_IRUSR | S_IWUSR);
3232 if (fd < 0)
3233 {
3234 if (errno != ENOENT)
3235 ereport(ERROR,
3236 (errcode_for_file_access(),
3237 errmsg("could not open file \"%s\": %m", path)));
3238 }
3239 else
3240 return fd;
3241 }
3242
3243 /*
3244 * Initialize an empty (all zeroes) segment. NOTE: it is possible that
3245 * another process is doing the same thing. If so, we will end up
3246 * pre-creating an extra log segment. That seems OK, and better than
3247 * holding the lock throughout this lengthy process.
3248 */
3249 elog(DEBUG2, "creating and filling new WAL file");
3250
3251 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3252
3253 unlink(tmppath);
3254
3255 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3256 fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3257 S_IRUSR | S_IWUSR);
3258 if (fd < 0)
3259 ereport(ERROR,
3260 (errcode_for_file_access(),
3261 errmsg("could not create file \"%s\": %m", tmppath)));
3262
3263 /*
3264 * Zero-fill the file. We have to do this the hard way to ensure that all
3265 * the file space has really been allocated --- on platforms that allow
3266 * "holes" in files, just seeking to the end doesn't allocate intermediate
3267 * space. This way, we know that we have all the space and (after the
3268 * fsync below) that all the indirect blocks are down on disk. Therefore,
3269 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3270 * log file.
3271 */
3272 memset(zbuffer.data, 0, XLOG_BLCKSZ);
3273 for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3274 {
3275 errno = 0;
3276 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_WRITE);
3277 if ((int) write(fd, zbuffer.data, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3278 {
3279 int save_errno = errno;
3280
3281 /*
3282 * If we fail to make the file, delete it to release disk space
3283 */
3284 unlink(tmppath);
3285
3286 close(fd);
3287
3288 /* if write didn't set errno, assume problem is no disk space */
3289 errno = save_errno ? save_errno : ENOSPC;
3290
3291 ereport(ERROR,
3292 (errcode_for_file_access(),
3293 errmsg("could not write to file \"%s\": %m", tmppath)));
3294 }
3295 pgstat_report_wait_end();
3296 }
3297
3298 pgstat_report_wait_start(WAIT_EVENT_WAL_INIT_SYNC);
3299 if (pg_fsync(fd) != 0)
3300 {
3301 int save_errno = errno;
3302
3303 close(fd);
3304 errno = save_errno;
3305 ereport(ERROR,
3306 (errcode_for_file_access(),
3307 errmsg("could not fsync file \"%s\": %m", tmppath)));
3308 }
3309 pgstat_report_wait_end();
3310
3311 if (close(fd))
3312 ereport(ERROR,
3313 (errcode_for_file_access(),
3314 errmsg("could not close file \"%s\": %m", tmppath)));
3315
3316 /*
3317 * Now move the segment into place with its final name.
3318 *
3319 * If caller didn't want to use a pre-existing file, get rid of any
3320 * pre-existing file. Otherwise, cope with possibility that someone else
3321 * has created the file while we were filling ours: if so, use ours to
3322 * pre-create a future log segment.
3323 */
3324 installed_segno = logsegno;
3325
3326 /*
3327 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3328 * that was a constant, but that was always a bit dubious: normally, at a
3329 * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3330 * here, it was the offset from the insert location. We can't do the
3331 * normal XLOGfileslop calculation here because we don't have access to
3332 * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3333 * CheckPointSegments.
3334 */
3335 max_segno = logsegno + CheckPointSegments;
3336 if (!InstallXLogFileSegment(&installed_segno, tmppath,
3337 *use_existent, max_segno,
3338 use_lock))
3339 {
3340 /*
3341 * No need for any more future segments, or InstallXLogFileSegment()
3342 * failed to rename the file into place. If the rename failed, opening
3343 * the file below will fail.
3344 */
3345 unlink(tmppath);
3346 }
3347
3348 /* Set flag to tell caller there was no existent file */
3349 *use_existent = false;
3350
3351 /* Now open original target segment (might not be file I just made) */
3352 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3353 S_IRUSR | S_IWUSR);
3354 if (fd < 0)
3355 ereport(ERROR,
3356 (errcode_for_file_access(),
3357 errmsg("could not open file \"%s\": %m", path)));
3358
3359 elog(DEBUG2, "done creating and filling new WAL file");
3360
3361 return fd;
3362 }
3363
3364 /*
3365 * Create a new XLOG file segment by copying a pre-existing one.
3366 *
3367 * destsegno: identify segment to be created.
3368 *
3369 * srcTLI, srcsegno: identify segment to be copied (could be from
3370 * a different timeline)
3371 *
3372 * upto: how much of the source file to copy (the rest is filled with
3373 * zeros)
3374 *
3375 * Currently this is only used during recovery, and so there are no locking
3376 * considerations. But we should be just as tense as XLogFileInit to avoid
3377 * emplacing a bogus file.
3378 */
3379 static void
XLogFileCopy(XLogSegNo destsegno,TimeLineID srcTLI,XLogSegNo srcsegno,int upto)3380 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3381 int upto)
3382 {
3383 char path[MAXPGPATH];
3384 char tmppath[MAXPGPATH];
3385 PGAlignedXLogBlock buffer;
3386 int srcfd;
3387 int fd;
3388 int nbytes;
3389
3390 /*
3391 * Open the source file
3392 */
3393 XLogFilePath(path, srcTLI, srcsegno);
3394 srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3395 if (srcfd < 0)
3396 ereport(ERROR,
3397 (errcode_for_file_access(),
3398 errmsg("could not open file \"%s\": %m", path)));
3399
3400 /*
3401 * Copy into a temp file name.
3402 */
3403 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3404
3405 unlink(tmppath);
3406
3407 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3408 fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3409 S_IRUSR | S_IWUSR);
3410 if (fd < 0)
3411 ereport(ERROR,
3412 (errcode_for_file_access(),
3413 errmsg("could not create file \"%s\": %m", tmppath)));
3414
3415 /*
3416 * Do the data copying.
3417 */
3418 for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3419 {
3420 int nread;
3421
3422 nread = upto - nbytes;
3423
3424 /*
3425 * The part that is not read from the source file is filled with
3426 * zeros.
3427 */
3428 if (nread < sizeof(buffer))
3429 memset(buffer.data, 0, sizeof(buffer));
3430
3431 if (nread > 0)
3432 {
3433 if (nread > sizeof(buffer))
3434 nread = sizeof(buffer);
3435 errno = 0;
3436 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_READ);
3437 if (read(srcfd, buffer.data, nread) != nread)
3438 {
3439 if (errno != 0)
3440 ereport(ERROR,
3441 (errcode_for_file_access(),
3442 errmsg("could not read file \"%s\": %m",
3443 path)));
3444 else
3445 ereport(ERROR,
3446 (errmsg("not enough data in file \"%s\"",
3447 path)));
3448 }
3449 pgstat_report_wait_end();
3450 }
3451 errno = 0;
3452 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_WRITE);
3453 if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3454 {
3455 int save_errno = errno;
3456
3457 /*
3458 * If we fail to make the file, delete it to release disk space
3459 */
3460 unlink(tmppath);
3461 /* if write didn't set errno, assume problem is no disk space */
3462 errno = save_errno ? save_errno : ENOSPC;
3463
3464 ereport(ERROR,
3465 (errcode_for_file_access(),
3466 errmsg("could not write to file \"%s\": %m", tmppath)));
3467 }
3468 pgstat_report_wait_end();
3469 }
3470
3471 pgstat_report_wait_start(WAIT_EVENT_WAL_COPY_SYNC);
3472 if (pg_fsync(fd) != 0)
3473 ereport(data_sync_elevel(ERROR),
3474 (errcode_for_file_access(),
3475 errmsg("could not fsync file \"%s\": %m", tmppath)));
3476 pgstat_report_wait_end();
3477
3478 if (CloseTransientFile(fd))
3479 ereport(ERROR,
3480 (errcode_for_file_access(),
3481 errmsg("could not close file \"%s\": %m", tmppath)));
3482
3483 CloseTransientFile(srcfd);
3484
3485 /*
3486 * Now move the segment into place with its final name.
3487 */
3488 if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3489 elog(ERROR, "InstallXLogFileSegment should not have failed");
3490 }
3491
3492 /*
3493 * Install a new XLOG segment file as a current or future log segment.
3494 *
3495 * This is used both to install a newly-created segment (which has a temp
3496 * filename while it's being created) and to recycle an old segment.
3497 *
3498 * *segno: identify segment to install as (or first possible target).
3499 * When find_free is TRUE, this is modified on return to indicate the
3500 * actual installation location or last segment searched.
3501 *
3502 * tmppath: initial name of file to install. It will be renamed into place.
3503 *
3504 * find_free: if TRUE, install the new segment at the first empty segno
3505 * number at or after the passed numbers. If FALSE, install the new segment
3506 * exactly where specified, deleting any existing segment file there.
3507 *
3508 * max_segno: maximum segment number to install the new file as. Fail if no
3509 * free slot is found between *segno and max_segno. (Ignored when find_free
3510 * is FALSE.)
3511 *
3512 * use_lock: if TRUE, acquire ControlFileLock while moving file into
3513 * place. This should be TRUE except during bootstrap log creation. The
3514 * caller must *not* hold the lock at call.
3515 *
3516 * Returns TRUE if the file was installed successfully. FALSE indicates that
3517 * max_segno limit was exceeded, or an error occurred while renaming the
3518 * file into place.
3519 */
3520 static bool
InstallXLogFileSegment(XLogSegNo * segno,char * tmppath,bool find_free,XLogSegNo max_segno,bool use_lock)3521 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3522 bool find_free, XLogSegNo max_segno,
3523 bool use_lock)
3524 {
3525 char path[MAXPGPATH];
3526 struct stat stat_buf;
3527
3528 XLogFilePath(path, ThisTimeLineID, *segno);
3529
3530 /*
3531 * We want to be sure that only one process does this at a time.
3532 */
3533 if (use_lock)
3534 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3535
3536 if (!find_free)
3537 {
3538 /* Force installation: get rid of any pre-existing segment file */
3539 durable_unlink(path, DEBUG1);
3540 }
3541 else
3542 {
3543 /* Find a free slot to put it in */
3544 while (stat(path, &stat_buf) == 0)
3545 {
3546 if ((*segno) >= max_segno)
3547 {
3548 /* Failed to find a free slot within specified range */
3549 if (use_lock)
3550 LWLockRelease(ControlFileLock);
3551 return false;
3552 }
3553 (*segno)++;
3554 XLogFilePath(path, ThisTimeLineID, *segno);
3555 }
3556 }
3557
3558 /*
3559 * Perform the rename using link if available, paranoidly trying to avoid
3560 * overwriting an existing file (there shouldn't be one).
3561 */
3562 if (durable_link_or_rename(tmppath, path, LOG) != 0)
3563 {
3564 if (use_lock)
3565 LWLockRelease(ControlFileLock);
3566 /* durable_link_or_rename already emitted log message */
3567 return false;
3568 }
3569
3570 if (use_lock)
3571 LWLockRelease(ControlFileLock);
3572
3573 return true;
3574 }
3575
3576 /*
3577 * Open a pre-existing logfile segment for writing.
3578 */
3579 int
XLogFileOpen(XLogSegNo segno)3580 XLogFileOpen(XLogSegNo segno)
3581 {
3582 char path[MAXPGPATH];
3583 int fd;
3584
3585 XLogFilePath(path, ThisTimeLineID, segno);
3586
3587 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3588 S_IRUSR | S_IWUSR);
3589 if (fd < 0)
3590 ereport(PANIC,
3591 (errcode_for_file_access(),
3592 errmsg("could not open write-ahead log file \"%s\": %m", path)));
3593
3594 return fd;
3595 }
3596
3597 /*
3598 * Open a logfile segment for reading (during recovery).
3599 *
3600 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3601 * Otherwise, it's assumed to be already available in pg_wal.
3602 */
3603 static int
XLogFileRead(XLogSegNo segno,int emode,TimeLineID tli,int source,bool notfoundOk)3604 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3605 int source, bool notfoundOk)
3606 {
3607 char xlogfname[MAXFNAMELEN];
3608 char activitymsg[MAXFNAMELEN + 16];
3609 char path[MAXPGPATH];
3610 int fd;
3611
3612 XLogFileName(xlogfname, tli, segno);
3613
3614 switch (source)
3615 {
3616 case XLOG_FROM_ARCHIVE:
3617 /* Report recovery progress in PS display */
3618 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3619 xlogfname);
3620 set_ps_display(activitymsg, false);
3621
3622 restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3623 "RECOVERYXLOG",
3624 XLogSegSize,
3625 InRedo);
3626 if (!restoredFromArchive)
3627 return -1;
3628 break;
3629
3630 case XLOG_FROM_PG_WAL:
3631 case XLOG_FROM_STREAM:
3632 XLogFilePath(path, tli, segno);
3633 restoredFromArchive = false;
3634 break;
3635
3636 default:
3637 elog(ERROR, "invalid XLogFileRead source %d", source);
3638 }
3639
3640 /*
3641 * If the segment was fetched from archival storage, replace the existing
3642 * xlog segment (if any) with the archival version.
3643 */
3644 if (source == XLOG_FROM_ARCHIVE)
3645 {
3646 KeepFileRestoredFromArchive(path, xlogfname);
3647
3648 /*
3649 * Set path to point at the new file in pg_wal.
3650 */
3651 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3652 }
3653
3654 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3655 if (fd >= 0)
3656 {
3657 /* Success! */
3658 curFileTLI = tli;
3659
3660 /* Report recovery progress in PS display */
3661 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3662 xlogfname);
3663 set_ps_display(activitymsg, false);
3664
3665 /* Track source of data in assorted state variables */
3666 readSource = source;
3667 XLogReceiptSource = source;
3668 /* In FROM_STREAM case, caller tracks receipt time, not me */
3669 if (source != XLOG_FROM_STREAM)
3670 XLogReceiptTime = GetCurrentTimestamp();
3671
3672 return fd;
3673 }
3674 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3675 ereport(PANIC,
3676 (errcode_for_file_access(),
3677 errmsg("could not open file \"%s\": %m", path)));
3678 return -1;
3679 }
3680
3681 /*
3682 * Open a logfile segment for reading (during recovery).
3683 *
3684 * This version searches for the segment with any TLI listed in expectedTLEs.
3685 */
3686 static int
XLogFileReadAnyTLI(XLogSegNo segno,int emode,int source)3687 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3688 {
3689 char path[MAXPGPATH];
3690 ListCell *cell;
3691 int fd;
3692 List *tles;
3693
3694 /*
3695 * Loop looking for a suitable timeline ID: we might need to read any of
3696 * the timelines listed in expectedTLEs.
3697 *
3698 * We expect curFileTLI on entry to be the TLI of the preceding file in
3699 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
3700 * to go backwards; this prevents us from picking up the wrong file when a
3701 * parent timeline extends to higher segment numbers than the child we
3702 * want to read.
3703 *
3704 * If we haven't read the timeline history file yet, read it now, so that
3705 * we know which TLIs to scan. We don't save the list in expectedTLEs,
3706 * however, unless we actually find a valid segment. That way if there is
3707 * neither a timeline history file nor a WAL segment in the archive, and
3708 * streaming replication is set up, we'll read the timeline history file
3709 * streamed from the master when we start streaming, instead of recovering
3710 * with a dummy history generated here.
3711 */
3712 if (expectedTLEs)
3713 tles = expectedTLEs;
3714 else
3715 tles = readTimeLineHistory(recoveryTargetTLI);
3716
3717 foreach(cell, tles)
3718 {
3719 TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
3720 TimeLineID tli = hent->tli;
3721
3722 if (tli < curFileTLI)
3723 break; /* don't bother looking at too-old TLIs */
3724
3725 /*
3726 * Skip scanning the timeline ID that the logfile segment to read
3727 * doesn't belong to
3728 */
3729 if (hent->begin != InvalidXLogRecPtr)
3730 {
3731 XLogSegNo beginseg = 0;
3732
3733 XLByteToSeg(hent->begin, beginseg);
3734
3735 /*
3736 * The logfile segment that doesn't belong to the timeline is
3737 * older or newer than the segment that the timeline started or
3738 * ended at, respectively. It's sufficient to check only the
3739 * starting segment of the timeline here. Since the timelines are
3740 * scanned in descending order in this loop, any segments newer
3741 * than the ending segment should belong to newer timeline and
3742 * have already been read before. So it's not necessary to check
3743 * the ending segment of the timeline here.
3744 */
3745 if (segno < beginseg)
3746 continue;
3747 }
3748
3749 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3750 {
3751 fd = XLogFileRead(segno, emode, tli,
3752 XLOG_FROM_ARCHIVE, true);
3753 if (fd != -1)
3754 {
3755 elog(DEBUG1, "got WAL segment from archive");
3756 if (!expectedTLEs)
3757 expectedTLEs = tles;
3758 return fd;
3759 }
3760 }
3761
3762 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_WAL)
3763 {
3764 fd = XLogFileRead(segno, emode, tli,
3765 XLOG_FROM_PG_WAL, true);
3766 if (fd != -1)
3767 {
3768 if (!expectedTLEs)
3769 expectedTLEs = tles;
3770 return fd;
3771 }
3772 }
3773 }
3774
3775 /* Couldn't find it. For simplicity, complain about front timeline */
3776 XLogFilePath(path, recoveryTargetTLI, segno);
3777 errno = ENOENT;
3778 ereport(emode,
3779 (errcode_for_file_access(),
3780 errmsg("could not open file \"%s\": %m", path)));
3781 return -1;
3782 }
3783
3784 /*
3785 * Close the current logfile segment for writing.
3786 */
3787 static void
XLogFileClose(void)3788 XLogFileClose(void)
3789 {
3790 Assert(openLogFile >= 0);
3791
3792 /*
3793 * WAL segment files will not be re-read in normal operation, so we advise
3794 * the OS to release any cached pages. But do not do so if WAL archiving
3795 * or streaming is active, because archiver and walsender process could
3796 * use the cache to read the WAL segment.
3797 */
3798 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3799 if (!XLogIsNeeded())
3800 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3801 #endif
3802
3803 if (close(openLogFile))
3804 ereport(PANIC,
3805 (errcode_for_file_access(),
3806 errmsg("could not close log file %s: %m",
3807 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3808 openLogFile = -1;
3809 }
3810
3811 /*
3812 * Preallocate log files beyond the specified log endpoint.
3813 *
3814 * XXX this is currently extremely conservative, since it forces only one
3815 * future log segment to exist, and even that only if we are 75% done with
3816 * the current one. This is only appropriate for very low-WAL-volume systems.
3817 * High-volume systems will be OK once they've built up a sufficient set of
3818 * recycled log segments, but the startup transient is likely to include
3819 * a lot of segment creations by foreground processes, which is not so good.
3820 */
3821 static void
PreallocXlogFiles(XLogRecPtr endptr)3822 PreallocXlogFiles(XLogRecPtr endptr)
3823 {
3824 XLogSegNo _logSegNo;
3825 int lf;
3826 bool use_existent;
3827
3828 XLByteToPrevSeg(endptr, _logSegNo);
3829 if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3830 {
3831 _logSegNo++;
3832 use_existent = true;
3833 lf = XLogFileInit(_logSegNo, &use_existent, true);
3834 close(lf);
3835 if (!use_existent)
3836 CheckpointStats.ckpt_segs_added++;
3837 }
3838 }
3839
3840 /*
3841 * Throws an error if the given log segment has already been removed or
3842 * recycled. The caller should only pass a segment that it knows to have
3843 * existed while the server has been running, as this function always
3844 * succeeds if no WAL segments have been removed since startup.
3845 * 'tli' is only used in the error message.
3846 *
3847 * Note: this function guarantees to keep errno unchanged on return.
3848 * This supports callers that use this to possibly deliver a better
3849 * error message about a missing file, while still being able to throw
3850 * a normal file-access error afterwards, if this does return.
3851 */
3852 void
CheckXLogRemoved(XLogSegNo segno,TimeLineID tli)3853 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3854 {
3855 int save_errno = errno;
3856 XLogSegNo lastRemovedSegNo;
3857
3858 SpinLockAcquire(&XLogCtl->info_lck);
3859 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3860 SpinLockRelease(&XLogCtl->info_lck);
3861
3862 if (segno <= lastRemovedSegNo)
3863 {
3864 char filename[MAXFNAMELEN];
3865
3866 XLogFileName(filename, tli, segno);
3867 errno = save_errno;
3868 ereport(ERROR,
3869 (errcode_for_file_access(),
3870 errmsg("requested WAL segment %s has already been removed",
3871 filename)));
3872 }
3873 errno = save_errno;
3874 }
3875
3876 /*
3877 * Return the last WAL segment removed, or 0 if no segment has been removed
3878 * since startup.
3879 *
3880 * NB: the result can be out of date arbitrarily fast, the caller has to deal
3881 * with that.
3882 */
3883 XLogSegNo
XLogGetLastRemovedSegno(void)3884 XLogGetLastRemovedSegno(void)
3885 {
3886 XLogSegNo lastRemovedSegNo;
3887
3888 SpinLockAcquire(&XLogCtl->info_lck);
3889 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3890 SpinLockRelease(&XLogCtl->info_lck);
3891
3892 return lastRemovedSegNo;
3893 }
3894
3895 /*
3896 * Update the last removed segno pointer in shared memory, to reflect
3897 * that the given XLOG file has been removed.
3898 */
3899 static void
UpdateLastRemovedPtr(char * filename)3900 UpdateLastRemovedPtr(char *filename)
3901 {
3902 uint32 tli;
3903 XLogSegNo segno;
3904
3905 XLogFromFileName(filename, &tli, &segno);
3906
3907 SpinLockAcquire(&XLogCtl->info_lck);
3908 if (segno > XLogCtl->lastRemovedSegNo)
3909 XLogCtl->lastRemovedSegNo = segno;
3910 SpinLockRelease(&XLogCtl->info_lck);
3911 }
3912
3913 /*
3914 * Recycle or remove all log files older or equal to passed segno.
3915 *
3916 * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3917 * redo pointer of the previous checkpoint. These are used to determine
3918 * whether we want to recycle rather than delete no-longer-wanted log files.
3919 */
3920 static void
RemoveOldXlogFiles(XLogSegNo segno,XLogRecPtr PriorRedoPtr,XLogRecPtr endptr)3921 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3922 {
3923 DIR *xldir;
3924 struct dirent *xlde;
3925 char lastoff[MAXFNAMELEN];
3926
3927 xldir = AllocateDir(XLOGDIR);
3928 if (xldir == NULL)
3929 ereport(ERROR,
3930 (errcode_for_file_access(),
3931 errmsg("could not open write-ahead log directory \"%s\": %m",
3932 XLOGDIR)));
3933
3934 /*
3935 * Construct a filename of the last segment to be kept. The timeline ID
3936 * doesn't matter, we ignore that in the comparison. (During recovery,
3937 * ThisTimeLineID isn't set, so we can't use that.)
3938 */
3939 XLogFileName(lastoff, 0, segno);
3940
3941 elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3942 lastoff);
3943
3944 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3945 {
3946 /* Ignore files that are not XLOG segments */
3947 if (!IsXLogFileName(xlde->d_name) &&
3948 !IsPartialXLogFileName(xlde->d_name))
3949 continue;
3950
3951 /*
3952 * We ignore the timeline part of the XLOG segment identifiers in
3953 * deciding whether a segment is still needed. This ensures that we
3954 * won't prematurely remove a segment from a parent timeline. We could
3955 * probably be a little more proactive about removing segments of
3956 * non-parent timelines, but that would be a whole lot more
3957 * complicated.
3958 *
3959 * We use the alphanumeric sorting property of the filenames to decide
3960 * which ones are earlier than the lastoff segment.
3961 */
3962 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3963 {
3964 if (XLogArchiveCheckDone(xlde->d_name))
3965 {
3966 /* Update the last removed location in shared memory first */
3967 UpdateLastRemovedPtr(xlde->d_name);
3968
3969 RemoveXlogFile(xlde->d_name, PriorRedoPtr, endptr);
3970 }
3971 }
3972 }
3973
3974 FreeDir(xldir);
3975 }
3976
3977 /*
3978 * Remove WAL files that are not part of the given timeline's history.
3979 *
3980 * This is called during recovery, whenever we switch to follow a new
3981 * timeline, and at the end of recovery when we create a new timeline. We
3982 * wouldn't otherwise care about extra WAL files lying in pg_wal, but they
3983 * might be leftover pre-allocated or recycled WAL segments on the old timeline
3984 * that we haven't used yet, and contain garbage. If we just leave them in
3985 * pg_wal, they will eventually be archived, and we can't let that happen.
3986 * Files that belong to our timeline history are valid, because we have
3987 * successfully replayed them, but from others we can't be sure.
3988 *
3989 * 'switchpoint' is the current point in WAL where we switch to new timeline,
3990 * and 'newTLI' is the new timeline we switch to.
3991 */
3992 static void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint,TimeLineID newTLI)3993 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3994 {
3995 DIR *xldir;
3996 struct dirent *xlde;
3997 char switchseg[MAXFNAMELEN];
3998 XLogSegNo endLogSegNo;
3999
4000 XLByteToPrevSeg(switchpoint, endLogSegNo);
4001
4002 xldir = AllocateDir(XLOGDIR);
4003 if (xldir == NULL)
4004 ereport(ERROR,
4005 (errcode_for_file_access(),
4006 errmsg("could not open write-ahead log directory \"%s\": %m",
4007 XLOGDIR)));
4008
4009 /*
4010 * Construct a filename of the last segment to be kept.
4011 */
4012 XLogFileName(switchseg, newTLI, endLogSegNo);
4013
4014 elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
4015 switchseg);
4016
4017 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4018 {
4019 /* Ignore files that are not XLOG segments */
4020 if (!IsXLogFileName(xlde->d_name))
4021 continue;
4022
4023 /*
4024 * Remove files that are on a timeline older than the new one we're
4025 * switching to, but with a segment number >= the first segment on the
4026 * new timeline.
4027 */
4028 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
4029 strcmp(xlde->d_name + 8, switchseg + 8) > 0)
4030 {
4031 /*
4032 * If the file has already been marked as .ready, however, don't
4033 * remove it yet. It should be OK to remove it - files that are
4034 * not part of our timeline history are not required for recovery
4035 * - but seems safer to let them be archived and removed later.
4036 */
4037 if (!XLogArchiveIsReady(xlde->d_name))
4038 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
4039 }
4040 }
4041
4042 FreeDir(xldir);
4043 }
4044
4045 /*
4046 * Recycle or remove a log file that's no longer needed.
4047 *
4048 * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
4049 * redo pointer of the previous checkpoint. These are used to determine
4050 * whether we want to recycle rather than delete no-longer-wanted log files.
4051 * If PriorRedoRecPtr is not known, pass invalid, and the function will
4052 * recycle, somewhat arbitrarily, 10 future segments.
4053 */
4054 static void
RemoveXlogFile(const char * segname,XLogRecPtr PriorRedoPtr,XLogRecPtr endptr)4055 RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
4056 {
4057 char path[MAXPGPATH];
4058 #ifdef WIN32
4059 char newpath[MAXPGPATH];
4060 #endif
4061 struct stat statbuf;
4062 XLogSegNo endlogSegNo;
4063 XLogSegNo recycleSegNo;
4064
4065 /*
4066 * Initialize info about where to try to recycle to.
4067 */
4068 XLByteToSeg(endptr, endlogSegNo);
4069 if (PriorRedoPtr == InvalidXLogRecPtr)
4070 recycleSegNo = endlogSegNo + 10;
4071 else
4072 recycleSegNo = XLOGfileslop(PriorRedoPtr);
4073
4074 snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
4075
4076 /*
4077 * Before deleting the file, see if it can be recycled as a future log
4078 * segment. Only recycle normal files, pg_standby for example can create
4079 * symbolic links pointing to a separate archive directory.
4080 */
4081 if (endlogSegNo <= recycleSegNo &&
4082 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
4083 InstallXLogFileSegment(&endlogSegNo, path,
4084 true, recycleSegNo, true))
4085 {
4086 ereport(DEBUG2,
4087 (errmsg("recycled write-ahead log file \"%s\"",
4088 segname)));
4089 CheckpointStats.ckpt_segs_recycled++;
4090 /* Needn't recheck that slot on future iterations */
4091 endlogSegNo++;
4092 }
4093 else
4094 {
4095 /* No need for any more future segments... */
4096 int rc;
4097
4098 ereport(DEBUG2,
4099 (errmsg("removing write-ahead log file \"%s\"",
4100 segname)));
4101
4102 #ifdef WIN32
4103
4104 /*
4105 * On Windows, if another process (e.g another backend) holds the file
4106 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
4107 * will still show up in directory listing until the last handle is
4108 * closed. To avoid confusing the lingering deleted file for a live
4109 * WAL file that needs to be archived, rename it before deleting it.
4110 *
4111 * If another process holds the file open without FILE_SHARE_DELETE
4112 * flag, rename will fail. We'll try again at the next checkpoint.
4113 */
4114 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
4115 if (rename(path, newpath) != 0)
4116 {
4117 ereport(LOG,
4118 (errcode_for_file_access(),
4119 errmsg("could not rename old write-ahead log file \"%s\": %m",
4120 path)));
4121 return;
4122 }
4123 rc = durable_unlink(newpath, LOG);
4124 #else
4125 rc = durable_unlink(path, LOG);
4126 #endif
4127 if (rc != 0)
4128 {
4129 /* Message already logged by durable_unlink() */
4130 return;
4131 }
4132 CheckpointStats.ckpt_segs_removed++;
4133 }
4134
4135 XLogArchiveCleanup(segname);
4136 }
4137
4138 /*
4139 * Verify whether pg_wal and pg_wal/archive_status exist.
4140 * If the latter does not exist, recreate it.
4141 *
4142 * It is not the goal of this function to verify the contents of these
4143 * directories, but to help in cases where someone has performed a cluster
4144 * copy for PITR purposes but omitted pg_wal from the copy.
4145 *
4146 * We could also recreate pg_wal if it doesn't exist, but a deliberate
4147 * policy decision was made not to. It is fairly common for pg_wal to be
4148 * a symlink, and if that was the DBA's intent then automatically making a
4149 * plain directory would result in degraded performance with no notice.
4150 */
4151 static void
ValidateXLOGDirectoryStructure(void)4152 ValidateXLOGDirectoryStructure(void)
4153 {
4154 char path[MAXPGPATH];
4155 struct stat stat_buf;
4156
4157 /* Check for pg_wal; if it doesn't exist, error out */
4158 if (stat(XLOGDIR, &stat_buf) != 0 ||
4159 !S_ISDIR(stat_buf.st_mode))
4160 ereport(FATAL,
4161 (errmsg("required WAL directory \"%s\" does not exist",
4162 XLOGDIR)));
4163
4164 /* Check for archive_status */
4165 snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4166 if (stat(path, &stat_buf) == 0)
4167 {
4168 /* Check for weird cases where it exists but isn't a directory */
4169 if (!S_ISDIR(stat_buf.st_mode))
4170 ereport(FATAL,
4171 (errmsg("required WAL directory \"%s\" does not exist",
4172 path)));
4173 }
4174 else
4175 {
4176 ereport(LOG,
4177 (errmsg("creating missing WAL directory \"%s\"", path)));
4178 if (mkdir(path, S_IRWXU) < 0)
4179 ereport(FATAL,
4180 (errmsg("could not create missing directory \"%s\": %m",
4181 path)));
4182 }
4183 }
4184
4185 /*
4186 * Remove previous backup history files. This also retries creation of
4187 * .ready files for any backup history files for which XLogArchiveNotify
4188 * failed earlier.
4189 */
4190 static void
CleanupBackupHistory(void)4191 CleanupBackupHistory(void)
4192 {
4193 DIR *xldir;
4194 struct dirent *xlde;
4195 char path[MAXPGPATH + sizeof(XLOGDIR)];
4196
4197 xldir = AllocateDir(XLOGDIR);
4198 if (xldir == NULL)
4199 ereport(ERROR,
4200 (errcode_for_file_access(),
4201 errmsg("could not open write-ahead log directory \"%s\": %m",
4202 XLOGDIR)));
4203
4204 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4205 {
4206 if (IsBackupHistoryFileName(xlde->d_name))
4207 {
4208 if (XLogArchiveCheckDone(xlde->d_name))
4209 {
4210 elog(DEBUG2, "removing WAL backup history file \"%s\"",
4211 xlde->d_name);
4212 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4213 unlink(path);
4214 XLogArchiveCleanup(xlde->d_name);
4215 }
4216 }
4217 }
4218
4219 FreeDir(xldir);
4220 }
4221
4222 /*
4223 * Attempt to read an XLOG record.
4224 *
4225 * If RecPtr is valid, try to read a record at that position. Otherwise
4226 * try to read a record just after the last one previously read.
4227 *
4228 * If no valid record is available, returns NULL, or fails if emode is PANIC.
4229 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4230 * record is available.
4231 *
4232 * The record is copied into readRecordBuf, so that on successful return,
4233 * the returned record pointer always points there.
4234 */
4235 static XLogRecord *
ReadRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int emode,bool fetching_ckpt)4236 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4237 bool fetching_ckpt)
4238 {
4239 XLogRecord *record;
4240 XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4241
4242 /* Pass through parameters to XLogPageRead */
4243 private->fetching_ckpt = fetching_ckpt;
4244 private->emode = emode;
4245 private->randAccess = (RecPtr != InvalidXLogRecPtr);
4246
4247 /* This is the first attempt to read this page. */
4248 lastSourceFailed = false;
4249
4250 for (;;)
4251 {
4252 char *errormsg;
4253
4254 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4255 ReadRecPtr = xlogreader->ReadRecPtr;
4256 EndRecPtr = xlogreader->EndRecPtr;
4257 if (record == NULL)
4258 {
4259 /*
4260 * When not in standby mode we find that WAL ends in an incomplete
4261 * record, keep track of that record. After recovery is done,
4262 * we'll write a record to indicate downstream WAL readers that
4263 * that portion is to be ignored.
4264 */
4265 if (!StandbyMode &&
4266 !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
4267 {
4268 abortedRecPtr = xlogreader->abortedRecPtr;
4269 missingContrecPtr = xlogreader->missingContrecPtr;
4270 }
4271
4272 if (readFile >= 0)
4273 {
4274 close(readFile);
4275 readFile = -1;
4276 }
4277
4278 /*
4279 * We only end up here without a message when XLogPageRead()
4280 * failed - in that case we already logged something. In
4281 * StandbyMode that only happens if we have been triggered, so we
4282 * shouldn't loop anymore in that case.
4283 */
4284 if (errormsg)
4285 ereport(emode_for_corrupt_record(emode,
4286 RecPtr ? RecPtr : EndRecPtr),
4287 (errmsg_internal("%s", errormsg) /* already translated */ ));
4288 }
4289
4290 /*
4291 * Check page TLI is one of the expected values.
4292 */
4293 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4294 {
4295 char fname[MAXFNAMELEN];
4296 XLogSegNo segno;
4297 int32 offset;
4298
4299 XLByteToSeg(xlogreader->latestPagePtr, segno);
4300 offset = xlogreader->latestPagePtr % XLogSegSize;
4301 XLogFileName(fname, xlogreader->readPageTLI, segno);
4302 ereport(emode_for_corrupt_record(emode,
4303 RecPtr ? RecPtr : EndRecPtr),
4304 (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4305 xlogreader->latestPageTLI,
4306 fname,
4307 offset)));
4308 record = NULL;
4309 }
4310
4311 if (record)
4312 {
4313 /* Great, got a record */
4314 return record;
4315 }
4316 else
4317 {
4318 /* No valid record available from this source */
4319 lastSourceFailed = true;
4320
4321 /*
4322 * If archive recovery was requested, but we were still doing
4323 * crash recovery, switch to archive recovery and retry using the
4324 * offline archive. We have now replayed all the valid WAL in
4325 * pg_wal, so we are presumably now consistent.
4326 *
4327 * We require that there's at least some valid WAL present in
4328 * pg_wal, however (!fetching_ckpt). We could recover using the
4329 * WAL from the archive, even if pg_wal is completely empty, but
4330 * we'd have no idea how far we'd have to replay to reach
4331 * consistency. So err on the safe side and give up.
4332 */
4333 if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4334 !fetching_ckpt)
4335 {
4336 ereport(DEBUG1,
4337 (errmsg_internal("reached end of WAL in pg_wal, entering archive recovery")));
4338 InArchiveRecovery = true;
4339 if (StandbyModeRequested)
4340 StandbyMode = true;
4341
4342 /* initialize minRecoveryPoint to this record */
4343 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4344 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4345 if (ControlFile->minRecoveryPoint < EndRecPtr)
4346 {
4347 ControlFile->minRecoveryPoint = EndRecPtr;
4348 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4349 }
4350 /* update local copy */
4351 minRecoveryPoint = ControlFile->minRecoveryPoint;
4352 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4353
4354 /*
4355 * The startup process can update its local copy of
4356 * minRecoveryPoint from this point.
4357 */
4358 updateMinRecoveryPoint = true;
4359
4360 UpdateControlFile();
4361
4362 /*
4363 * We update SharedRecoveryState while holding the lock on
4364 * ControlFileLock so both states are consistent in shared
4365 * memory.
4366 */
4367 SpinLockAcquire(&XLogCtl->info_lck);
4368 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
4369 SpinLockRelease(&XLogCtl->info_lck);
4370
4371 LWLockRelease(ControlFileLock);
4372
4373 CheckRecoveryConsistency();
4374
4375 /*
4376 * Before we retry, reset lastSourceFailed and currentSource
4377 * so that we will check the archive next.
4378 */
4379 lastSourceFailed = false;
4380 currentSource = 0;
4381
4382 continue;
4383 }
4384
4385 /* In standby mode, loop back to retry. Otherwise, give up. */
4386 if (StandbyMode && !CheckForStandbyTrigger())
4387 continue;
4388 else
4389 return NULL;
4390 }
4391 }
4392 }
4393
4394 /*
4395 * Scan for new timelines that might have appeared in the archive since we
4396 * started recovery.
4397 *
4398 * If there are any, the function changes recovery target TLI to the latest
4399 * one and returns 'true'.
4400 */
4401 static bool
rescanLatestTimeLine(void)4402 rescanLatestTimeLine(void)
4403 {
4404 List *newExpectedTLEs;
4405 bool found;
4406 ListCell *cell;
4407 TimeLineID newtarget;
4408 TimeLineID oldtarget = recoveryTargetTLI;
4409 TimeLineHistoryEntry *currentTle = NULL;
4410
4411 newtarget = findNewestTimeLine(recoveryTargetTLI);
4412 if (newtarget == recoveryTargetTLI)
4413 {
4414 /* No new timelines found */
4415 return false;
4416 }
4417
4418 /*
4419 * Determine the list of expected TLIs for the new TLI
4420 */
4421
4422 newExpectedTLEs = readTimeLineHistory(newtarget);
4423
4424 /*
4425 * If the current timeline is not part of the history of the new timeline,
4426 * we cannot proceed to it.
4427 */
4428 found = false;
4429 foreach(cell, newExpectedTLEs)
4430 {
4431 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4432
4433 if (currentTle->tli == recoveryTargetTLI)
4434 {
4435 found = true;
4436 break;
4437 }
4438 }
4439 if (!found)
4440 {
4441 ereport(LOG,
4442 (errmsg("new timeline %u is not a child of database system timeline %u",
4443 newtarget,
4444 ThisTimeLineID)));
4445 return false;
4446 }
4447
4448 /*
4449 * The current timeline was found in the history file, but check that the
4450 * next timeline was forked off from it *after* the current recovery
4451 * location.
4452 */
4453 if (currentTle->end < EndRecPtr)
4454 {
4455 ereport(LOG,
4456 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4457 newtarget,
4458 ThisTimeLineID,
4459 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4460 return false;
4461 }
4462
4463 /* The new timeline history seems valid. Switch target */
4464 recoveryTargetTLI = newtarget;
4465 list_free_deep(expectedTLEs);
4466 expectedTLEs = newExpectedTLEs;
4467
4468 /*
4469 * As in StartupXLOG(), try to ensure we have all the history files
4470 * between the old target and new target in pg_wal.
4471 */
4472 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4473
4474 ereport(LOG,
4475 (errmsg("new target timeline is %u",
4476 recoveryTargetTLI)));
4477
4478 return true;
4479 }
4480
4481 /*
4482 * I/O routines for pg_control
4483 *
4484 * *ControlFile is a buffer in shared memory that holds an image of the
4485 * contents of pg_control. WriteControlFile() initializes pg_control
4486 * given a preloaded buffer, ReadControlFile() loads the buffer from
4487 * the pg_control file (during postmaster or standalone-backend startup),
4488 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4489 *
4490 * For simplicity, WriteControlFile() initializes the fields of pg_control
4491 * that are related to checking backend/database compatibility, and
4492 * ReadControlFile() verifies they are correct. We could split out the
4493 * I/O and compatibility-check functions, but there seems no need currently.
4494 */
4495 static void
WriteControlFile(void)4496 WriteControlFile(void)
4497 {
4498 int fd;
4499 char buffer[PG_CONTROL_FILE_SIZE]; /* need not be aligned */
4500
4501 /*
4502 * Ensure that the size of the pg_control data structure is sane. See the
4503 * comments for these symbols in pg_control.h.
4504 */
4505 StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_MAX_SAFE_SIZE,
4506 "pg_control is too large for atomic disk writes");
4507 StaticAssertStmt(sizeof(ControlFileData) <= PG_CONTROL_FILE_SIZE,
4508 "sizeof(ControlFileData) exceeds PG_CONTROL_FILE_SIZE");
4509
4510 /*
4511 * Initialize version and compatibility-check fields
4512 */
4513 ControlFile->pg_control_version = PG_CONTROL_VERSION;
4514 ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4515
4516 ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4517 ControlFile->floatFormat = FLOATFORMAT_VALUE;
4518
4519 ControlFile->blcksz = BLCKSZ;
4520 ControlFile->relseg_size = RELSEG_SIZE;
4521 ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4522 ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4523
4524 ControlFile->nameDataLen = NAMEDATALEN;
4525 ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4526
4527 ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4528 ControlFile->loblksize = LOBLKSIZE;
4529
4530 ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4531 ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4532
4533 /* Contents are protected with a CRC */
4534 INIT_CRC32C(ControlFile->crc);
4535 COMP_CRC32C(ControlFile->crc,
4536 (char *) ControlFile,
4537 offsetof(ControlFileData, crc));
4538 FIN_CRC32C(ControlFile->crc);
4539
4540 /*
4541 * We write out PG_CONTROL_FILE_SIZE bytes into pg_control, zero-padding
4542 * the excess over sizeof(ControlFileData). This reduces the odds of
4543 * premature-EOF errors when reading pg_control. We'll still fail when we
4544 * check the contents of the file, but hopefully with a more specific
4545 * error than "couldn't read pg_control".
4546 */
4547 memset(buffer, 0, PG_CONTROL_FILE_SIZE);
4548 memcpy(buffer, ControlFile, sizeof(ControlFileData));
4549
4550 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4551 O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4552 S_IRUSR | S_IWUSR);
4553 if (fd < 0)
4554 ereport(PANIC,
4555 (errcode_for_file_access(),
4556 errmsg("could not create control file \"%s\": %m",
4557 XLOG_CONTROL_FILE)));
4558
4559 errno = 0;
4560 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE);
4561 if (write(fd, buffer, PG_CONTROL_FILE_SIZE) != PG_CONTROL_FILE_SIZE)
4562 {
4563 /* if write didn't set errno, assume problem is no disk space */
4564 if (errno == 0)
4565 errno = ENOSPC;
4566 ereport(PANIC,
4567 (errcode_for_file_access(),
4568 errmsg("could not write to control file: %m")));
4569 }
4570 pgstat_report_wait_end();
4571
4572 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC);
4573 if (pg_fsync(fd) != 0)
4574 ereport(PANIC,
4575 (errcode_for_file_access(),
4576 errmsg("could not fsync control file: %m")));
4577 pgstat_report_wait_end();
4578
4579 if (close(fd))
4580 ereport(PANIC,
4581 (errcode_for_file_access(),
4582 errmsg("could not close control file: %m")));
4583 }
4584
4585 static void
ReadControlFile(void)4586 ReadControlFile(void)
4587 {
4588 pg_crc32c crc;
4589 int fd;
4590 int r;
4591
4592 /*
4593 * Read data...
4594 */
4595 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4596 O_RDWR | PG_BINARY,
4597 S_IRUSR | S_IWUSR);
4598 if (fd < 0)
4599 ereport(PANIC,
4600 (errcode_for_file_access(),
4601 errmsg("could not open control file \"%s\": %m",
4602 XLOG_CONTROL_FILE)));
4603
4604 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_READ);
4605 r = read(fd, ControlFile, sizeof(ControlFileData));
4606 if (r != sizeof(ControlFileData))
4607 {
4608 if (r < 0)
4609 ereport(PANIC,
4610 (errcode_for_file_access(),
4611 errmsg("could not read from control file: %m")));
4612 else
4613 ereport(PANIC,
4614 (errmsg("could not read from control file: read %d bytes, expected %d", r, (int) sizeof(ControlFileData))));
4615 }
4616 pgstat_report_wait_end();
4617
4618 close(fd);
4619
4620 /*
4621 * Check for expected pg_control format version. If this is wrong, the
4622 * CRC check will likely fail because we'll be checking the wrong number
4623 * of bytes. Complaining about wrong version will probably be more
4624 * enlightening than complaining about wrong CRC.
4625 */
4626
4627 if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4628 ereport(FATAL,
4629 (errmsg("database files are incompatible with server"),
4630 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4631 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4632 ControlFile->pg_control_version, ControlFile->pg_control_version,
4633 PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4634 errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
4635
4636 if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4637 ereport(FATAL,
4638 (errmsg("database files are incompatible with server"),
4639 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4640 " but the server was compiled with PG_CONTROL_VERSION %d.",
4641 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4642 errhint("It looks like you need to initdb.")));
4643
4644 /* Now check the CRC. */
4645 INIT_CRC32C(crc);
4646 COMP_CRC32C(crc,
4647 (char *) ControlFile,
4648 offsetof(ControlFileData, crc));
4649 FIN_CRC32C(crc);
4650
4651 if (!EQ_CRC32C(crc, ControlFile->crc))
4652 ereport(FATAL,
4653 (errmsg("incorrect checksum in control file")));
4654
4655 /*
4656 * Do compatibility checking immediately. If the database isn't
4657 * compatible with the backend executable, we want to abort before we can
4658 * possibly do any damage.
4659 */
4660 if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4661 ereport(FATAL,
4662 (errmsg("database files are incompatible with server"),
4663 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4664 " but the server was compiled with CATALOG_VERSION_NO %d.",
4665 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4666 errhint("It looks like you need to initdb.")));
4667 if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4668 ereport(FATAL,
4669 (errmsg("database files are incompatible with server"),
4670 errdetail("The database cluster was initialized with MAXALIGN %d,"
4671 " but the server was compiled with MAXALIGN %d.",
4672 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4673 errhint("It looks like you need to initdb.")));
4674 if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4675 ereport(FATAL,
4676 (errmsg("database files are incompatible with server"),
4677 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4678 errhint("It looks like you need to initdb.")));
4679 if (ControlFile->blcksz != BLCKSZ)
4680 ereport(FATAL,
4681 (errmsg("database files are incompatible with server"),
4682 errdetail("The database cluster was initialized with BLCKSZ %d,"
4683 " but the server was compiled with BLCKSZ %d.",
4684 ControlFile->blcksz, BLCKSZ),
4685 errhint("It looks like you need to recompile or initdb.")));
4686 if (ControlFile->relseg_size != RELSEG_SIZE)
4687 ereport(FATAL,
4688 (errmsg("database files are incompatible with server"),
4689 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4690 " but the server was compiled with RELSEG_SIZE %d.",
4691 ControlFile->relseg_size, RELSEG_SIZE),
4692 errhint("It looks like you need to recompile or initdb.")));
4693 if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4694 ereport(FATAL,
4695 (errmsg("database files are incompatible with server"),
4696 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4697 " but the server was compiled with XLOG_BLCKSZ %d.",
4698 ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4699 errhint("It looks like you need to recompile or initdb.")));
4700 if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4701 ereport(FATAL,
4702 (errmsg("database files are incompatible with server"),
4703 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4704 " but the server was compiled with XLOG_SEG_SIZE %d.",
4705 ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4706 errhint("It looks like you need to recompile or initdb.")));
4707 if (ControlFile->nameDataLen != NAMEDATALEN)
4708 ereport(FATAL,
4709 (errmsg("database files are incompatible with server"),
4710 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4711 " but the server was compiled with NAMEDATALEN %d.",
4712 ControlFile->nameDataLen, NAMEDATALEN),
4713 errhint("It looks like you need to recompile or initdb.")));
4714 if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4715 ereport(FATAL,
4716 (errmsg("database files are incompatible with server"),
4717 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4718 " but the server was compiled with INDEX_MAX_KEYS %d.",
4719 ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4720 errhint("It looks like you need to recompile or initdb.")));
4721 if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4722 ereport(FATAL,
4723 (errmsg("database files are incompatible with server"),
4724 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4725 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4726 ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4727 errhint("It looks like you need to recompile or initdb.")));
4728 if (ControlFile->loblksize != LOBLKSIZE)
4729 ereport(FATAL,
4730 (errmsg("database files are incompatible with server"),
4731 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4732 " but the server was compiled with LOBLKSIZE %d.",
4733 ControlFile->loblksize, (int) LOBLKSIZE),
4734 errhint("It looks like you need to recompile or initdb.")));
4735
4736 #ifdef USE_FLOAT4_BYVAL
4737 if (ControlFile->float4ByVal != true)
4738 ereport(FATAL,
4739 (errmsg("database files are incompatible with server"),
4740 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4741 " but the server was compiled with USE_FLOAT4_BYVAL."),
4742 errhint("It looks like you need to recompile or initdb.")));
4743 #else
4744 if (ControlFile->float4ByVal != false)
4745 ereport(FATAL,
4746 (errmsg("database files are incompatible with server"),
4747 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4748 " but the server was compiled without USE_FLOAT4_BYVAL."),
4749 errhint("It looks like you need to recompile or initdb.")));
4750 #endif
4751
4752 #ifdef USE_FLOAT8_BYVAL
4753 if (ControlFile->float8ByVal != true)
4754 ereport(FATAL,
4755 (errmsg("database files are incompatible with server"),
4756 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4757 " but the server was compiled with USE_FLOAT8_BYVAL."),
4758 errhint("It looks like you need to recompile or initdb.")));
4759 #else
4760 if (ControlFile->float8ByVal != false)
4761 ereport(FATAL,
4762 (errmsg("database files are incompatible with server"),
4763 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4764 " but the server was compiled without USE_FLOAT8_BYVAL."),
4765 errhint("It looks like you need to recompile or initdb.")));
4766 #endif
4767
4768 /* Make the initdb settings visible as GUC variables, too */
4769 SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4770 PGC_INTERNAL, PGC_S_OVERRIDE);
4771 }
4772
4773 void
UpdateControlFile(void)4774 UpdateControlFile(void)
4775 {
4776 int fd;
4777
4778 INIT_CRC32C(ControlFile->crc);
4779 COMP_CRC32C(ControlFile->crc,
4780 (char *) ControlFile,
4781 offsetof(ControlFileData, crc));
4782 FIN_CRC32C(ControlFile->crc);
4783
4784 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4785 O_RDWR | PG_BINARY,
4786 S_IRUSR | S_IWUSR);
4787 if (fd < 0)
4788 ereport(PANIC,
4789 (errcode_for_file_access(),
4790 errmsg("could not open control file \"%s\": %m",
4791 XLOG_CONTROL_FILE)));
4792
4793 errno = 0;
4794 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_WRITE_UPDATE);
4795 if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4796 {
4797 /* if write didn't set errno, assume problem is no disk space */
4798 if (errno == 0)
4799 errno = ENOSPC;
4800 ereport(PANIC,
4801 (errcode_for_file_access(),
4802 errmsg("could not write to control file: %m")));
4803 }
4804 pgstat_report_wait_end();
4805
4806 pgstat_report_wait_start(WAIT_EVENT_CONTROL_FILE_SYNC_UPDATE);
4807 if (pg_fsync(fd) != 0)
4808 ereport(PANIC,
4809 (errcode_for_file_access(),
4810 errmsg("could not fsync control file: %m")));
4811 pgstat_report_wait_end();
4812
4813 if (close(fd))
4814 ereport(PANIC,
4815 (errcode_for_file_access(),
4816 errmsg("could not close control file: %m")));
4817 }
4818
4819 /*
4820 * Returns the unique system identifier from control file.
4821 */
4822 uint64
GetSystemIdentifier(void)4823 GetSystemIdentifier(void)
4824 {
4825 Assert(ControlFile != NULL);
4826 return ControlFile->system_identifier;
4827 }
4828
4829 /*
4830 * Returns the random nonce from control file.
4831 */
4832 char *
GetMockAuthenticationNonce(void)4833 GetMockAuthenticationNonce(void)
4834 {
4835 Assert(ControlFile != NULL);
4836 return ControlFile->mock_authentication_nonce;
4837 }
4838
4839 /*
4840 * Are checksums enabled for data pages?
4841 */
4842 bool
DataChecksumsEnabled(void)4843 DataChecksumsEnabled(void)
4844 {
4845 Assert(ControlFile != NULL);
4846 return (ControlFile->data_checksum_version > 0);
4847 }
4848
4849 /*
4850 * Returns a fake LSN for unlogged relations.
4851 *
4852 * Each call generates an LSN that is greater than any previous value
4853 * returned. The current counter value is saved and restored across clean
4854 * shutdowns, but like unlogged relations, does not survive a crash. This can
4855 * be used in lieu of real LSN values returned by XLogInsert, if you need an
4856 * LSN-like increasing sequence of numbers without writing any WAL.
4857 */
4858 XLogRecPtr
GetFakeLSNForUnloggedRel(void)4859 GetFakeLSNForUnloggedRel(void)
4860 {
4861 XLogRecPtr nextUnloggedLSN;
4862
4863 /* increment the unloggedLSN counter, need SpinLock */
4864 SpinLockAcquire(&XLogCtl->ulsn_lck);
4865 nextUnloggedLSN = XLogCtl->unloggedLSN++;
4866 SpinLockRelease(&XLogCtl->ulsn_lck);
4867
4868 return nextUnloggedLSN;
4869 }
4870
4871 /*
4872 * Auto-tune the number of XLOG buffers.
4873 *
4874 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4875 * a maximum of one XLOG segment (there is little reason to think that more
4876 * is helpful, at least so long as we force an fsync when switching log files)
4877 * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4878 * 9.1, when auto-tuning was added).
4879 *
4880 * This should not be called until NBuffers has received its final value.
4881 */
4882 static int
XLOGChooseNumBuffers(void)4883 XLOGChooseNumBuffers(void)
4884 {
4885 int xbuffers;
4886
4887 xbuffers = NBuffers / 32;
4888 if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4889 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4890 if (xbuffers < 8)
4891 xbuffers = 8;
4892 return xbuffers;
4893 }
4894
4895 /*
4896 * GUC check_hook for wal_buffers
4897 */
4898 bool
check_wal_buffers(int * newval,void ** extra,GucSource source)4899 check_wal_buffers(int *newval, void **extra, GucSource source)
4900 {
4901 /*
4902 * -1 indicates a request for auto-tune.
4903 */
4904 if (*newval == -1)
4905 {
4906 /*
4907 * If we haven't yet changed the boot_val default of -1, just let it
4908 * be. We'll fix it when XLOGShmemSize is called.
4909 */
4910 if (XLOGbuffers == -1)
4911 return true;
4912
4913 /* Otherwise, substitute the auto-tune value */
4914 *newval = XLOGChooseNumBuffers();
4915 }
4916
4917 /*
4918 * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
4919 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4920 * the case, we just silently treat such values as a request for the
4921 * minimum. (We could throw an error instead, but that doesn't seem very
4922 * helpful.)
4923 */
4924 if (*newval < 4)
4925 *newval = 4;
4926
4927 return true;
4928 }
4929
4930 /*
4931 * Initialization of shared memory for XLOG
4932 */
4933 Size
XLOGShmemSize(void)4934 XLOGShmemSize(void)
4935 {
4936 Size size;
4937
4938 /*
4939 * If the value of wal_buffers is -1, use the preferred auto-tune value.
4940 * This isn't an amazingly clean place to do this, but we must wait till
4941 * NBuffers has received its final value, and must do it before using the
4942 * value of XLOGbuffers to do anything important.
4943 */
4944 if (XLOGbuffers == -1)
4945 {
4946 char buf[32];
4947
4948 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4949 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4950 }
4951 Assert(XLOGbuffers > 0);
4952
4953 /* XLogCtl */
4954 size = sizeof(XLogCtlData);
4955
4956 /* WAL insertion locks, plus alignment */
4957 size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4958 /* xlblocks array */
4959 size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4960 /* extra alignment padding for XLOG I/O buffers */
4961 size = add_size(size, XLOG_BLCKSZ);
4962 /* and the buffers themselves */
4963 size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4964
4965 /*
4966 * Note: we don't count ControlFileData, it comes out of the "slop factor"
4967 * added by CreateSharedMemoryAndSemaphores. This lets us use this
4968 * routine again below to compute the actual allocation size.
4969 */
4970
4971 return size;
4972 }
4973
4974 void
XLOGShmemInit(void)4975 XLOGShmemInit(void)
4976 {
4977 bool foundCFile,
4978 foundXLog;
4979 char *allocptr;
4980 int i;
4981
4982 #ifdef WAL_DEBUG
4983
4984 /*
4985 * Create a memory context for WAL debugging that's exempt from the normal
4986 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4987 * an allocation fails, but wal_debug is not for production use anyway.
4988 */
4989 if (walDebugCxt == NULL)
4990 {
4991 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4992 "WAL Debug",
4993 ALLOCSET_DEFAULT_SIZES);
4994 MemoryContextAllowInCriticalSection(walDebugCxt, true);
4995 }
4996 #endif
4997
4998 ControlFile = (ControlFileData *)
4999 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
5000 XLogCtl = (XLogCtlData *)
5001 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
5002
5003 if (foundCFile || foundXLog)
5004 {
5005 /* both should be present or neither */
5006 Assert(foundCFile && foundXLog);
5007
5008 /* Initialize local copy of WALInsertLocks and register the tranche */
5009 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
5010 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
5011 "wal_insert");
5012 return;
5013 }
5014 memset(XLogCtl, 0, sizeof(XLogCtlData));
5015
5016 /*
5017 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
5018 * multiple of the alignment for same, so no extra alignment padding is
5019 * needed here.
5020 */
5021 allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
5022 XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
5023 memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
5024 allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
5025
5026
5027 /* WAL insertion locks. Ensure they're aligned to the full padded size */
5028 allocptr += sizeof(WALInsertLockPadded) -
5029 ((uintptr_t) allocptr) % sizeof(WALInsertLockPadded);
5030 WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
5031 (WALInsertLockPadded *) allocptr;
5032 allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
5033
5034 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, "wal_insert");
5035 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
5036 {
5037 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
5038 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
5039 WALInsertLocks[i].l.lastImportantAt = InvalidXLogRecPtr;
5040 }
5041
5042 /*
5043 * Align the start of the page buffers to a full xlog block size boundary.
5044 * This simplifies some calculations in XLOG insertion. It is also
5045 * required for O_DIRECT.
5046 */
5047 allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
5048 XLogCtl->pages = allocptr;
5049 memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
5050
5051 /*
5052 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
5053 * in additional info.)
5054 */
5055 XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
5056 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
5057 XLogCtl->SharedHotStandbyActive = false;
5058 XLogCtl->WalWriterSleeping = false;
5059
5060 SpinLockInit(&XLogCtl->Insert.insertpos_lck);
5061 SpinLockInit(&XLogCtl->info_lck);
5062 SpinLockInit(&XLogCtl->ulsn_lck);
5063 InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
5064
5065 /*
5066 * If we are not in bootstrap mode, pg_control should already exist. Read
5067 * and validate it immediately (see comments in ReadControlFile() for the
5068 * reasons why).
5069 */
5070 if (!IsBootstrapProcessingMode())
5071 ReadControlFile();
5072 }
5073
5074 /*
5075 * This func must be called ONCE on system install. It creates pg_control
5076 * and the initial XLOG segment.
5077 */
5078 void
BootStrapXLOG(void)5079 BootStrapXLOG(void)
5080 {
5081 CheckPoint checkPoint;
5082 char *buffer;
5083 XLogPageHeader page;
5084 XLogLongPageHeader longpage;
5085 XLogRecord *record;
5086 char *recptr;
5087 bool use_existent;
5088 uint64 sysidentifier;
5089 char mock_auth_nonce[MOCK_AUTH_NONCE_LEN];
5090 struct timeval tv;
5091 pg_crc32c crc;
5092
5093 /*
5094 * Select a hopefully-unique system identifier code for this installation.
5095 * We use the result of gettimeofday(), including the fractional seconds
5096 * field, as being about as unique as we can easily get. (Think not to
5097 * use random(), since it hasn't been seeded and there's no portable way
5098 * to seed it other than the system clock value...) The upper half of the
5099 * uint64 value is just the tv_sec part, while the lower half contains the
5100 * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
5101 * PID for a little extra uniqueness. A person knowing this encoding can
5102 * determine the initialization time of the installation, which could
5103 * perhaps be useful sometimes.
5104 */
5105 gettimeofday(&tv, NULL);
5106 sysidentifier = ((uint64) tv.tv_sec) << 32;
5107 sysidentifier |= ((uint64) tv.tv_usec) << 12;
5108 sysidentifier |= getpid() & 0xFFF;
5109
5110 /*
5111 * Generate a random nonce. This is used for authentication requests that
5112 * will fail because the user does not exist. The nonce is used to create
5113 * a genuine-looking password challenge for the non-existent user, in lieu
5114 * of an actual stored password.
5115 */
5116 if (!pg_backend_random(mock_auth_nonce, MOCK_AUTH_NONCE_LEN))
5117 ereport(PANIC,
5118 (errcode(ERRCODE_INTERNAL_ERROR),
5119 errmsg("could not generate secret authorization token")));
5120
5121 /* First timeline ID is always 1 */
5122 ThisTimeLineID = 1;
5123
5124 /* page buffer must be aligned suitably for O_DIRECT */
5125 buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
5126 page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
5127 memset(page, 0, XLOG_BLCKSZ);
5128
5129 /*
5130 * Set up information for the initial checkpoint record
5131 *
5132 * The initial checkpoint record is written to the beginning of the WAL
5133 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
5134 * used, so that we can use 0/0 to mean "before any valid WAL segment".
5135 */
5136 checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
5137 checkPoint.ThisTimeLineID = ThisTimeLineID;
5138 checkPoint.PrevTimeLineID = ThisTimeLineID;
5139 checkPoint.fullPageWrites = fullPageWrites;
5140 checkPoint.nextXidEpoch = 0;
5141 checkPoint.nextXid = FirstNormalTransactionId;
5142 checkPoint.nextOid = FirstBootstrapObjectId;
5143 checkPoint.nextMulti = FirstMultiXactId;
5144 checkPoint.nextMultiOffset = 0;
5145 checkPoint.oldestXid = FirstNormalTransactionId;
5146 checkPoint.oldestXidDB = TemplateDbOid;
5147 checkPoint.oldestMulti = FirstMultiXactId;
5148 checkPoint.oldestMultiDB = TemplateDbOid;
5149 checkPoint.oldestCommitTsXid = InvalidTransactionId;
5150 checkPoint.newestCommitTsXid = InvalidTransactionId;
5151 checkPoint.time = (pg_time_t) time(NULL);
5152 checkPoint.oldestActiveXid = InvalidTransactionId;
5153
5154 ShmemVariableCache->nextXid = checkPoint.nextXid;
5155 ShmemVariableCache->nextOid = checkPoint.nextOid;
5156 ShmemVariableCache->oidCount = 0;
5157 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
5158 AdvanceOldestClogXid(checkPoint.oldestXid);
5159 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
5160 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
5161 SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
5162
5163 /* Set up the XLOG page header */
5164 page->xlp_magic = XLOG_PAGE_MAGIC;
5165 page->xlp_info = XLP_LONG_HEADER;
5166 page->xlp_tli = ThisTimeLineID;
5167 page->xlp_pageaddr = XLogSegSize;
5168 longpage = (XLogLongPageHeader) page;
5169 longpage->xlp_sysid = sysidentifier;
5170 longpage->xlp_seg_size = XLogSegSize;
5171 longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
5172
5173 /* Insert the initial checkpoint record */
5174 recptr = ((char *) page + SizeOfXLogLongPHD);
5175 record = (XLogRecord *) recptr;
5176 record->xl_prev = 0;
5177 record->xl_xid = InvalidTransactionId;
5178 record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5179 record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5180 record->xl_rmid = RM_XLOG_ID;
5181 recptr += SizeOfXLogRecord;
5182 /* fill the XLogRecordDataHeaderShort struct */
5183 *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5184 *(recptr++) = sizeof(checkPoint);
5185 memcpy(recptr, &checkPoint, sizeof(checkPoint));
5186 recptr += sizeof(checkPoint);
5187 Assert(recptr - (char *) record == record->xl_tot_len);
5188
5189 INIT_CRC32C(crc);
5190 COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5191 COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5192 FIN_CRC32C(crc);
5193 record->xl_crc = crc;
5194
5195 /* Create first XLOG segment file */
5196 use_existent = false;
5197 openLogFile = XLogFileInit(1, &use_existent, false);
5198
5199 /* Write the first page with the initial record */
5200 errno = 0;
5201 pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_WRITE);
5202 if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5203 {
5204 /* if write didn't set errno, assume problem is no disk space */
5205 if (errno == 0)
5206 errno = ENOSPC;
5207 ereport(PANIC,
5208 (errcode_for_file_access(),
5209 errmsg("could not write bootstrap write-ahead log file: %m")));
5210 }
5211 pgstat_report_wait_end();
5212
5213 pgstat_report_wait_start(WAIT_EVENT_WAL_BOOTSTRAP_SYNC);
5214 if (pg_fsync(openLogFile) != 0)
5215 ereport(PANIC,
5216 (errcode_for_file_access(),
5217 errmsg("could not fsync bootstrap write-ahead log file: %m")));
5218 pgstat_report_wait_end();
5219
5220 if (close(openLogFile))
5221 ereport(PANIC,
5222 (errcode_for_file_access(),
5223 errmsg("could not close bootstrap write-ahead log file: %m")));
5224
5225 openLogFile = -1;
5226
5227 /* Now create pg_control */
5228
5229 memset(ControlFile, 0, sizeof(ControlFileData));
5230 /* Initialize pg_control status fields */
5231 ControlFile->system_identifier = sysidentifier;
5232 memcpy(ControlFile->mock_authentication_nonce, mock_auth_nonce, MOCK_AUTH_NONCE_LEN);
5233 ControlFile->state = DB_SHUTDOWNED;
5234 ControlFile->time = checkPoint.time;
5235 ControlFile->checkPoint = checkPoint.redo;
5236 ControlFile->checkPointCopy = checkPoint;
5237 ControlFile->unloggedLSN = 1;
5238
5239 /* Set important parameter values for use when replaying WAL */
5240 ControlFile->MaxConnections = MaxConnections;
5241 ControlFile->max_worker_processes = max_worker_processes;
5242 ControlFile->max_prepared_xacts = max_prepared_xacts;
5243 ControlFile->max_locks_per_xact = max_locks_per_xact;
5244 ControlFile->wal_level = wal_level;
5245 ControlFile->wal_log_hints = wal_log_hints;
5246 ControlFile->track_commit_timestamp = track_commit_timestamp;
5247 ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5248
5249 /* some additional ControlFile fields are set in WriteControlFile() */
5250
5251 WriteControlFile();
5252
5253 /* Bootstrap the commit log, too */
5254 BootStrapCLOG();
5255 BootStrapCommitTs();
5256 BootStrapSUBTRANS();
5257 BootStrapMultiXact();
5258
5259 pfree(buffer);
5260 }
5261
5262 static char *
str_time(pg_time_t tnow)5263 str_time(pg_time_t tnow)
5264 {
5265 static char buf[128];
5266
5267 pg_strftime(buf, sizeof(buf),
5268 "%Y-%m-%d %H:%M:%S %Z",
5269 pg_localtime(&tnow, log_timezone));
5270
5271 return buf;
5272 }
5273
5274 /*
5275 * See if there is a recovery command file (recovery.conf), and if so
5276 * read in parameters for archive recovery and XLOG streaming.
5277 *
5278 * The file is parsed using the main configuration parser.
5279 */
5280 static void
readRecoveryCommandFile(void)5281 readRecoveryCommandFile(void)
5282 {
5283 FILE *fd;
5284 TimeLineID rtli = 0;
5285 bool rtliGiven = false;
5286 ConfigVariable *item,
5287 *head = NULL,
5288 *tail = NULL;
5289 bool recoveryTargetActionSet = false;
5290
5291
5292 fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5293 if (fd == NULL)
5294 {
5295 if (errno == ENOENT)
5296 return; /* not there, so no archive recovery */
5297 ereport(FATAL,
5298 (errcode_for_file_access(),
5299 errmsg("could not open recovery command file \"%s\": %m",
5300 RECOVERY_COMMAND_FILE)));
5301 }
5302
5303 /*
5304 * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5305 * no need to check the return value.
5306 */
5307 (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5308
5309 FreeFile(fd);
5310
5311 for (item = head; item; item = item->next)
5312 {
5313 if (strcmp(item->name, "restore_command") == 0)
5314 {
5315 recoveryRestoreCommand = pstrdup(item->value);
5316 ereport(DEBUG2,
5317 (errmsg_internal("restore_command = '%s'",
5318 recoveryRestoreCommand)));
5319 }
5320 else if (strcmp(item->name, "recovery_end_command") == 0)
5321 {
5322 recoveryEndCommand = pstrdup(item->value);
5323 ereport(DEBUG2,
5324 (errmsg_internal("recovery_end_command = '%s'",
5325 recoveryEndCommand)));
5326 }
5327 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5328 {
5329 archiveCleanupCommand = pstrdup(item->value);
5330 ereport(DEBUG2,
5331 (errmsg_internal("archive_cleanup_command = '%s'",
5332 archiveCleanupCommand)));
5333 }
5334 else if (strcmp(item->name, "recovery_target_action") == 0)
5335 {
5336 if (strcmp(item->value, "pause") == 0)
5337 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
5338 else if (strcmp(item->value, "promote") == 0)
5339 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
5340 else if (strcmp(item->value, "shutdown") == 0)
5341 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5342 else
5343 ereport(ERROR,
5344 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5345 errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5346 "recovery_target_action",
5347 item->value),
5348 errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
5349
5350 ereport(DEBUG2,
5351 (errmsg_internal("recovery_target_action = '%s'",
5352 item->value)));
5353
5354 recoveryTargetActionSet = true;
5355 }
5356 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5357 {
5358 rtliGiven = true;
5359 if (strcmp(item->value, "latest") == 0)
5360 rtli = 0;
5361 else
5362 {
5363 errno = 0;
5364 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5365 if (errno == EINVAL || errno == ERANGE)
5366 ereport(FATAL,
5367 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5368 errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5369 item->value)));
5370 }
5371 if (rtli)
5372 ereport(DEBUG2,
5373 (errmsg_internal("recovery_target_timeline = %u", rtli)));
5374 else
5375 ereport(DEBUG2,
5376 (errmsg_internal("recovery_target_timeline = latest")));
5377 }
5378 else if (strcmp(item->name, "recovery_target_xid") == 0)
5379 {
5380 errno = 0;
5381 recoveryTargetXid = (TransactionId) pg_strtouint64(item->value, NULL, 0);
5382 if (errno == EINVAL || errno == ERANGE)
5383 ereport(FATAL,
5384 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5385 errmsg("recovery_target_xid is not a valid number: \"%s\"",
5386 item->value)));
5387 ereport(DEBUG2,
5388 (errmsg_internal("recovery_target_xid = %u",
5389 recoveryTargetXid)));
5390 recoveryTarget = RECOVERY_TARGET_XID;
5391 }
5392 else if (strcmp(item->name, "recovery_target_time") == 0)
5393 {
5394 recoveryTarget = RECOVERY_TARGET_TIME;
5395
5396 /*
5397 * Convert the time string given by the user to TimestampTz form.
5398 */
5399 recoveryTargetTime =
5400 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5401 CStringGetDatum(item->value),
5402 ObjectIdGetDatum(InvalidOid),
5403 Int32GetDatum(-1)));
5404 ereport(DEBUG2,
5405 (errmsg_internal("recovery_target_time = '%s'",
5406 timestamptz_to_str(recoveryTargetTime))));
5407 }
5408 else if (strcmp(item->name, "recovery_target_name") == 0)
5409 {
5410 recoveryTarget = RECOVERY_TARGET_NAME;
5411
5412 recoveryTargetName = pstrdup(item->value);
5413 if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5414 ereport(FATAL,
5415 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5416 errmsg("recovery_target_name is too long (maximum %d characters)",
5417 MAXFNAMELEN - 1)));
5418
5419 ereport(DEBUG2,
5420 (errmsg_internal("recovery_target_name = '%s'",
5421 recoveryTargetName)));
5422 }
5423 else if (strcmp(item->name, "recovery_target_lsn") == 0)
5424 {
5425 recoveryTarget = RECOVERY_TARGET_LSN;
5426
5427 /*
5428 * Convert the LSN string given by the user to XLogRecPtr form.
5429 */
5430 recoveryTargetLSN =
5431 DatumGetLSN(DirectFunctionCall3(pg_lsn_in,
5432 CStringGetDatum(item->value),
5433 ObjectIdGetDatum(InvalidOid),
5434 Int32GetDatum(-1)));
5435 ereport(DEBUG2,
5436 (errmsg_internal("recovery_target_lsn = '%X/%X'",
5437 (uint32) (recoveryTargetLSN >> 32),
5438 (uint32) recoveryTargetLSN)));
5439 }
5440 else if (strcmp(item->name, "recovery_target") == 0)
5441 {
5442 if (strcmp(item->value, "immediate") == 0)
5443 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5444 else
5445 ereport(ERROR,
5446 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5447 errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5448 "recovery_target",
5449 item->value),
5450 errhint("The only allowed value is \"immediate\".")));
5451 ereport(DEBUG2,
5452 (errmsg_internal("recovery_target = '%s'",
5453 item->value)));
5454 }
5455 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5456 {
5457 /*
5458 * does nothing if a recovery_target is not also set
5459 */
5460 if (!parse_bool(item->value, &recoveryTargetInclusive))
5461 ereport(ERROR,
5462 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5463 errmsg("parameter \"%s\" requires a Boolean value",
5464 "recovery_target_inclusive")));
5465 ereport(DEBUG2,
5466 (errmsg_internal("recovery_target_inclusive = %s",
5467 item->value)));
5468 }
5469 else if (strcmp(item->name, "standby_mode") == 0)
5470 {
5471 if (!parse_bool(item->value, &StandbyModeRequested))
5472 ereport(ERROR,
5473 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5474 errmsg("parameter \"%s\" requires a Boolean value",
5475 "standby_mode")));
5476 ereport(DEBUG2,
5477 (errmsg_internal("standby_mode = '%s'", item->value)));
5478 }
5479 else if (strcmp(item->name, "primary_conninfo") == 0)
5480 {
5481 PrimaryConnInfo = pstrdup(item->value);
5482 ereport(DEBUG2,
5483 (errmsg_internal("primary_conninfo = '%s'",
5484 PrimaryConnInfo)));
5485 }
5486 else if (strcmp(item->name, "primary_slot_name") == 0)
5487 {
5488 ReplicationSlotValidateName(item->value, ERROR);
5489 PrimarySlotName = pstrdup(item->value);
5490 ereport(DEBUG2,
5491 (errmsg_internal("primary_slot_name = '%s'",
5492 PrimarySlotName)));
5493 }
5494 else if (strcmp(item->name, "trigger_file") == 0)
5495 {
5496 TriggerFile = pstrdup(item->value);
5497 ereport(DEBUG2,
5498 (errmsg_internal("trigger_file = '%s'",
5499 TriggerFile)));
5500 }
5501 else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
5502 {
5503 const char *hintmsg;
5504
5505 if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
5506 &hintmsg))
5507 ereport(ERROR,
5508 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5509 errmsg("parameter \"%s\" requires a temporal value",
5510 "recovery_min_apply_delay"),
5511 hintmsg ? errhint("%s", _(hintmsg)) : 0));
5512 ereport(DEBUG2,
5513 (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
5514 }
5515 else
5516 ereport(FATAL,
5517 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5518 errmsg("unrecognized recovery parameter \"%s\"",
5519 item->name)));
5520 }
5521
5522 /*
5523 * Check for compulsory parameters
5524 */
5525 if (StandbyModeRequested)
5526 {
5527 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5528 ereport(WARNING,
5529 (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5530 RECOVERY_COMMAND_FILE),
5531 errhint("The database server will regularly poll the pg_wal subdirectory to check for files placed there.")));
5532 }
5533 else
5534 {
5535 if (recoveryRestoreCommand == NULL)
5536 ereport(FATAL,
5537 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5538 errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5539 RECOVERY_COMMAND_FILE)));
5540 }
5541
5542 /*
5543 * Override any inconsistent requests. Not that this is a change of
5544 * behaviour in 9.5; prior to this we simply ignored a request to pause if
5545 * hot_standby = off, which was surprising behaviour.
5546 */
5547 if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5548 recoveryTargetActionSet &&
5549 !EnableHotStandby)
5550 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5551
5552 /*
5553 * We don't support standby_mode in standalone backends; that requires
5554 * other processes such as the WAL receiver to be alive.
5555 */
5556 if (StandbyModeRequested && !IsUnderPostmaster)
5557 ereport(FATAL,
5558 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5559 errmsg("standby mode is not supported by single-user servers")));
5560
5561 /* Enable fetching from archive recovery area */
5562 ArchiveRecoveryRequested = true;
5563
5564 /*
5565 * If user specified recovery_target_timeline, validate it or compute the
5566 * "latest" value. We can't do this until after we've gotten the restore
5567 * command and set InArchiveRecovery, because we need to fetch timeline
5568 * history files from the archive.
5569 */
5570 if (rtliGiven)
5571 {
5572 if (rtli)
5573 {
5574 /* Timeline 1 does not have a history file, all else should */
5575 if (rtli != 1 && !existsTimeLineHistory(rtli))
5576 ereport(FATAL,
5577 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5578 errmsg("recovery target timeline %u does not exist",
5579 rtli)));
5580 recoveryTargetTLI = rtli;
5581 recoveryTargetIsLatest = false;
5582 }
5583 else
5584 {
5585 /* We start the "latest" search from pg_control's timeline */
5586 recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5587 recoveryTargetIsLatest = true;
5588 }
5589 }
5590
5591 FreeConfigVariables(head);
5592 }
5593
5594 /*
5595 * Exit archive-recovery state
5596 */
5597 static void
exitArchiveRecovery(TimeLineID endTLI,XLogRecPtr endOfLog)5598 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5599 {
5600 char xlogfname[MAXFNAMELEN];
5601 XLogSegNo endLogSegNo;
5602 XLogSegNo startLogSegNo;
5603
5604 /* we always switch to a new timeline after archive recovery */
5605 Assert(endTLI != ThisTimeLineID);
5606
5607 /*
5608 * We are no longer in archive recovery state.
5609 */
5610 InArchiveRecovery = false;
5611
5612 /*
5613 * Update min recovery point one last time.
5614 */
5615 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5616
5617 /*
5618 * If the ending log segment is still open, close it (to avoid problems on
5619 * Windows with trying to rename or delete an open file).
5620 */
5621 if (readFile >= 0)
5622 {
5623 close(readFile);
5624 readFile = -1;
5625 }
5626
5627 /*
5628 * Calculate the last segment on the old timeline, and the first segment
5629 * on the new timeline. If the switch happens in the middle of a segment,
5630 * they are the same, but if the switch happens exactly at a segment
5631 * boundary, startLogSegNo will be endLogSegNo + 1.
5632 */
5633 XLByteToPrevSeg(endOfLog, endLogSegNo);
5634 XLByteToSeg(endOfLog, startLogSegNo);
5635
5636 /*
5637 * Initialize the starting WAL segment for the new timeline. If the switch
5638 * happens in the middle of a segment, copy data from the last WAL segment
5639 * of the old timeline up to the switch point, to the starting WAL segment
5640 * on the new timeline.
5641 */
5642 if (endLogSegNo == startLogSegNo)
5643 {
5644 /*
5645 * Make a copy of the file on the new timeline.
5646 *
5647 * Writing WAL isn't allowed yet, so there are no locking
5648 * considerations. But we should be just as tense as XLogFileInit to
5649 * avoid emplacing a bogus file.
5650 */
5651 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5652 endOfLog % XLOG_SEG_SIZE);
5653 }
5654 else
5655 {
5656 /*
5657 * The switch happened at a segment boundary, so just create the next
5658 * segment on the new timeline.
5659 */
5660 bool use_existent = true;
5661 int fd;
5662
5663 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5664
5665 if (close(fd))
5666 ereport(ERROR,
5667 (errcode_for_file_access(),
5668 errmsg("could not close log file %s: %m",
5669 XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5670 }
5671
5672 /*
5673 * Let's just make real sure there are not .ready or .done flags posted
5674 * for the new segment.
5675 */
5676 XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo);
5677 XLogArchiveCleanup(xlogfname);
5678
5679 /*
5680 * Rename the config file out of the way, so that we don't accidentally
5681 * re-enter archive recovery mode in a subsequent crash.
5682 */
5683 unlink(RECOVERY_COMMAND_DONE);
5684 durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
5685
5686 ereport(LOG,
5687 (errmsg("archive recovery complete")));
5688 }
5689
5690 /*
5691 * Extract timestamp from WAL record.
5692 *
5693 * If the record contains a timestamp, returns true, and saves the timestamp
5694 * in *recordXtime. If the record type has no timestamp, returns false.
5695 * Currently, only transaction commit/abort records and restore points contain
5696 * timestamps.
5697 */
5698 static bool
getRecordTimestamp(XLogReaderState * record,TimestampTz * recordXtime)5699 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5700 {
5701 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5702 uint8 xact_info = info & XLOG_XACT_OPMASK;
5703 uint8 rmid = XLogRecGetRmid(record);
5704
5705 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5706 {
5707 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5708 return true;
5709 }
5710 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5711 xact_info == XLOG_XACT_COMMIT_PREPARED))
5712 {
5713 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5714 return true;
5715 }
5716 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5717 xact_info == XLOG_XACT_ABORT_PREPARED))
5718 {
5719 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5720 return true;
5721 }
5722 return false;
5723 }
5724
5725 /*
5726 * For point-in-time recovery, this function decides whether we want to
5727 * stop applying the XLOG before the current record.
5728 *
5729 * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5730 * information is saved in recoveryStopXid et al for use in annotating the
5731 * new timeline's history file.
5732 */
5733 static bool
recoveryStopsBefore(XLogReaderState * record)5734 recoveryStopsBefore(XLogReaderState *record)
5735 {
5736 bool stopsHere = false;
5737 uint8 xact_info;
5738 bool isCommit;
5739 TimestampTz recordXtime = 0;
5740 TransactionId recordXid;
5741
5742 /* Check if we should stop as soon as reaching consistency */
5743 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5744 {
5745 ereport(LOG,
5746 (errmsg("recovery stopping after reaching consistency")));
5747
5748 recoveryStopAfter = false;
5749 recoveryStopXid = InvalidTransactionId;
5750 recoveryStopLSN = InvalidXLogRecPtr;
5751 recoveryStopTime = 0;
5752 recoveryStopName[0] = '\0';
5753 return true;
5754 }
5755
5756 /* Check if target LSN has been reached */
5757 if (recoveryTarget == RECOVERY_TARGET_LSN &&
5758 !recoveryTargetInclusive &&
5759 record->ReadRecPtr >= recoveryTargetLSN)
5760 {
5761 recoveryStopAfter = false;
5762 recoveryStopXid = InvalidTransactionId;
5763 recoveryStopLSN = record->ReadRecPtr;
5764 recoveryStopTime = 0;
5765 recoveryStopName[0] = '\0';
5766 ereport(LOG,
5767 (errmsg("recovery stopping before WAL location (LSN) \"%X/%X\"",
5768 (uint32) (recoveryStopLSN >> 32),
5769 (uint32) recoveryStopLSN)));
5770 return true;
5771 }
5772
5773 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5774 if (XLogRecGetRmid(record) != RM_XACT_ID)
5775 return false;
5776
5777 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5778
5779 if (xact_info == XLOG_XACT_COMMIT)
5780 {
5781 isCommit = true;
5782 recordXid = XLogRecGetXid(record);
5783 }
5784 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5785 {
5786 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5787 xl_xact_parsed_commit parsed;
5788
5789 isCommit = true;
5790 ParseCommitRecord(XLogRecGetInfo(record),
5791 xlrec,
5792 &parsed);
5793 recordXid = parsed.twophase_xid;
5794 }
5795 else if (xact_info == XLOG_XACT_ABORT)
5796 {
5797 isCommit = false;
5798 recordXid = XLogRecGetXid(record);
5799 }
5800 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5801 {
5802 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5803 xl_xact_parsed_abort parsed;
5804
5805 isCommit = false;
5806 ParseAbortRecord(XLogRecGetInfo(record),
5807 xlrec,
5808 &parsed);
5809 recordXid = parsed.twophase_xid;
5810 }
5811 else
5812 return false;
5813
5814 if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5815 {
5816 /*
5817 * There can be only one transaction end record with this exact
5818 * transactionid
5819 *
5820 * when testing for an xid, we MUST test for equality only, since
5821 * transactions are numbered in the order they start, not the order
5822 * they complete. A higher numbered xid will complete before you about
5823 * 50% of the time...
5824 */
5825 stopsHere = (recordXid == recoveryTargetXid);
5826 }
5827
5828 if (recoveryTarget == RECOVERY_TARGET_TIME &&
5829 getRecordTimestamp(record, &recordXtime))
5830 {
5831 /*
5832 * There can be many transactions that share the same commit time, so
5833 * we stop after the last one, if we are inclusive, or stop at the
5834 * first one if we are exclusive
5835 */
5836 if (recoveryTargetInclusive)
5837 stopsHere = (recordXtime > recoveryTargetTime);
5838 else
5839 stopsHere = (recordXtime >= recoveryTargetTime);
5840 }
5841
5842 if (stopsHere)
5843 {
5844 recoveryStopAfter = false;
5845 recoveryStopXid = recordXid;
5846 recoveryStopTime = recordXtime;
5847 recoveryStopLSN = InvalidXLogRecPtr;
5848 recoveryStopName[0] = '\0';
5849
5850 if (isCommit)
5851 {
5852 ereport(LOG,
5853 (errmsg("recovery stopping before commit of transaction %u, time %s",
5854 recoveryStopXid,
5855 timestamptz_to_str(recoveryStopTime))));
5856 }
5857 else
5858 {
5859 ereport(LOG,
5860 (errmsg("recovery stopping before abort of transaction %u, time %s",
5861 recoveryStopXid,
5862 timestamptz_to_str(recoveryStopTime))));
5863 }
5864 }
5865
5866 return stopsHere;
5867 }
5868
5869 /*
5870 * Same as recoveryStopsBefore, but called after applying the record.
5871 *
5872 * We also track the timestamp of the latest applied COMMIT/ABORT
5873 * record in XLogCtl->recoveryLastXTime.
5874 */
5875 static bool
recoveryStopsAfter(XLogReaderState * record)5876 recoveryStopsAfter(XLogReaderState *record)
5877 {
5878 uint8 info;
5879 uint8 xact_info;
5880 uint8 rmid;
5881 TimestampTz recordXtime;
5882
5883 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5884 rmid = XLogRecGetRmid(record);
5885
5886 /*
5887 * There can be many restore points that share the same name; we stop at
5888 * the first one.
5889 */
5890 if (recoveryTarget == RECOVERY_TARGET_NAME &&
5891 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5892 {
5893 xl_restore_point *recordRestorePointData;
5894
5895 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5896
5897 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5898 {
5899 recoveryStopAfter = true;
5900 recoveryStopXid = InvalidTransactionId;
5901 recoveryStopLSN = InvalidXLogRecPtr;
5902 (void) getRecordTimestamp(record, &recoveryStopTime);
5903 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5904
5905 ereport(LOG,
5906 (errmsg("recovery stopping at restore point \"%s\", time %s",
5907 recoveryStopName,
5908 timestamptz_to_str(recoveryStopTime))));
5909 return true;
5910 }
5911 }
5912
5913 /* Check if the target LSN has been reached */
5914 if (recoveryTarget == RECOVERY_TARGET_LSN &&
5915 recoveryTargetInclusive &&
5916 record->ReadRecPtr >= recoveryTargetLSN)
5917 {
5918 recoveryStopAfter = true;
5919 recoveryStopXid = InvalidTransactionId;
5920 recoveryStopLSN = record->ReadRecPtr;
5921 recoveryStopTime = 0;
5922 recoveryStopName[0] = '\0';
5923 ereport(LOG,
5924 (errmsg("recovery stopping after WAL location (LSN) \"%X/%X\"",
5925 (uint32) (recoveryStopLSN >> 32),
5926 (uint32) recoveryStopLSN)));
5927 return true;
5928 }
5929
5930 if (rmid != RM_XACT_ID)
5931 return false;
5932
5933 xact_info = info & XLOG_XACT_OPMASK;
5934
5935 if (xact_info == XLOG_XACT_COMMIT ||
5936 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5937 xact_info == XLOG_XACT_ABORT ||
5938 xact_info == XLOG_XACT_ABORT_PREPARED)
5939 {
5940 TransactionId recordXid;
5941
5942 /* Update the last applied transaction timestamp */
5943 if (getRecordTimestamp(record, &recordXtime))
5944 SetLatestXTime(recordXtime);
5945
5946 /* Extract the XID of the committed/aborted transaction */
5947 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5948 {
5949 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5950 xl_xact_parsed_commit parsed;
5951
5952 ParseCommitRecord(XLogRecGetInfo(record),
5953 xlrec,
5954 &parsed);
5955 recordXid = parsed.twophase_xid;
5956 }
5957 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5958 {
5959 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5960 xl_xact_parsed_abort parsed;
5961
5962 ParseAbortRecord(XLogRecGetInfo(record),
5963 xlrec,
5964 &parsed);
5965 recordXid = parsed.twophase_xid;
5966 }
5967 else
5968 recordXid = XLogRecGetXid(record);
5969
5970 /*
5971 * There can be only one transaction end record with this exact
5972 * transactionid
5973 *
5974 * when testing for an xid, we MUST test for equality only, since
5975 * transactions are numbered in the order they start, not the order
5976 * they complete. A higher numbered xid will complete before you about
5977 * 50% of the time...
5978 */
5979 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5980 recordXid == recoveryTargetXid)
5981 {
5982 recoveryStopAfter = true;
5983 recoveryStopXid = recordXid;
5984 recoveryStopTime = recordXtime;
5985 recoveryStopLSN = InvalidXLogRecPtr;
5986 recoveryStopName[0] = '\0';
5987
5988 if (xact_info == XLOG_XACT_COMMIT ||
5989 xact_info == XLOG_XACT_COMMIT_PREPARED)
5990 {
5991 ereport(LOG,
5992 (errmsg("recovery stopping after commit of transaction %u, time %s",
5993 recoveryStopXid,
5994 timestamptz_to_str(recoveryStopTime))));
5995 }
5996 else if (xact_info == XLOG_XACT_ABORT ||
5997 xact_info == XLOG_XACT_ABORT_PREPARED)
5998 {
5999 ereport(LOG,
6000 (errmsg("recovery stopping after abort of transaction %u, time %s",
6001 recoveryStopXid,
6002 timestamptz_to_str(recoveryStopTime))));
6003 }
6004 return true;
6005 }
6006 }
6007
6008 /* Check if we should stop as soon as reaching consistency */
6009 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
6010 {
6011 ereport(LOG,
6012 (errmsg("recovery stopping after reaching consistency")));
6013
6014 recoveryStopAfter = true;
6015 recoveryStopXid = InvalidTransactionId;
6016 recoveryStopTime = 0;
6017 recoveryStopLSN = InvalidXLogRecPtr;
6018 recoveryStopName[0] = '\0';
6019 return true;
6020 }
6021
6022 return false;
6023 }
6024
6025 /*
6026 * Wait until shared recoveryPause flag is cleared.
6027 *
6028 * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
6029 * Probably not worth the trouble though. This state shouldn't be one that
6030 * anyone cares about server power consumption in.
6031 */
6032 static void
recoveryPausesHere(void)6033 recoveryPausesHere(void)
6034 {
6035 /* Don't pause unless users can connect! */
6036 if (!LocalHotStandbyActive)
6037 return;
6038
6039 ereport(LOG,
6040 (errmsg("recovery has paused"),
6041 errhint("Execute pg_wal_replay_resume() to continue.")));
6042
6043 while (RecoveryIsPaused())
6044 {
6045 pg_usleep(1000000L); /* 1000 ms */
6046 HandleStartupProcInterrupts();
6047 }
6048 }
6049
6050 bool
RecoveryIsPaused(void)6051 RecoveryIsPaused(void)
6052 {
6053 bool recoveryPause;
6054
6055 SpinLockAcquire(&XLogCtl->info_lck);
6056 recoveryPause = XLogCtl->recoveryPause;
6057 SpinLockRelease(&XLogCtl->info_lck);
6058
6059 return recoveryPause;
6060 }
6061
6062 void
SetRecoveryPause(bool recoveryPause)6063 SetRecoveryPause(bool recoveryPause)
6064 {
6065 SpinLockAcquire(&XLogCtl->info_lck);
6066 XLogCtl->recoveryPause = recoveryPause;
6067 SpinLockRelease(&XLogCtl->info_lck);
6068 }
6069
6070 /*
6071 * When recovery_min_apply_delay is set, we wait long enough to make sure
6072 * certain record types are applied at least that interval behind the master.
6073 *
6074 * Returns true if we waited.
6075 *
6076 * Note that the delay is calculated between the WAL record log time and
6077 * the current time on standby. We would prefer to keep track of when this
6078 * standby received each WAL record, which would allow a more consistent
6079 * approach and one not affected by time synchronisation issues, but that
6080 * is significantly more effort and complexity for little actual gain in
6081 * usability.
6082 */
6083 static bool
recoveryApplyDelay(XLogReaderState * record)6084 recoveryApplyDelay(XLogReaderState *record)
6085 {
6086 uint8 xact_info;
6087 TimestampTz xtime;
6088 long msecs;
6089
6090 /* nothing to do if no delay configured */
6091 if (recovery_min_apply_delay <= 0)
6092 return false;
6093
6094 /* no delay is applied on a database not yet consistent */
6095 if (!reachedConsistency)
6096 return false;
6097
6098 /*
6099 * Is it a COMMIT record?
6100 *
6101 * We deliberately choose not to delay aborts since they have no effect on
6102 * MVCC. We already allow replay of records that don't have a timestamp,
6103 * so there is already opportunity for issues caused by early conflicts on
6104 * standbys.
6105 */
6106 if (XLogRecGetRmid(record) != RM_XACT_ID)
6107 return false;
6108
6109 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
6110
6111 if (xact_info != XLOG_XACT_COMMIT &&
6112 xact_info != XLOG_XACT_COMMIT_PREPARED)
6113 return false;
6114
6115 if (!getRecordTimestamp(record, &xtime))
6116 return false;
6117
6118 recoveryDelayUntilTime =
6119 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6120
6121 /*
6122 * Exit without arming the latch if it's already past time to apply this
6123 * record
6124 */
6125 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
6126 recoveryDelayUntilTime);
6127 if (msecs <= 0)
6128 return false;
6129
6130 while (true)
6131 {
6132 ResetLatch(&XLogCtl->recoveryWakeupLatch);
6133
6134 /*
6135 * This might change recovery_min_apply_delay or the trigger file's
6136 * location.
6137 */
6138 HandleStartupProcInterrupts();
6139
6140 if (CheckForStandbyTrigger())
6141 break;
6142
6143 /*
6144 * Recalculate recoveryDelayUntilTime as recovery_min_apply_delay
6145 * could have changed while waiting in this loop.
6146 */
6147 recoveryDelayUntilTime =
6148 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
6149
6150 /*
6151 * Wait for difference between GetCurrentTimestamp() and
6152 * recoveryDelayUntilTime
6153 */
6154 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
6155 recoveryDelayUntilTime);
6156
6157 if (msecs <= 0)
6158 break;
6159
6160 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
6161
6162 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
6163 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
6164 msecs,
6165 WAIT_EVENT_RECOVERY_APPLY_DELAY);
6166 }
6167 return true;
6168 }
6169
6170 /*
6171 * Save timestamp of latest processed commit/abort record.
6172 *
6173 * We keep this in XLogCtl, not a simple static variable, so that it can be
6174 * seen by processes other than the startup process. Note in particular
6175 * that CreateRestartPoint is executed in the checkpointer.
6176 */
6177 static void
SetLatestXTime(TimestampTz xtime)6178 SetLatestXTime(TimestampTz xtime)
6179 {
6180 SpinLockAcquire(&XLogCtl->info_lck);
6181 XLogCtl->recoveryLastXTime = xtime;
6182 SpinLockRelease(&XLogCtl->info_lck);
6183 }
6184
6185 /*
6186 * Fetch timestamp of latest processed commit/abort record.
6187 */
6188 TimestampTz
GetLatestXTime(void)6189 GetLatestXTime(void)
6190 {
6191 TimestampTz xtime;
6192
6193 SpinLockAcquire(&XLogCtl->info_lck);
6194 xtime = XLogCtl->recoveryLastXTime;
6195 SpinLockRelease(&XLogCtl->info_lck);
6196
6197 return xtime;
6198 }
6199
6200 /*
6201 * Save timestamp of the next chunk of WAL records to apply.
6202 *
6203 * We keep this in XLogCtl, not a simple static variable, so that it can be
6204 * seen by all backends.
6205 */
6206 static void
SetCurrentChunkStartTime(TimestampTz xtime)6207 SetCurrentChunkStartTime(TimestampTz xtime)
6208 {
6209 SpinLockAcquire(&XLogCtl->info_lck);
6210 XLogCtl->currentChunkStartTime = xtime;
6211 SpinLockRelease(&XLogCtl->info_lck);
6212 }
6213
6214 /*
6215 * Fetch timestamp of latest processed commit/abort record.
6216 * Startup process maintains an accurate local copy in XLogReceiptTime
6217 */
6218 TimestampTz
GetCurrentChunkReplayStartTime(void)6219 GetCurrentChunkReplayStartTime(void)
6220 {
6221 TimestampTz xtime;
6222
6223 SpinLockAcquire(&XLogCtl->info_lck);
6224 xtime = XLogCtl->currentChunkStartTime;
6225 SpinLockRelease(&XLogCtl->info_lck);
6226
6227 return xtime;
6228 }
6229
6230 /*
6231 * Returns time of receipt of current chunk of XLOG data, as well as
6232 * whether it was received from streaming replication or from archives.
6233 */
6234 void
GetXLogReceiptTime(TimestampTz * rtime,bool * fromStream)6235 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6236 {
6237 /*
6238 * This must be executed in the startup process, since we don't export the
6239 * relevant state to shared memory.
6240 */
6241 Assert(InRecovery);
6242
6243 *rtime = XLogReceiptTime;
6244 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6245 }
6246
6247 /*
6248 * Note that text field supplied is a parameter name and does not require
6249 * translation
6250 */
6251 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6252 do { \
6253 if ((currValue) < (minValue)) \
6254 ereport(ERROR, \
6255 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6256 errmsg("hot standby is not possible because " \
6257 "%s = %d is a lower setting than on the master server " \
6258 "(its value was %d)", \
6259 param_name, \
6260 currValue, \
6261 minValue))); \
6262 } while(0)
6263
6264 /*
6265 * Check to see if required parameters are set high enough on this server
6266 * for various aspects of recovery operation.
6267 *
6268 * Note that all the parameters which this function tests need to be
6269 * listed in Administrator's Overview section in high-availability.sgml.
6270 * If you change them, don't forget to update the list.
6271 */
6272 static void
CheckRequiredParameterValues(void)6273 CheckRequiredParameterValues(void)
6274 {
6275 /*
6276 * For archive recovery, the WAL must be generated with at least 'replica'
6277 * wal_level.
6278 */
6279 if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6280 {
6281 ereport(WARNING,
6282 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6283 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6284 }
6285
6286 /*
6287 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6288 * must have at least as many backend slots as the primary.
6289 */
6290 if (ArchiveRecoveryRequested && EnableHotStandby)
6291 {
6292 if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6293 ereport(ERROR,
6294 (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6295 errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6296
6297 /* We ignore autovacuum_max_workers when we make this test. */
6298 RecoveryRequiresIntParameter("max_connections",
6299 MaxConnections,
6300 ControlFile->MaxConnections);
6301 RecoveryRequiresIntParameter("max_worker_processes",
6302 max_worker_processes,
6303 ControlFile->max_worker_processes);
6304 RecoveryRequiresIntParameter("max_prepared_transactions",
6305 max_prepared_xacts,
6306 ControlFile->max_prepared_xacts);
6307 RecoveryRequiresIntParameter("max_locks_per_transaction",
6308 max_locks_per_xact,
6309 ControlFile->max_locks_per_xact);
6310 }
6311 }
6312
6313 /*
6314 * This must be called ONCE during postmaster or standalone-backend startup
6315 */
6316 void
StartupXLOG(void)6317 StartupXLOG(void)
6318 {
6319 XLogCtlInsert *Insert;
6320 CheckPoint checkPoint;
6321 bool wasShutdown;
6322 bool reachedStopPoint = false;
6323 bool haveBackupLabel = false;
6324 bool haveTblspcMap = false;
6325 XLogRecPtr RecPtr,
6326 checkPointLoc,
6327 EndOfLog;
6328 TimeLineID EndOfLogTLI;
6329 TimeLineID PrevTimeLineID;
6330 XLogRecord *record;
6331 TransactionId oldestActiveXID;
6332 bool backupEndRequired = false;
6333 bool backupFromStandby = false;
6334 DBState dbstate_at_startup;
6335 XLogReaderState *xlogreader;
6336 XLogPageReadPrivate private;
6337 bool fast_promoted = false;
6338 struct stat st;
6339
6340 /*
6341 * Read control file and check XLOG status looks valid.
6342 *
6343 * Note: in most control paths, *ControlFile is already valid and we need
6344 * not do ReadControlFile() here, but might as well do it to be sure.
6345 */
6346 ReadControlFile();
6347
6348 if (ControlFile->state < DB_SHUTDOWNED ||
6349 ControlFile->state > DB_IN_PRODUCTION ||
6350 !XRecOffIsValid(ControlFile->checkPoint))
6351 ereport(FATAL,
6352 (errmsg("control file contains invalid data")));
6353
6354 if (ControlFile->state == DB_SHUTDOWNED)
6355 {
6356 /* This is the expected case, so don't be chatty in standalone mode */
6357 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6358 (errmsg("database system was shut down at %s",
6359 str_time(ControlFile->time))));
6360 }
6361 else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6362 ereport(LOG,
6363 (errmsg("database system was shut down in recovery at %s",
6364 str_time(ControlFile->time))));
6365 else if (ControlFile->state == DB_SHUTDOWNING)
6366 ereport(LOG,
6367 (errmsg("database system shutdown was interrupted; last known up at %s",
6368 str_time(ControlFile->time))));
6369 else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6370 ereport(LOG,
6371 (errmsg("database system was interrupted while in recovery at %s",
6372 str_time(ControlFile->time)),
6373 errhint("This probably means that some data is corrupted and"
6374 " you will have to use the last backup for recovery.")));
6375 else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6376 ereport(LOG,
6377 (errmsg("database system was interrupted while in recovery at log time %s",
6378 str_time(ControlFile->checkPointCopy.time)),
6379 errhint("If this has occurred more than once some data might be corrupted"
6380 " and you might need to choose an earlier recovery target.")));
6381 else if (ControlFile->state == DB_IN_PRODUCTION)
6382 ereport(LOG,
6383 (errmsg("database system was interrupted; last known up at %s",
6384 str_time(ControlFile->time))));
6385
6386 /* This is just to allow attaching to startup process with a debugger */
6387 #ifdef XLOG_REPLAY_DELAY
6388 if (ControlFile->state != DB_SHUTDOWNED)
6389 pg_usleep(60000000L);
6390 #endif
6391
6392 /*
6393 * Verify that pg_wal and pg_wal/archive_status exist. In cases where
6394 * someone has performed a copy for PITR, these directories may have been
6395 * excluded and need to be re-created.
6396 */
6397 ValidateXLOGDirectoryStructure();
6398
6399 /*
6400 * If we previously crashed, there might be data which we had written,
6401 * intending to fsync it, but which we had not actually fsync'd yet.
6402 * Therefore, a power failure in the near future might cause earlier
6403 * unflushed writes to be lost, even though more recent data written to
6404 * disk from here on would be persisted. To avoid that, fsync the entire
6405 * data directory.
6406 */
6407 if (ControlFile->state != DB_SHUTDOWNED &&
6408 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6409 SyncDataDirectory();
6410
6411 /*
6412 * Initialize on the assumption we want to recover to the latest timeline
6413 * that's active according to pg_control.
6414 */
6415 if (ControlFile->minRecoveryPointTLI >
6416 ControlFile->checkPointCopy.ThisTimeLineID)
6417 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6418 else
6419 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6420
6421 /*
6422 * Check for recovery control file, and if so set up state for offline
6423 * recovery
6424 */
6425 readRecoveryCommandFile();
6426
6427 /*
6428 * Save archive_cleanup_command in shared memory so that other processes
6429 * can see it.
6430 */
6431 strlcpy(XLogCtl->archiveCleanupCommand,
6432 archiveCleanupCommand ? archiveCleanupCommand : "",
6433 sizeof(XLogCtl->archiveCleanupCommand));
6434
6435 if (ArchiveRecoveryRequested)
6436 {
6437 if (StandbyModeRequested)
6438 ereport(LOG,
6439 (errmsg("entering standby mode")));
6440 else if (recoveryTarget == RECOVERY_TARGET_XID)
6441 ereport(LOG,
6442 (errmsg("starting point-in-time recovery to XID %u",
6443 recoveryTargetXid)));
6444 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6445 ereport(LOG,
6446 (errmsg("starting point-in-time recovery to %s",
6447 timestamptz_to_str(recoveryTargetTime))));
6448 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6449 ereport(LOG,
6450 (errmsg("starting point-in-time recovery to \"%s\"",
6451 recoveryTargetName)));
6452 else if (recoveryTarget == RECOVERY_TARGET_LSN)
6453 ereport(LOG,
6454 (errmsg("starting point-in-time recovery to WAL location (LSN) \"%X/%X\"",
6455 (uint32) (recoveryTargetLSN >> 32),
6456 (uint32) recoveryTargetLSN)));
6457 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6458 ereport(LOG,
6459 (errmsg("starting point-in-time recovery to earliest consistent point")));
6460 else
6461 ereport(LOG,
6462 (errmsg("starting archive recovery")));
6463 }
6464
6465 /*
6466 * Take ownership of the wakeup latch if we're going to sleep during
6467 * recovery.
6468 */
6469 if (ArchiveRecoveryRequested)
6470 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6471
6472 /* Set up XLOG reader facility */
6473 MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6474 xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6475 if (!xlogreader)
6476 ereport(ERROR,
6477 (errcode(ERRCODE_OUT_OF_MEMORY),
6478 errmsg("out of memory"),
6479 errdetail("Failed while allocating a WAL reading processor.")));
6480 xlogreader->system_identifier = ControlFile->system_identifier;
6481
6482 /*
6483 * Allocate two page buffers dedicated to WAL consistency checks. We do
6484 * it this way, rather than just making static arrays, for two reasons:
6485 * (1) no need to waste the storage in most instantiations of the backend;
6486 * (2) a static char array isn't guaranteed to have any particular
6487 * alignment, whereas palloc() will provide MAXALIGN'd storage.
6488 */
6489 replay_image_masked = (char *) palloc(BLCKSZ);
6490 master_image_masked = (char *) palloc(BLCKSZ);
6491
6492 if (read_backup_label(&checkPointLoc, &backupEndRequired,
6493 &backupFromStandby))
6494 {
6495 List *tablespaces = NIL;
6496
6497 /*
6498 * Archive recovery was requested, and thanks to the backup label
6499 * file, we know how far we need to replay to reach consistency. Enter
6500 * archive recovery directly.
6501 */
6502 InArchiveRecovery = true;
6503 if (StandbyModeRequested)
6504 StandbyMode = true;
6505
6506 /*
6507 * When a backup_label file is present, we want to roll forward from
6508 * the checkpoint it identifies, rather than using pg_control.
6509 */
6510 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6511 if (record != NULL)
6512 {
6513 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6514 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6515 ereport(DEBUG1,
6516 (errmsg("checkpoint record is at %X/%X",
6517 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6518 InRecovery = true; /* force recovery even if SHUTDOWNED */
6519
6520 /*
6521 * Make sure that REDO location exists. This may not be the case
6522 * if there was a crash during an online backup, which left a
6523 * backup_label around that references a WAL segment that's
6524 * already been archived.
6525 */
6526 if (checkPoint.redo < checkPointLoc)
6527 {
6528 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6529 ereport(FATAL,
6530 (errmsg("could not find redo location referenced by checkpoint record"),
6531 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6532 }
6533 }
6534 else
6535 {
6536 ereport(FATAL,
6537 (errmsg("could not locate required checkpoint record"),
6538 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6539 wasShutdown = false; /* keep compiler quiet */
6540 }
6541
6542 /* read the tablespace_map file if present and create symlinks. */
6543 if (read_tablespace_map(&tablespaces))
6544 {
6545 ListCell *lc;
6546
6547 foreach(lc, tablespaces)
6548 {
6549 tablespaceinfo *ti = lfirst(lc);
6550 char *linkloc;
6551
6552 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6553
6554 /*
6555 * Remove the existing symlink if any and Create the symlink
6556 * under PGDATA.
6557 */
6558 remove_tablespace_symlink(linkloc);
6559
6560 if (symlink(ti->path, linkloc) < 0)
6561 ereport(ERROR,
6562 (errcode_for_file_access(),
6563 errmsg("could not create symbolic link \"%s\": %m",
6564 linkloc)));
6565
6566 pfree(ti->oid);
6567 pfree(ti->path);
6568 pfree(ti);
6569 }
6570
6571 /* set flag to delete it later */
6572 haveTblspcMap = true;
6573 }
6574
6575 /* set flag to delete it later */
6576 haveBackupLabel = true;
6577 }
6578 else
6579 {
6580 /*
6581 * If tablespace_map file is present without backup_label file, there
6582 * is no use of such file. There is no harm in retaining it, but it
6583 * is better to get rid of the map file so that we don't have any
6584 * redundant file in data directory and it will avoid any sort of
6585 * confusion. It seems prudent though to just rename the file out of
6586 * the way rather than delete it completely, also we ignore any error
6587 * that occurs in rename operation as even if map file is present
6588 * without backup_label file, it is harmless.
6589 */
6590 if (stat(TABLESPACE_MAP, &st) == 0)
6591 {
6592 unlink(TABLESPACE_MAP_OLD);
6593 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6594 ereport(LOG,
6595 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6596 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6597 errdetail("File \"%s\" was renamed to \"%s\".",
6598 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6599 else
6600 ereport(LOG,
6601 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6602 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6603 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6604 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6605 }
6606
6607 /*
6608 * It's possible that archive recovery was requested, but we don't
6609 * know how far we need to replay the WAL before we reach consistency.
6610 * This can happen for example if a base backup is taken from a
6611 * running server using an atomic filesystem snapshot, without calling
6612 * pg_start/stop_backup. Or if you just kill a running master server
6613 * and put it into archive recovery by creating a recovery.conf file.
6614 *
6615 * Our strategy in that case is to perform crash recovery first,
6616 * replaying all the WAL present in pg_wal, and only enter archive
6617 * recovery after that.
6618 *
6619 * But usually we already know how far we need to replay the WAL (up
6620 * to minRecoveryPoint, up to backupEndPoint, or until we see an
6621 * end-of-backup record), and we can enter archive recovery directly.
6622 */
6623 if (ArchiveRecoveryRequested &&
6624 (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6625 ControlFile->backupEndRequired ||
6626 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6627 ControlFile->state == DB_SHUTDOWNED))
6628 {
6629 InArchiveRecovery = true;
6630 if (StandbyModeRequested)
6631 StandbyMode = true;
6632 }
6633
6634 /*
6635 * Get the last valid checkpoint record. If the latest one according
6636 * to pg_control is broken, try the next-to-last one.
6637 */
6638 checkPointLoc = ControlFile->checkPoint;
6639 RedoStartLSN = ControlFile->checkPointCopy.redo;
6640 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6641 if (record != NULL)
6642 {
6643 ereport(DEBUG1,
6644 (errmsg("checkpoint record is at %X/%X",
6645 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6646 }
6647 else if (StandbyMode)
6648 {
6649 /*
6650 * The last valid checkpoint record required for a streaming
6651 * recovery exists in neither standby nor the primary.
6652 */
6653 ereport(PANIC,
6654 (errmsg("could not locate a valid checkpoint record")));
6655 }
6656 else
6657 {
6658 checkPointLoc = ControlFile->prevCheckPoint;
6659 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6660 if (record != NULL)
6661 {
6662 ereport(LOG,
6663 (errmsg("using previous checkpoint record at %X/%X",
6664 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6665 InRecovery = true; /* force recovery even if SHUTDOWNED */
6666 }
6667 else
6668 ereport(PANIC,
6669 (errmsg("could not locate a valid checkpoint record")));
6670 }
6671 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6672 wasShutdown = ((record->xl_info & ~XLR_INFO_MASK) == XLOG_CHECKPOINT_SHUTDOWN);
6673 }
6674
6675 /*
6676 * Clear out any old relcache cache files. This is *necessary* if we do
6677 * any WAL replay, since that would probably result in the cache files
6678 * being out of sync with database reality. In theory we could leave them
6679 * in place if the database had been cleanly shut down, but it seems
6680 * safest to just remove them always and let them be rebuilt during the
6681 * first backend startup. These files needs to be removed from all
6682 * directories including pg_tblspc, however the symlinks are created only
6683 * after reading tablespace_map file in case of archive recovery from
6684 * backup, so needs to clear old relcache files here after creating
6685 * symlinks.
6686 */
6687 RelationCacheInitFileRemove();
6688
6689 /*
6690 * If the location of the checkpoint record is not on the expected
6691 * timeline in the history of the requested timeline, we cannot proceed:
6692 * the backup is not part of the history of the requested timeline.
6693 */
6694 Assert(expectedTLEs); /* was initialized by reading checkpoint
6695 * record */
6696 if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6697 checkPoint.ThisTimeLineID)
6698 {
6699 XLogRecPtr switchpoint;
6700
6701 /*
6702 * tliSwitchPoint will throw an error if the checkpoint's timeline is
6703 * not in expectedTLEs at all.
6704 */
6705 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6706 ereport(FATAL,
6707 (errmsg("requested timeline %u is not a child of this server's history",
6708 recoveryTargetTLI),
6709 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6710 (uint32) (ControlFile->checkPoint >> 32),
6711 (uint32) ControlFile->checkPoint,
6712 ControlFile->checkPointCopy.ThisTimeLineID,
6713 (uint32) (switchpoint >> 32),
6714 (uint32) switchpoint)));
6715 }
6716
6717 /*
6718 * The min recovery point should be part of the requested timeline's
6719 * history, too.
6720 */
6721 if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6722 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6723 ControlFile->minRecoveryPointTLI)
6724 ereport(FATAL,
6725 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6726 recoveryTargetTLI,
6727 (uint32) (ControlFile->minRecoveryPoint >> 32),
6728 (uint32) ControlFile->minRecoveryPoint,
6729 ControlFile->minRecoveryPointTLI)));
6730
6731 LastRec = RecPtr = checkPointLoc;
6732
6733 ereport(DEBUG1,
6734 (errmsg_internal("redo record is at %X/%X; shutdown %s",
6735 (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6736 wasShutdown ? "TRUE" : "FALSE")));
6737 ereport(DEBUG1,
6738 (errmsg_internal("next transaction ID: %u:%u; next OID: %u",
6739 checkPoint.nextXidEpoch, checkPoint.nextXid,
6740 checkPoint.nextOid)));
6741 ereport(DEBUG1,
6742 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6743 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6744 ereport(DEBUG1,
6745 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6746 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6747 ereport(DEBUG1,
6748 (errmsg_internal("oldest MultiXactId: %u, in database %u",
6749 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6750 ereport(DEBUG1,
6751 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6752 checkPoint.oldestCommitTsXid,
6753 checkPoint.newestCommitTsXid)));
6754 if (!TransactionIdIsNormal(checkPoint.nextXid))
6755 ereport(PANIC,
6756 (errmsg("invalid next transaction ID")));
6757
6758 /* initialize shared memory variables from the checkpoint record */
6759 ShmemVariableCache->nextXid = checkPoint.nextXid;
6760 ShmemVariableCache->nextOid = checkPoint.nextOid;
6761 ShmemVariableCache->oidCount = 0;
6762 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6763 AdvanceOldestClogXid(checkPoint.oldestXid);
6764 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6765 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB, true);
6766 SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6767 checkPoint.newestCommitTsXid);
6768 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6769 XLogCtl->ckptXid = checkPoint.nextXid;
6770
6771 /*
6772 * Initialize replication slots, before there's a chance to remove
6773 * required resources.
6774 */
6775 StartupReplicationSlots();
6776
6777 /*
6778 * Startup logical state, needs to be setup now so we have proper data
6779 * during crash recovery.
6780 */
6781 StartupReorderBuffer();
6782
6783 /*
6784 * Startup MultiXact. We need to do this early to be able to replay
6785 * truncations.
6786 */
6787 StartupMultiXact();
6788
6789 /*
6790 * Ditto for commit timestamps. Activate the facility if the setting is
6791 * enabled in the control file, as there should be no tracking of commit
6792 * timestamps done when the setting was disabled. This facility can be
6793 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6794 */
6795 if (ControlFile->track_commit_timestamp)
6796 StartupCommitTs();
6797
6798 /*
6799 * Recover knowledge about replay progress of known replication partners.
6800 */
6801 StartupReplicationOrigin();
6802
6803 /*
6804 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6805 * control file. On recovery, all unlogged relations are blown away, so
6806 * the unlogged LSN counter can be reset too.
6807 */
6808 if (ControlFile->state == DB_SHUTDOWNED)
6809 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6810 else
6811 XLogCtl->unloggedLSN = 1;
6812
6813 /*
6814 * We must replay WAL entries using the same TimeLineID they were created
6815 * under, so temporarily adopt the TLI indicated by the checkpoint (see
6816 * also xlog_redo()).
6817 */
6818 ThisTimeLineID = checkPoint.ThisTimeLineID;
6819
6820 /*
6821 * Copy any missing timeline history files between 'now' and the recovery
6822 * target timeline from archive to pg_wal. While we don't need those files
6823 * ourselves - the history file of the recovery target timeline covers all
6824 * the previous timelines in the history too - a cascading standby server
6825 * might be interested in them. Or, if you archive the WAL from this
6826 * server to a different archive than the master, it'd be good for all the
6827 * history files to get archived there after failover, so that you can use
6828 * one of the old timelines as a PITR target. Timeline history files are
6829 * small, so it's better to copy them unnecessarily than not copy them and
6830 * regret later.
6831 */
6832 restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6833
6834 /*
6835 * Before running in recovery, scan pg_twophase and fill in its status to
6836 * be able to work on entries generated by redo. Doing a scan before
6837 * taking any recovery action has the merit to discard any 2PC files that
6838 * are newer than the first record to replay, saving from any conflicts at
6839 * replay. This avoids as well any subsequent scans when doing recovery
6840 * of the on-disk two-phase data.
6841 */
6842 restoreTwoPhaseData();
6843
6844 lastFullPageWrites = checkPoint.fullPageWrites;
6845
6846 RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6847 doPageWrites = lastFullPageWrites;
6848
6849 if (RecPtr < checkPoint.redo)
6850 ereport(PANIC,
6851 (errmsg("invalid redo in checkpoint record")));
6852
6853 /*
6854 * Check whether we need to force recovery from WAL. If it appears to
6855 * have been a clean shutdown and we did not have a recovery.conf file,
6856 * then assume no recovery needed.
6857 */
6858 if (checkPoint.redo < RecPtr)
6859 {
6860 if (wasShutdown)
6861 ereport(PANIC,
6862 (errmsg("invalid redo record in shutdown checkpoint")));
6863 InRecovery = true;
6864 }
6865 else if (ControlFile->state != DB_SHUTDOWNED)
6866 InRecovery = true;
6867 else if (ArchiveRecoveryRequested)
6868 {
6869 /* force recovery due to presence of recovery.conf */
6870 InRecovery = true;
6871 }
6872
6873 /*
6874 * Start recovery assuming that the final record isn't lost.
6875 */
6876 abortedRecPtr = InvalidXLogRecPtr;
6877 missingContrecPtr = InvalidXLogRecPtr;
6878
6879 /* REDO */
6880 if (InRecovery)
6881 {
6882 int rmid;
6883
6884 /*
6885 * Update pg_control to show that we are recovering and to show the
6886 * selected checkpoint as the place we are starting from. We also mark
6887 * pg_control with any minimum recovery stop point obtained from a
6888 * backup history file.
6889 */
6890 dbstate_at_startup = ControlFile->state;
6891 if (InArchiveRecovery)
6892 {
6893 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6894
6895 SpinLockAcquire(&XLogCtl->info_lck);
6896 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
6897 SpinLockRelease(&XLogCtl->info_lck);
6898 }
6899 else
6900 {
6901 ereport(LOG,
6902 (errmsg("database system was not properly shut down; "
6903 "automatic recovery in progress")));
6904 if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6905 ereport(LOG,
6906 (errmsg("crash recovery starts in timeline %u "
6907 "and has target timeline %u",
6908 ControlFile->checkPointCopy.ThisTimeLineID,
6909 recoveryTargetTLI)));
6910 ControlFile->state = DB_IN_CRASH_RECOVERY;
6911
6912 SpinLockAcquire(&XLogCtl->info_lck);
6913 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
6914 SpinLockRelease(&XLogCtl->info_lck);
6915 }
6916 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6917 ControlFile->checkPoint = checkPointLoc;
6918 ControlFile->checkPointCopy = checkPoint;
6919 if (InArchiveRecovery)
6920 {
6921 /* initialize minRecoveryPoint if not set yet */
6922 if (ControlFile->minRecoveryPoint < checkPoint.redo)
6923 {
6924 ControlFile->minRecoveryPoint = checkPoint.redo;
6925 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6926 }
6927 }
6928
6929 /*
6930 * Set backupStartPoint if we're starting recovery from a base backup.
6931 *
6932 * Also set backupEndPoint and use minRecoveryPoint as the backup end
6933 * location if we're starting recovery from a base backup which was
6934 * taken from a standby. In this case, the database system status in
6935 * pg_control must indicate that the database was already in recovery.
6936 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6937 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6938 * before reaching this point; e.g. because restore_command or
6939 * primary_conninfo were faulty.
6940 *
6941 * Any other state indicates that the backup somehow became corrupted
6942 * and we can't sensibly continue with recovery.
6943 */
6944 if (haveBackupLabel)
6945 {
6946 ControlFile->backupStartPoint = checkPoint.redo;
6947 ControlFile->backupEndRequired = backupEndRequired;
6948
6949 if (backupFromStandby)
6950 {
6951 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6952 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6953 ereport(FATAL,
6954 (errmsg("backup_label contains data inconsistent with control file"),
6955 errhint("This means that the backup is corrupted and you will "
6956 "have to use another backup for recovery.")));
6957 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6958 }
6959 }
6960 ControlFile->time = (pg_time_t) time(NULL);
6961 /* No need to hold ControlFileLock yet, we aren't up far enough */
6962 UpdateControlFile();
6963
6964 /*
6965 * Initialize our local copy of minRecoveryPoint. When doing crash
6966 * recovery we want to replay up to the end of WAL. Particularly, in
6967 * the case of a promoted standby minRecoveryPoint value in the
6968 * control file is only updated after the first checkpoint. However,
6969 * if the instance crashes before the first post-recovery checkpoint
6970 * is completed then recovery will use a stale location causing the
6971 * startup process to think that there are still invalid page
6972 * references when checking for data consistency.
6973 */
6974 if (InArchiveRecovery)
6975 {
6976 minRecoveryPoint = ControlFile->minRecoveryPoint;
6977 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6978 }
6979 else
6980 {
6981 minRecoveryPoint = InvalidXLogRecPtr;
6982 minRecoveryPointTLI = 0;
6983 }
6984
6985 /*
6986 * Reset pgstat data, because it may be invalid after recovery.
6987 */
6988 pgstat_reset_all();
6989
6990 /*
6991 * If there was a backup label file, it's done its job and the info
6992 * has now been propagated into pg_control. We must get rid of the
6993 * label file so that if we crash during recovery, we'll pick up at
6994 * the latest recovery restartpoint instead of going all the way back
6995 * to the backup start point. It seems prudent though to just rename
6996 * the file out of the way rather than delete it completely.
6997 */
6998 if (haveBackupLabel)
6999 {
7000 unlink(BACKUP_LABEL_OLD);
7001 durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
7002 }
7003
7004 /*
7005 * If there was a tablespace_map file, it's done its job and the
7006 * symlinks have been created. We must get rid of the map file so
7007 * that if we crash during recovery, we don't create symlinks again.
7008 * It seems prudent though to just rename the file out of the way
7009 * rather than delete it completely.
7010 */
7011 if (haveTblspcMap)
7012 {
7013 unlink(TABLESPACE_MAP_OLD);
7014 durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
7015 }
7016
7017 /* Check that the GUCs used to generate the WAL allow recovery */
7018 CheckRequiredParameterValues();
7019
7020 /*
7021 * We're in recovery, so unlogged relations may be trashed and must be
7022 * reset. This should be done BEFORE allowing Hot Standby
7023 * connections, so that read-only backends don't try to read whatever
7024 * garbage is left over from before.
7025 */
7026 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
7027
7028 /*
7029 * Likewise, delete any saved transaction snapshot files that got left
7030 * behind by crashed backends.
7031 */
7032 DeleteAllExportedSnapshotFiles();
7033
7034 /*
7035 * Initialize for Hot Standby, if enabled. We won't let backends in
7036 * yet, not until we've reached the min recovery point specified in
7037 * control file and we've established a recovery snapshot from a
7038 * running-xacts WAL record.
7039 */
7040 if (ArchiveRecoveryRequested && EnableHotStandby)
7041 {
7042 TransactionId *xids;
7043 int nxids;
7044
7045 ereport(DEBUG1,
7046 (errmsg("initializing for hot standby")));
7047
7048 InitRecoveryTransactionEnvironment();
7049
7050 if (wasShutdown)
7051 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
7052 else
7053 oldestActiveXID = checkPoint.oldestActiveXid;
7054 Assert(TransactionIdIsValid(oldestActiveXID));
7055
7056 /* Tell procarray about the range of xids it has to deal with */
7057 ProcArrayInitRecovery(ShmemVariableCache->nextXid);
7058
7059 /*
7060 * Startup commit log and subtrans only. MultiXact and commit
7061 * timestamp have already been started up and other SLRUs are not
7062 * maintained during recovery and need not be started yet.
7063 */
7064 StartupCLOG();
7065 StartupSUBTRANS(oldestActiveXID);
7066
7067 /*
7068 * If we're beginning at a shutdown checkpoint, we know that
7069 * nothing was running on the master at this point. So fake-up an
7070 * empty running-xacts record and use that here and now. Recover
7071 * additional standby state for prepared transactions.
7072 */
7073 if (wasShutdown)
7074 {
7075 RunningTransactionsData running;
7076 TransactionId latestCompletedXid;
7077
7078 /*
7079 * Construct a RunningTransactions snapshot representing a
7080 * shut down server, with only prepared transactions still
7081 * alive. We're never overflowed at this point because all
7082 * subxids are listed with their parent prepared transactions.
7083 */
7084 running.xcnt = nxids;
7085 running.subxcnt = 0;
7086 running.subxid_overflow = false;
7087 running.nextXid = checkPoint.nextXid;
7088 running.oldestRunningXid = oldestActiveXID;
7089 latestCompletedXid = checkPoint.nextXid;
7090 TransactionIdRetreat(latestCompletedXid);
7091 Assert(TransactionIdIsNormal(latestCompletedXid));
7092 running.latestCompletedXid = latestCompletedXid;
7093 running.xids = xids;
7094
7095 ProcArrayApplyRecoveryInfo(&running);
7096
7097 StandbyRecoverPreparedTransactions();
7098 }
7099 }
7100
7101 /* Initialize resource managers */
7102 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7103 {
7104 if (RmgrTable[rmid].rm_startup != NULL)
7105 RmgrTable[rmid].rm_startup();
7106 }
7107
7108 /*
7109 * Initialize shared variables for tracking progress of WAL replay, as
7110 * if we had just replayed the record before the REDO location (or the
7111 * checkpoint record itself, if it's a shutdown checkpoint).
7112 */
7113 SpinLockAcquire(&XLogCtl->info_lck);
7114 if (checkPoint.redo < RecPtr)
7115 XLogCtl->replayEndRecPtr = checkPoint.redo;
7116 else
7117 XLogCtl->replayEndRecPtr = EndRecPtr;
7118 XLogCtl->replayEndTLI = ThisTimeLineID;
7119 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
7120 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
7121 XLogCtl->recoveryLastXTime = 0;
7122 XLogCtl->currentChunkStartTime = 0;
7123 XLogCtl->recoveryPause = false;
7124 SpinLockRelease(&XLogCtl->info_lck);
7125
7126 /* Also ensure XLogReceiptTime has a sane value */
7127 XLogReceiptTime = GetCurrentTimestamp();
7128
7129 /*
7130 * Let postmaster know we've started redo now, so that it can launch
7131 * checkpointer to perform restartpoints. We don't bother during
7132 * crash recovery as restartpoints can only be performed during
7133 * archive recovery. And we'd like to keep crash recovery simple, to
7134 * avoid introducing bugs that could affect you when recovering after
7135 * crash.
7136 *
7137 * After this point, we can no longer assume that we're the only
7138 * process in addition to postmaster! Also, fsync requests are
7139 * subsequently to be handled by the checkpointer, not locally.
7140 */
7141 if (ArchiveRecoveryRequested && IsUnderPostmaster)
7142 {
7143 PublishStartupProcessInformation();
7144 SetForwardFsyncRequests();
7145 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
7146 bgwriterLaunched = true;
7147 }
7148
7149 /*
7150 * Allow read-only connections immediately if we're consistent
7151 * already.
7152 */
7153 CheckRecoveryConsistency();
7154
7155 /*
7156 * Find the first record that logically follows the checkpoint --- it
7157 * might physically precede it, though.
7158 */
7159 if (checkPoint.redo < RecPtr)
7160 {
7161 /* back up to find the record */
7162 record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
7163 }
7164 else
7165 {
7166 /* just have to read next record after CheckPoint */
7167 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7168 }
7169
7170 if (record != NULL)
7171 {
7172 ErrorContextCallback errcallback;
7173 TimestampTz xtime;
7174
7175 InRedo = true;
7176
7177 ereport(LOG,
7178 (errmsg("redo starts at %X/%X",
7179 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7180
7181 /*
7182 * main redo apply loop
7183 */
7184 do
7185 {
7186 bool switchedTLI = false;
7187
7188 #ifdef WAL_DEBUG
7189 if (XLOG_DEBUG ||
7190 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
7191 (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
7192 {
7193 StringInfoData buf;
7194
7195 initStringInfo(&buf);
7196 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
7197 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
7198 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
7199 xlog_outrec(&buf, xlogreader);
7200 appendStringInfoString(&buf, " - ");
7201 xlog_outdesc(&buf, xlogreader);
7202 elog(LOG, "%s", buf.data);
7203 pfree(buf.data);
7204 }
7205 #endif
7206
7207 /* Handle interrupt signals of startup process */
7208 HandleStartupProcInterrupts();
7209
7210 /*
7211 * Pause WAL replay, if requested by a hot-standby session via
7212 * SetRecoveryPause().
7213 *
7214 * Note that we intentionally don't take the info_lck spinlock
7215 * here. We might therefore read a slightly stale value of
7216 * the recoveryPause flag, but it can't be very stale (no
7217 * worse than the last spinlock we did acquire). Since a
7218 * pause request is a pretty asynchronous thing anyway,
7219 * possibly responding to it one WAL record later than we
7220 * otherwise would is a minor issue, so it doesn't seem worth
7221 * adding another spinlock cycle to prevent that.
7222 */
7223 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7224 recoveryPausesHere();
7225
7226 /*
7227 * Have we reached our recovery target?
7228 */
7229 if (recoveryStopsBefore(xlogreader))
7230 {
7231 reachedStopPoint = true; /* see below */
7232 break;
7233 }
7234
7235 /*
7236 * If we've been asked to lag the master, wait on latch until
7237 * enough time has passed.
7238 */
7239 if (recoveryApplyDelay(xlogreader))
7240 {
7241 /*
7242 * We test for paused recovery again here. If user sets
7243 * delayed apply, it may be because they expect to pause
7244 * recovery in case of problems, so we must test again
7245 * here otherwise pausing during the delay-wait wouldn't
7246 * work.
7247 */
7248 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
7249 recoveryPausesHere();
7250 }
7251
7252 /* Setup error traceback support for ereport() */
7253 errcallback.callback = rm_redo_error_callback;
7254 errcallback.arg = (void *) xlogreader;
7255 errcallback.previous = error_context_stack;
7256 error_context_stack = &errcallback;
7257
7258 /*
7259 * ShmemVariableCache->nextXid must be beyond record's xid.
7260 *
7261 * We don't expect anyone else to modify nextXid, hence we
7262 * don't need to hold a lock while examining it. We still
7263 * acquire the lock to modify it, though.
7264 */
7265 if (TransactionIdFollowsOrEquals(record->xl_xid,
7266 ShmemVariableCache->nextXid))
7267 {
7268 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7269 ShmemVariableCache->nextXid = record->xl_xid;
7270 TransactionIdAdvance(ShmemVariableCache->nextXid);
7271 LWLockRelease(XidGenLock);
7272 }
7273
7274 /*
7275 * Before replaying this record, check if this record causes
7276 * the current timeline to change. The record is already
7277 * considered to be part of the new timeline, so we update
7278 * ThisTimeLineID before replaying it. That's important so
7279 * that replayEndTLI, which is recorded as the minimum
7280 * recovery point's TLI if recovery stops after this record,
7281 * is set correctly.
7282 */
7283 if (record->xl_rmid == RM_XLOG_ID)
7284 {
7285 TimeLineID newTLI = ThisTimeLineID;
7286 TimeLineID prevTLI = ThisTimeLineID;
7287 uint8 info = record->xl_info & ~XLR_INFO_MASK;
7288
7289 if (info == XLOG_CHECKPOINT_SHUTDOWN)
7290 {
7291 CheckPoint checkPoint;
7292
7293 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7294 newTLI = checkPoint.ThisTimeLineID;
7295 prevTLI = checkPoint.PrevTimeLineID;
7296 }
7297 else if (info == XLOG_END_OF_RECOVERY)
7298 {
7299 xl_end_of_recovery xlrec;
7300
7301 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7302 newTLI = xlrec.ThisTimeLineID;
7303 prevTLI = xlrec.PrevTimeLineID;
7304 }
7305
7306 if (newTLI != ThisTimeLineID)
7307 {
7308 /* Check that it's OK to switch to this TLI */
7309 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7310
7311 /* Following WAL records should be run with new TLI */
7312 ThisTimeLineID = newTLI;
7313 switchedTLI = true;
7314 }
7315 }
7316
7317 /*
7318 * Update shared replayEndRecPtr before replaying this record,
7319 * so that XLogFlush will update minRecoveryPoint correctly.
7320 */
7321 SpinLockAcquire(&XLogCtl->info_lck);
7322 XLogCtl->replayEndRecPtr = EndRecPtr;
7323 XLogCtl->replayEndTLI = ThisTimeLineID;
7324 SpinLockRelease(&XLogCtl->info_lck);
7325
7326 /*
7327 * If we are attempting to enter Hot Standby mode, process
7328 * XIDs we see
7329 */
7330 if (standbyState >= STANDBY_INITIALIZED &&
7331 TransactionIdIsValid(record->xl_xid))
7332 RecordKnownAssignedTransactionIds(record->xl_xid);
7333
7334 /* Now apply the WAL record itself */
7335 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7336
7337 /*
7338 * After redo, check whether the backup pages associated with
7339 * the WAL record are consistent with the existing pages. This
7340 * check is done only if consistency check is enabled for this
7341 * record.
7342 */
7343 if ((record->xl_info & XLR_CHECK_CONSISTENCY) != 0)
7344 checkXLogConsistency(xlogreader);
7345
7346 /* Pop the error context stack */
7347 error_context_stack = errcallback.previous;
7348
7349 /*
7350 * Update lastReplayedEndRecPtr after this record has been
7351 * successfully replayed.
7352 */
7353 SpinLockAcquire(&XLogCtl->info_lck);
7354 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7355 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7356 SpinLockRelease(&XLogCtl->info_lck);
7357
7358 /*
7359 * If rm_redo called XLogRequestWalReceiverReply, then we wake
7360 * up the receiver so that it notices the updated
7361 * lastReplayedEndRecPtr and sends a reply to the master.
7362 */
7363 if (doRequestWalReceiverReply)
7364 {
7365 doRequestWalReceiverReply = false;
7366 WalRcvForceReply();
7367 }
7368
7369 /* Remember this record as the last-applied one */
7370 LastRec = ReadRecPtr;
7371
7372 /* Allow read-only connections if we're consistent now */
7373 CheckRecoveryConsistency();
7374
7375 /* Is this a timeline switch? */
7376 if (switchedTLI)
7377 {
7378 /*
7379 * Before we continue on the new timeline, clean up any
7380 * (possibly bogus) future WAL segments on the old
7381 * timeline.
7382 */
7383 RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7384
7385 /*
7386 * Wake up any walsenders to notice that we are on a new
7387 * timeline.
7388 */
7389 if (switchedTLI && AllowCascadeReplication())
7390 WalSndWakeup();
7391 }
7392
7393 /* Exit loop if we reached inclusive recovery target */
7394 if (recoveryStopsAfter(xlogreader))
7395 {
7396 reachedStopPoint = true;
7397 break;
7398 }
7399
7400 /* Else, try to fetch the next WAL record */
7401 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7402 } while (record != NULL);
7403
7404 /*
7405 * end of main redo apply loop
7406 */
7407
7408 if (reachedStopPoint)
7409 {
7410 if (!reachedConsistency)
7411 ereport(FATAL,
7412 (errmsg("requested recovery stop point is before consistent recovery point")));
7413
7414 /*
7415 * This is the last point where we can restart recovery with a
7416 * new recovery target, if we shutdown and begin again. After
7417 * this, Resource Managers may choose to do permanent
7418 * corrective actions at end of recovery.
7419 */
7420 switch (recoveryTargetAction)
7421 {
7422 case RECOVERY_TARGET_ACTION_SHUTDOWN:
7423
7424 /*
7425 * exit with special return code to request shutdown
7426 * of postmaster. Log messages issued from
7427 * postmaster.
7428 */
7429 proc_exit(3);
7430
7431 case RECOVERY_TARGET_ACTION_PAUSE:
7432 SetRecoveryPause(true);
7433 recoveryPausesHere();
7434
7435 /* drop into promote */
7436
7437 case RECOVERY_TARGET_ACTION_PROMOTE:
7438 break;
7439 }
7440 }
7441
7442 /* Allow resource managers to do any required cleanup. */
7443 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7444 {
7445 if (RmgrTable[rmid].rm_cleanup != NULL)
7446 RmgrTable[rmid].rm_cleanup();
7447 }
7448
7449 ereport(LOG,
7450 (errmsg("redo done at %X/%X",
7451 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7452 xtime = GetLatestXTime();
7453 if (xtime)
7454 ereport(LOG,
7455 (errmsg("last completed transaction was at log time %s",
7456 timestamptz_to_str(xtime))));
7457
7458 InRedo = false;
7459 }
7460 else
7461 {
7462 /* there are no WAL records following the checkpoint */
7463 ereport(LOG,
7464 (errmsg("redo is not required")));
7465 }
7466 }
7467
7468 /*
7469 * Kill WAL receiver, if it's still running, before we continue to write
7470 * the startup checkpoint and aborted-contrecord records. It will trump
7471 * over these records and subsequent ones if it's still alive when we
7472 * start writing WAL.
7473 */
7474 ShutdownWalRcv();
7475
7476 /*
7477 * Reset unlogged relations to the contents of their INIT fork. This is
7478 * done AFTER recovery is complete so as to include any unlogged relations
7479 * created during recovery, but BEFORE recovery is marked as having
7480 * completed successfully. Otherwise we'd not retry if any of the post
7481 * end-of-recovery steps fail.
7482 */
7483 if (InRecovery)
7484 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7485
7486 /*
7487 * We don't need the latch anymore. It's not strictly necessary to disown
7488 * it, but let's do it for the sake of tidiness.
7489 */
7490 if (ArchiveRecoveryRequested)
7491 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7492
7493 /*
7494 * We are now done reading the xlog from stream. Turn off streaming
7495 * recovery to force fetching the files (which would be required at end of
7496 * recovery, e.g., timeline history file) from archive or pg_wal.
7497 *
7498 * Note that standby mode must be turned off after killing WAL receiver,
7499 * i.e., calling ShutdownWalRcv().
7500 */
7501 Assert(!WalRcvStreaming());
7502 StandbyMode = false;
7503
7504 /*
7505 * Determine where to start writing WAL next.
7506 *
7507 * When recovery ended in an incomplete record, write a WAL record about
7508 * that and continue after it. In all other cases, re-fetch the last
7509 * valid or last applied record, so we can identify the exact endpoint of
7510 * what we consider the valid portion of WAL.
7511 */
7512 record = ReadRecord(xlogreader, LastRec, PANIC, false);
7513 EndOfLog = EndRecPtr;
7514
7515 /*
7516 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7517 * the end-of-log. It could be different from the timeline that EndOfLog
7518 * nominally belongs to, if there was a timeline switch in that segment,
7519 * and we were reading the old WAL from a segment belonging to a higher
7520 * timeline.
7521 */
7522 EndOfLogTLI = xlogreader->readPageTLI;
7523
7524 /*
7525 * Complain if we did not roll forward far enough to render the backup
7526 * dump consistent. Note: it is indeed okay to look at the local variable
7527 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7528 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7529 * advanced beyond the WAL we processed.
7530 */
7531 if (InRecovery &&
7532 (EndOfLog < minRecoveryPoint ||
7533 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7534 {
7535 /*
7536 * Ran off end of WAL before reaching end-of-backup WAL record, or
7537 * minRecoveryPoint. That's usually a bad sign, indicating that you
7538 * tried to recover from an online backup but never called
7539 * pg_stop_backup(), or you didn't archive all the WAL up to that
7540 * point. However, this also happens in crash recovery, if the system
7541 * crashes while an online backup is in progress. We must not treat
7542 * that as an error, or the database will refuse to start up.
7543 */
7544 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7545 {
7546 if (ControlFile->backupEndRequired)
7547 ereport(FATAL,
7548 (errmsg("WAL ends before end of online backup"),
7549 errhint("All WAL generated while online backup was taken must be available at recovery.")));
7550 else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7551 ereport(FATAL,
7552 (errmsg("WAL ends before end of online backup"),
7553 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7554 else
7555 ereport(FATAL,
7556 (errmsg("WAL ends before consistent recovery point")));
7557 }
7558 }
7559
7560 /*
7561 * Pre-scan prepared transactions to find out the range of XIDs present.
7562 * This information is not quite needed yet, but it is positioned here so
7563 * as potential problems are detected before any on-disk change is done.
7564 */
7565 oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7566
7567 /*
7568 * Consider whether we need to assign a new timeline ID.
7569 *
7570 * If we are doing an archive recovery, we always assign a new ID. This
7571 * handles a couple of issues. If we stopped short of the end of WAL
7572 * during recovery, then we are clearly generating a new timeline and must
7573 * assign it a unique new ID. Even if we ran to the end, modifying the
7574 * current last segment is problematic because it may result in trying to
7575 * overwrite an already-archived copy of that segment, and we encourage
7576 * DBAs to make their archive_commands reject that. We can dodge the
7577 * problem by making the new active segment have a new timeline ID.
7578 *
7579 * In a normal crash recovery, we can just extend the timeline we were in.
7580 */
7581 PrevTimeLineID = ThisTimeLineID;
7582 if (ArchiveRecoveryRequested)
7583 {
7584 char reason[200];
7585 char recoveryPath[MAXPGPATH];
7586
7587 Assert(InArchiveRecovery);
7588
7589 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7590 ereport(LOG,
7591 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7592
7593 /*
7594 * Create a comment for the history file to explain why and where
7595 * timeline changed.
7596 */
7597 if (recoveryTarget == RECOVERY_TARGET_XID)
7598 snprintf(reason, sizeof(reason),
7599 "%s transaction %u",
7600 recoveryStopAfter ? "after" : "before",
7601 recoveryStopXid);
7602 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7603 snprintf(reason, sizeof(reason),
7604 "%s %s\n",
7605 recoveryStopAfter ? "after" : "before",
7606 timestamptz_to_str(recoveryStopTime));
7607 else if (recoveryTarget == RECOVERY_TARGET_LSN)
7608 snprintf(reason, sizeof(reason),
7609 "%s LSN %X/%X\n",
7610 recoveryStopAfter ? "after" : "before",
7611 (uint32) (recoveryStopLSN >> 32),
7612 (uint32) recoveryStopLSN);
7613 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7614 snprintf(reason, sizeof(reason),
7615 "at restore point \"%s\"",
7616 recoveryStopName);
7617 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7618 snprintf(reason, sizeof(reason), "reached consistency");
7619 else
7620 snprintf(reason, sizeof(reason), "no recovery target specified");
7621
7622 /*
7623 * We are now done reading the old WAL. Turn off archive fetching if
7624 * it was active, and make a writable copy of the last WAL segment.
7625 * (Note that we also have a copy of the last block of the old WAL in
7626 * readBuf; we will use that below.)
7627 */
7628 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7629
7630 /*
7631 * Write the timeline history file, and have it archived. After this
7632 * point (or rather, as soon as the file is archived), the timeline
7633 * will appear as "taken" in the WAL archive and to any standby
7634 * servers. If we crash before actually switching to the new
7635 * timeline, standby servers will nevertheless think that we switched
7636 * to the new timeline, and will try to connect to the new timeline.
7637 * To minimize the window for that, try to do as little as possible
7638 * between here and writing the end-of-recovery record.
7639 */
7640 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7641 EndRecPtr, reason);
7642
7643 /*
7644 * Since there might be a partial WAL segment named RECOVERYXLOG, get
7645 * rid of it.
7646 */
7647 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
7648 unlink(recoveryPath); /* ignore any error */
7649
7650 /* Get rid of any remaining recovered timeline-history file, too */
7651 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
7652 unlink(recoveryPath); /* ignore any error */
7653 }
7654
7655 /* Save the selected TimeLineID in shared memory, too */
7656 XLogCtl->ThisTimeLineID = ThisTimeLineID;
7657 XLogCtl->PrevTimeLineID = PrevTimeLineID;
7658
7659 /*
7660 * Actually, if WAL ended in an incomplete record, skip the parts that
7661 * made it through and start writing after the portion that persisted.
7662 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
7663 * we'll do as soon as we're open for writing new WAL.)
7664 */
7665 if (!XLogRecPtrIsInvalid(missingContrecPtr))
7666 {
7667 Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
7668 EndOfLog = missingContrecPtr;
7669 }
7670
7671 /*
7672 * Prepare to write WAL starting at EndOfLog location, and init xlog
7673 * buffer cache using the block containing the last record from the
7674 * previous incarnation.
7675 */
7676 Insert = &XLogCtl->Insert;
7677 Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7678 Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7679
7680 /*
7681 * Tricky point here: readBuf contains the *last* block that the LastRec
7682 * record spans, not the one it starts in. The last block is indeed the
7683 * one we want to use.
7684 */
7685 if (EndOfLog % XLOG_BLCKSZ != 0)
7686 {
7687 char *page;
7688 int len;
7689 int firstIdx;
7690 XLogRecPtr pageBeginPtr;
7691
7692 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7693 Assert(readOff == pageBeginPtr % XLogSegSize);
7694
7695 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7696
7697 /* Copy the valid part of the last block, and zero the rest */
7698 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7699 len = EndOfLog % XLOG_BLCKSZ;
7700 memcpy(page, xlogreader->readBuf, len);
7701 memset(page + len, 0, XLOG_BLCKSZ - len);
7702
7703 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7704 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7705 }
7706 else
7707 {
7708 /*
7709 * There is no partial block to copy. Just set InitializedUpTo, and
7710 * let the first attempt to insert a log record to initialize the next
7711 * buffer.
7712 */
7713 XLogCtl->InitializedUpTo = EndOfLog;
7714 }
7715
7716 LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7717
7718 XLogCtl->LogwrtResult = LogwrtResult;
7719
7720 XLogCtl->LogwrtRqst.Write = EndOfLog;
7721 XLogCtl->LogwrtRqst.Flush = EndOfLog;
7722
7723 LocalSetXLogInsertAllowed();
7724
7725 /* If necessary, write overwrite-contrecord before doing anything else */
7726 if (!XLogRecPtrIsInvalid(abortedRecPtr))
7727 {
7728 Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
7729 CreateOverwriteContrecordRecord(abortedRecPtr);
7730 abortedRecPtr = InvalidXLogRecPtr;
7731 missingContrecPtr = InvalidXLogRecPtr;
7732 }
7733
7734 /*
7735 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7736 * record before resource manager writes cleanup WAL records or checkpoint
7737 * record is written.
7738 */
7739 Insert->fullPageWrites = lastFullPageWrites;
7740 UpdateFullPageWrites();
7741 LocalXLogInsertAllowed = -1;
7742
7743 if (InRecovery)
7744 {
7745 /*
7746 * Perform a checkpoint to update all our recovery activity to disk.
7747 *
7748 * Note that we write a shutdown checkpoint rather than an on-line
7749 * one. This is not particularly critical, but since we may be
7750 * assigning a new TLI, using a shutdown checkpoint allows us to have
7751 * the rule that TLI only changes in shutdown checkpoints, which
7752 * allows some extra error checking in xlog_redo.
7753 *
7754 * In fast promotion, only create a lightweight end-of-recovery record
7755 * instead of a full checkpoint. A checkpoint is requested later,
7756 * after we're fully out of recovery mode and already accepting
7757 * queries.
7758 */
7759 if (bgwriterLaunched)
7760 {
7761 if (fast_promote)
7762 {
7763 checkPointLoc = ControlFile->prevCheckPoint;
7764
7765 /*
7766 * Confirm the last checkpoint is available for us to recover
7767 * from if we fail. Note that we don't check for the secondary
7768 * checkpoint since that isn't available in most base backups.
7769 */
7770 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7771 if (record != NULL)
7772 {
7773 fast_promoted = true;
7774
7775 /*
7776 * Insert a special WAL record to mark the end of
7777 * recovery, since we aren't doing a checkpoint. That
7778 * means that the checkpointer process may likely be in
7779 * the middle of a time-smoothed restartpoint and could
7780 * continue to be for minutes after this. That sounds
7781 * strange, but the effect is roughly the same and it
7782 * would be stranger to try to come out of the
7783 * restartpoint and then checkpoint. We request a
7784 * checkpoint later anyway, just for safety.
7785 */
7786 CreateEndOfRecoveryRecord();
7787 }
7788 }
7789
7790 if (!fast_promoted)
7791 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7792 CHECKPOINT_IMMEDIATE |
7793 CHECKPOINT_WAIT);
7794 }
7795 else
7796 CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7797
7798 /*
7799 * And finally, execute the recovery_end_command, if any.
7800 */
7801 if (recoveryEndCommand)
7802 ExecuteRecoveryCommand(recoveryEndCommand,
7803 "recovery_end_command",
7804 true);
7805 }
7806
7807 if (ArchiveRecoveryRequested)
7808 {
7809 /*
7810 * We switched to a new timeline. Clean up segments on the old
7811 * timeline.
7812 *
7813 * If there are any higher-numbered segments on the old timeline,
7814 * remove them. They might contain valid WAL, but they might also be
7815 * pre-allocated files containing garbage. In any case, they are not
7816 * part of the new timeline's history so we don't need them.
7817 */
7818 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7819
7820 /*
7821 * If the switch happened in the middle of a segment, what to do with
7822 * the last, partial segment on the old timeline? If we don't archive
7823 * it, and the server that created the WAL never archives it either
7824 * (e.g. because it was hit by a meteor), it will never make it to the
7825 * archive. That's OK from our point of view, because the new segment
7826 * that we created with the new TLI contains all the WAL from the old
7827 * timeline up to the switch point. But if you later try to do PITR to
7828 * the "missing" WAL on the old timeline, recovery won't find it in
7829 * the archive. It's physically present in the new file with new TLI,
7830 * but recovery won't look there when it's recovering to the older
7831 * timeline. On the other hand, if we archive the partial segment, and
7832 * the original server on that timeline is still running and archives
7833 * the completed version of the same segment later, it will fail. (We
7834 * used to do that in 9.4 and below, and it caused such problems).
7835 *
7836 * As a compromise, we rename the last segment with the .partial
7837 * suffix, and archive it. Archive recovery will never try to read
7838 * .partial segments, so they will normally go unused. But in the odd
7839 * PITR case, the administrator can copy them manually to the pg_wal
7840 * directory (removing the suffix). They can be useful in debugging,
7841 * too.
7842 *
7843 * If a .done or .ready file already exists for the old timeline,
7844 * however, we had already determined that the segment is complete, so
7845 * we can let it be archived normally. (In particular, if it was
7846 * restored from the archive to begin with, it's expected to have a
7847 * .done file).
7848 */
7849 if (EndOfLog % XLOG_SEG_SIZE != 0 && XLogArchivingActive())
7850 {
7851 char origfname[MAXFNAMELEN];
7852 XLogSegNo endLogSegNo;
7853
7854 XLByteToPrevSeg(EndOfLog, endLogSegNo);
7855 XLogFileName(origfname, EndOfLogTLI, endLogSegNo);
7856
7857 if (!XLogArchiveIsReadyOrDone(origfname))
7858 {
7859 char origpath[MAXPGPATH];
7860 char partialfname[MAXFNAMELEN];
7861 char partialpath[MAXPGPATH];
7862
7863 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo);
7864 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7865 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7866
7867 /*
7868 * Make sure there's no .done or .ready file for the .partial
7869 * file.
7870 */
7871 XLogArchiveCleanup(partialfname);
7872
7873 durable_rename(origpath, partialpath, ERROR);
7874 XLogArchiveNotify(partialfname);
7875 }
7876 }
7877 }
7878
7879 /*
7880 * Preallocate additional log files, if wanted.
7881 */
7882 PreallocXlogFiles(EndOfLog);
7883
7884 /*
7885 * Okay, we're officially UP.
7886 */
7887 InRecovery = false;
7888
7889 /* start the archive_timeout timer and LSN running */
7890 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7891 XLogCtl->lastSegSwitchLSN = EndOfLog;
7892
7893 /* also initialize latestCompletedXid, to nextXid - 1 */
7894 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7895 ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7896 TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7897 LWLockRelease(ProcArrayLock);
7898
7899 /*
7900 * Start up the commit log and subtrans, if not already done for hot
7901 * standby. (commit timestamps are started below, if necessary.)
7902 */
7903 if (standbyState == STANDBY_DISABLED)
7904 {
7905 StartupCLOG();
7906 StartupSUBTRANS(oldestActiveXID);
7907 }
7908
7909 /*
7910 * Perform end of recovery actions for any SLRUs that need it.
7911 */
7912 TrimCLOG();
7913 TrimMultiXact();
7914
7915 /* Reload shared-memory state for prepared transactions */
7916 RecoverPreparedTransactions();
7917
7918 /* Shut down xlogreader */
7919 if (readFile >= 0)
7920 {
7921 close(readFile);
7922 readFile = -1;
7923 }
7924 XLogReaderFree(xlogreader);
7925
7926 /*
7927 * If any of the critical GUCs have changed, log them before we allow
7928 * backends to write WAL.
7929 */
7930 LocalSetXLogInsertAllowed();
7931 XLogReportParameters();
7932
7933 /*
7934 * Local WAL inserts enabled, so it's time to finish initialization of
7935 * commit timestamp.
7936 */
7937 CompleteCommitTsInitialization();
7938
7939 /*
7940 * All done with end-of-recovery actions.
7941 *
7942 * Now allow backends to write WAL and update the control file status in
7943 * consequence. The boolean flag allowing backends to write WAL is
7944 * updated while holding ControlFileLock to prevent other backends to look
7945 * at an inconsistent state of the control file in shared memory. There
7946 * is still a small window during which backends can write WAL and the
7947 * control file is still referring to a system not in DB_IN_PRODUCTION
7948 * state while looking at the on-disk control file.
7949 *
7950 * Also, although the boolean flag to allow WAL is probably atomic in
7951 * itself, we use the info_lck here to ensure that there are no race
7952 * conditions concerning visibility of other recent updates to shared
7953 * memory.
7954 */
7955 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7956 ControlFile->state = DB_IN_PRODUCTION;
7957 ControlFile->time = (pg_time_t) time(NULL);
7958
7959 SpinLockAcquire(&XLogCtl->info_lck);
7960 XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
7961 SpinLockRelease(&XLogCtl->info_lck);
7962
7963 UpdateControlFile();
7964 LWLockRelease(ControlFileLock);
7965
7966 /*
7967 * Shutdown the recovery environment. This must occur after
7968 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
7969 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
7970 * any session building a snapshot will not rely on KnownAssignedXids as
7971 * RecoveryInProgress() would return false at this stage. This is
7972 * particularly critical for prepared 2PC transactions, that would still
7973 * need to be included in snapshots once recovery has ended.
7974 */
7975 if (standbyState != STANDBY_DISABLED)
7976 ShutdownRecoveryTransactionEnvironment();
7977
7978 /*
7979 * If there were cascading standby servers connected to us, nudge any wal
7980 * sender processes to notice that we've been promoted.
7981 */
7982 WalSndWakeup();
7983
7984 /*
7985 * If this was a fast promotion, request an (online) checkpoint now. This
7986 * isn't required for consistency, but the last restartpoint might be far
7987 * back, and in case of a crash, recovering from it might take a longer
7988 * than is appropriate now that we're not in standby mode anymore.
7989 */
7990 if (fast_promoted)
7991 RequestCheckpoint(CHECKPOINT_FORCE);
7992 }
7993
7994 /*
7995 * Checks if recovery has reached a consistent state. When consistency is
7996 * reached and we have a valid starting standby snapshot, tell postmaster
7997 * that it can start accepting read-only connections.
7998 */
7999 static void
CheckRecoveryConsistency(void)8000 CheckRecoveryConsistency(void)
8001 {
8002 XLogRecPtr lastReplayedEndRecPtr;
8003
8004 /*
8005 * During crash recovery, we don't reach a consistent state until we've
8006 * replayed all the WAL.
8007 */
8008 if (XLogRecPtrIsInvalid(minRecoveryPoint))
8009 return;
8010
8011 Assert(InArchiveRecovery);
8012
8013 /*
8014 * assume that we are called in the startup process, and hence don't need
8015 * a lock to read lastReplayedEndRecPtr
8016 */
8017 lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
8018
8019 /*
8020 * Have we reached the point where our base backup was completed?
8021 */
8022 if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
8023 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
8024 {
8025 /*
8026 * We have reached the end of base backup, as indicated by pg_control.
8027 * The data on disk is now consistent. Reset backupStartPoint and
8028 * backupEndPoint, and update minRecoveryPoint to make sure we don't
8029 * allow starting up at an earlier point even if recovery is stopped
8030 * and restarted soon after this.
8031 */
8032 elog(DEBUG1, "end of backup reached");
8033
8034 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8035
8036 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
8037 ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
8038
8039 ControlFile->backupStartPoint = InvalidXLogRecPtr;
8040 ControlFile->backupEndPoint = InvalidXLogRecPtr;
8041 ControlFile->backupEndRequired = false;
8042 UpdateControlFile();
8043
8044 LWLockRelease(ControlFileLock);
8045 }
8046
8047 /*
8048 * Have we passed our safe starting point? Note that minRecoveryPoint is
8049 * known to be incorrectly set if ControlFile->backupEndRequired, until
8050 * the XLOG_BACKUP_END arrives to advise us of the correct
8051 * minRecoveryPoint. All we know prior to that is that we're not
8052 * consistent yet.
8053 */
8054 if (!reachedConsistency && !ControlFile->backupEndRequired &&
8055 minRecoveryPoint <= lastReplayedEndRecPtr &&
8056 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
8057 {
8058 /*
8059 * Check to see if the XLOG sequence contained any unresolved
8060 * references to uninitialized pages.
8061 */
8062 XLogCheckInvalidPages();
8063
8064 reachedConsistency = true;
8065 ereport(LOG,
8066 (errmsg("consistent recovery state reached at %X/%X",
8067 (uint32) (lastReplayedEndRecPtr >> 32),
8068 (uint32) lastReplayedEndRecPtr)));
8069 }
8070
8071 /*
8072 * Have we got a valid starting snapshot that will allow queries to be
8073 * run? If so, we can tell postmaster that the database is consistent now,
8074 * enabling connections.
8075 */
8076 if (standbyState == STANDBY_SNAPSHOT_READY &&
8077 !LocalHotStandbyActive &&
8078 reachedConsistency &&
8079 IsUnderPostmaster)
8080 {
8081 SpinLockAcquire(&XLogCtl->info_lck);
8082 XLogCtl->SharedHotStandbyActive = true;
8083 SpinLockRelease(&XLogCtl->info_lck);
8084
8085 LocalHotStandbyActive = true;
8086
8087 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
8088 }
8089 }
8090
8091 /*
8092 * Is the system still in recovery?
8093 *
8094 * Unlike testing InRecovery, this works in any process that's connected to
8095 * shared memory.
8096 *
8097 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
8098 * variables the first time we see that recovery is finished.
8099 */
8100 bool
RecoveryInProgress(void)8101 RecoveryInProgress(void)
8102 {
8103 /*
8104 * We check shared state each time only until we leave recovery mode. We
8105 * can't re-enter recovery, so there's no need to keep checking after the
8106 * shared variable has once been seen false.
8107 */
8108 if (!LocalRecoveryInProgress)
8109 return false;
8110 else
8111 {
8112 /*
8113 * use volatile pointer to make sure we make a fresh read of the
8114 * shared variable.
8115 */
8116 volatile XLogCtlData *xlogctl = XLogCtl;
8117
8118 LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
8119
8120 /*
8121 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
8122 * is finished. InitPostgres() relies upon this behaviour to ensure
8123 * that InitXLOGAccess() is called at backend startup. (If you change
8124 * this, see also LocalSetXLogInsertAllowed.)
8125 */
8126 if (!LocalRecoveryInProgress)
8127 {
8128 /*
8129 * If we just exited recovery, make sure we read TimeLineID and
8130 * RedoRecPtr after SharedRecoveryState (for machines with weak
8131 * memory ordering).
8132 */
8133 pg_memory_barrier();
8134 InitXLOGAccess();
8135 }
8136
8137 /*
8138 * Note: We don't need a memory barrier when we're still in recovery.
8139 * We might exit recovery immediately after return, so the caller
8140 * can't rely on 'true' meaning that we're still in recovery anyway.
8141 */
8142
8143 return LocalRecoveryInProgress;
8144 }
8145 }
8146
8147 /*
8148 * Returns current recovery state from shared memory.
8149 *
8150 * This returned state is kept consistent with the contents of the control
8151 * file. See details about the possible values of RecoveryState in xlog.h.
8152 */
8153 RecoveryState
GetRecoveryState(void)8154 GetRecoveryState(void)
8155 {
8156 RecoveryState retval;
8157
8158 SpinLockAcquire(&XLogCtl->info_lck);
8159 retval = XLogCtl->SharedRecoveryState;
8160 SpinLockRelease(&XLogCtl->info_lck);
8161
8162 return retval;
8163 }
8164
8165 /*
8166 * Is HotStandby active yet? This is only important in special backends
8167 * since normal backends won't ever be able to connect until this returns
8168 * true. Postmaster knows this by way of signal, not via shared memory.
8169 *
8170 * Unlike testing standbyState, this works in any process that's connected to
8171 * shared memory. (And note that standbyState alone doesn't tell the truth
8172 * anyway.)
8173 */
8174 bool
HotStandbyActive(void)8175 HotStandbyActive(void)
8176 {
8177 /*
8178 * We check shared state each time only until Hot Standby is active. We
8179 * can't de-activate Hot Standby, so there's no need to keep checking
8180 * after the shared variable has once been seen true.
8181 */
8182 if (LocalHotStandbyActive)
8183 return true;
8184 else
8185 {
8186 /* spinlock is essential on machines with weak memory ordering! */
8187 SpinLockAcquire(&XLogCtl->info_lck);
8188 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
8189 SpinLockRelease(&XLogCtl->info_lck);
8190
8191 return LocalHotStandbyActive;
8192 }
8193 }
8194
8195 /*
8196 * Like HotStandbyActive(), but to be used only in WAL replay code,
8197 * where we don't need to ask any other process what the state is.
8198 */
8199 bool
HotStandbyActiveInReplay(void)8200 HotStandbyActiveInReplay(void)
8201 {
8202 Assert(AmStartupProcess() || !IsPostmasterEnvironment);
8203 return LocalHotStandbyActive;
8204 }
8205
8206 /*
8207 * Is this process allowed to insert new WAL records?
8208 *
8209 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
8210 * But we also have provisions for forcing the result "true" or "false"
8211 * within specific processes regardless of the global state.
8212 */
8213 bool
XLogInsertAllowed(void)8214 XLogInsertAllowed(void)
8215 {
8216 /*
8217 * If value is "unconditionally true" or "unconditionally false", just
8218 * return it. This provides the normal fast path once recovery is known
8219 * done.
8220 */
8221 if (LocalXLogInsertAllowed >= 0)
8222 return (bool) LocalXLogInsertAllowed;
8223
8224 /*
8225 * Else, must check to see if we're still in recovery.
8226 */
8227 if (RecoveryInProgress())
8228 return false;
8229
8230 /*
8231 * On exit from recovery, reset to "unconditionally true", since there is
8232 * no need to keep checking.
8233 */
8234 LocalXLogInsertAllowed = 1;
8235 return true;
8236 }
8237
8238 /*
8239 * Make XLogInsertAllowed() return true in the current process only.
8240 *
8241 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
8242 * and even call LocalSetXLogInsertAllowed() again after that.
8243 */
8244 static void
LocalSetXLogInsertAllowed(void)8245 LocalSetXLogInsertAllowed(void)
8246 {
8247 Assert(LocalXLogInsertAllowed == -1);
8248 LocalXLogInsertAllowed = 1;
8249
8250 /* Initialize as RecoveryInProgress() would do when switching state */
8251 InitXLOGAccess();
8252 }
8253
8254 /*
8255 * Subroutine to try to fetch and validate a prior checkpoint record.
8256 *
8257 * whichChkpt identifies the checkpoint (merely for reporting purposes).
8258 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
8259 */
8260 static XLogRecord *
ReadCheckpointRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int whichChkpt,bool report)8261 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
8262 int whichChkpt, bool report)
8263 {
8264 XLogRecord *record;
8265 uint8 info;
8266
8267 if (!XRecOffIsValid(RecPtr))
8268 {
8269 if (!report)
8270 return NULL;
8271
8272 switch (whichChkpt)
8273 {
8274 case 1:
8275 ereport(LOG,
8276 (errmsg("invalid primary checkpoint link in control file")));
8277 break;
8278 case 2:
8279 ereport(LOG,
8280 (errmsg("invalid secondary checkpoint link in control file")));
8281 break;
8282 default:
8283 ereport(LOG,
8284 (errmsg("invalid checkpoint link in backup_label file")));
8285 break;
8286 }
8287 return NULL;
8288 }
8289
8290 record = ReadRecord(xlogreader, RecPtr, LOG, true);
8291
8292 if (record == NULL)
8293 {
8294 if (!report)
8295 return NULL;
8296
8297 switch (whichChkpt)
8298 {
8299 case 1:
8300 ereport(LOG,
8301 (errmsg("invalid primary checkpoint record")));
8302 break;
8303 case 2:
8304 ereport(LOG,
8305 (errmsg("invalid secondary checkpoint record")));
8306 break;
8307 default:
8308 ereport(LOG,
8309 (errmsg("invalid checkpoint record")));
8310 break;
8311 }
8312 return NULL;
8313 }
8314 if (record->xl_rmid != RM_XLOG_ID)
8315 {
8316 switch (whichChkpt)
8317 {
8318 case 1:
8319 ereport(LOG,
8320 (errmsg("invalid resource manager ID in primary checkpoint record")));
8321 break;
8322 case 2:
8323 ereport(LOG,
8324 (errmsg("invalid resource manager ID in secondary checkpoint record")));
8325 break;
8326 default:
8327 ereport(LOG,
8328 (errmsg("invalid resource manager ID in checkpoint record")));
8329 break;
8330 }
8331 return NULL;
8332 }
8333 info = record->xl_info & ~XLR_INFO_MASK;
8334 if (info != XLOG_CHECKPOINT_SHUTDOWN &&
8335 info != XLOG_CHECKPOINT_ONLINE)
8336 {
8337 switch (whichChkpt)
8338 {
8339 case 1:
8340 ereport(LOG,
8341 (errmsg("invalid xl_info in primary checkpoint record")));
8342 break;
8343 case 2:
8344 ereport(LOG,
8345 (errmsg("invalid xl_info in secondary checkpoint record")));
8346 break;
8347 default:
8348 ereport(LOG,
8349 (errmsg("invalid xl_info in checkpoint record")));
8350 break;
8351 }
8352 return NULL;
8353 }
8354 if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8355 {
8356 switch (whichChkpt)
8357 {
8358 case 1:
8359 ereport(LOG,
8360 (errmsg("invalid length of primary checkpoint record")));
8361 break;
8362 case 2:
8363 ereport(LOG,
8364 (errmsg("invalid length of secondary checkpoint record")));
8365 break;
8366 default:
8367 ereport(LOG,
8368 (errmsg("invalid length of checkpoint record")));
8369 break;
8370 }
8371 return NULL;
8372 }
8373 return record;
8374 }
8375
8376 /*
8377 * This must be called in a backend process before creating WAL records
8378 * (except in a standalone backend, which does StartupXLOG instead). We need
8379 * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8380 *
8381 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8382 * process's copies of ThisTimeLineID and RedoRecPtr valid too. This was
8383 * unnecessary however, since the postmaster itself never touches XLOG anyway.
8384 */
8385 void
InitXLOGAccess(void)8386 InitXLOGAccess(void)
8387 {
8388 XLogCtlInsert *Insert = &XLogCtl->Insert;
8389
8390 /* ThisTimeLineID doesn't change so we need no lock to copy it */
8391 ThisTimeLineID = XLogCtl->ThisTimeLineID;
8392 Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8393
8394 /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8395 (void) GetRedoRecPtr();
8396 /* Also update our copy of doPageWrites. */
8397 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8398
8399 /* Also initialize the working areas for constructing WAL records */
8400 InitXLogInsert();
8401 }
8402
8403 /*
8404 * Return the current Redo pointer from shared memory.
8405 *
8406 * As a side-effect, the local RedoRecPtr copy is updated.
8407 */
8408 XLogRecPtr
GetRedoRecPtr(void)8409 GetRedoRecPtr(void)
8410 {
8411 XLogRecPtr ptr;
8412
8413 /*
8414 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8415 * grabbed a WAL insertion lock to read the master copy, someone might
8416 * update it just after we've released the lock.
8417 */
8418 SpinLockAcquire(&XLogCtl->info_lck);
8419 ptr = XLogCtl->RedoRecPtr;
8420 SpinLockRelease(&XLogCtl->info_lck);
8421
8422 if (RedoRecPtr < ptr)
8423 RedoRecPtr = ptr;
8424
8425 return RedoRecPtr;
8426 }
8427
8428 /*
8429 * Return information needed to decide whether a modified block needs a
8430 * full-page image to be included in the WAL record.
8431 *
8432 * The returned values are cached copies from backend-private memory, and
8433 * possibly out-of-date. XLogInsertRecord will re-check them against
8434 * up-to-date values, while holding the WAL insert lock.
8435 */
8436 void
GetFullPageWriteInfo(XLogRecPtr * RedoRecPtr_p,bool * doPageWrites_p)8437 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8438 {
8439 *RedoRecPtr_p = RedoRecPtr;
8440 *doPageWrites_p = doPageWrites;
8441 }
8442
8443 /*
8444 * GetInsertRecPtr -- Returns the current insert position.
8445 *
8446 * NOTE: The value *actually* returned is the position of the last full
8447 * xlog page. It lags behind the real insert position by at most 1 page.
8448 * For that, we don't need to scan through WAL insertion locks, and an
8449 * approximation is enough for the current usage of this function.
8450 */
8451 XLogRecPtr
GetInsertRecPtr(void)8452 GetInsertRecPtr(void)
8453 {
8454 XLogRecPtr recptr;
8455
8456 SpinLockAcquire(&XLogCtl->info_lck);
8457 recptr = XLogCtl->LogwrtRqst.Write;
8458 SpinLockRelease(&XLogCtl->info_lck);
8459
8460 return recptr;
8461 }
8462
8463 /*
8464 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8465 * position known to be fsync'd to disk.
8466 */
8467 XLogRecPtr
GetFlushRecPtr(void)8468 GetFlushRecPtr(void)
8469 {
8470 SpinLockAcquire(&XLogCtl->info_lck);
8471 LogwrtResult = XLogCtl->LogwrtResult;
8472 SpinLockRelease(&XLogCtl->info_lck);
8473
8474 return LogwrtResult.Flush;
8475 }
8476
8477 /*
8478 * GetLastImportantRecPtr -- Returns the LSN of the last important record
8479 * inserted. All records not explicitly marked as unimportant are considered
8480 * important.
8481 *
8482 * The LSN is determined by computing the maximum of
8483 * WALInsertLocks[i].lastImportantAt.
8484 */
8485 XLogRecPtr
GetLastImportantRecPtr(void)8486 GetLastImportantRecPtr(void)
8487 {
8488 XLogRecPtr res = InvalidXLogRecPtr;
8489 int i;
8490
8491 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
8492 {
8493 XLogRecPtr last_important;
8494
8495 /*
8496 * Need to take a lock to prevent torn reads of the LSN, which are
8497 * possible on some of the supported platforms. WAL insert locks only
8498 * support exclusive mode, so we have to use that.
8499 */
8500 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
8501 last_important = WALInsertLocks[i].l.lastImportantAt;
8502 LWLockRelease(&WALInsertLocks[i].l.lock);
8503
8504 if (res < last_important)
8505 res = last_important;
8506 }
8507
8508 return res;
8509 }
8510
8511 /*
8512 * Get the time and LSN of the last xlog segment switch
8513 */
8514 pg_time_t
GetLastSegSwitchData(XLogRecPtr * lastSwitchLSN)8515 GetLastSegSwitchData(XLogRecPtr *lastSwitchLSN)
8516 {
8517 pg_time_t result;
8518
8519 /* Need WALWriteLock, but shared lock is sufficient */
8520 LWLockAcquire(WALWriteLock, LW_SHARED);
8521 result = XLogCtl->lastSegSwitchTime;
8522 *lastSwitchLSN = XLogCtl->lastSegSwitchLSN;
8523 LWLockRelease(WALWriteLock);
8524
8525 return result;
8526 }
8527
8528 /*
8529 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
8530 *
8531 * This is exported for use by code that would like to have 64-bit XIDs.
8532 * We don't really support such things, but all XIDs within the system
8533 * can be presumed "close to" the result, and thus the epoch associated
8534 * with them can be determined.
8535 */
8536 void
GetNextXidAndEpoch(TransactionId * xid,uint32 * epoch)8537 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
8538 {
8539 uint32 ckptXidEpoch;
8540 TransactionId ckptXid;
8541 TransactionId nextXid;
8542
8543 /* Must read checkpoint info first, else have race condition */
8544 SpinLockAcquire(&XLogCtl->info_lck);
8545 ckptXidEpoch = XLogCtl->ckptXidEpoch;
8546 ckptXid = XLogCtl->ckptXid;
8547 SpinLockRelease(&XLogCtl->info_lck);
8548
8549 /* Now fetch current nextXid */
8550 nextXid = ReadNewTransactionId();
8551
8552 /*
8553 * nextXid is certainly logically later than ckptXid. So if it's
8554 * numerically less, it must have wrapped into the next epoch.
8555 */
8556 if (nextXid < ckptXid)
8557 ckptXidEpoch++;
8558
8559 *xid = nextXid;
8560 *epoch = ckptXidEpoch;
8561 }
8562
8563 /*
8564 * This must be called ONCE during postmaster or standalone-backend shutdown
8565 */
8566 void
ShutdownXLOG(int code,Datum arg)8567 ShutdownXLOG(int code, Datum arg)
8568 {
8569 /* Don't be chatty in standalone mode */
8570 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8571 (errmsg("shutting down")));
8572
8573 /*
8574 * Signal walsenders to move to stopping state.
8575 */
8576 WalSndInitStopping();
8577
8578 /*
8579 * Wait for WAL senders to be in stopping state. This prevents commands
8580 * from writing new WAL.
8581 */
8582 WalSndWaitStopping();
8583
8584 if (RecoveryInProgress())
8585 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8586 else
8587 {
8588 /*
8589 * If archiving is enabled, rotate the last XLOG file so that all the
8590 * remaining records are archived (postmaster wakes up the archiver
8591 * process one more time at the end of shutdown). The checkpoint
8592 * record will go to the next XLOG file and won't be archived (yet).
8593 */
8594 if (XLogArchivingActive() && XLogArchiveCommandSet())
8595 RequestXLogSwitch(false);
8596
8597 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8598 }
8599 ShutdownCLOG();
8600 ShutdownCommitTs();
8601 ShutdownSUBTRANS();
8602 ShutdownMultiXact();
8603 }
8604
8605 /*
8606 * Log start of a checkpoint.
8607 */
8608 static void
LogCheckpointStart(int flags,bool restartpoint)8609 LogCheckpointStart(int flags, bool restartpoint)
8610 {
8611 elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8612 restartpoint ? "restartpoint" : "checkpoint",
8613 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8614 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8615 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8616 (flags & CHECKPOINT_FORCE) ? " force" : "",
8617 (flags & CHECKPOINT_WAIT) ? " wait" : "",
8618 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8619 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8620 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8621 }
8622
8623 /*
8624 * Log end of a checkpoint.
8625 */
8626 static void
LogCheckpointEnd(bool restartpoint)8627 LogCheckpointEnd(bool restartpoint)
8628 {
8629 long write_msecs,
8630 sync_msecs,
8631 total_msecs,
8632 longest_msecs,
8633 average_msecs;
8634 uint64 average_sync_time;
8635
8636 CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8637
8638 write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
8639 CheckpointStats.ckpt_sync_t);
8640
8641 sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
8642 CheckpointStats.ckpt_sync_end_t);
8643
8644 /* Accumulate checkpoint timing summary data, in milliseconds. */
8645 BgWriterStats.m_checkpoint_write_time += write_msecs;
8646 BgWriterStats.m_checkpoint_sync_time += sync_msecs;
8647
8648 /*
8649 * All of the published timing statistics are accounted for. Only
8650 * continue if a log message is to be written.
8651 */
8652 if (!log_checkpoints)
8653 return;
8654
8655 total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
8656 CheckpointStats.ckpt_end_t);
8657
8658 /*
8659 * Timing values returned from CheckpointStats are in microseconds.
8660 * Convert to milliseconds for consistent printing.
8661 */
8662 longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
8663
8664 average_sync_time = 0;
8665 if (CheckpointStats.ckpt_sync_rels > 0)
8666 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8667 CheckpointStats.ckpt_sync_rels;
8668 average_msecs = (long) ((average_sync_time + 999) / 1000);
8669
8670 elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8671 "%d WAL file(s) added, %d removed, %d recycled; "
8672 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8673 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8674 "distance=%d kB, estimate=%d kB",
8675 restartpoint ? "restartpoint" : "checkpoint",
8676 CheckpointStats.ckpt_bufs_written,
8677 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8678 CheckpointStats.ckpt_segs_added,
8679 CheckpointStats.ckpt_segs_removed,
8680 CheckpointStats.ckpt_segs_recycled,
8681 write_msecs / 1000, (int) (write_msecs % 1000),
8682 sync_msecs / 1000, (int) (sync_msecs % 1000),
8683 total_msecs / 1000, (int) (total_msecs % 1000),
8684 CheckpointStats.ckpt_sync_rels,
8685 longest_msecs / 1000, (int) (longest_msecs % 1000),
8686 average_msecs / 1000, (int) (average_msecs % 1000),
8687 (int) (PrevCheckPointDistance / 1024.0),
8688 (int) (CheckPointDistanceEstimate / 1024.0));
8689 }
8690
8691 /*
8692 * Update the estimate of distance between checkpoints.
8693 *
8694 * The estimate is used to calculate the number of WAL segments to keep
8695 * preallocated, see XLOGFileSlop().
8696 */
8697 static void
UpdateCheckPointDistanceEstimate(uint64 nbytes)8698 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8699 {
8700 /*
8701 * To estimate the number of segments consumed between checkpoints, keep a
8702 * moving average of the amount of WAL generated in previous checkpoint
8703 * cycles. However, if the load is bursty, with quiet periods and busy
8704 * periods, we want to cater for the peak load. So instead of a plain
8705 * moving average, let the average decline slowly if the previous cycle
8706 * used less WAL than estimated, but bump it up immediately if it used
8707 * more.
8708 *
8709 * When checkpoints are triggered by max_wal_size, this should converge to
8710 * CheckpointSegments * XLOG_SEG_SIZE,
8711 *
8712 * Note: This doesn't pay any attention to what caused the checkpoint.
8713 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8714 * starting a base backup, are counted the same as those created
8715 * automatically. The slow-decline will largely mask them out, if they are
8716 * not frequent. If they are frequent, it seems reasonable to count them
8717 * in as any others; if you issue a manual checkpoint every 5 minutes and
8718 * never let a timed checkpoint happen, it makes sense to base the
8719 * preallocation on that 5 minute interval rather than whatever
8720 * checkpoint_timeout is set to.
8721 */
8722 PrevCheckPointDistance = nbytes;
8723 if (CheckPointDistanceEstimate < nbytes)
8724 CheckPointDistanceEstimate = nbytes;
8725 else
8726 CheckPointDistanceEstimate =
8727 (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8728 }
8729
8730 /*
8731 * Perform a checkpoint --- either during shutdown, or on-the-fly
8732 *
8733 * flags is a bitwise OR of the following:
8734 * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8735 * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8736 * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8737 * ignoring checkpoint_completion_target parameter.
8738 * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8739 * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8740 * CHECKPOINT_END_OF_RECOVERY).
8741 * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8742 *
8743 * Note: flags contains other bits, of interest here only for logging purposes.
8744 * In particular note that this routine is synchronous and does not pay
8745 * attention to CHECKPOINT_WAIT.
8746 *
8747 * If !shutdown then we are writing an online checkpoint. This is a very special
8748 * kind of operation and WAL record because the checkpoint action occurs over
8749 * a period of time yet logically occurs at just a single LSN. The logical
8750 * position of the WAL record (redo ptr) is the same or earlier than the
8751 * physical position. When we replay WAL we locate the checkpoint via its
8752 * physical position then read the redo ptr and actually start replay at the
8753 * earlier logical position. Note that we don't write *anything* to WAL at
8754 * the logical position, so that location could be any other kind of WAL record.
8755 * All of this mechanism allows us to continue working while we checkpoint.
8756 * As a result, timing of actions is critical here and be careful to note that
8757 * this function will likely take minutes to execute on a busy system.
8758 */
8759 void
CreateCheckPoint(int flags)8760 CreateCheckPoint(int flags)
8761 {
8762 bool shutdown;
8763 CheckPoint checkPoint;
8764 XLogRecPtr recptr;
8765 XLogCtlInsert *Insert = &XLogCtl->Insert;
8766 uint32 freespace;
8767 XLogRecPtr PriorRedoPtr;
8768 XLogRecPtr curInsert;
8769 XLogRecPtr last_important_lsn;
8770 VirtualTransactionId *vxids;
8771 int nvxids;
8772
8773 /*
8774 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8775 * issued at a different time.
8776 */
8777 if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8778 shutdown = true;
8779 else
8780 shutdown = false;
8781
8782 /* sanity check */
8783 if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8784 elog(ERROR, "can't create a checkpoint during recovery");
8785
8786 /*
8787 * Initialize InitXLogInsert working areas before entering the critical
8788 * section. Normally, this is done by the first call to
8789 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8790 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8791 * done below in a critical section, and InitXLogInsert cannot be called
8792 * in a critical section.
8793 */
8794 InitXLogInsert();
8795
8796 /*
8797 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8798 * (This is just pro forma, since in the present system structure there is
8799 * only one process that is allowed to issue checkpoints at any given
8800 * time.)
8801 */
8802 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8803
8804 /*
8805 * Prepare to accumulate statistics.
8806 *
8807 * Note: because it is possible for log_checkpoints to change while a
8808 * checkpoint proceeds, we always accumulate stats, even if
8809 * log_checkpoints is currently off.
8810 */
8811 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8812 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8813
8814 /*
8815 * Use a critical section to force system panic if we have trouble.
8816 */
8817 START_CRIT_SECTION();
8818
8819 if (shutdown)
8820 {
8821 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8822 ControlFile->state = DB_SHUTDOWNING;
8823 ControlFile->time = (pg_time_t) time(NULL);
8824 UpdateControlFile();
8825 LWLockRelease(ControlFileLock);
8826 }
8827
8828 /*
8829 * Let smgr prepare for checkpoint; this has to happen before we determine
8830 * the REDO pointer. Note that smgr must not do anything that'd have to
8831 * be undone if we decide no checkpoint is needed.
8832 */
8833 smgrpreckpt();
8834
8835 /* Begin filling in the checkpoint WAL record */
8836 MemSet(&checkPoint, 0, sizeof(checkPoint));
8837 checkPoint.time = (pg_time_t) time(NULL);
8838
8839 /*
8840 * For Hot Standby, derive the oldestActiveXid before we fix the redo
8841 * pointer. This allows us to begin accumulating changes to assemble our
8842 * starting snapshot of locks and transactions.
8843 */
8844 if (!shutdown && XLogStandbyInfoActive())
8845 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8846 else
8847 checkPoint.oldestActiveXid = InvalidTransactionId;
8848
8849 /*
8850 * Get location of last important record before acquiring insert locks (as
8851 * GetLastImportantRecPtr() also locks WAL locks).
8852 */
8853 last_important_lsn = GetLastImportantRecPtr();
8854
8855 /*
8856 * We must block concurrent insertions while examining insert state to
8857 * determine the checkpoint REDO pointer.
8858 */
8859 WALInsertLockAcquireExclusive();
8860 curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8861
8862 /*
8863 * If this isn't a shutdown or forced checkpoint, and if there has been no
8864 * WAL activity requiring a checkpoint, skip it. The idea here is to
8865 * avoid inserting duplicate checkpoints when the system is idle.
8866 */
8867 if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8868 CHECKPOINT_FORCE)) == 0)
8869 {
8870 if (last_important_lsn == ControlFile->checkPoint)
8871 {
8872 WALInsertLockRelease();
8873 LWLockRelease(CheckpointLock);
8874 END_CRIT_SECTION();
8875 ereport(DEBUG1,
8876 (errmsg("checkpoint skipped because system is idle")));
8877 return;
8878 }
8879 }
8880
8881 /*
8882 * An end-of-recovery checkpoint is created before anyone is allowed to
8883 * write WAL. To allow us to write the checkpoint record, temporarily
8884 * enable XLogInsertAllowed. (This also ensures ThisTimeLineID is
8885 * initialized, which we need here and in AdvanceXLInsertBuffer.)
8886 */
8887 if (flags & CHECKPOINT_END_OF_RECOVERY)
8888 LocalSetXLogInsertAllowed();
8889
8890 checkPoint.ThisTimeLineID = ThisTimeLineID;
8891 if (flags & CHECKPOINT_END_OF_RECOVERY)
8892 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8893 else
8894 checkPoint.PrevTimeLineID = ThisTimeLineID;
8895
8896 checkPoint.fullPageWrites = Insert->fullPageWrites;
8897
8898 /*
8899 * Compute new REDO record ptr = location of next XLOG record.
8900 *
8901 * NB: this is NOT necessarily where the checkpoint record itself will be,
8902 * since other backends may insert more XLOG records while we're off doing
8903 * the buffer flush work. Those XLOG records are logically after the
8904 * checkpoint, even though physically before it. Got that?
8905 */
8906 freespace = INSERT_FREESPACE(curInsert);
8907 if (freespace == 0)
8908 {
8909 if (curInsert % XLogSegSize == 0)
8910 curInsert += SizeOfXLogLongPHD;
8911 else
8912 curInsert += SizeOfXLogShortPHD;
8913 }
8914 checkPoint.redo = curInsert;
8915
8916 /*
8917 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8918 * must be done while holding all the insertion locks.
8919 *
8920 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8921 * pointing past where it really needs to point. This is okay; the only
8922 * consequence is that XLogInsert might back up whole buffers that it
8923 * didn't really need to. We can't postpone advancing RedoRecPtr because
8924 * XLogInserts that happen while we are dumping buffers must assume that
8925 * their buffer changes are not included in the checkpoint.
8926 */
8927 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8928
8929 /*
8930 * Now we can release the WAL insertion locks, allowing other xacts to
8931 * proceed while we are flushing disk buffers.
8932 */
8933 WALInsertLockRelease();
8934
8935 /* Update the info_lck-protected copy of RedoRecPtr as well */
8936 SpinLockAcquire(&XLogCtl->info_lck);
8937 XLogCtl->RedoRecPtr = checkPoint.redo;
8938 SpinLockRelease(&XLogCtl->info_lck);
8939
8940 /*
8941 * If enabled, log checkpoint start. We postpone this until now so as not
8942 * to log anything if we decided to skip the checkpoint.
8943 */
8944 if (log_checkpoints)
8945 LogCheckpointStart(flags, false);
8946
8947 TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8948
8949 /*
8950 * Get the other info we need for the checkpoint record.
8951 *
8952 * We don't need to save oldestClogXid in the checkpoint, it only matters
8953 * for the short period in which clog is being truncated, and if we crash
8954 * during that we'll redo the clog truncation and fix up oldestClogXid
8955 * there.
8956 */
8957 LWLockAcquire(XidGenLock, LW_SHARED);
8958 checkPoint.nextXid = ShmemVariableCache->nextXid;
8959 checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8960 checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8961 LWLockRelease(XidGenLock);
8962
8963 LWLockAcquire(CommitTsLock, LW_SHARED);
8964 checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8965 checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8966 LWLockRelease(CommitTsLock);
8967
8968 /* Increase XID epoch if we've wrapped around since last checkpoint */
8969 checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8970 if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8971 checkPoint.nextXidEpoch++;
8972
8973 LWLockAcquire(OidGenLock, LW_SHARED);
8974 checkPoint.nextOid = ShmemVariableCache->nextOid;
8975 if (!shutdown)
8976 checkPoint.nextOid += ShmemVariableCache->oidCount;
8977 LWLockRelease(OidGenLock);
8978
8979 MultiXactGetCheckptMulti(shutdown,
8980 &checkPoint.nextMulti,
8981 &checkPoint.nextMultiOffset,
8982 &checkPoint.oldestMulti,
8983 &checkPoint.oldestMultiDB);
8984
8985 /*
8986 * Having constructed the checkpoint record, ensure all shmem disk buffers
8987 * and commit-log buffers are flushed to disk.
8988 *
8989 * This I/O could fail for various reasons. If so, we will fail to
8990 * complete the checkpoint, but there is no reason to force a system
8991 * panic. Accordingly, exit critical section while doing it.
8992 */
8993 END_CRIT_SECTION();
8994
8995 /*
8996 * In some cases there are groups of actions that must all occur on one
8997 * side or the other of a checkpoint record. Before flushing the
8998 * checkpoint record we must explicitly wait for any backend currently
8999 * performing those groups of actions.
9000 *
9001 * One example is end of transaction, so we must wait for any transactions
9002 * that are currently in commit critical sections. If an xact inserted
9003 * its commit record into XLOG just before the REDO point, then a crash
9004 * restart from the REDO point would not replay that record, which means
9005 * that our flushing had better include the xact's update of pg_xact. So
9006 * we wait till he's out of his commit critical section before proceeding.
9007 * See notes in RecordTransactionCommit().
9008 *
9009 * Because we've already released the insertion locks, this test is a bit
9010 * fuzzy: it is possible that we will wait for xacts we didn't really need
9011 * to wait for. But the delay should be short and it seems better to make
9012 * checkpoint take a bit longer than to hold off insertions longer than
9013 * necessary. (In fact, the whole reason we have this issue is that xact.c
9014 * does commit record XLOG insertion and clog update as two separate steps
9015 * protected by different locks, but again that seems best on grounds of
9016 * minimizing lock contention.)
9017 *
9018 * A transaction that has not yet set delayChkpt when we look cannot be at
9019 * risk, since he's not inserted his commit record yet; and one that's
9020 * already cleared it is not at risk either, since he's done fixing clog
9021 * and we will correctly flush the update below. So we cannot miss any
9022 * xacts we need to wait for.
9023 */
9024 vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
9025 if (nvxids > 0)
9026 {
9027 do
9028 {
9029 pg_usleep(10000L); /* wait for 10 msec */
9030 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
9031 }
9032 pfree(vxids);
9033
9034 CheckPointGuts(checkPoint.redo, flags);
9035
9036 /*
9037 * Take a snapshot of running transactions and write this to WAL. This
9038 * allows us to reconstruct the state of running transactions during
9039 * archive recovery, if required. Skip, if this info disabled.
9040 *
9041 * If we are shutting down, or Startup process is completing crash
9042 * recovery we don't need to write running xact data.
9043 */
9044 if (!shutdown && XLogStandbyInfoActive())
9045 LogStandbySnapshot();
9046
9047 START_CRIT_SECTION();
9048
9049 /*
9050 * Now insert the checkpoint record into XLOG.
9051 */
9052 XLogBeginInsert();
9053 XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
9054 recptr = XLogInsert(RM_XLOG_ID,
9055 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
9056 XLOG_CHECKPOINT_ONLINE);
9057
9058 XLogFlush(recptr);
9059
9060 /*
9061 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
9062 * overwritten at next startup. No-one should even try, this just allows
9063 * sanity-checking. In the case of an end-of-recovery checkpoint, we want
9064 * to just temporarily disable writing until the system has exited
9065 * recovery.
9066 */
9067 if (shutdown)
9068 {
9069 if (flags & CHECKPOINT_END_OF_RECOVERY)
9070 LocalXLogInsertAllowed = -1; /* return to "check" state */
9071 else
9072 LocalXLogInsertAllowed = 0; /* never again write WAL */
9073 }
9074
9075 /*
9076 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
9077 * = end of actual checkpoint record.
9078 */
9079 if (shutdown && checkPoint.redo != ProcLastRecPtr)
9080 ereport(PANIC,
9081 (errmsg("concurrent write-ahead log activity while database system is shutting down")));
9082
9083 /*
9084 * Remember the prior checkpoint's redo pointer, used later to determine
9085 * the point where the log can be truncated.
9086 */
9087 PriorRedoPtr = ControlFile->checkPointCopy.redo;
9088
9089 /*
9090 * Update the control file.
9091 */
9092 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9093 if (shutdown)
9094 ControlFile->state = DB_SHUTDOWNED;
9095 ControlFile->prevCheckPoint = ControlFile->checkPoint;
9096 ControlFile->checkPoint = ProcLastRecPtr;
9097 ControlFile->checkPointCopy = checkPoint;
9098 ControlFile->time = (pg_time_t) time(NULL);
9099 /* crash recovery should always recover to the end of WAL */
9100 ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
9101 ControlFile->minRecoveryPointTLI = 0;
9102
9103 /*
9104 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
9105 * unused on non-shutdown checkpoints, but seems useful to store it always
9106 * for debugging purposes.
9107 */
9108 SpinLockAcquire(&XLogCtl->ulsn_lck);
9109 ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
9110 SpinLockRelease(&XLogCtl->ulsn_lck);
9111
9112 UpdateControlFile();
9113 LWLockRelease(ControlFileLock);
9114
9115 /* Update shared-memory copy of checkpoint XID/epoch */
9116 SpinLockAcquire(&XLogCtl->info_lck);
9117 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9118 XLogCtl->ckptXid = checkPoint.nextXid;
9119 SpinLockRelease(&XLogCtl->info_lck);
9120
9121 /*
9122 * We are now done with critical updates; no need for system panic if we
9123 * have trouble while fooling with old log segments.
9124 */
9125 END_CRIT_SECTION();
9126
9127 /*
9128 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
9129 */
9130 smgrpostckpt();
9131
9132 /*
9133 * Delete old log files (those no longer needed even for previous
9134 * checkpoint or the standbys in XLOG streaming).
9135 */
9136 if (PriorRedoPtr != InvalidXLogRecPtr)
9137 {
9138 XLogSegNo _logSegNo;
9139
9140 /* Update the average distance between checkpoints. */
9141 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9142
9143 XLByteToSeg(PriorRedoPtr, _logSegNo);
9144 KeepLogSeg(recptr, &_logSegNo);
9145 _logSegNo--;
9146 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
9147 }
9148
9149 /*
9150 * Make more log segments if needed. (Do this after recycling old log
9151 * segments, since that may supply some of the needed files.)
9152 */
9153 if (!shutdown)
9154 PreallocXlogFiles(recptr);
9155
9156 /*
9157 * Truncate pg_subtrans if possible. We can throw away all data before
9158 * the oldest XMIN of any running transaction. No future transaction will
9159 * attempt to reference any pg_subtrans entry older than that (see Asserts
9160 * in subtrans.c). During recovery, though, we mustn't do this because
9161 * StartupSUBTRANS hasn't been called yet.
9162 */
9163 if (!RecoveryInProgress())
9164 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9165
9166 /* Real work is done, but log and update stats before releasing lock. */
9167 LogCheckpointEnd(false);
9168
9169 TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
9170 NBuffers,
9171 CheckpointStats.ckpt_segs_added,
9172 CheckpointStats.ckpt_segs_removed,
9173 CheckpointStats.ckpt_segs_recycled);
9174
9175 LWLockRelease(CheckpointLock);
9176 }
9177
9178 /*
9179 * Mark the end of recovery in WAL though without running a full checkpoint.
9180 * We can expect that a restartpoint is likely to be in progress as we
9181 * do this, though we are unwilling to wait for it to complete. So be
9182 * careful to avoid taking the CheckpointLock anywhere here.
9183 *
9184 * CreateRestartPoint() allows for the case where recovery may end before
9185 * the restartpoint completes so there is no concern of concurrent behaviour.
9186 */
9187 static void
CreateEndOfRecoveryRecord(void)9188 CreateEndOfRecoveryRecord(void)
9189 {
9190 xl_end_of_recovery xlrec;
9191 XLogRecPtr recptr;
9192
9193 /* sanity check */
9194 if (!RecoveryInProgress())
9195 elog(ERROR, "can only be used to end recovery");
9196
9197 xlrec.end_time = GetCurrentTimestamp();
9198
9199 WALInsertLockAcquireExclusive();
9200 xlrec.ThisTimeLineID = ThisTimeLineID;
9201 xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
9202 WALInsertLockRelease();
9203
9204 LocalSetXLogInsertAllowed();
9205
9206 START_CRIT_SECTION();
9207
9208 XLogBeginInsert();
9209 XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
9210 recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
9211
9212 XLogFlush(recptr);
9213
9214 /*
9215 * Update the control file so that crash recovery can follow the timeline
9216 * changes to this point.
9217 */
9218 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9219 ControlFile->time = (pg_time_t) time(NULL);
9220 ControlFile->minRecoveryPoint = recptr;
9221 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9222 UpdateControlFile();
9223 LWLockRelease(ControlFileLock);
9224
9225 END_CRIT_SECTION();
9226
9227 LocalXLogInsertAllowed = -1; /* return to "check" state */
9228 }
9229
9230 /*
9231 * Write an OVERWRITE_CONTRECORD message.
9232 *
9233 * When on WAL replay we expect a continuation record at the start of a page
9234 * that is not there, recovery ends and WAL writing resumes at that point.
9235 * But it's wrong to resume writing new WAL back at the start of the record
9236 * that was broken, because downstream consumers of that WAL (physical
9237 * replicas) are not prepared to "rewind". So the first action after
9238 * finishing replay of all valid WAL must be to write a record of this type
9239 * at the point where the contrecord was missing; to support xlogreader
9240 * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
9241 * to the page header where the record occurs. xlogreader has an ad-hoc
9242 * mechanism to report metadata about the broken record, which is what we
9243 * use here.
9244 *
9245 * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
9246 * skip the record it was reading, and pass back the LSN of the skipped
9247 * record, so that its caller can verify (on "replay" of that record) that the
9248 * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
9249 */
9250 static XLogRecPtr
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)9251 CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
9252 {
9253 xl_overwrite_contrecord xlrec;
9254 XLogRecPtr recptr;
9255
9256 /* sanity check */
9257 if (!RecoveryInProgress())
9258 elog(ERROR, "can only be used at end of recovery");
9259
9260 xlrec.overwritten_lsn = aborted_lsn;
9261 xlrec.overwrite_time = GetCurrentTimestamp();
9262
9263 START_CRIT_SECTION();
9264
9265 XLogBeginInsert();
9266 XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
9267
9268 recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
9269
9270 XLogFlush(recptr);
9271
9272 END_CRIT_SECTION();
9273
9274 return recptr;
9275 }
9276
9277 /*
9278 * Flush all data in shared memory to disk, and fsync
9279 *
9280 * This is the common code shared between regular checkpoints and
9281 * recovery restartpoints.
9282 */
9283 static void
CheckPointGuts(XLogRecPtr checkPointRedo,int flags)9284 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
9285 {
9286 CheckPointCLOG();
9287 CheckPointCommitTs();
9288 CheckPointSUBTRANS();
9289 CheckPointMultiXact();
9290 CheckPointPredicate();
9291 CheckPointRelationMap();
9292 CheckPointReplicationSlots();
9293 CheckPointSnapBuild();
9294 CheckPointLogicalRewriteHeap();
9295 CheckPointBuffers(flags); /* performs all required fsyncs */
9296 CheckPointReplicationOrigin();
9297 /* We deliberately delay 2PC checkpointing as long as possible */
9298 CheckPointTwoPhase(checkPointRedo);
9299 }
9300
9301 /*
9302 * Save a checkpoint for recovery restart if appropriate
9303 *
9304 * This function is called each time a checkpoint record is read from XLOG.
9305 * It must determine whether the checkpoint represents a safe restartpoint or
9306 * not. If so, the checkpoint record is stashed in shared memory so that
9307 * CreateRestartPoint can consult it. (Note that the latter function is
9308 * executed by the checkpointer, while this one will be executed by the
9309 * startup process.)
9310 */
9311 static void
RecoveryRestartPoint(const CheckPoint * checkPoint)9312 RecoveryRestartPoint(const CheckPoint *checkPoint)
9313 {
9314 /*
9315 * Also refrain from creating a restartpoint if we have seen any
9316 * references to non-existent pages. Restarting recovery from the
9317 * restartpoint would not see the references, so we would lose the
9318 * cross-check that the pages belonged to a relation that was dropped
9319 * later.
9320 */
9321 if (XLogHaveInvalidPages())
9322 {
9323 elog(trace_recovery(DEBUG2),
9324 "could not record restart point at %X/%X because there "
9325 "are unresolved references to invalid pages",
9326 (uint32) (checkPoint->redo >> 32),
9327 (uint32) checkPoint->redo);
9328 return;
9329 }
9330
9331 /*
9332 * Copy the checkpoint record to shared memory, so that checkpointer can
9333 * work out the next time it wants to perform a restartpoint.
9334 */
9335 SpinLockAcquire(&XLogCtl->info_lck);
9336 XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9337 XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9338 XLogCtl->lastCheckPoint = *checkPoint;
9339 SpinLockRelease(&XLogCtl->info_lck);
9340 }
9341
9342 /*
9343 * Establish a restartpoint if possible.
9344 *
9345 * This is similar to CreateCheckPoint, but is used during WAL recovery
9346 * to establish a point from which recovery can roll forward without
9347 * replaying the entire recovery log.
9348 *
9349 * Returns true if a new restartpoint was established. We can only establish
9350 * a restartpoint if we have replayed a safe checkpoint record since last
9351 * restartpoint.
9352 */
9353 bool
CreateRestartPoint(int flags)9354 CreateRestartPoint(int flags)
9355 {
9356 XLogRecPtr lastCheckPointRecPtr;
9357 XLogRecPtr lastCheckPointEndPtr;
9358 CheckPoint lastCheckPoint;
9359 XLogRecPtr PriorRedoPtr;
9360 TimestampTz xtime;
9361
9362 /*
9363 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9364 * happens at a time.
9365 */
9366 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9367
9368 /* Get a local copy of the last safe checkpoint record. */
9369 SpinLockAcquire(&XLogCtl->info_lck);
9370 lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9371 lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9372 lastCheckPoint = XLogCtl->lastCheckPoint;
9373 SpinLockRelease(&XLogCtl->info_lck);
9374
9375 /*
9376 * Check that we're still in recovery mode. It's ok if we exit recovery
9377 * mode after this check, the restart point is valid anyway.
9378 */
9379 if (!RecoveryInProgress())
9380 {
9381 ereport(DEBUG2,
9382 (errmsg("skipping restartpoint, recovery has already ended")));
9383 LWLockRelease(CheckpointLock);
9384 return false;
9385 }
9386
9387 /*
9388 * If the last checkpoint record we've replayed is already our last
9389 * restartpoint, we can't perform a new restart point. We still update
9390 * minRecoveryPoint in that case, so that if this is a shutdown restart
9391 * point, we won't start up earlier than before. That's not strictly
9392 * necessary, but when hot standby is enabled, it would be rather weird if
9393 * the database opened up for read-only connections at a point-in-time
9394 * before the last shutdown. Such time travel is still possible in case of
9395 * immediate shutdown, though.
9396 *
9397 * We don't explicitly advance minRecoveryPoint when we do create a
9398 * restartpoint. It's assumed that flushing the buffers will do that as a
9399 * side-effect.
9400 */
9401 if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9402 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9403 {
9404 ereport(DEBUG2,
9405 (errmsg("skipping restartpoint, already performed at %X/%X",
9406 (uint32) (lastCheckPoint.redo >> 32),
9407 (uint32) lastCheckPoint.redo)));
9408
9409 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9410 if (flags & CHECKPOINT_IS_SHUTDOWN)
9411 {
9412 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9413 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9414 ControlFile->time = (pg_time_t) time(NULL);
9415 UpdateControlFile();
9416 LWLockRelease(ControlFileLock);
9417 }
9418 LWLockRelease(CheckpointLock);
9419 return false;
9420 }
9421
9422 /*
9423 * Update the shared RedoRecPtr so that the startup process can calculate
9424 * the number of segments replayed since last restartpoint, and request a
9425 * restartpoint if it exceeds CheckPointSegments.
9426 *
9427 * Like in CreateCheckPoint(), hold off insertions to update it, although
9428 * during recovery this is just pro forma, because no WAL insertions are
9429 * happening.
9430 */
9431 WALInsertLockAcquireExclusive();
9432 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9433 WALInsertLockRelease();
9434
9435 /* Also update the info_lck-protected copy */
9436 SpinLockAcquire(&XLogCtl->info_lck);
9437 XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9438 SpinLockRelease(&XLogCtl->info_lck);
9439
9440 /*
9441 * Prepare to accumulate statistics.
9442 *
9443 * Note: because it is possible for log_checkpoints to change while a
9444 * checkpoint proceeds, we always accumulate stats, even if
9445 * log_checkpoints is currently off.
9446 */
9447 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9448 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9449
9450 if (log_checkpoints)
9451 LogCheckpointStart(flags, true);
9452
9453 CheckPointGuts(lastCheckPoint.redo, flags);
9454
9455 /*
9456 * Remember the prior checkpoint's redo pointer, used later to determine
9457 * the point at which we can truncate the log.
9458 */
9459 PriorRedoPtr = ControlFile->checkPointCopy.redo;
9460
9461 /*
9462 * Update pg_control, using current time. Check that it still shows
9463 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9464 * this is a quick hack to make sure nothing really bad happens if somehow
9465 * we get here after the end-of-recovery checkpoint.
9466 */
9467 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9468 if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9469 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9470 {
9471 ControlFile->prevCheckPoint = ControlFile->checkPoint;
9472 ControlFile->checkPoint = lastCheckPointRecPtr;
9473 ControlFile->checkPointCopy = lastCheckPoint;
9474 ControlFile->time = (pg_time_t) time(NULL);
9475
9476 /*
9477 * Ensure minRecoveryPoint is past the checkpoint record. Normally,
9478 * this will have happened already while writing out dirty buffers,
9479 * but not necessarily - e.g. because no buffers were dirtied. We do
9480 * this because a non-exclusive base backup uses minRecoveryPoint to
9481 * determine which WAL files must be included in the backup, and the
9482 * file (or files) containing the checkpoint record must be included,
9483 * at a minimum. Note that for an ordinary restart of recovery there's
9484 * no value in having the minimum recovery point any earlier than this
9485 * anyway, because redo will begin just after the checkpoint record.
9486 */
9487 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9488 {
9489 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9490 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9491
9492 /* update local copy */
9493 minRecoveryPoint = ControlFile->minRecoveryPoint;
9494 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9495 }
9496 if (flags & CHECKPOINT_IS_SHUTDOWN)
9497 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9498 UpdateControlFile();
9499 }
9500 LWLockRelease(ControlFileLock);
9501
9502 /*
9503 * Delete old log files (those no longer needed even for previous
9504 * checkpoint/restartpoint) to prevent the disk holding the xlog from
9505 * growing full.
9506 */
9507 if (PriorRedoPtr != InvalidXLogRecPtr)
9508 {
9509 XLogRecPtr receivePtr;
9510 XLogRecPtr replayPtr;
9511 TimeLineID replayTLI;
9512 XLogRecPtr endptr;
9513 XLogSegNo _logSegNo;
9514
9515 /* Update the average distance between checkpoints/restartpoints. */
9516 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9517
9518 XLByteToSeg(PriorRedoPtr, _logSegNo);
9519
9520 /*
9521 * Get the current end of xlog replayed or received, whichever is
9522 * later.
9523 */
9524 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9525 replayPtr = GetXLogReplayRecPtr(&replayTLI);
9526 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9527
9528 KeepLogSeg(endptr, &_logSegNo);
9529 _logSegNo--;
9530
9531 /*
9532 * Try to recycle segments on a useful timeline. If we've been
9533 * promoted since the beginning of this restartpoint, use the new
9534 * timeline chosen at end of recovery (RecoveryInProgress() sets
9535 * ThisTimeLineID in that case). If we're still in recovery, use the
9536 * timeline we're currently replaying.
9537 *
9538 * There is no guarantee that the WAL segments will be useful on the
9539 * current timeline; if recovery proceeds to a new timeline right
9540 * after this, the pre-allocated WAL segments on this timeline will
9541 * not be used, and will go wasted until recycled on the next
9542 * restartpoint. We'll live with that.
9543 */
9544 if (RecoveryInProgress())
9545 ThisTimeLineID = replayTLI;
9546
9547 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
9548
9549 /*
9550 * Make more log segments if needed. (Do this after recycling old log
9551 * segments, since that may supply some of the needed files.)
9552 */
9553 PreallocXlogFiles(endptr);
9554
9555 /*
9556 * ThisTimeLineID is normally not set when we're still in recovery.
9557 * However, recycling/preallocating segments above needed
9558 * ThisTimeLineID to determine which timeline to install the segments
9559 * on. Reset it now, to restore the normal state of affairs for
9560 * debugging purposes.
9561 */
9562 if (RecoveryInProgress())
9563 ThisTimeLineID = 0;
9564 }
9565
9566 /*
9567 * Truncate pg_subtrans if possible. We can throw away all data before
9568 * the oldest XMIN of any running transaction. No future transaction will
9569 * attempt to reference any pg_subtrans entry older than that (see Asserts
9570 * in subtrans.c). When hot standby is disabled, though, we mustn't do
9571 * this because StartupSUBTRANS hasn't been called yet.
9572 */
9573 if (EnableHotStandby)
9574 TruncateSUBTRANS(GetOldestXmin(NULL, PROCARRAY_FLAGS_DEFAULT));
9575
9576 /* Real work is done, but log and update before releasing lock. */
9577 LogCheckpointEnd(true);
9578
9579 xtime = GetLatestXTime();
9580 ereport((log_checkpoints ? LOG : DEBUG2),
9581 (errmsg("recovery restart point at %X/%X",
9582 (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9583 xtime ? errdetail("last completed transaction was at log time %s",
9584 timestamptz_to_str(xtime)) : 0));
9585
9586 LWLockRelease(CheckpointLock);
9587
9588 /*
9589 * Finally, execute archive_cleanup_command, if any.
9590 */
9591 if (XLogCtl->archiveCleanupCommand[0])
9592 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
9593 "archive_cleanup_command",
9594 false);
9595
9596 return true;
9597 }
9598
9599 /*
9600 * Retreat *logSegNo to the last segment that we need to retain because of
9601 * either wal_keep_segments or replication slots.
9602 *
9603 * This is calculated by subtracting wal_keep_segments from the given xlog
9604 * location, recptr and by making sure that that result is below the
9605 * requirement of replication slots.
9606 */
9607 static void
KeepLogSeg(XLogRecPtr recptr,XLogSegNo * logSegNo)9608 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9609 {
9610 XLogSegNo segno;
9611 XLogRecPtr keep;
9612
9613 XLByteToSeg(recptr, segno);
9614 keep = XLogGetReplicationSlotMinimumLSN();
9615
9616 /* compute limit for wal_keep_segments first */
9617 if (wal_keep_segments > 0)
9618 {
9619 /* avoid underflow, don't go below 1 */
9620 if (segno <= wal_keep_segments)
9621 segno = 1;
9622 else
9623 segno = segno - wal_keep_segments;
9624 }
9625
9626 /* then check whether slots limit removal further */
9627 if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9628 {
9629 XLogSegNo slotSegNo;
9630
9631 XLByteToSeg(keep, slotSegNo);
9632
9633 if (slotSegNo <= 0)
9634 segno = 1;
9635 else if (slotSegNo < segno)
9636 segno = slotSegNo;
9637 }
9638
9639 /* don't delete WAL segments newer than the calculated segment */
9640 if (segno < *logSegNo)
9641 *logSegNo = segno;
9642 }
9643
9644 /*
9645 * Write a NEXTOID log record
9646 */
9647 void
XLogPutNextOid(Oid nextOid)9648 XLogPutNextOid(Oid nextOid)
9649 {
9650 XLogBeginInsert();
9651 XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9652 (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9653
9654 /*
9655 * We need not flush the NEXTOID record immediately, because any of the
9656 * just-allocated OIDs could only reach disk as part of a tuple insert or
9657 * update that would have its own XLOG record that must follow the NEXTOID
9658 * record. Therefore, the standard buffer LSN interlock applied to those
9659 * records will ensure no such OID reaches disk before the NEXTOID record
9660 * does.
9661 *
9662 * Note, however, that the above statement only covers state "within" the
9663 * database. When we use a generated OID as a file or directory name, we
9664 * are in a sense violating the basic WAL rule, because that filesystem
9665 * change may reach disk before the NEXTOID WAL record does. The impact
9666 * of this is that if a database crash occurs immediately afterward, we
9667 * might after restart re-generate the same OID and find that it conflicts
9668 * with the leftover file or directory. But since for safety's sake we
9669 * always loop until finding a nonconflicting filename, this poses no real
9670 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9671 */
9672 }
9673
9674 /*
9675 * Write an XLOG SWITCH record.
9676 *
9677 * Here we just blindly issue an XLogInsert request for the record.
9678 * All the magic happens inside XLogInsert.
9679 *
9680 * The return value is either the end+1 address of the switch record,
9681 * or the end+1 address of the prior segment if we did not need to
9682 * write a switch record because we are already at segment start.
9683 */
9684 XLogRecPtr
RequestXLogSwitch(bool mark_unimportant)9685 RequestXLogSwitch(bool mark_unimportant)
9686 {
9687 XLogRecPtr RecPtr;
9688
9689 /* XLOG SWITCH has no data */
9690 XLogBeginInsert();
9691
9692 if (mark_unimportant)
9693 XLogSetRecordFlags(XLOG_MARK_UNIMPORTANT);
9694 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9695
9696 return RecPtr;
9697 }
9698
9699 /*
9700 * Write a RESTORE POINT record
9701 */
9702 XLogRecPtr
XLogRestorePoint(const char * rpName)9703 XLogRestorePoint(const char *rpName)
9704 {
9705 XLogRecPtr RecPtr;
9706 xl_restore_point xlrec;
9707
9708 xlrec.rp_time = GetCurrentTimestamp();
9709 strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9710
9711 XLogBeginInsert();
9712 XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9713
9714 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9715
9716 ereport(LOG,
9717 (errmsg("restore point \"%s\" created at %X/%X",
9718 rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9719
9720 return RecPtr;
9721 }
9722
9723 /*
9724 * Check if any of the GUC parameters that are critical for hot standby
9725 * have changed, and update the value in pg_control file if necessary.
9726 */
9727 static void
XLogReportParameters(void)9728 XLogReportParameters(void)
9729 {
9730 if (wal_level != ControlFile->wal_level ||
9731 wal_log_hints != ControlFile->wal_log_hints ||
9732 MaxConnections != ControlFile->MaxConnections ||
9733 max_worker_processes != ControlFile->max_worker_processes ||
9734 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9735 max_locks_per_xact != ControlFile->max_locks_per_xact ||
9736 track_commit_timestamp != ControlFile->track_commit_timestamp)
9737 {
9738 /*
9739 * The change in number of backend slots doesn't need to be WAL-logged
9740 * if archiving is not enabled, as you can't start archive recovery
9741 * with wal_level=minimal anyway. We don't really care about the
9742 * values in pg_control either if wal_level=minimal, but seems better
9743 * to keep them up-to-date to avoid confusion.
9744 */
9745 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9746 {
9747 xl_parameter_change xlrec;
9748 XLogRecPtr recptr;
9749
9750 xlrec.MaxConnections = MaxConnections;
9751 xlrec.max_worker_processes = max_worker_processes;
9752 xlrec.max_prepared_xacts = max_prepared_xacts;
9753 xlrec.max_locks_per_xact = max_locks_per_xact;
9754 xlrec.wal_level = wal_level;
9755 xlrec.wal_log_hints = wal_log_hints;
9756 xlrec.track_commit_timestamp = track_commit_timestamp;
9757
9758 XLogBeginInsert();
9759 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9760
9761 recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9762 XLogFlush(recptr);
9763 }
9764
9765 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9766
9767 ControlFile->MaxConnections = MaxConnections;
9768 ControlFile->max_worker_processes = max_worker_processes;
9769 ControlFile->max_prepared_xacts = max_prepared_xacts;
9770 ControlFile->max_locks_per_xact = max_locks_per_xact;
9771 ControlFile->wal_level = wal_level;
9772 ControlFile->wal_log_hints = wal_log_hints;
9773 ControlFile->track_commit_timestamp = track_commit_timestamp;
9774 UpdateControlFile();
9775
9776 LWLockRelease(ControlFileLock);
9777 }
9778 }
9779
9780 /*
9781 * Update full_page_writes in shared memory, and write an
9782 * XLOG_FPW_CHANGE record if necessary.
9783 *
9784 * Note: this function assumes there is no other process running
9785 * concurrently that could update it.
9786 */
9787 void
UpdateFullPageWrites(void)9788 UpdateFullPageWrites(void)
9789 {
9790 XLogCtlInsert *Insert = &XLogCtl->Insert;
9791 bool recoveryInProgress;
9792
9793 /*
9794 * Do nothing if full_page_writes has not been changed.
9795 *
9796 * It's safe to check the shared full_page_writes without the lock,
9797 * because we assume that there is no concurrently running process which
9798 * can update it.
9799 */
9800 if (fullPageWrites == Insert->fullPageWrites)
9801 return;
9802
9803 /*
9804 * Perform this outside critical section so that the WAL insert
9805 * initialization done by RecoveryInProgress() doesn't trigger an
9806 * assertion failure.
9807 */
9808 recoveryInProgress = RecoveryInProgress();
9809
9810 START_CRIT_SECTION();
9811
9812 /*
9813 * It's always safe to take full page images, even when not strictly
9814 * required, but not the other round. So if we're setting full_page_writes
9815 * to true, first set it true and then write the WAL record. If we're
9816 * setting it to false, first write the WAL record and then set the global
9817 * flag.
9818 */
9819 if (fullPageWrites)
9820 {
9821 WALInsertLockAcquireExclusive();
9822 Insert->fullPageWrites = true;
9823 WALInsertLockRelease();
9824 }
9825
9826 /*
9827 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9828 * full_page_writes during archive recovery, if required.
9829 */
9830 if (XLogStandbyInfoActive() && !recoveryInProgress)
9831 {
9832 XLogBeginInsert();
9833 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9834
9835 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9836 }
9837
9838 if (!fullPageWrites)
9839 {
9840 WALInsertLockAcquireExclusive();
9841 Insert->fullPageWrites = false;
9842 WALInsertLockRelease();
9843 }
9844 END_CRIT_SECTION();
9845 }
9846
9847 /*
9848 * Check that it's OK to switch to new timeline during recovery.
9849 *
9850 * 'lsn' is the address of the shutdown checkpoint record we're about to
9851 * replay. (Currently, timeline can only change at a shutdown checkpoint).
9852 */
9853 static void
checkTimeLineSwitch(XLogRecPtr lsn,TimeLineID newTLI,TimeLineID prevTLI)9854 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9855 {
9856 /* Check that the record agrees on what the current (old) timeline is */
9857 if (prevTLI != ThisTimeLineID)
9858 ereport(PANIC,
9859 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9860 prevTLI, ThisTimeLineID)));
9861
9862 /*
9863 * The new timeline better be in the list of timelines we expect to see,
9864 * according to the timeline history. It should also not decrease.
9865 */
9866 if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9867 ereport(PANIC,
9868 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9869 newTLI, ThisTimeLineID)));
9870
9871 /*
9872 * If we have not yet reached min recovery point, and we're about to
9873 * switch to a timeline greater than the timeline of the min recovery
9874 * point: trouble. After switching to the new timeline, we could not
9875 * possibly visit the min recovery point on the correct timeline anymore.
9876 * This can happen if there is a newer timeline in the archive that
9877 * branched before the timeline the min recovery point is on, and you
9878 * attempt to do PITR to the new timeline.
9879 */
9880 if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9881 lsn < minRecoveryPoint &&
9882 newTLI > minRecoveryPointTLI)
9883 ereport(PANIC,
9884 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9885 newTLI,
9886 (uint32) (minRecoveryPoint >> 32),
9887 (uint32) minRecoveryPoint,
9888 minRecoveryPointTLI)));
9889
9890 /* Looks good */
9891 }
9892
9893 /*
9894 * XLOG resource manager's routines
9895 *
9896 * Definitions of info values are in include/catalog/pg_control.h, though
9897 * not all record types are related to control file updates.
9898 */
9899 void
xlog_redo(XLogReaderState * record)9900 xlog_redo(XLogReaderState *record)
9901 {
9902 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9903 XLogRecPtr lsn = record->EndRecPtr;
9904
9905 /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
9906 Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9907 info == XLOG_FPI_MULTI || !XLogRecHasAnyBlockRefs(record));
9908
9909 if (info == XLOG_NEXTOID)
9910 {
9911 Oid nextOid;
9912
9913 /*
9914 * We used to try to take the maximum of ShmemVariableCache->nextOid
9915 * and the recorded nextOid, but that fails if the OID counter wraps
9916 * around. Since no OID allocation should be happening during replay
9917 * anyway, better to just believe the record exactly. We still take
9918 * OidGenLock while setting the variable, just in case.
9919 */
9920 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9921 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9922 ShmemVariableCache->nextOid = nextOid;
9923 ShmemVariableCache->oidCount = 0;
9924 LWLockRelease(OidGenLock);
9925 }
9926 else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9927 {
9928 CheckPoint checkPoint;
9929
9930 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9931 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9932 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9933 ShmemVariableCache->nextXid = checkPoint.nextXid;
9934 LWLockRelease(XidGenLock);
9935 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9936 ShmemVariableCache->nextOid = checkPoint.nextOid;
9937 ShmemVariableCache->oidCount = 0;
9938 LWLockRelease(OidGenLock);
9939 MultiXactSetNextMXact(checkPoint.nextMulti,
9940 checkPoint.nextMultiOffset);
9941
9942 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9943 checkPoint.oldestMultiDB);
9944
9945 /*
9946 * No need to set oldestClogXid here as well; it'll be set when we
9947 * redo an xl_clog_truncate if it changed since initialization.
9948 */
9949 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9950
9951 /*
9952 * If we see a shutdown checkpoint while waiting for an end-of-backup
9953 * record, the backup was canceled and the end-of-backup record will
9954 * never arrive.
9955 */
9956 if (ArchiveRecoveryRequested &&
9957 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9958 XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9959 ereport(PANIC,
9960 (errmsg("online backup was canceled, recovery cannot continue")));
9961
9962 /*
9963 * If we see a shutdown checkpoint, we know that nothing was running
9964 * on the master at this point. So fake-up an empty running-xacts
9965 * record and use that here and now. Recover additional standby state
9966 * for prepared transactions.
9967 */
9968 if (standbyState >= STANDBY_INITIALIZED)
9969 {
9970 TransactionId *xids;
9971 int nxids;
9972 TransactionId oldestActiveXID;
9973 TransactionId latestCompletedXid;
9974 RunningTransactionsData running;
9975
9976 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9977
9978 /*
9979 * Construct a RunningTransactions snapshot representing a shut
9980 * down server, with only prepared transactions still alive. We're
9981 * never overflowed at this point because all subxids are listed
9982 * with their parent prepared transactions.
9983 */
9984 running.xcnt = nxids;
9985 running.subxcnt = 0;
9986 running.subxid_overflow = false;
9987 running.nextXid = checkPoint.nextXid;
9988 running.oldestRunningXid = oldestActiveXID;
9989 latestCompletedXid = checkPoint.nextXid;
9990 TransactionIdRetreat(latestCompletedXid);
9991 Assert(TransactionIdIsNormal(latestCompletedXid));
9992 running.latestCompletedXid = latestCompletedXid;
9993 running.xids = xids;
9994
9995 ProcArrayApplyRecoveryInfo(&running);
9996
9997 StandbyRecoverPreparedTransactions();
9998 }
9999
10000 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10001 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10002 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
10003 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
10004 LWLockRelease(ControlFileLock);
10005
10006 /* Update shared-memory copy of checkpoint XID/epoch */
10007 SpinLockAcquire(&XLogCtl->info_lck);
10008 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
10009 XLogCtl->ckptXid = checkPoint.nextXid;
10010 SpinLockRelease(&XLogCtl->info_lck);
10011
10012 /*
10013 * We should've already switched to the new TLI before replaying this
10014 * record.
10015 */
10016 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10017 ereport(PANIC,
10018 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10019 checkPoint.ThisTimeLineID, ThisTimeLineID)));
10020
10021 RecoveryRestartPoint(&checkPoint);
10022 }
10023 else if (info == XLOG_CHECKPOINT_ONLINE)
10024 {
10025 CheckPoint checkPoint;
10026
10027 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
10028 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
10029 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
10030 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
10031 checkPoint.nextXid))
10032 ShmemVariableCache->nextXid = checkPoint.nextXid;
10033 LWLockRelease(XidGenLock);
10034
10035 /*
10036 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
10037 * to track OID assignment through XLOG_NEXTOID records. The nextOid
10038 * counter is from the start of the checkpoint and might well be stale
10039 * compared to later XLOG_NEXTOID records. We could try to take the
10040 * maximum of the nextOid counter and our latest value, but since
10041 * there's no particular guarantee about the speed with which the OID
10042 * counter wraps around, that's a risky thing to do. In any case,
10043 * users of the nextOid counter are required to avoid assignment of
10044 * duplicates, so that a somewhat out-of-date value should be safe.
10045 */
10046
10047 /* Handle multixact */
10048 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
10049 checkPoint.nextMultiOffset);
10050
10051 /*
10052 * NB: This may perform multixact truncation when replaying WAL
10053 * generated by an older primary.
10054 */
10055 MultiXactAdvanceOldest(checkPoint.oldestMulti,
10056 checkPoint.oldestMultiDB);
10057 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
10058 checkPoint.oldestXid))
10059 SetTransactionIdLimit(checkPoint.oldestXid,
10060 checkPoint.oldestXidDB);
10061 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
10062 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10063 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
10064 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
10065 LWLockRelease(ControlFileLock);
10066
10067 /* Update shared-memory copy of checkpoint XID/epoch */
10068 SpinLockAcquire(&XLogCtl->info_lck);
10069 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
10070 XLogCtl->ckptXid = checkPoint.nextXid;
10071 SpinLockRelease(&XLogCtl->info_lck);
10072
10073 /* TLI should not change in an on-line checkpoint */
10074 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
10075 ereport(PANIC,
10076 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10077 checkPoint.ThisTimeLineID, ThisTimeLineID)));
10078
10079 RecoveryRestartPoint(&checkPoint);
10080 }
10081 else if (info == XLOG_OVERWRITE_CONTRECORD)
10082 {
10083 xl_overwrite_contrecord xlrec;
10084
10085 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
10086 VerifyOverwriteContrecord(&xlrec, record);
10087 }
10088 else if (info == XLOG_END_OF_RECOVERY)
10089 {
10090 xl_end_of_recovery xlrec;
10091
10092 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
10093
10094 /*
10095 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
10096 * but this case is rarer and harder to test, so the benefit doesn't
10097 * outweigh the potential extra cost of maintenance.
10098 */
10099
10100 /*
10101 * We should've already switched to the new TLI before replaying this
10102 * record.
10103 */
10104 if (xlrec.ThisTimeLineID != ThisTimeLineID)
10105 ereport(PANIC,
10106 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
10107 xlrec.ThisTimeLineID, ThisTimeLineID)));
10108 }
10109 else if (info == XLOG_NOOP)
10110 {
10111 /* nothing to do here */
10112 }
10113 else if (info == XLOG_SWITCH)
10114 {
10115 /* nothing to do here */
10116 }
10117 else if (info == XLOG_RESTORE_POINT)
10118 {
10119 /* nothing to do here */
10120 }
10121 else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
10122 info == XLOG_FPI_MULTI)
10123 {
10124 uint8 block_id;
10125
10126 /*
10127 * Full-page image (FPI) records contain nothing else but a backup
10128 * block (or multiple backup blocks). Every block reference must
10129 * include a full-page image - otherwise there would be no point in
10130 * this record.
10131 *
10132 * No recovery conflicts are generated by these generic records - if a
10133 * resource manager needs to generate conflicts, it has to define a
10134 * separate WAL record type and redo routine.
10135 *
10136 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
10137 * WAL- logged because of a hint bit update. They are only generated
10138 * when checksums are enabled. There is no difference in handling
10139 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
10140 * code just to distinguish them for statistics purposes.
10141 */
10142 for (block_id = 0; block_id <= record->max_block_id; block_id++)
10143 {
10144 Buffer buffer;
10145
10146 if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
10147 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
10148 UnlockReleaseBuffer(buffer);
10149 }
10150 }
10151 else if (info == XLOG_BACKUP_END)
10152 {
10153 XLogRecPtr startpoint;
10154
10155 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
10156
10157 if (ControlFile->backupStartPoint == startpoint)
10158 {
10159 /*
10160 * We have reached the end of base backup, the point where
10161 * pg_stop_backup() was done. The data on disk is now consistent.
10162 * Reset backupStartPoint, and update minRecoveryPoint to make
10163 * sure we don't allow starting up at an earlier point even if
10164 * recovery is stopped and restarted soon after this.
10165 */
10166 elog(DEBUG1, "end of backup reached");
10167
10168 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10169
10170 if (ControlFile->minRecoveryPoint < lsn)
10171 {
10172 ControlFile->minRecoveryPoint = lsn;
10173 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10174 }
10175 ControlFile->backupStartPoint = InvalidXLogRecPtr;
10176 ControlFile->backupEndRequired = false;
10177 UpdateControlFile();
10178
10179 LWLockRelease(ControlFileLock);
10180 }
10181 }
10182 else if (info == XLOG_PARAMETER_CHANGE)
10183 {
10184 xl_parameter_change xlrec;
10185
10186 /* Update our copy of the parameters in pg_control */
10187 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
10188
10189 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
10190 ControlFile->MaxConnections = xlrec.MaxConnections;
10191 ControlFile->max_worker_processes = xlrec.max_worker_processes;
10192 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
10193 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
10194 ControlFile->wal_level = xlrec.wal_level;
10195 ControlFile->wal_log_hints = xlrec.wal_log_hints;
10196
10197 /*
10198 * Update minRecoveryPoint to ensure that if recovery is aborted, we
10199 * recover back up to this point before allowing hot standby again.
10200 * This is important if the max_* settings are decreased, to ensure
10201 * you don't run queries against the WAL preceding the change. The
10202 * local copies cannot be updated as long as crash recovery is
10203 * happening and we expect all the WAL to be replayed.
10204 */
10205 if (InArchiveRecovery)
10206 {
10207 minRecoveryPoint = ControlFile->minRecoveryPoint;
10208 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
10209 }
10210 if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
10211 {
10212 ControlFile->minRecoveryPoint = lsn;
10213 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
10214 }
10215
10216 CommitTsParameterChange(xlrec.track_commit_timestamp,
10217 ControlFile->track_commit_timestamp);
10218 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
10219
10220 UpdateControlFile();
10221 LWLockRelease(ControlFileLock);
10222
10223 /* Check to see if any changes to max_connections give problems */
10224 CheckRequiredParameterValues();
10225 }
10226 else if (info == XLOG_FPW_CHANGE)
10227 {
10228 bool fpw;
10229
10230 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
10231
10232 /*
10233 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
10234 * do_pg_start_backup() and do_pg_stop_backup() can check whether
10235 * full_page_writes has been disabled during online backup.
10236 */
10237 if (!fpw)
10238 {
10239 SpinLockAcquire(&XLogCtl->info_lck);
10240 if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
10241 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
10242 SpinLockRelease(&XLogCtl->info_lck);
10243 }
10244
10245 /* Keep track of full_page_writes */
10246 lastFullPageWrites = fpw;
10247 }
10248 }
10249
10250 /*
10251 * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
10252 */
10253 static void
VerifyOverwriteContrecord(xl_overwrite_contrecord * xlrec,XLogReaderState * state)10254 VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
10255 {
10256 if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
10257 elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
10258 (uint32) (xlrec->overwritten_lsn >> 32),
10259 (uint32) xlrec->overwritten_lsn,
10260 (uint32) (state->overwrittenRecPtr >> 32),
10261 (uint32) state->overwrittenRecPtr);
10262
10263 ereport(LOG,
10264 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
10265 (uint32) (xlrec->overwritten_lsn >> 32),
10266 (uint32) xlrec->overwritten_lsn,
10267 timestamptz_to_str(xlrec->overwrite_time))));
10268
10269 /* Verifying the record should only happen once */
10270 state->overwrittenRecPtr = InvalidXLogRecPtr;
10271 }
10272
10273 #ifdef WAL_DEBUG
10274
10275 static void
xlog_outrec(StringInfo buf,XLogReaderState * record)10276 xlog_outrec(StringInfo buf, XLogReaderState *record)
10277 {
10278 int block_id;
10279
10280 appendStringInfo(buf, "prev %X/%X; xid %u",
10281 (uint32) (XLogRecGetPrev(record) >> 32),
10282 (uint32) XLogRecGetPrev(record),
10283 XLogRecGetXid(record));
10284
10285 appendStringInfo(buf, "; len %u",
10286 XLogRecGetDataLen(record));
10287
10288 /* decode block references */
10289 for (block_id = 0; block_id <= record->max_block_id; block_id++)
10290 {
10291 RelFileNode rnode;
10292 ForkNumber forknum;
10293 BlockNumber blk;
10294
10295 if (!XLogRecHasBlockRef(record, block_id))
10296 continue;
10297
10298 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
10299 if (forknum != MAIN_FORKNUM)
10300 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
10301 block_id,
10302 rnode.spcNode, rnode.dbNode, rnode.relNode,
10303 forknum,
10304 blk);
10305 else
10306 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
10307 block_id,
10308 rnode.spcNode, rnode.dbNode, rnode.relNode,
10309 blk);
10310 if (XLogRecHasBlockImage(record, block_id))
10311 appendStringInfoString(buf, " FPW");
10312 }
10313 }
10314 #endif /* WAL_DEBUG */
10315
10316 /*
10317 * Returns a string describing an XLogRecord, consisting of its identity
10318 * optionally followed by a colon, a space, and a further description.
10319 */
10320 static void
xlog_outdesc(StringInfo buf,XLogReaderState * record)10321 xlog_outdesc(StringInfo buf, XLogReaderState *record)
10322 {
10323 RmgrId rmid = XLogRecGetRmid(record);
10324 uint8 info = XLogRecGetInfo(record);
10325 const char *id;
10326
10327 appendStringInfoString(buf, RmgrTable[rmid].rm_name);
10328 appendStringInfoChar(buf, '/');
10329
10330 id = RmgrTable[rmid].rm_identify(info);
10331 if (id == NULL)
10332 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
10333 else
10334 appendStringInfo(buf, "%s: ", id);
10335
10336 RmgrTable[rmid].rm_desc(buf, record);
10337 }
10338
10339
10340 /*
10341 * Return the (possible) sync flag used for opening a file, depending on the
10342 * value of the GUC wal_sync_method.
10343 */
10344 static int
get_sync_bit(int method)10345 get_sync_bit(int method)
10346 {
10347 int o_direct_flag = 0;
10348
10349 /* If fsync is disabled, never open in sync mode */
10350 if (!enableFsync)
10351 return 0;
10352
10353 /*
10354 * Optimize writes by bypassing kernel cache with O_DIRECT when using
10355 * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
10356 * disabled, otherwise the archive command or walsender process will read
10357 * the WAL soon after writing it, which is guaranteed to cause a physical
10358 * read if we bypassed the kernel cache. We also skip the
10359 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10360 * reason.
10361 *
10362 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10363 * written by walreceiver is normally read by the startup process soon
10364 * after its written. Also, walreceiver performs unaligned writes, which
10365 * don't work with O_DIRECT, so it is required for correctness too.
10366 */
10367 if (!XLogIsNeeded() && !AmWalReceiverProcess())
10368 o_direct_flag = PG_O_DIRECT;
10369
10370 switch (method)
10371 {
10372 /*
10373 * enum values for all sync options are defined even if they are
10374 * not supported on the current platform. But if not, they are
10375 * not included in the enum option array, and therefore will never
10376 * be seen here.
10377 */
10378 case SYNC_METHOD_FSYNC:
10379 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10380 case SYNC_METHOD_FDATASYNC:
10381 return 0;
10382 #ifdef OPEN_SYNC_FLAG
10383 case SYNC_METHOD_OPEN:
10384 return OPEN_SYNC_FLAG | o_direct_flag;
10385 #endif
10386 #ifdef OPEN_DATASYNC_FLAG
10387 case SYNC_METHOD_OPEN_DSYNC:
10388 return OPEN_DATASYNC_FLAG | o_direct_flag;
10389 #endif
10390 default:
10391 /* can't happen (unless we are out of sync with option array) */
10392 elog(ERROR, "unrecognized wal_sync_method: %d", method);
10393 return 0; /* silence warning */
10394 }
10395 }
10396
10397 /*
10398 * GUC support
10399 */
10400 void
assign_xlog_sync_method(int new_sync_method,void * extra)10401 assign_xlog_sync_method(int new_sync_method, void *extra)
10402 {
10403 if (sync_method != new_sync_method)
10404 {
10405 /*
10406 * To ensure that no blocks escape unsynced, force an fsync on the
10407 * currently open log segment (if any). Also, if the open flag is
10408 * changing, close the log file so it will be reopened (with new flag
10409 * bit) at next use.
10410 */
10411 if (openLogFile >= 0)
10412 {
10413 pgstat_report_wait_start(WAIT_EVENT_WAL_SYNC_METHOD_ASSIGN);
10414 if (pg_fsync(openLogFile) != 0)
10415 ereport(PANIC,
10416 (errcode_for_file_access(),
10417 errmsg("could not fsync log segment %s: %m",
10418 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10419 pgstat_report_wait_end();
10420 if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10421 XLogFileClose();
10422 }
10423 }
10424 }
10425
10426
10427 /*
10428 * Issue appropriate kind of fsync (if any) for an XLOG output file.
10429 *
10430 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10431 * 'log' and 'seg' are for error reporting purposes.
10432 */
10433 void
issue_xlog_fsync(int fd,XLogSegNo segno)10434 issue_xlog_fsync(int fd, XLogSegNo segno)
10435 {
10436 switch (sync_method)
10437 {
10438 case SYNC_METHOD_FSYNC:
10439 if (pg_fsync_no_writethrough(fd) != 0)
10440 ereport(PANIC,
10441 (errcode_for_file_access(),
10442 errmsg("could not fsync log file %s: %m",
10443 XLogFileNameP(ThisTimeLineID, segno))));
10444 break;
10445 #ifdef HAVE_FSYNC_WRITETHROUGH
10446 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10447 if (pg_fsync_writethrough(fd) != 0)
10448 ereport(PANIC,
10449 (errcode_for_file_access(),
10450 errmsg("could not fsync write-through log file %s: %m",
10451 XLogFileNameP(ThisTimeLineID, segno))));
10452 break;
10453 #endif
10454 #ifdef HAVE_FDATASYNC
10455 case SYNC_METHOD_FDATASYNC:
10456 if (pg_fdatasync(fd) != 0)
10457 ereport(PANIC,
10458 (errcode_for_file_access(),
10459 errmsg("could not fdatasync log file %s: %m",
10460 XLogFileNameP(ThisTimeLineID, segno))));
10461 break;
10462 #endif
10463 case SYNC_METHOD_OPEN:
10464 case SYNC_METHOD_OPEN_DSYNC:
10465 /* write synced it already */
10466 break;
10467 default:
10468 elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10469 break;
10470 }
10471 }
10472
10473 /*
10474 * Return the filename of given log segment, as a palloc'd string.
10475 */
10476 char *
XLogFileNameP(TimeLineID tli,XLogSegNo segno)10477 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10478 {
10479 char *result = palloc(MAXFNAMELEN);
10480
10481 XLogFileName(result, tli, segno);
10482 return result;
10483 }
10484
10485 /*
10486 * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
10487 * function. It creates the necessary starting checkpoint and constructs the
10488 * backup label file.
10489 *
10490 * There are two kind of backups: exclusive and non-exclusive. An exclusive
10491 * backup is started with pg_start_backup(), and there can be only one active
10492 * at a time. The backup and tablespace map files of an exclusive backup are
10493 * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10494 * removed by pg_stop_backup().
10495 *
10496 * A non-exclusive backup is used for the streaming base backups (see
10497 * src/backend/replication/basebackup.c). The difference to exclusive backups
10498 * is that the backup label and tablespace map files are not written to disk.
10499 * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10500 * and the caller is responsible for including them in the backup archive as
10501 * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10502 * active at the same time, and they don't conflict with an exclusive backup
10503 * either.
10504 *
10505 * tblspcmapfile is required mainly for tar format in windows as native windows
10506 * utilities are not able to create symlinks while extracting files from tar.
10507 * However for consistency, the same is used for all platforms.
10508 *
10509 * needtblspcmapfile is true for the cases (exclusive backup and for
10510 * non-exclusive backup only when tar format is used for taking backup)
10511 * when backup needs to generate tablespace_map file, it is used to
10512 * embed escape character before newline character in tablespace path.
10513 *
10514 * Returns the minimum WAL location that must be present to restore from this
10515 * backup, and the corresponding timeline ID in *starttli_p.
10516 *
10517 * Every successfully started non-exclusive backup must be stopped by calling
10518 * do_pg_stop_backup() or do_pg_abort_backup().
10519 *
10520 * It is the responsibility of the caller of this function to verify the
10521 * permissions of the calling user!
10522 */
10523 XLogRecPtr
do_pg_start_backup(const char * backupidstr,bool fast,TimeLineID * starttli_p,StringInfo labelfile,DIR * tblspcdir,List ** tablespaces,StringInfo tblspcmapfile,bool infotbssize,bool needtblspcmapfile)10524 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10525 StringInfo labelfile, DIR *tblspcdir, List **tablespaces,
10526 StringInfo tblspcmapfile, bool infotbssize,
10527 bool needtblspcmapfile)
10528 {
10529 bool exclusive = (labelfile == NULL);
10530 bool backup_started_in_recovery = false;
10531 XLogRecPtr checkpointloc;
10532 XLogRecPtr startpoint;
10533 TimeLineID starttli;
10534 pg_time_t stamp_time;
10535 char strfbuf[128];
10536 char xlogfilename[MAXFNAMELEN];
10537 XLogSegNo _logSegNo;
10538 struct stat stat_buf;
10539 FILE *fp;
10540
10541 backup_started_in_recovery = RecoveryInProgress();
10542
10543 /*
10544 * Currently only non-exclusive backup can be taken during recovery.
10545 */
10546 if (backup_started_in_recovery && exclusive)
10547 ereport(ERROR,
10548 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10549 errmsg("recovery is in progress"),
10550 errhint("WAL control functions cannot be executed during recovery.")));
10551
10552 /*
10553 * During recovery, we don't need to check WAL level. Because, if WAL
10554 * level is not sufficient, it's impossible to get here during recovery.
10555 */
10556 if (!backup_started_in_recovery && !XLogIsNeeded())
10557 ereport(ERROR,
10558 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10559 errmsg("WAL level not sufficient for making an online backup"),
10560 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10561
10562 if (strlen(backupidstr) > MAXPGPATH)
10563 ereport(ERROR,
10564 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10565 errmsg("backup label too long (max %d bytes)",
10566 MAXPGPATH)));
10567
10568 /*
10569 * Mark backup active in shared memory. We must do full-page WAL writes
10570 * during an on-line backup even if not doing so at other times, because
10571 * it's quite possible for the backup dump to obtain a "torn" (partially
10572 * written) copy of a database page if it reads the page concurrently with
10573 * our write to the same page. This can be fixed as long as the first
10574 * write to the page in the WAL sequence is a full-page write. Hence, we
10575 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10576 * are no dirty pages in shared memory that might get dumped while the
10577 * backup is in progress without having a corresponding WAL record. (Once
10578 * the backup is complete, we need not force full-page writes anymore,
10579 * since we expect that any pages not modified during the backup interval
10580 * must have been correctly captured by the backup.)
10581 *
10582 * Note that forcePageWrites has no effect during an online backup from
10583 * the standby.
10584 *
10585 * We must hold all the insertion locks to change the value of
10586 * forcePageWrites, to ensure adequate interlocking against
10587 * XLogInsertRecord().
10588 */
10589 WALInsertLockAcquireExclusive();
10590 if (exclusive)
10591 {
10592 /*
10593 * At first, mark that we're now starting an exclusive backup, to
10594 * ensure that there are no other sessions currently running
10595 * pg_start_backup() or pg_stop_backup().
10596 */
10597 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10598 {
10599 WALInsertLockRelease();
10600 ereport(ERROR,
10601 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10602 errmsg("a backup is already in progress"),
10603 errhint("Run pg_stop_backup() and try again.")));
10604 }
10605 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10606 }
10607 else
10608 XLogCtl->Insert.nonExclusiveBackups++;
10609 XLogCtl->Insert.forcePageWrites = true;
10610 WALInsertLockRelease();
10611
10612 /* Ensure we release forcePageWrites if fail below */
10613 PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10614 {
10615 bool gotUniqueStartpoint = false;
10616 struct dirent *de;
10617 tablespaceinfo *ti;
10618 int datadirpathlen;
10619
10620 /*
10621 * Force an XLOG file switch before the checkpoint, to ensure that the
10622 * WAL segment the checkpoint is written to doesn't contain pages with
10623 * old timeline IDs. That would otherwise happen if you called
10624 * pg_start_backup() right after restoring from a PITR archive: the
10625 * first WAL segment containing the startup checkpoint has pages in
10626 * the beginning with the old timeline ID. That can cause trouble at
10627 * recovery: we won't have a history file covering the old timeline if
10628 * pg_wal directory was not included in the base backup and the WAL
10629 * archive was cleared too before starting the backup.
10630 *
10631 * This also ensures that we have emitted a WAL page header that has
10632 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10633 * Therefore, if a WAL archiver (such as pglesslog) is trying to
10634 * compress out removable backup blocks, it won't remove any that
10635 * occur after this point.
10636 *
10637 * During recovery, we skip forcing XLOG file switch, which means that
10638 * the backup taken during recovery is not available for the special
10639 * recovery case described above.
10640 */
10641 if (!backup_started_in_recovery)
10642 RequestXLogSwitch(false);
10643
10644 do
10645 {
10646 bool checkpointfpw;
10647
10648 /*
10649 * Force a CHECKPOINT. Aside from being necessary to prevent torn
10650 * page problems, this guarantees that two successive backup runs
10651 * will have different checkpoint positions and hence different
10652 * history file names, even if nothing happened in between.
10653 *
10654 * During recovery, establish a restartpoint if possible. We use
10655 * the last restartpoint as the backup starting checkpoint. This
10656 * means that two successive backup runs can have same checkpoint
10657 * positions.
10658 *
10659 * Since the fact that we are executing do_pg_start_backup()
10660 * during recovery means that checkpointer is running, we can use
10661 * RequestCheckpoint() to establish a restartpoint.
10662 *
10663 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10664 * passing fast = true). Otherwise this can take awhile.
10665 */
10666 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10667 (fast ? CHECKPOINT_IMMEDIATE : 0));
10668
10669 /*
10670 * Now we need to fetch the checkpoint record location, and also
10671 * its REDO pointer. The oldest point in WAL that would be needed
10672 * to restore starting from the checkpoint is precisely the REDO
10673 * pointer.
10674 */
10675 LWLockAcquire(ControlFileLock, LW_SHARED);
10676 checkpointloc = ControlFile->checkPoint;
10677 startpoint = ControlFile->checkPointCopy.redo;
10678 starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10679 checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10680 LWLockRelease(ControlFileLock);
10681
10682 if (backup_started_in_recovery)
10683 {
10684 XLogRecPtr recptr;
10685
10686 /*
10687 * Check to see if all WAL replayed during online backup
10688 * (i.e., since last restartpoint used as backup starting
10689 * checkpoint) contain full-page writes.
10690 */
10691 SpinLockAcquire(&XLogCtl->info_lck);
10692 recptr = XLogCtl->lastFpwDisableRecPtr;
10693 SpinLockRelease(&XLogCtl->info_lck);
10694
10695 if (!checkpointfpw || startpoint <= recptr)
10696 ereport(ERROR,
10697 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10698 errmsg("WAL generated with full_page_writes=off was replayed "
10699 "since last restartpoint"),
10700 errhint("This means that the backup being taken on the standby "
10701 "is corrupt and should not be used. "
10702 "Enable full_page_writes and run CHECKPOINT on the master, "
10703 "and then try an online backup again.")));
10704
10705 /*
10706 * During recovery, since we don't use the end-of-backup WAL
10707 * record and don't write the backup history file, the
10708 * starting WAL location doesn't need to be unique. This means
10709 * that two base backups started at the same time might use
10710 * the same checkpoint as starting locations.
10711 */
10712 gotUniqueStartpoint = true;
10713 }
10714
10715 /*
10716 * If two base backups are started at the same time (in WAL sender
10717 * processes), we need to make sure that they use different
10718 * checkpoints as starting locations, because we use the starting
10719 * WAL location as a unique identifier for the base backup in the
10720 * end-of-backup WAL record and when we write the backup history
10721 * file. Perhaps it would be better generate a separate unique ID
10722 * for each backup instead of forcing another checkpoint, but
10723 * taking a checkpoint right after another is not that expensive
10724 * either because only few buffers have been dirtied yet.
10725 */
10726 WALInsertLockAcquireExclusive();
10727 if (XLogCtl->Insert.lastBackupStart < startpoint)
10728 {
10729 XLogCtl->Insert.lastBackupStart = startpoint;
10730 gotUniqueStartpoint = true;
10731 }
10732 WALInsertLockRelease();
10733 } while (!gotUniqueStartpoint);
10734
10735 XLByteToSeg(startpoint, _logSegNo);
10736 XLogFileName(xlogfilename, starttli, _logSegNo);
10737
10738 /*
10739 * Construct tablespace_map file
10740 */
10741 if (exclusive)
10742 tblspcmapfile = makeStringInfo();
10743
10744 datadirpathlen = strlen(DataDir);
10745
10746 /* Collect information about all tablespaces */
10747 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10748 {
10749 char fullpath[MAXPGPATH + 10];
10750 char linkpath[MAXPGPATH];
10751 char *relpath = NULL;
10752 int rllen;
10753 StringInfoData buflinkpath;
10754 char *s = linkpath;
10755
10756 /* Skip special stuff */
10757 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10758 continue;
10759
10760 snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10761
10762 #if defined(HAVE_READLINK) || defined(WIN32)
10763 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10764 if (rllen < 0)
10765 {
10766 ereport(WARNING,
10767 (errmsg("could not read symbolic link \"%s\": %m",
10768 fullpath)));
10769 continue;
10770 }
10771 else if (rllen >= sizeof(linkpath))
10772 {
10773 ereport(WARNING,
10774 (errmsg("symbolic link \"%s\" target is too long",
10775 fullpath)));
10776 continue;
10777 }
10778 linkpath[rllen] = '\0';
10779
10780 /*
10781 * Add the escape character '\\' before newline in a string to
10782 * ensure that we can distinguish between the newline in the
10783 * tablespace path and end of line while reading tablespace_map
10784 * file during archive recovery.
10785 */
10786 initStringInfo(&buflinkpath);
10787
10788 while (*s)
10789 {
10790 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10791 appendStringInfoChar(&buflinkpath, '\\');
10792 appendStringInfoChar(&buflinkpath, *s++);
10793 }
10794
10795
10796 /*
10797 * Relpath holds the relative path of the tablespace directory
10798 * when it's located within PGDATA, or NULL if it's located
10799 * elsewhere.
10800 */
10801 if (rllen > datadirpathlen &&
10802 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10803 IS_DIR_SEP(linkpath[datadirpathlen]))
10804 relpath = linkpath + datadirpathlen + 1;
10805
10806 ti = palloc(sizeof(tablespaceinfo));
10807 ti->oid = pstrdup(de->d_name);
10808 ti->path = pstrdup(buflinkpath.data);
10809 ti->rpath = relpath ? pstrdup(relpath) : NULL;
10810 ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
10811
10812 if (tablespaces)
10813 *tablespaces = lappend(*tablespaces, ti);
10814
10815 appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10816
10817 pfree(buflinkpath.data);
10818 #else
10819
10820 /*
10821 * If the platform does not have symbolic links, it should not be
10822 * possible to have tablespaces - clearly somebody else created
10823 * them. Warn about it and ignore.
10824 */
10825 ereport(WARNING,
10826 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10827 errmsg("tablespaces are not supported on this platform")));
10828 #endif
10829 }
10830
10831 /*
10832 * Construct backup label file
10833 */
10834 if (exclusive)
10835 labelfile = makeStringInfo();
10836
10837 /* Use the log timezone here, not the session timezone */
10838 stamp_time = (pg_time_t) time(NULL);
10839 pg_strftime(strfbuf, sizeof(strfbuf),
10840 "%Y-%m-%d %H:%M:%S %Z",
10841 pg_localtime(&stamp_time, log_timezone));
10842 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10843 (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10844 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10845 (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10846 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10847 exclusive ? "pg_start_backup" : "streamed");
10848 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10849 backup_started_in_recovery ? "standby" : "master");
10850 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10851 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10852
10853 /*
10854 * Okay, write the file, or return its contents to caller.
10855 */
10856 if (exclusive)
10857 {
10858 /*
10859 * Check for existing backup label --- implies a backup is already
10860 * running. (XXX given that we checked exclusiveBackupState
10861 * above, maybe it would be OK to just unlink any such label
10862 * file?)
10863 */
10864 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10865 {
10866 if (errno != ENOENT)
10867 ereport(ERROR,
10868 (errcode_for_file_access(),
10869 errmsg("could not stat file \"%s\": %m",
10870 BACKUP_LABEL_FILE)));
10871 }
10872 else
10873 ereport(ERROR,
10874 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10875 errmsg("a backup is already in progress"),
10876 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10877 BACKUP_LABEL_FILE)));
10878
10879 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10880
10881 if (!fp)
10882 ereport(ERROR,
10883 (errcode_for_file_access(),
10884 errmsg("could not create file \"%s\": %m",
10885 BACKUP_LABEL_FILE)));
10886 if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
10887 fflush(fp) != 0 ||
10888 pg_fsync(fileno(fp)) != 0 ||
10889 ferror(fp) ||
10890 FreeFile(fp))
10891 ereport(ERROR,
10892 (errcode_for_file_access(),
10893 errmsg("could not write file \"%s\": %m",
10894 BACKUP_LABEL_FILE)));
10895 /* Allocated locally for exclusive backups, so free separately */
10896 pfree(labelfile->data);
10897 pfree(labelfile);
10898
10899 /* Write backup tablespace_map file. */
10900 if (tblspcmapfile->len > 0)
10901 {
10902 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
10903 {
10904 if (errno != ENOENT)
10905 ereport(ERROR,
10906 (errcode_for_file_access(),
10907 errmsg("could not stat file \"%s\": %m",
10908 TABLESPACE_MAP)));
10909 }
10910 else
10911 ereport(ERROR,
10912 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10913 errmsg("a backup is already in progress"),
10914 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10915 TABLESPACE_MAP)));
10916
10917 fp = AllocateFile(TABLESPACE_MAP, "w");
10918
10919 if (!fp)
10920 ereport(ERROR,
10921 (errcode_for_file_access(),
10922 errmsg("could not create file \"%s\": %m",
10923 TABLESPACE_MAP)));
10924 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
10925 fflush(fp) != 0 ||
10926 pg_fsync(fileno(fp)) != 0 ||
10927 ferror(fp) ||
10928 FreeFile(fp))
10929 ereport(ERROR,
10930 (errcode_for_file_access(),
10931 errmsg("could not write file \"%s\": %m",
10932 TABLESPACE_MAP)));
10933 }
10934
10935 /* Allocated locally for exclusive backups, so free separately */
10936 pfree(tblspcmapfile->data);
10937 pfree(tblspcmapfile);
10938 }
10939 }
10940 PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10941
10942 /*
10943 * Mark that start phase has correctly finished for an exclusive backup.
10944 * Session-level locks are updated as well to reflect that state.
10945 *
10946 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating
10947 * backup counters and session-level lock. Otherwise they can be
10948 * updated inconsistently, and which might cause do_pg_abort_backup()
10949 * to fail.
10950 */
10951 if (exclusive)
10952 {
10953 WALInsertLockAcquireExclusive();
10954 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10955
10956 /* Set session-level lock */
10957 sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
10958 WALInsertLockRelease();
10959 }
10960 else
10961 sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
10962
10963 /*
10964 * We're done. As a convenience, return the starting WAL location.
10965 */
10966 if (starttli_p)
10967 *starttli_p = starttli;
10968 return startpoint;
10969 }
10970
10971 /* Error cleanup callback for pg_start_backup */
10972 static void
pg_start_backup_callback(int code,Datum arg)10973 pg_start_backup_callback(int code, Datum arg)
10974 {
10975 bool exclusive = DatumGetBool(arg);
10976
10977 /* Update backup counters and forcePageWrites on failure */
10978 WALInsertLockAcquireExclusive();
10979 if (exclusive)
10980 {
10981 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10982 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10983 }
10984 else
10985 {
10986 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10987 XLogCtl->Insert.nonExclusiveBackups--;
10988 }
10989
10990 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10991 XLogCtl->Insert.nonExclusiveBackups == 0)
10992 {
10993 XLogCtl->Insert.forcePageWrites = false;
10994 }
10995 WALInsertLockRelease();
10996 }
10997
10998 /*
10999 * Error cleanup callback for pg_stop_backup
11000 */
11001 static void
pg_stop_backup_callback(int code,Datum arg)11002 pg_stop_backup_callback(int code, Datum arg)
11003 {
11004 bool exclusive = DatumGetBool(arg);
11005
11006 /* Update backup status on failure */
11007 WALInsertLockAcquireExclusive();
11008 if (exclusive)
11009 {
11010 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
11011 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
11012 }
11013 WALInsertLockRelease();
11014 }
11015
11016 /*
11017 * Utility routine to fetch the session-level status of a backup running.
11018 */
11019 SessionBackupState
get_backup_status(void)11020 get_backup_status(void)
11021 {
11022 return sessionBackupState;
11023 }
11024
11025 /*
11026 * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
11027 * function.
11028 *
11029 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
11030 * the non-exclusive backup specified by 'labelfile'.
11031 *
11032 * Returns the last WAL location that must be present to restore from this
11033 * backup, and the corresponding timeline ID in *stoptli_p.
11034 *
11035 * It is the responsibility of the caller of this function to verify the
11036 * permissions of the calling user!
11037 */
11038 XLogRecPtr
do_pg_stop_backup(char * labelfile,bool waitforarchive,TimeLineID * stoptli_p)11039 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
11040 {
11041 bool exclusive = (labelfile == NULL);
11042 bool backup_started_in_recovery = false;
11043 XLogRecPtr startpoint;
11044 XLogRecPtr stoppoint;
11045 TimeLineID stoptli;
11046 pg_time_t stamp_time;
11047 char strfbuf[128];
11048 char histfilepath[MAXPGPATH];
11049 char startxlogfilename[MAXFNAMELEN];
11050 char stopxlogfilename[MAXFNAMELEN];
11051 char lastxlogfilename[MAXFNAMELEN];
11052 char histfilename[MAXFNAMELEN];
11053 char backupfrom[20];
11054 XLogSegNo _logSegNo;
11055 FILE *lfp;
11056 FILE *fp;
11057 char ch;
11058 int seconds_before_warning;
11059 int waits = 0;
11060 bool reported_waiting = false;
11061 char *remaining;
11062 char *ptr;
11063 uint32 hi,
11064 lo;
11065
11066 backup_started_in_recovery = RecoveryInProgress();
11067
11068 /*
11069 * Currently only non-exclusive backup can be taken during recovery.
11070 */
11071 if (backup_started_in_recovery && exclusive)
11072 ereport(ERROR,
11073 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11074 errmsg("recovery is in progress"),
11075 errhint("WAL control functions cannot be executed during recovery.")));
11076
11077 /*
11078 * During recovery, we don't need to check WAL level. Because, if WAL
11079 * level is not sufficient, it's impossible to get here during recovery.
11080 */
11081 if (!backup_started_in_recovery && !XLogIsNeeded())
11082 ereport(ERROR,
11083 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11084 errmsg("WAL level not sufficient for making an online backup"),
11085 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
11086
11087 if (exclusive)
11088 {
11089 /*
11090 * At first, mark that we're now stopping an exclusive backup, to
11091 * ensure that there are no other sessions currently running
11092 * pg_start_backup() or pg_stop_backup().
11093 */
11094 WALInsertLockAcquireExclusive();
11095 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
11096 {
11097 WALInsertLockRelease();
11098 ereport(ERROR,
11099 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11100 errmsg("exclusive backup not in progress")));
11101 }
11102 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
11103 WALInsertLockRelease();
11104
11105 /*
11106 * Remove backup_label. In case of failure, the state for an exclusive
11107 * backup is switched back to in-progress.
11108 */
11109 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11110 {
11111 /*
11112 * Read the existing label file into memory.
11113 */
11114 struct stat statbuf;
11115 int r;
11116
11117 if (stat(BACKUP_LABEL_FILE, &statbuf))
11118 {
11119 /* should not happen per the upper checks */
11120 if (errno != ENOENT)
11121 ereport(ERROR,
11122 (errcode_for_file_access(),
11123 errmsg("could not stat file \"%s\": %m",
11124 BACKUP_LABEL_FILE)));
11125 ereport(ERROR,
11126 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11127 errmsg("a backup is not in progress")));
11128 }
11129
11130 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11131 if (!lfp)
11132 {
11133 ereport(ERROR,
11134 (errcode_for_file_access(),
11135 errmsg("could not read file \"%s\": %m",
11136 BACKUP_LABEL_FILE)));
11137 }
11138 labelfile = palloc(statbuf.st_size + 1);
11139 r = fread(labelfile, statbuf.st_size, 1, lfp);
11140 labelfile[statbuf.st_size] = '\0';
11141
11142 /*
11143 * Close and remove the backup label file
11144 */
11145 if (r != 1 || ferror(lfp) || FreeFile(lfp))
11146 ereport(ERROR,
11147 (errcode_for_file_access(),
11148 errmsg("could not read file \"%s\": %m",
11149 BACKUP_LABEL_FILE)));
11150 durable_unlink(BACKUP_LABEL_FILE, ERROR);
11151
11152 /*
11153 * Remove tablespace_map file if present, it is created only if
11154 * there are tablespaces.
11155 */
11156 durable_unlink(TABLESPACE_MAP, DEBUG1);
11157 }
11158 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
11159 }
11160
11161 /*
11162 * OK to update backup counters, forcePageWrites and session-level lock.
11163 *
11164 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
11165 * Otherwise they can be updated inconsistently, and which might cause
11166 * do_pg_abort_backup() to fail.
11167 */
11168 WALInsertLockAcquireExclusive();
11169 if (exclusive)
11170 {
11171 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
11172 }
11173 else
11174 {
11175 /*
11176 * The user-visible pg_start/stop_backup() functions that operate on
11177 * exclusive backups can be called at any time, but for non-exclusive
11178 * backups, it is expected that each do_pg_start_backup() call is
11179 * matched by exactly one do_pg_stop_backup() call.
11180 */
11181 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11182 XLogCtl->Insert.nonExclusiveBackups--;
11183 }
11184
11185 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11186 XLogCtl->Insert.nonExclusiveBackups == 0)
11187 {
11188 XLogCtl->Insert.forcePageWrites = false;
11189 }
11190
11191 /*
11192 * Clean up session-level lock.
11193 *
11194 * You might think that WALInsertLockRelease() can be called
11195 * before cleaning up session-level lock because session-level
11196 * lock doesn't need to be protected with WAL insertion lock.
11197 * But since CHECK_FOR_INTERRUPTS() can occur in it,
11198 * session-level lock must be cleaned up before it.
11199 */
11200 sessionBackupState = SESSION_BACKUP_NONE;
11201
11202 WALInsertLockRelease();
11203
11204 /*
11205 * Read and parse the START WAL LOCATION line (this code is pretty crude,
11206 * but we are not expecting any variability in the file format).
11207 */
11208 if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
11209 &hi, &lo, startxlogfilename,
11210 &ch) != 4 || ch != '\n')
11211 ereport(ERROR,
11212 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11213 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11214 startpoint = ((uint64) hi) << 32 | lo;
11215 remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
11216
11217 /*
11218 * Parse the BACKUP FROM line. If we are taking an online backup from the
11219 * standby, we confirm that the standby has not been promoted during the
11220 * backup.
11221 */
11222 ptr = strstr(remaining, "BACKUP FROM:");
11223 if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
11224 ereport(ERROR,
11225 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11226 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11227 if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
11228 ereport(ERROR,
11229 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11230 errmsg("the standby was promoted during online backup"),
11231 errhint("This means that the backup being taken is corrupt "
11232 "and should not be used. "
11233 "Try taking another online backup.")));
11234
11235 /*
11236 * During recovery, we don't write an end-of-backup record. We assume that
11237 * pg_control was backed up last and its minimum recovery point can be
11238 * available as the backup end location. Since we don't have an
11239 * end-of-backup record, we use the pg_control value to check whether
11240 * we've reached the end of backup when starting recovery from this
11241 * backup. We have no way of checking if pg_control wasn't backed up last
11242 * however.
11243 *
11244 * We don't force a switch to new WAL file but it is still possible to
11245 * wait for all the required files to be archived if waitforarchive is
11246 * true. This is okay if we use the backup to start a standby and fetch
11247 * the missing WAL using streaming replication. But in the case of an
11248 * archive recovery, a user should set waitforarchive to true and wait for
11249 * them to be archived to ensure that all the required files are
11250 * available.
11251 *
11252 * We return the current minimum recovery point as the backup end
11253 * location. Note that it can be greater than the exact backup end
11254 * location if the minimum recovery point is updated after the backup of
11255 * pg_control. This is harmless for current uses.
11256 *
11257 * XXX currently a backup history file is for informational and debug
11258 * purposes only. It's not essential for an online backup. Furthermore,
11259 * even if it's created, it will not be archived during recovery because
11260 * an archiver is not invoked. So it doesn't seem worthwhile to write a
11261 * backup history file during recovery.
11262 */
11263 if (backup_started_in_recovery)
11264 {
11265 XLogRecPtr recptr;
11266
11267 /*
11268 * Check to see if all WAL replayed during online backup contain
11269 * full-page writes.
11270 */
11271 SpinLockAcquire(&XLogCtl->info_lck);
11272 recptr = XLogCtl->lastFpwDisableRecPtr;
11273 SpinLockRelease(&XLogCtl->info_lck);
11274
11275 if (startpoint <= recptr)
11276 ereport(ERROR,
11277 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11278 errmsg("WAL generated with full_page_writes=off was replayed "
11279 "during online backup"),
11280 errhint("This means that the backup being taken on the standby "
11281 "is corrupt and should not be used. "
11282 "Enable full_page_writes and run CHECKPOINT on the master, "
11283 "and then try an online backup again.")));
11284
11285
11286 LWLockAcquire(ControlFileLock, LW_SHARED);
11287 stoppoint = ControlFile->minRecoveryPoint;
11288 stoptli = ControlFile->minRecoveryPointTLI;
11289 LWLockRelease(ControlFileLock);
11290 }
11291 else
11292 {
11293 /*
11294 * Write the backup-end xlog record
11295 */
11296 XLogBeginInsert();
11297 XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
11298 stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
11299 stoptli = ThisTimeLineID;
11300
11301 /*
11302 * Force a switch to a new xlog segment file, so that the backup is
11303 * valid as soon as archiver moves out the current segment file.
11304 */
11305 RequestXLogSwitch(false);
11306
11307 XLByteToPrevSeg(stoppoint, _logSegNo);
11308 XLogFileName(stopxlogfilename, stoptli, _logSegNo);
11309
11310 /* Use the log timezone here, not the session timezone */
11311 stamp_time = (pg_time_t) time(NULL);
11312 pg_strftime(strfbuf, sizeof(strfbuf),
11313 "%Y-%m-%d %H:%M:%S %Z",
11314 pg_localtime(&stamp_time, log_timezone));
11315
11316 /*
11317 * Write the backup history file
11318 */
11319 XLByteToSeg(startpoint, _logSegNo);
11320 BackupHistoryFilePath(histfilepath, stoptli, _logSegNo,
11321 (uint32) (startpoint % XLogSegSize));
11322 fp = AllocateFile(histfilepath, "w");
11323 if (!fp)
11324 ereport(ERROR,
11325 (errcode_for_file_access(),
11326 errmsg("could not create file \"%s\": %m",
11327 histfilepath)));
11328 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
11329 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
11330 fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
11331 (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
11332 /* transfer remaining lines from label to history file */
11333 fprintf(fp, "%s", remaining);
11334 fprintf(fp, "STOP TIME: %s\n", strfbuf);
11335 if (fflush(fp) || ferror(fp) || FreeFile(fp))
11336 ereport(ERROR,
11337 (errcode_for_file_access(),
11338 errmsg("could not write file \"%s\": %m",
11339 histfilepath)));
11340
11341 /*
11342 * Clean out any no-longer-needed history files. As a side effect,
11343 * this will post a .ready file for the newly created history file,
11344 * notifying the archiver that history file may be archived
11345 * immediately.
11346 */
11347 CleanupBackupHistory();
11348 }
11349
11350 /*
11351 * If archiving is enabled, wait for all the required WAL files to be
11352 * archived before returning. If archiving isn't enabled, the required WAL
11353 * needs to be transported via streaming replication (hopefully with
11354 * wal_keep_segments set high enough), or some more exotic mechanism like
11355 * polling and copying files from pg_wal with script. We have no knowledge
11356 * of those mechanisms, so it's up to the user to ensure that he gets all
11357 * the required WAL.
11358 *
11359 * We wait until both the last WAL file filled during backup and the
11360 * history file have been archived, and assume that the alphabetic sorting
11361 * property of the WAL files ensures any earlier WAL files are safely
11362 * archived as well.
11363 *
11364 * We wait forever, since archive_command is supposed to work and we
11365 * assume the admin wanted his backup to work completely. If you don't
11366 * wish to wait, then either waitforarchive should be passed in as false,
11367 * or you can set statement_timeout. Also, some notices are issued to
11368 * clue in anyone who might be doing this interactively.
11369 */
11370
11371 if (waitforarchive &&
11372 ((!backup_started_in_recovery && XLogArchivingActive()) ||
11373 (backup_started_in_recovery && XLogArchivingAlways())))
11374 {
11375 XLByteToPrevSeg(stoppoint, _logSegNo);
11376 XLogFileName(lastxlogfilename, stoptli, _logSegNo);
11377
11378 XLByteToSeg(startpoint, _logSegNo);
11379 BackupHistoryFileName(histfilename, stoptli, _logSegNo,
11380 (uint32) (startpoint % XLogSegSize));
11381
11382 seconds_before_warning = 60;
11383 waits = 0;
11384
11385 while (XLogArchiveIsBusy(lastxlogfilename) ||
11386 XLogArchiveIsBusy(histfilename))
11387 {
11388 CHECK_FOR_INTERRUPTS();
11389
11390 if (!reported_waiting && waits > 5)
11391 {
11392 ereport(NOTICE,
11393 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
11394 reported_waiting = true;
11395 }
11396
11397 pg_usleep(1000000L);
11398
11399 if (++waits >= seconds_before_warning)
11400 {
11401 seconds_before_warning *= 2; /* This wraps in >10 years... */
11402 ereport(WARNING,
11403 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11404 waits),
11405 errhint("Check that your archive_command is executing properly. "
11406 "pg_stop_backup can be canceled safely, "
11407 "but the database backup will not be usable without all the WAL segments.")));
11408 }
11409 }
11410
11411 ereport(NOTICE,
11412 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
11413 }
11414 else if (waitforarchive)
11415 ereport(NOTICE,
11416 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11417
11418 /*
11419 * We're done. As a convenience, return the ending WAL location.
11420 */
11421 if (stoptli_p)
11422 *stoptli_p = stoptli;
11423 return stoppoint;
11424 }
11425
11426
11427 /*
11428 * do_pg_abort_backup: abort a running backup
11429 *
11430 * This does just the most basic steps of do_pg_stop_backup(), by taking the
11431 * system out of backup mode, thus making it a lot more safe to call from
11432 * an error handler.
11433 *
11434 * NB: This is only for aborting a non-exclusive backup that doesn't write
11435 * backup_label. A backup started with pg_start_backup() needs to be finished
11436 * with pg_stop_backup().
11437 */
11438 void
do_pg_abort_backup(void)11439 do_pg_abort_backup(void)
11440 {
11441 /*
11442 * Quick exit if session is not keeping around a non-exclusive backup
11443 * already started.
11444 */
11445 if (sessionBackupState == SESSION_BACKUP_NONE)
11446 return;
11447
11448 WALInsertLockAcquireExclusive();
11449 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11450 Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE);
11451 XLogCtl->Insert.nonExclusiveBackups--;
11452
11453 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11454 XLogCtl->Insert.nonExclusiveBackups == 0)
11455 {
11456 XLogCtl->Insert.forcePageWrites = false;
11457 }
11458 WALInsertLockRelease();
11459 }
11460
11461 /*
11462 * Get latest redo apply position.
11463 *
11464 * Exported to allow WALReceiver to read the pointer directly.
11465 */
11466 XLogRecPtr
GetXLogReplayRecPtr(TimeLineID * replayTLI)11467 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11468 {
11469 XLogRecPtr recptr;
11470 TimeLineID tli;
11471
11472 SpinLockAcquire(&XLogCtl->info_lck);
11473 recptr = XLogCtl->lastReplayedEndRecPtr;
11474 tli = XLogCtl->lastReplayedTLI;
11475 SpinLockRelease(&XLogCtl->info_lck);
11476
11477 if (replayTLI)
11478 *replayTLI = tli;
11479 return recptr;
11480 }
11481
11482 /*
11483 * Get latest WAL insert pointer
11484 */
11485 XLogRecPtr
GetXLogInsertRecPtr(void)11486 GetXLogInsertRecPtr(void)
11487 {
11488 XLogCtlInsert *Insert = &XLogCtl->Insert;
11489 uint64 current_bytepos;
11490
11491 SpinLockAcquire(&Insert->insertpos_lck);
11492 current_bytepos = Insert->CurrBytePos;
11493 SpinLockRelease(&Insert->insertpos_lck);
11494
11495 return XLogBytePosToRecPtr(current_bytepos);
11496 }
11497
11498 /*
11499 * Get latest WAL write pointer
11500 */
11501 XLogRecPtr
GetXLogWriteRecPtr(void)11502 GetXLogWriteRecPtr(void)
11503 {
11504 SpinLockAcquire(&XLogCtl->info_lck);
11505 LogwrtResult = XLogCtl->LogwrtResult;
11506 SpinLockRelease(&XLogCtl->info_lck);
11507
11508 return LogwrtResult.Write;
11509 }
11510
11511 /*
11512 * Returns the redo pointer of the last checkpoint or restartpoint. This is
11513 * the oldest point in WAL that we still need, if we have to restart recovery.
11514 */
11515 void
GetOldestRestartPoint(XLogRecPtr * oldrecptr,TimeLineID * oldtli)11516 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11517 {
11518 LWLockAcquire(ControlFileLock, LW_SHARED);
11519 *oldrecptr = ControlFile->checkPointCopy.redo;
11520 *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11521 LWLockRelease(ControlFileLock);
11522 }
11523
11524 /*
11525 * read_backup_label: check to see if a backup_label file is present
11526 *
11527 * If we see a backup_label during recovery, we assume that we are recovering
11528 * from a backup dump file, and we therefore roll forward from the checkpoint
11529 * identified by the label file, NOT what pg_control says. This avoids the
11530 * problem that pg_control might have been archived one or more checkpoints
11531 * later than the start of the dump, and so if we rely on it as the start
11532 * point, we will fail to restore a consistent database state.
11533 *
11534 * Returns TRUE if a backup_label was found (and fills the checkpoint
11535 * location and its REDO location into *checkPointLoc and RedoStartLSN,
11536 * respectively); returns FALSE if not. If this backup_label came from a
11537 * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
11538 * was created during recovery, *backupFromStandby is set to TRUE.
11539 */
11540 static bool
read_backup_label(XLogRecPtr * checkPointLoc,bool * backupEndRequired,bool * backupFromStandby)11541 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11542 bool *backupFromStandby)
11543 {
11544 char startxlogfilename[MAXFNAMELEN];
11545 TimeLineID tli;
11546 FILE *lfp;
11547 char ch;
11548 char backuptype[20];
11549 char backupfrom[20];
11550 uint32 hi,
11551 lo;
11552
11553 *backupEndRequired = false;
11554 *backupFromStandby = false;
11555
11556 /*
11557 * See if label file is present
11558 */
11559 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11560 if (!lfp)
11561 {
11562 if (errno != ENOENT)
11563 ereport(FATAL,
11564 (errcode_for_file_access(),
11565 errmsg("could not read file \"%s\": %m",
11566 BACKUP_LABEL_FILE)));
11567 return false; /* it's not there, all is fine */
11568 }
11569
11570 /*
11571 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11572 * is pretty crude, but we are not expecting any variability in the file
11573 * format).
11574 */
11575 if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11576 &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
11577 ereport(FATAL,
11578 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11579 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11580 RedoStartLSN = ((uint64) hi) << 32 | lo;
11581 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11582 &hi, &lo, &ch) != 3 || ch != '\n')
11583 ereport(FATAL,
11584 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11585 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11586 *checkPointLoc = ((uint64) hi) << 32 | lo;
11587
11588 /*
11589 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11590 * from an older backup anyway, but since the information on it is not
11591 * strictly required, don't error out if it's missing for some reason.
11592 */
11593 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11594 {
11595 if (strcmp(backuptype, "streamed") == 0)
11596 *backupEndRequired = true;
11597 }
11598
11599 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11600 {
11601 if (strcmp(backupfrom, "standby") == 0)
11602 *backupFromStandby = true;
11603 }
11604
11605 if (ferror(lfp) || FreeFile(lfp))
11606 ereport(FATAL,
11607 (errcode_for_file_access(),
11608 errmsg("could not read file \"%s\": %m",
11609 BACKUP_LABEL_FILE)));
11610
11611 return true;
11612 }
11613
11614 /*
11615 * read_tablespace_map: check to see if a tablespace_map file is present
11616 *
11617 * If we see a tablespace_map file during recovery, we assume that we are
11618 * recovering from a backup dump file, and we therefore need to create symlinks
11619 * as per the information present in tablespace_map file.
11620 *
11621 * Returns TRUE if a tablespace_map file was found (and fills the link
11622 * information for all the tablespace links present in file); returns FALSE
11623 * if not.
11624 */
11625 static bool
read_tablespace_map(List ** tablespaces)11626 read_tablespace_map(List **tablespaces)
11627 {
11628 tablespaceinfo *ti;
11629 FILE *lfp;
11630 char tbsoid[MAXPGPATH];
11631 char *tbslinkpath;
11632 char str[MAXPGPATH];
11633 int ch,
11634 prev_ch = -1,
11635 i = 0,
11636 n;
11637
11638 /*
11639 * See if tablespace_map file is present
11640 */
11641 lfp = AllocateFile(TABLESPACE_MAP, "r");
11642 if (!lfp)
11643 {
11644 if (errno != ENOENT)
11645 ereport(FATAL,
11646 (errcode_for_file_access(),
11647 errmsg("could not read file \"%s\": %m",
11648 TABLESPACE_MAP)));
11649 return false; /* it's not there, all is fine */
11650 }
11651
11652 /*
11653 * Read and parse the link name and path lines from tablespace_map file
11654 * (this code is pretty crude, but we are not expecting any variability in
11655 * the file format). While taking backup we embed escape character '\\'
11656 * before newline in tablespace path, so that during reading of
11657 * tablespace_map file, we could distinguish newline in tablespace path
11658 * and end of line. Now while reading tablespace_map file, remove the
11659 * escape character that has been added in tablespace path during backup.
11660 */
11661 while ((ch = fgetc(lfp)) != EOF)
11662 {
11663 if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11664 {
11665 str[i] = '\0';
11666 if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11667 ereport(FATAL,
11668 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11669 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11670 tbslinkpath = str + n;
11671 i = 0;
11672
11673 ti = palloc(sizeof(tablespaceinfo));
11674 ti->oid = pstrdup(tbsoid);
11675 ti->path = pstrdup(tbslinkpath);
11676
11677 *tablespaces = lappend(*tablespaces, ti);
11678 continue;
11679 }
11680 else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11681 str[i - 1] = ch;
11682 else if (i < sizeof(str) - 1)
11683 str[i++] = ch;
11684 prev_ch = ch;
11685 }
11686
11687 if (ferror(lfp) || FreeFile(lfp))
11688 ereport(FATAL,
11689 (errcode_for_file_access(),
11690 errmsg("could not read file \"%s\": %m",
11691 TABLESPACE_MAP)));
11692
11693 return true;
11694 }
11695
11696 /*
11697 * Error context callback for errors occurring during rm_redo().
11698 */
11699 static void
rm_redo_error_callback(void * arg)11700 rm_redo_error_callback(void *arg)
11701 {
11702 XLogReaderState *record = (XLogReaderState *) arg;
11703 StringInfoData buf;
11704
11705 initStringInfo(&buf);
11706 xlog_outdesc(&buf, record);
11707
11708 /* translator: %s is a WAL record description */
11709 errcontext("WAL redo at %X/%X for %s",
11710 (uint32) (record->ReadRecPtr >> 32),
11711 (uint32) record->ReadRecPtr,
11712 buf.data);
11713
11714 pfree(buf.data);
11715 }
11716
11717 /*
11718 * BackupInProgress: check if online backup mode is active
11719 *
11720 * This is done by checking for existence of the "backup_label" file.
11721 */
11722 bool
BackupInProgress(void)11723 BackupInProgress(void)
11724 {
11725 struct stat stat_buf;
11726
11727 return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11728 }
11729
11730 /*
11731 * CancelBackup: rename the "backup_label" and "tablespace_map"
11732 * files to cancel backup mode
11733 *
11734 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11735 * Similarly, if the "tablespace_map" file exists, it will be renamed to
11736 * "tablespace_map.old".
11737 *
11738 * Note that this will render an online backup in progress
11739 * useless. To correctly finish an online backup, pg_stop_backup must be
11740 * called.
11741 */
11742 void
CancelBackup(void)11743 CancelBackup(void)
11744 {
11745 struct stat stat_buf;
11746
11747 /* if the backup_label file is not there, return */
11748 if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11749 return;
11750
11751 /* remove leftover file from previously canceled backup if it exists */
11752 unlink(BACKUP_LABEL_OLD);
11753
11754 if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11755 {
11756 ereport(WARNING,
11757 (errcode_for_file_access(),
11758 errmsg("online backup mode was not canceled"),
11759 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11760 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11761 return;
11762 }
11763
11764 /* if the tablespace_map file is not there, return */
11765 if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11766 {
11767 ereport(LOG,
11768 (errmsg("online backup mode canceled"),
11769 errdetail("File \"%s\" was renamed to \"%s\".",
11770 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11771 return;
11772 }
11773
11774 /* remove leftover file from previously canceled backup if it exists */
11775 unlink(TABLESPACE_MAP_OLD);
11776
11777 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11778 {
11779 ereport(LOG,
11780 (errmsg("online backup mode canceled"),
11781 errdetail("Files \"%s\" and \"%s\" were renamed to "
11782 "\"%s\" and \"%s\", respectively.",
11783 BACKUP_LABEL_FILE, TABLESPACE_MAP,
11784 BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11785 }
11786 else
11787 {
11788 ereport(WARNING,
11789 (errcode_for_file_access(),
11790 errmsg("online backup mode canceled"),
11791 errdetail("File \"%s\" was renamed to \"%s\", but "
11792 "file \"%s\" could not be renamed to \"%s\": %m.",
11793 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11794 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11795 }
11796 }
11797
11798 /*
11799 * Read the XLOG page containing RecPtr into readBuf (if not read already).
11800 * Returns number of bytes read, if the page is read successfully, or -1
11801 * in case of errors. When errors occur, they are ereport'ed, but only
11802 * if they have not been previously reported.
11803 *
11804 * This is responsible for restoring files from archive as needed, as well
11805 * as for waiting for the requested WAL record to arrive in standby mode.
11806 *
11807 * 'emode' specifies the log level used for reporting "file not found" or
11808 * "end of WAL" situations in archive recovery, or in standby mode when a
11809 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
11810 * false in those situations, on higher log levels the ereport() won't
11811 * return.
11812 *
11813 * In standby mode, if after a successful return of XLogPageRead() the
11814 * caller finds the record it's interested in to be broken, it should
11815 * ereport the error with the level determined by
11816 * emode_for_corrupt_record(), and then set lastSourceFailed
11817 * and call XLogPageRead() again with the same arguments. This lets
11818 * XLogPageRead() to try fetching the record from another source, or to
11819 * sleep and retry.
11820 */
11821 static int
XLogPageRead(XLogReaderState * xlogreader,XLogRecPtr targetPagePtr,int reqLen,XLogRecPtr targetRecPtr,char * readBuf,TimeLineID * readTLI)11822 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
11823 XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
11824 {
11825 XLogPageReadPrivate *private =
11826 (XLogPageReadPrivate *) xlogreader->private_data;
11827 int emode = private->emode;
11828 uint32 targetPageOff;
11829 XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11830
11831 XLByteToSeg(targetPagePtr, targetSegNo);
11832 targetPageOff = targetPagePtr % XLogSegSize;
11833
11834 /*
11835 * See if we need to switch to a new segment because the requested record
11836 * is not in the currently open one.
11837 */
11838 if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
11839 {
11840 /*
11841 * Request a restartpoint if we've replayed too much xlog since the
11842 * last one.
11843 */
11844 if (bgwriterLaunched)
11845 {
11846 if (XLogCheckpointNeeded(readSegNo))
11847 {
11848 (void) GetRedoRecPtr();
11849 if (XLogCheckpointNeeded(readSegNo))
11850 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11851 }
11852 }
11853
11854 close(readFile);
11855 readFile = -1;
11856 readSource = 0;
11857 }
11858
11859 XLByteToSeg(targetPagePtr, readSegNo);
11860
11861 retry:
11862 /* See if we need to retrieve more data */
11863 if (readFile < 0 ||
11864 (readSource == XLOG_FROM_STREAM &&
11865 receivedUpto < targetPagePtr + reqLen))
11866 {
11867 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11868 private->randAccess,
11869 private->fetching_ckpt,
11870 targetRecPtr))
11871 {
11872 if (readFile >= 0)
11873 close(readFile);
11874 readFile = -1;
11875 readLen = 0;
11876 readSource = 0;
11877
11878 return -1;
11879 }
11880 }
11881
11882 /*
11883 * At this point, we have the right segment open and if we're streaming we
11884 * know the requested record is in it.
11885 */
11886 Assert(readFile != -1);
11887
11888 /*
11889 * If the current segment is being streamed from master, calculate how
11890 * much of the current page we have received already. We know the
11891 * requested record has been received, but this is for the benefit of
11892 * future calls, to allow quick exit at the top of this function.
11893 */
11894 if (readSource == XLOG_FROM_STREAM)
11895 {
11896 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11897 readLen = XLOG_BLCKSZ;
11898 else
11899 readLen = receivedUpto % XLogSegSize - targetPageOff;
11900 }
11901 else
11902 readLen = XLOG_BLCKSZ;
11903
11904 /* Read the requested page */
11905 readOff = targetPageOff;
11906 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
11907 {
11908 char fname[MAXFNAMELEN];
11909 int save_errno = errno;
11910
11911 XLogFileName(fname, curFileTLI, readSegNo);
11912 errno = save_errno;
11913 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11914 (errcode_for_file_access(),
11915 errmsg("could not seek in log segment %s to offset %u: %m",
11916 fname, readOff)));
11917 goto next_record_is_invalid;
11918 }
11919
11920 pgstat_report_wait_start(WAIT_EVENT_WAL_READ);
11921 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
11922 {
11923 char fname[MAXFNAMELEN];
11924 int save_errno = errno;
11925
11926 pgstat_report_wait_end();
11927 XLogFileName(fname, curFileTLI, readSegNo);
11928 errno = save_errno;
11929 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11930 (errcode_for_file_access(),
11931 errmsg("could not read from log segment %s, offset %u: %m",
11932 fname, readOff)));
11933 goto next_record_is_invalid;
11934 }
11935 pgstat_report_wait_end();
11936
11937 Assert(targetSegNo == readSegNo);
11938 Assert(targetPageOff == readOff);
11939 Assert(reqLen <= readLen);
11940
11941 *readTLI = curFileTLI;
11942
11943 /*
11944 * Check the page header immediately, so that we can retry immediately if
11945 * it's not valid. This may seem unnecessary, because XLogReadRecord()
11946 * validates the page header anyway, and would propagate the failure up to
11947 * ReadRecord(), which would retry. However, there's a corner case with
11948 * continuation records, if a record is split across two pages such that
11949 * we would need to read the two pages from different sources. For
11950 * example, imagine a scenario where a streaming replica is started up,
11951 * and replay reaches a record that's split across two WAL segments. The
11952 * first page is only available locally, in pg_wal, because it's already
11953 * been recycled in the master. The second page, however, is not present
11954 * in pg_wal, and we should stream it from the master. There is a recycled
11955 * WAL segment present in pg_wal, with garbage contents, however. We would
11956 * read the first page from the local WAL segment, but when reading the
11957 * second page, we would read the bogus, recycled, WAL segment. If we
11958 * didn't catch that case here, we would never recover, because
11959 * ReadRecord() would retry reading the whole record from the beginning.
11960 *
11961 * Of course, this only catches errors in the page header, which is what
11962 * happens in the case of a recycled WAL segment. Other kinds of errors or
11963 * corruption still has the same problem. But this at least fixes the
11964 * common case, which can happen as part of normal operation.
11965 *
11966 * Validating the page header is cheap enough that doing it twice
11967 * shouldn't be a big deal from a performance point of view.
11968 */
11969 if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11970 {
11971 /* reset any error XLogReaderValidatePageHeader() might have set */
11972 xlogreader->errormsg_buf[0] = '\0';
11973 goto next_record_is_invalid;
11974 }
11975
11976 return readLen;
11977
11978 next_record_is_invalid:
11979 lastSourceFailed = true;
11980
11981 if (readFile >= 0)
11982 close(readFile);
11983 readFile = -1;
11984 readLen = 0;
11985 readSource = 0;
11986
11987 /* In standby-mode, keep trying */
11988 if (StandbyMode)
11989 goto retry;
11990 else
11991 return -1;
11992 }
11993
11994 /*
11995 * Open the WAL segment containing WAL location 'RecPtr'.
11996 *
11997 * The segment can be fetched via restore_command, or via walreceiver having
11998 * streamed the record, or it can already be present in pg_wal. Checking
11999 * pg_wal is mainly for crash recovery, but it will be polled in standby mode
12000 * too, in case someone copies a new segment directly to pg_wal. That is not
12001 * documented or recommended, though.
12002 *
12003 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
12004 * prepare to read WAL starting from RedoStartLSN after this.
12005 *
12006 * 'RecPtr' might not point to the beginning of the record we're interested
12007 * in, it might also point to the page or segment header. In that case,
12008 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
12009 * used to decide which timeline to stream the requested WAL from.
12010 *
12011 * If the record is not immediately available, the function returns false
12012 * if we're not in standby mode. In standby mode, waits for it to become
12013 * available.
12014 *
12015 * When the requested record becomes available, the function opens the file
12016 * containing it (if not open already), and returns true. When end of standby
12017 * mode is triggered by the user, and there is no more WAL available, returns
12018 * false.
12019 */
12020 static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,bool randAccess,bool fetching_ckpt,XLogRecPtr tliRecPtr)12021 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
12022 bool fetching_ckpt, XLogRecPtr tliRecPtr)
12023 {
12024 static TimestampTz last_fail_time = 0;
12025 TimestampTz now;
12026 bool streaming_reply_sent = false;
12027
12028 /*-------
12029 * Standby mode is implemented by a state machine:
12030 *
12031 * 1. Read from either archive or pg_wal (XLOG_FROM_ARCHIVE), or just
12032 * pg_wal (XLOG_FROM_PG_WAL)
12033 * 2. Check trigger file
12034 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
12035 * 4. Rescan timelines
12036 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
12037 *
12038 * Failure to read from the current source advances the state machine to
12039 * the next state.
12040 *
12041 * 'currentSource' indicates the current state. There are no currentSource
12042 * values for "check trigger", "rescan timelines", and "sleep" states,
12043 * those actions are taken when reading from the previous source fails, as
12044 * part of advancing to the next state.
12045 *
12046 * If standby mode is turned off while reading WAL from stream, we move
12047 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
12048 * the files (which would be required at end of recovery, e.g., timeline
12049 * history file) from archive or pg_wal. We don't need to kill WAL receiver
12050 * here because it's already stopped when standby mode is turned off at
12051 * the end of recovery.
12052 *-------
12053 */
12054 if (!InArchiveRecovery)
12055 currentSource = XLOG_FROM_PG_WAL;
12056 else if (currentSource == 0 ||
12057 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
12058 {
12059 lastSourceFailed = false;
12060 currentSource = XLOG_FROM_ARCHIVE;
12061 }
12062
12063 for (;;)
12064 {
12065 int oldSource = currentSource;
12066
12067 /*
12068 * First check if we failed to read from the current source, and
12069 * advance the state machine if so. The failure to read might've
12070 * happened outside this function, e.g when a CRC check fails on a
12071 * record, or within this loop.
12072 */
12073 if (lastSourceFailed)
12074 {
12075 switch (currentSource)
12076 {
12077 case XLOG_FROM_ARCHIVE:
12078 case XLOG_FROM_PG_WAL:
12079
12080 /*
12081 * Check to see if the trigger file exists. Note that we
12082 * do this only after failure, so when you create the
12083 * trigger file, we still finish replaying as much as we
12084 * can from archive and pg_wal before failover.
12085 */
12086 if (StandbyMode && CheckForStandbyTrigger())
12087 {
12088 ShutdownWalRcv();
12089 return false;
12090 }
12091
12092 /*
12093 * Not in standby mode, and we've now tried the archive
12094 * and pg_wal.
12095 */
12096 if (!StandbyMode)
12097 return false;
12098
12099 /*
12100 * If primary_conninfo is set, launch walreceiver to try
12101 * to stream the missing WAL.
12102 *
12103 * If fetching_ckpt is TRUE, RecPtr points to the initial
12104 * checkpoint location. In that case, we use RedoStartLSN
12105 * as the streaming start position instead of RecPtr, so
12106 * that when we later jump backwards to start redo at
12107 * RedoStartLSN, we will have the logs streamed already.
12108 */
12109 if (PrimaryConnInfo)
12110 {
12111 XLogRecPtr ptr;
12112 TimeLineID tli;
12113
12114 if (fetching_ckpt)
12115 {
12116 ptr = RedoStartLSN;
12117 tli = ControlFile->checkPointCopy.ThisTimeLineID;
12118 }
12119 else
12120 {
12121 ptr = RecPtr;
12122
12123 /*
12124 * Use the record begin position to determine the
12125 * TLI, rather than the position we're reading.
12126 */
12127 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
12128
12129 if (curFileTLI > 0 && tli < curFileTLI)
12130 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
12131 (uint32) (tliRecPtr >> 32),
12132 (uint32) tliRecPtr,
12133 tli, curFileTLI);
12134 }
12135 curFileTLI = tli;
12136 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
12137 PrimarySlotName);
12138 receivedUpto = 0;
12139 }
12140
12141 /*
12142 * Move to XLOG_FROM_STREAM state in either case. We'll
12143 * get immediate failure if we didn't launch walreceiver,
12144 * and move on to the next state.
12145 */
12146 currentSource = XLOG_FROM_STREAM;
12147 break;
12148
12149 case XLOG_FROM_STREAM:
12150
12151 /*
12152 * Failure while streaming. Most likely, we got here
12153 * because streaming replication was terminated, or
12154 * promotion was triggered. But we also get here if we
12155 * find an invalid record in the WAL streamed from master,
12156 * in which case something is seriously wrong. There's
12157 * little chance that the problem will just go away, but
12158 * PANIC is not good for availability either, especially
12159 * in hot standby mode. So, we treat that the same as
12160 * disconnection, and retry from archive/pg_wal again. The
12161 * WAL in the archive should be identical to what was
12162 * streamed, so it's unlikely that it helps, but one can
12163 * hope...
12164 */
12165
12166 /*
12167 * We should be able to move to XLOG_FROM_STREAM
12168 * only in standby mode.
12169 */
12170 Assert(StandbyMode);
12171
12172 /*
12173 * Before we leave XLOG_FROM_STREAM state, make sure that
12174 * walreceiver is not active, so that it won't overwrite
12175 * WAL that we restore from archive.
12176 */
12177 if (WalRcvStreaming())
12178 ShutdownWalRcv();
12179
12180 /*
12181 * Before we sleep, re-scan for possible new timelines if
12182 * we were requested to recover to the latest timeline.
12183 */
12184 if (recoveryTargetIsLatest)
12185 {
12186 if (rescanLatestTimeLine())
12187 {
12188 currentSource = XLOG_FROM_ARCHIVE;
12189 break;
12190 }
12191 }
12192
12193 /*
12194 * XLOG_FROM_STREAM is the last state in our state
12195 * machine, so we've exhausted all the options for
12196 * obtaining the requested WAL. We're going to loop back
12197 * and retry from the archive, but if it hasn't been long
12198 * since last attempt, sleep wal_retrieve_retry_interval
12199 * milliseconds to avoid busy-waiting.
12200 */
12201 now = GetCurrentTimestamp();
12202 if (!TimestampDifferenceExceeds(last_fail_time, now,
12203 wal_retrieve_retry_interval))
12204 {
12205 long wait_time;
12206
12207 wait_time = wal_retrieve_retry_interval -
12208 TimestampDifferenceMilliseconds(last_fail_time, now);
12209
12210 WaitLatch(&XLogCtl->recoveryWakeupLatch,
12211 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12212 wait_time, WAIT_EVENT_RECOVERY_WAL_STREAM);
12213 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12214 now = GetCurrentTimestamp();
12215
12216 /* Handle interrupt signals of startup process */
12217 HandleStartupProcInterrupts();
12218 }
12219 last_fail_time = now;
12220 currentSource = XLOG_FROM_ARCHIVE;
12221 break;
12222
12223 default:
12224 elog(ERROR, "unexpected WAL source %d", currentSource);
12225 }
12226 }
12227 else if (currentSource == XLOG_FROM_PG_WAL)
12228 {
12229 /*
12230 * We just successfully read a file in pg_wal. We prefer files in
12231 * the archive over ones in pg_wal, so try the next file again
12232 * from the archive first.
12233 */
12234 if (InArchiveRecovery)
12235 currentSource = XLOG_FROM_ARCHIVE;
12236 }
12237
12238 if (currentSource != oldSource)
12239 elog(DEBUG2, "switched WAL source from %s to %s after %s",
12240 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
12241 lastSourceFailed ? "failure" : "success");
12242
12243 /*
12244 * We've now handled possible failure. Try to read from the chosen
12245 * source.
12246 */
12247 lastSourceFailed = false;
12248
12249 switch (currentSource)
12250 {
12251 case XLOG_FROM_ARCHIVE:
12252 case XLOG_FROM_PG_WAL:
12253 /*
12254 * WAL receiver must not be running when reading WAL from
12255 * archive or pg_wal.
12256 */
12257 Assert(!WalRcvStreaming());
12258
12259 /* Close any old file we might have open. */
12260 if (readFile >= 0)
12261 {
12262 close(readFile);
12263 readFile = -1;
12264 }
12265 /* Reset curFileTLI if random fetch. */
12266 if (randAccess)
12267 curFileTLI = 0;
12268
12269 /*
12270 * Try to restore the file from archive, or read an existing
12271 * file from pg_wal.
12272 */
12273 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
12274 currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
12275 currentSource);
12276 if (readFile >= 0)
12277 return true; /* success! */
12278
12279 /*
12280 * Nope, not found in archive or pg_wal.
12281 */
12282 lastSourceFailed = true;
12283 break;
12284
12285 case XLOG_FROM_STREAM:
12286 {
12287 bool havedata;
12288
12289 /*
12290 * We should be able to move to XLOG_FROM_STREAM
12291 * only in standby mode.
12292 */
12293 Assert(StandbyMode);
12294
12295 /*
12296 * Check if WAL receiver is still active.
12297 */
12298 if (!WalRcvStreaming())
12299 {
12300 lastSourceFailed = true;
12301 break;
12302 }
12303
12304 /*
12305 * Walreceiver is active, so see if new data has arrived.
12306 *
12307 * We only advance XLogReceiptTime when we obtain fresh
12308 * WAL from walreceiver and observe that we had already
12309 * processed everything before the most recent "chunk"
12310 * that it flushed to disk. In steady state where we are
12311 * keeping up with the incoming data, XLogReceiptTime will
12312 * be updated on each cycle. When we are behind,
12313 * XLogReceiptTime will not advance, so the grace time
12314 * allotted to conflicting queries will decrease.
12315 */
12316 if (RecPtr < receivedUpto)
12317 havedata = true;
12318 else
12319 {
12320 XLogRecPtr latestChunkStart;
12321
12322 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
12323 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
12324 {
12325 havedata = true;
12326 if (latestChunkStart <= RecPtr)
12327 {
12328 XLogReceiptTime = GetCurrentTimestamp();
12329 SetCurrentChunkStartTime(XLogReceiptTime);
12330 }
12331 }
12332 else
12333 havedata = false;
12334 }
12335 if (havedata)
12336 {
12337 /*
12338 * Great, streamed far enough. Open the file if it's
12339 * not open already. Also read the timeline history
12340 * file if we haven't initialized timeline history
12341 * yet; it should be streamed over and present in
12342 * pg_wal by now. Use XLOG_FROM_STREAM so that source
12343 * info is set correctly and XLogReceiptTime isn't
12344 * changed.
12345 *
12346 * NB: We must set readTimeLineHistory based on
12347 * recoveryTargetTLI, not receiveTLI. Normally they'll
12348 * be the same, but if recovery_target_timeline is
12349 * 'latest' and archiving is configured, then it's
12350 * possible that we managed to retrieve one or more
12351 * new timeline history files from the archive,
12352 * updating recoveryTargetTLI.
12353 */
12354 if (readFile < 0)
12355 {
12356 if (!expectedTLEs)
12357 expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
12358 readFile = XLogFileRead(readSegNo, PANIC,
12359 receiveTLI,
12360 XLOG_FROM_STREAM, false);
12361 Assert(readFile >= 0);
12362 }
12363 else
12364 {
12365 /* just make sure source info is correct... */
12366 readSource = XLOG_FROM_STREAM;
12367 XLogReceiptSource = XLOG_FROM_STREAM;
12368 return true;
12369 }
12370 break;
12371 }
12372
12373 /*
12374 * Data not here yet. Check for trigger, then wait for
12375 * walreceiver to wake us up when new WAL arrives.
12376 */
12377 if (CheckForStandbyTrigger())
12378 {
12379 /*
12380 * Note that we don't "return false" immediately here.
12381 * After being triggered, we still want to replay all
12382 * the WAL that was already streamed. It's in pg_wal
12383 * now, so we just treat this as a failure, and the
12384 * state machine will move on to replay the streamed
12385 * WAL from pg_wal, and then recheck the trigger and
12386 * exit replay.
12387 */
12388 lastSourceFailed = true;
12389 break;
12390 }
12391
12392 /*
12393 * Since we have replayed everything we have received so
12394 * far and are about to start waiting for more WAL, let's
12395 * tell the upstream server our replay location now so
12396 * that pg_stat_replication doesn't show stale
12397 * information.
12398 */
12399 if (!streaming_reply_sent)
12400 {
12401 WalRcvForceReply();
12402 streaming_reply_sent = true;
12403 }
12404
12405 /*
12406 * Wait for more WAL to arrive. Time out after 5 seconds
12407 * to react to a trigger file promptly.
12408 */
12409 WaitLatch(&XLogCtl->recoveryWakeupLatch,
12410 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12411 5000L, WAIT_EVENT_RECOVERY_WAL_ALL);
12412 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12413 break;
12414 }
12415
12416 default:
12417 elog(ERROR, "unexpected WAL source %d", currentSource);
12418 }
12419
12420 /*
12421 * This possibly-long loop needs to handle interrupts of startup
12422 * process.
12423 */
12424 HandleStartupProcInterrupts();
12425 }
12426
12427 return false; /* not reached */
12428 }
12429
12430 /*
12431 * Determine what log level should be used to report a corrupt WAL record
12432 * in the current WAL page, previously read by XLogPageRead().
12433 *
12434 * 'emode' is the error mode that would be used to report a file-not-found
12435 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
12436 * we're retrying the exact same record that we've tried previously, only
12437 * complain the first time to keep the noise down. However, we only do when
12438 * reading from pg_wal, because we don't expect any invalid records in archive
12439 * or in records streamed from master. Files in the archive should be complete,
12440 * and we should never hit the end of WAL because we stop and wait for more WAL
12441 * to arrive before replaying it.
12442 *
12443 * NOTE: This function remembers the RecPtr value it was last called with,
12444 * to suppress repeated messages about the same record. Only call this when
12445 * you are about to ereport(), or you might cause a later message to be
12446 * erroneously suppressed.
12447 */
12448 static int
emode_for_corrupt_record(int emode,XLogRecPtr RecPtr)12449 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12450 {
12451 static XLogRecPtr lastComplaint = 0;
12452
12453 if (readSource == XLOG_FROM_PG_WAL && emode == LOG)
12454 {
12455 if (RecPtr == lastComplaint)
12456 emode = DEBUG1;
12457 else
12458 lastComplaint = RecPtr;
12459 }
12460 return emode;
12461 }
12462
12463 /*
12464 * Check to see whether the user-specified trigger file exists and whether a
12465 * promote request has arrived. If either condition holds, return true.
12466 */
12467 static bool
CheckForStandbyTrigger(void)12468 CheckForStandbyTrigger(void)
12469 {
12470 struct stat stat_buf;
12471 static bool triggered = false;
12472
12473 if (triggered)
12474 return true;
12475
12476 if (IsPromoteTriggered())
12477 {
12478 /*
12479 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12480 * signal handler. It now leaves the file in place and lets the
12481 * Startup process do the unlink. This allows Startup to know whether
12482 * it should create a full checkpoint before starting up (fallback
12483 * mode). Fast promotion takes precedence.
12484 */
12485 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12486 {
12487 unlink(PROMOTE_SIGNAL_FILE);
12488 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12489 fast_promote = true;
12490 }
12491 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12492 {
12493 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12494 fast_promote = false;
12495 }
12496
12497 ereport(LOG, (errmsg("received promote request")));
12498
12499 ResetPromoteTriggered();
12500 triggered = true;
12501 return true;
12502 }
12503
12504 if (TriggerFile == NULL)
12505 return false;
12506
12507 if (stat(TriggerFile, &stat_buf) == 0)
12508 {
12509 ereport(LOG,
12510 (errmsg("trigger file found: %s", TriggerFile)));
12511 unlink(TriggerFile);
12512 triggered = true;
12513 fast_promote = true;
12514 return true;
12515 }
12516 else if (errno != ENOENT)
12517 ereport(ERROR,
12518 (errcode_for_file_access(),
12519 errmsg("could not stat trigger file \"%s\": %m",
12520 TriggerFile)));
12521
12522 return false;
12523 }
12524
12525 /*
12526 * Remove the files signaling a standby promotion request.
12527 */
12528 void
RemovePromoteSignalFiles(void)12529 RemovePromoteSignalFiles(void)
12530 {
12531 unlink(PROMOTE_SIGNAL_FILE);
12532 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12533 }
12534
12535 /*
12536 * Check to see if a promote request has arrived. Should be
12537 * called by postmaster after receiving SIGUSR1.
12538 */
12539 bool
CheckPromoteSignal(void)12540 CheckPromoteSignal(void)
12541 {
12542 struct stat stat_buf;
12543
12544 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12545 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12546 return true;
12547
12548 return false;
12549 }
12550
12551 /*
12552 * Wake up startup process to replay newly arrived WAL, or to notice that
12553 * failover has been requested.
12554 */
12555 void
WakeupRecovery(void)12556 WakeupRecovery(void)
12557 {
12558 SetLatch(&XLogCtl->recoveryWakeupLatch);
12559 }
12560
12561 /*
12562 * Update the WalWriterSleeping flag.
12563 */
12564 void
SetWalWriterSleeping(bool sleeping)12565 SetWalWriterSleeping(bool sleeping)
12566 {
12567 SpinLockAcquire(&XLogCtl->info_lck);
12568 XLogCtl->WalWriterSleeping = sleeping;
12569 SpinLockRelease(&XLogCtl->info_lck);
12570 }
12571
12572 /*
12573 * Schedule a walreceiver wakeup in the main recovery loop.
12574 */
12575 void
XLogRequestWalReceiverReply(void)12576 XLogRequestWalReceiverReply(void)
12577 {
12578 doRequestWalReceiverReply = true;
12579 }
12580