1 /*-------------------------------------------------------------------------
2 *
3 * xlog.c
4 * PostgreSQL transaction log manager
5 *
6 *
7 * Portions Copyright (c) 1996-2016, PostgreSQL Global Development Group
8 * Portions Copyright (c) 1994, Regents of the University of California
9 *
10 * src/backend/access/transam/xlog.c
11 *
12 *-------------------------------------------------------------------------
13 */
14
15 #include "postgres.h"
16
17 #include <ctype.h>
18 #include <time.h>
19 #include <fcntl.h>
20 #include <sys/stat.h>
21 #include <sys/time.h>
22 #include <unistd.h>
23
24 #include "access/clog.h"
25 #include "access/commit_ts.h"
26 #include "access/multixact.h"
27 #include "access/rewriteheap.h"
28 #include "access/subtrans.h"
29 #include "access/timeline.h"
30 #include "access/transam.h"
31 #include "access/tuptoaster.h"
32 #include "access/twophase.h"
33 #include "access/xact.h"
34 #include "access/xlog_internal.h"
35 #include "access/xloginsert.h"
36 #include "access/xlogreader.h"
37 #include "access/xlogutils.h"
38 #include "catalog/catversion.h"
39 #include "catalog/pg_control.h"
40 #include "catalog/pg_database.h"
41 #include "commands/tablespace.h"
42 #include "miscadmin.h"
43 #include "pgstat.h"
44 #include "postmaster/bgwriter.h"
45 #include "postmaster/walwriter.h"
46 #include "postmaster/startup.h"
47 #include "replication/basebackup.h"
48 #include "replication/logical.h"
49 #include "replication/slot.h"
50 #include "replication/origin.h"
51 #include "replication/snapbuild.h"
52 #include "replication/walreceiver.h"
53 #include "replication/walsender.h"
54 #include "storage/barrier.h"
55 #include "storage/bufmgr.h"
56 #include "storage/fd.h"
57 #include "storage/ipc.h"
58 #include "storage/large_object.h"
59 #include "storage/latch.h"
60 #include "storage/pmsignal.h"
61 #include "storage/predicate.h"
62 #include "storage/proc.h"
63 #include "storage/procarray.h"
64 #include "storage/reinit.h"
65 #include "storage/smgr.h"
66 #include "storage/spin.h"
67 #include "utils/builtins.h"
68 #include "utils/guc.h"
69 #include "utils/memutils.h"
70 #include "utils/ps_status.h"
71 #include "utils/relmapper.h"
72 #include "utils/snapmgr.h"
73 #include "utils/timestamp.h"
74 #include "pg_trace.h"
75
76 extern uint32 bootstrap_data_checksum_version;
77
78 /* File path names (all relative to $PGDATA) */
79 #define RECOVERY_COMMAND_FILE "recovery.conf"
80 #define RECOVERY_COMMAND_DONE "recovery.done"
81 #define PROMOTE_SIGNAL_FILE "promote"
82 #define FALLBACK_PROMOTE_SIGNAL_FILE "fallback_promote"
83
84
85 /* User-settable parameters */
86 int max_wal_size = 64; /* 1 GB */
87 int min_wal_size = 5; /* 80 MB */
88 int wal_keep_segments = 0;
89 int XLOGbuffers = -1;
90 int XLogArchiveTimeout = 0;
91 int XLogArchiveMode = ARCHIVE_MODE_OFF;
92 char *XLogArchiveCommand = NULL;
93 bool EnableHotStandby = false;
94 bool fullPageWrites = true;
95 bool wal_log_hints = false;
96 bool wal_compression = false;
97 bool log_checkpoints = false;
98 int sync_method = DEFAULT_SYNC_METHOD;
99 int wal_level = WAL_LEVEL_MINIMAL;
100 int CommitDelay = 0; /* precommit delay in microseconds */
101 int CommitSiblings = 5; /* # concurrent xacts needed to sleep */
102 int wal_retrieve_retry_interval = 5000;
103
104 #ifdef WAL_DEBUG
105 bool XLOG_DEBUG = false;
106 #endif
107
108 /*
109 * Number of WAL insertion locks to use. A higher value allows more insertions
110 * to happen concurrently, but adds some CPU overhead to flushing the WAL,
111 * which needs to iterate all the locks.
112 */
113 #define NUM_XLOGINSERT_LOCKS 8
114
115 /*
116 * Max distance from last checkpoint, before triggering a new xlog-based
117 * checkpoint.
118 */
119 int CheckPointSegments;
120
121 /* Estimated distance between checkpoints, in bytes */
122 static double CheckPointDistanceEstimate = 0;
123 static double PrevCheckPointDistance = 0;
124
125 /*
126 * GUC support
127 */
128 const struct config_enum_entry sync_method_options[] = {
129 {"fsync", SYNC_METHOD_FSYNC, false},
130 #ifdef HAVE_FSYNC_WRITETHROUGH
131 {"fsync_writethrough", SYNC_METHOD_FSYNC_WRITETHROUGH, false},
132 #endif
133 #ifdef HAVE_FDATASYNC
134 {"fdatasync", SYNC_METHOD_FDATASYNC, false},
135 #endif
136 #ifdef OPEN_SYNC_FLAG
137 {"open_sync", SYNC_METHOD_OPEN, false},
138 #endif
139 #ifdef OPEN_DATASYNC_FLAG
140 {"open_datasync", SYNC_METHOD_OPEN_DSYNC, false},
141 #endif
142 {NULL, 0, false}
143 };
144
145
146 /*
147 * Although only "on", "off", and "always" are documented,
148 * we accept all the likely variants of "on" and "off".
149 */
150 const struct config_enum_entry archive_mode_options[] = {
151 {"always", ARCHIVE_MODE_ALWAYS, false},
152 {"on", ARCHIVE_MODE_ON, false},
153 {"off", ARCHIVE_MODE_OFF, false},
154 {"true", ARCHIVE_MODE_ON, true},
155 {"false", ARCHIVE_MODE_OFF, true},
156 {"yes", ARCHIVE_MODE_ON, true},
157 {"no", ARCHIVE_MODE_OFF, true},
158 {"1", ARCHIVE_MODE_ON, true},
159 {"0", ARCHIVE_MODE_OFF, true},
160 {NULL, 0, false}
161 };
162
163 /*
164 * Statistics for current checkpoint are collected in this global struct.
165 * Because only the checkpointer or a stand-alone backend can perform
166 * checkpoints, this will be unused in normal backends.
167 */
168 CheckpointStatsData CheckpointStats;
169
170 /*
171 * ThisTimeLineID will be same in all backends --- it identifies current
172 * WAL timeline for the database system.
173 */
174 TimeLineID ThisTimeLineID = 0;
175
176 /*
177 * Are we doing recovery from XLOG?
178 *
179 * This is only ever true in the startup process; it should be read as meaning
180 * "this process is replaying WAL records", rather than "the system is in
181 * recovery mode". It should be examined primarily by functions that need
182 * to act differently when called from a WAL redo function (e.g., to skip WAL
183 * logging). To check whether the system is in recovery regardless of which
184 * process you're running in, use RecoveryInProgress() but only after shared
185 * memory startup and lock initialization.
186 */
187 bool InRecovery = false;
188
189 /* Are we in Hot Standby mode? Only valid in startup process, see xlog.h */
190 HotStandbyState standbyState = STANDBY_DISABLED;
191
192 static XLogRecPtr LastRec;
193
194 /* Local copy of WalRcv->receivedUpto */
195 static XLogRecPtr receivedUpto = 0;
196 static TimeLineID receiveTLI = 0;
197
198 /*
199 * abortedRecPtr is the start pointer of a broken record at end of WAL when
200 * recovery completes; missingContrecPtr is the location of the first
201 * contrecord that went missing. See CreateOverwriteContrecordRecord for
202 * details.
203 */
204 static XLogRecPtr abortedRecPtr;
205 static XLogRecPtr missingContrecPtr;
206
207 /*
208 * During recovery, lastFullPageWrites keeps track of full_page_writes that
209 * the replayed WAL records indicate. It's initialized with full_page_writes
210 * that the recovery starting checkpoint record indicates, and then updated
211 * each time XLOG_FPW_CHANGE record is replayed.
212 */
213 static bool lastFullPageWrites;
214
215 /*
216 * Local copy of the state tracked by SharedRecoveryState in shared memory,
217 * It is false if SharedRecoveryState is RECOVERY_STATE_DONE. True actually
218 * means "not known, need to check the shared state".
219 */
220 static bool LocalRecoveryInProgress = true;
221
222 /*
223 * Local copy of SharedHotStandbyActive variable. False actually means "not
224 * known, need to check the shared state".
225 */
226 static bool LocalHotStandbyActive = false;
227
228 /*
229 * Local state for XLogInsertAllowed():
230 * 1: unconditionally allowed to insert XLOG
231 * 0: unconditionally not allowed to insert XLOG
232 * -1: must check RecoveryInProgress(); disallow until it is false
233 * Most processes start with -1 and transition to 1 after seeing that recovery
234 * is not in progress. But we can also force the value for special cases.
235 * The coding in XLogInsertAllowed() depends on the first two of these states
236 * being numerically the same as bool true and false.
237 */
238 static int LocalXLogInsertAllowed = -1;
239
240 /*
241 * When ArchiveRecoveryRequested is set, archive recovery was requested,
242 * ie. recovery.conf file was present. When InArchiveRecovery is set, we are
243 * currently recovering using offline XLOG archives. These variables are only
244 * valid in the startup process.
245 *
246 * When ArchiveRecoveryRequested is true, but InArchiveRecovery is false, we're
247 * currently performing crash recovery using only XLOG files in pg_xlog, but
248 * will switch to using offline XLOG archives as soon as we reach the end of
249 * WAL in pg_xlog.
250 */
251 bool ArchiveRecoveryRequested = false;
252 bool InArchiveRecovery = false;
253
254 /* Was the last xlog file restored from archive, or local? */
255 static bool restoredFromArchive = false;
256
257 /* options taken from recovery.conf for archive recovery */
258 char *recoveryRestoreCommand = NULL;
259 static char *recoveryEndCommand = NULL;
260 static char *archiveCleanupCommand = NULL;
261 static RecoveryTargetType recoveryTarget = RECOVERY_TARGET_UNSET;
262 static bool recoveryTargetInclusive = true;
263 static RecoveryTargetAction recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
264 static TransactionId recoveryTargetXid;
265 static TimestampTz recoveryTargetTime;
266 static char *recoveryTargetName;
267 static int recovery_min_apply_delay = 0;
268 static TimestampTz recoveryDelayUntilTime;
269
270 /* options taken from recovery.conf for XLOG streaming */
271 static bool StandbyModeRequested = false;
272 static char *PrimaryConnInfo = NULL;
273 static char *PrimarySlotName = NULL;
274 static char *TriggerFile = NULL;
275
276 /* are we currently in standby mode? */
277 bool StandbyMode = false;
278
279 /* whether request for fast promotion has been made yet */
280 static bool fast_promote = false;
281
282 /*
283 * if recoveryStopsBefore/After returns true, it saves information of the stop
284 * point here
285 */
286 static TransactionId recoveryStopXid;
287 static TimestampTz recoveryStopTime;
288 static char recoveryStopName[MAXFNAMELEN];
289 static bool recoveryStopAfter;
290
291 /*
292 * During normal operation, the only timeline we care about is ThisTimeLineID.
293 * During recovery, however, things are more complicated. To simplify life
294 * for rmgr code, we keep ThisTimeLineID set to the "current" timeline as we
295 * scan through the WAL history (that is, it is the line that was active when
296 * the currently-scanned WAL record was generated). We also need these
297 * timeline values:
298 *
299 * recoveryTargetTLI: the desired timeline that we want to end in.
300 *
301 * recoveryTargetIsLatest: was the requested target timeline 'latest'?
302 *
303 * expectedTLEs: a list of TimeLineHistoryEntries for recoveryTargetTLI and the timelines of
304 * its known parents, newest first (so recoveryTargetTLI is always the
305 * first list member). Only these TLIs are expected to be seen in the WAL
306 * segments we read, and indeed only these TLIs will be considered as
307 * candidate WAL files to open at all.
308 *
309 * curFileTLI: the TLI appearing in the name of the current input WAL file.
310 * (This is not necessarily the same as ThisTimeLineID, because we could
311 * be scanning data that was copied from an ancestor timeline when the current
312 * file was created.) During a sequential scan we do not allow this value
313 * to decrease.
314 */
315 static TimeLineID recoveryTargetTLI;
316 static bool recoveryTargetIsLatest = false;
317 static List *expectedTLEs;
318 static TimeLineID curFileTLI;
319
320 /*
321 * ProcLastRecPtr points to the start of the last XLOG record inserted by the
322 * current backend. It is updated for all inserts. XactLastRecEnd points to
323 * end+1 of the last record, and is reset when we end a top-level transaction,
324 * or start a new one; so it can be used to tell if the current transaction has
325 * created any XLOG records.
326 *
327 * While in parallel mode, this may not be fully up to date. When committing,
328 * a transaction can assume this covers all xlog records written either by the
329 * user backend or by any parallel worker which was present at any point during
330 * the transaction. But when aborting, or when still in parallel mode, other
331 * parallel backends may have written WAL records at later LSNs than the value
332 * stored here. The parallel leader advances its own copy, when necessary,
333 * in WaitForParallelWorkersToFinish.
334 */
335 XLogRecPtr ProcLastRecPtr = InvalidXLogRecPtr;
336 XLogRecPtr XactLastRecEnd = InvalidXLogRecPtr;
337 XLogRecPtr XactLastCommitEnd = InvalidXLogRecPtr;
338
339 /*
340 * RedoRecPtr is this backend's local copy of the REDO record pointer
341 * (which is almost but not quite the same as a pointer to the most recent
342 * CHECKPOINT record). We update this from the shared-memory copy,
343 * XLogCtl->Insert.RedoRecPtr, whenever we can safely do so (ie, when we
344 * hold an insertion lock). See XLogInsertRecord for details. We are also
345 * allowed to update from XLogCtl->RedoRecPtr if we hold the info_lck;
346 * see GetRedoRecPtr. A freshly spawned backend obtains the value during
347 * InitXLOGAccess.
348 */
349 static XLogRecPtr RedoRecPtr;
350
351 /*
352 * doPageWrites is this backend's local copy of (forcePageWrites ||
353 * fullPageWrites). It is used together with RedoRecPtr to decide whether
354 * a full-page image of a page need to be taken.
355 */
356 static bool doPageWrites;
357
358 /* Has the recovery code requested a walreceiver wakeup? */
359 static bool doRequestWalReceiverReply;
360
361 /*
362 * RedoStartLSN points to the checkpoint's REDO location which is specified
363 * in a backup label file, backup history file or control file. In standby
364 * mode, XLOG streaming usually starts from the position where an invalid
365 * record was found. But if we fail to read even the initial checkpoint
366 * record, we use the REDO location instead of the checkpoint location as
367 * the start position of XLOG streaming. Otherwise we would have to jump
368 * backwards to the REDO location after reading the checkpoint record,
369 * because the REDO record can precede the checkpoint record.
370 */
371 static XLogRecPtr RedoStartLSN = InvalidXLogRecPtr;
372
373 /*----------
374 * Shared-memory data structures for XLOG control
375 *
376 * LogwrtRqst indicates a byte position that we need to write and/or fsync
377 * the log up to (all records before that point must be written or fsynced).
378 * LogwrtResult indicates the byte positions we have already written/fsynced.
379 * These structs are identical but are declared separately to indicate their
380 * slightly different functions.
381 *
382 * To read XLogCtl->LogwrtResult, you must hold either info_lck or
383 * WALWriteLock. To update it, you need to hold both locks. The point of
384 * this arrangement is that the value can be examined by code that already
385 * holds WALWriteLock without needing to grab info_lck as well. In addition
386 * to the shared variable, each backend has a private copy of LogwrtResult,
387 * which is updated when convenient.
388 *
389 * The request bookkeeping is simpler: there is a shared XLogCtl->LogwrtRqst
390 * (protected by info_lck), but we don't need to cache any copies of it.
391 *
392 * info_lck is only held long enough to read/update the protected variables,
393 * so it's a plain spinlock. The other locks are held longer (potentially
394 * over I/O operations), so we use LWLocks for them. These locks are:
395 *
396 * WALBufMappingLock: must be held to replace a page in the WAL buffer cache.
397 * It is only held while initializing and changing the mapping. If the
398 * contents of the buffer being replaced haven't been written yet, the mapping
399 * lock is released while the write is done, and reacquired afterwards.
400 *
401 * WALWriteLock: must be held to write WAL buffers to disk (XLogWrite or
402 * XLogFlush).
403 *
404 * ControlFileLock: must be held to read/update control file or create
405 * new log file.
406 *
407 * CheckpointLock: must be held to do a checkpoint or restartpoint (ensures
408 * only one checkpointer at a time; currently, with all checkpoints done by
409 * the checkpointer, this is just pro forma).
410 *
411 *----------
412 */
413
414 typedef struct XLogwrtRqst
415 {
416 XLogRecPtr Write; /* last byte + 1 to write out */
417 XLogRecPtr Flush; /* last byte + 1 to flush */
418 } XLogwrtRqst;
419
420 typedef struct XLogwrtResult
421 {
422 XLogRecPtr Write; /* last byte + 1 written out */
423 XLogRecPtr Flush; /* last byte + 1 flushed */
424 } XLogwrtResult;
425
426 /*
427 * Inserting to WAL is protected by a small fixed number of WAL insertion
428 * locks. To insert to the WAL, you must hold one of the locks - it doesn't
429 * matter which one. To lock out other concurrent insertions, you must hold
430 * of them. Each WAL insertion lock consists of a lightweight lock, plus an
431 * indicator of how far the insertion has progressed (insertingAt).
432 *
433 * The insertingAt values are read when a process wants to flush WAL from
434 * the in-memory buffers to disk, to check that all the insertions to the
435 * region the process is about to write out have finished. You could simply
436 * wait for all currently in-progress insertions to finish, but the
437 * insertingAt indicator allows you to ignore insertions to later in the WAL,
438 * so that you only wait for the insertions that are modifying the buffers
439 * you're about to write out.
440 *
441 * This isn't just an optimization. If all the WAL buffers are dirty, an
442 * inserter that's holding a WAL insert lock might need to evict an old WAL
443 * buffer, which requires flushing the WAL. If it's possible for an inserter
444 * to block on another inserter unnecessarily, deadlock can arise when two
445 * inserters holding a WAL insert lock wait for each other to finish their
446 * insertion.
447 *
448 * Small WAL records that don't cross a page boundary never update the value,
449 * the WAL record is just copied to the page and the lock is released. But
450 * to avoid the deadlock-scenario explained above, the indicator is always
451 * updated before sleeping while holding an insertion lock.
452 */
453 typedef struct
454 {
455 LWLock lock;
456 XLogRecPtr insertingAt;
457 } WALInsertLock;
458
459 /*
460 * All the WAL insertion locks are allocated as an array in shared memory. We
461 * force the array stride to be a power of 2, which saves a few cycles in
462 * indexing, but more importantly also ensures that individual slots don't
463 * cross cache line boundaries. (Of course, we have to also ensure that the
464 * array start address is suitably aligned.)
465 */
466 typedef union WALInsertLockPadded
467 {
468 WALInsertLock l;
469 char pad[PG_CACHE_LINE_SIZE];
470 } WALInsertLockPadded;
471
472 /*
473 * State of an exclusive backup, necessary to control concurrent activities
474 * across sessions when working on exclusive backups.
475 *
476 * EXCLUSIVE_BACKUP_NONE means that there is no exclusive backup actually
477 * running, to be more precise pg_start_backup() is not being executed for
478 * an exclusive backup and there is no exclusive backup in progress.
479 * EXCLUSIVE_BACKUP_STARTING means that pg_start_backup() is starting an
480 * exclusive backup.
481 * EXCLUSIVE_BACKUP_IN_PROGRESS means that pg_start_backup() has finished
482 * running and an exclusive backup is in progress. pg_stop_backup() is
483 * needed to finish it.
484 * EXCLUSIVE_BACKUP_STOPPING means that pg_stop_backup() is stopping an
485 * exclusive backup.
486 */
487 typedef enum ExclusiveBackupState
488 {
489 EXCLUSIVE_BACKUP_NONE = 0,
490 EXCLUSIVE_BACKUP_STARTING,
491 EXCLUSIVE_BACKUP_IN_PROGRESS,
492 EXCLUSIVE_BACKUP_STOPPING
493 } ExclusiveBackupState;
494
495 /*
496 * Session status of running backup, used for sanity checks in SQL-callable
497 * functions to start and stop backups.
498 */
499 static SessionBackupState sessionBackupState = SESSION_BACKUP_NONE;
500
501 /*
502 * Shared state data for WAL insertion.
503 */
504 typedef struct XLogCtlInsert
505 {
506 slock_t insertpos_lck; /* protects CurrBytePos and PrevBytePos */
507
508 /*
509 * CurrBytePos is the end of reserved WAL. The next record will be
510 * inserted at that position. PrevBytePos is the start position of the
511 * previously inserted (or rather, reserved) record - it is copied to the
512 * prev-link of the next record. These are stored as "usable byte
513 * positions" rather than XLogRecPtrs (see XLogBytePosToRecPtr()).
514 */
515 uint64 CurrBytePos;
516 uint64 PrevBytePos;
517
518 /*
519 * Make sure the above heavily-contended spinlock and byte positions are
520 * on their own cache line. In particular, the RedoRecPtr and full page
521 * write variables below should be on a different cache line. They are
522 * read on every WAL insertion, but updated rarely, and we don't want
523 * those reads to steal the cache line containing Curr/PrevBytePos.
524 */
525 char pad[PG_CACHE_LINE_SIZE];
526
527 /*
528 * fullPageWrites is the master copy used by all backends to determine
529 * whether to write full-page to WAL, instead of using process-local one.
530 * This is required because, when full_page_writes is changed by SIGHUP,
531 * we must WAL-log it before it actually affects WAL-logging by backends.
532 * Checkpointer sets at startup or after SIGHUP.
533 *
534 * To read these fields, you must hold an insertion lock. To modify them,
535 * you must hold ALL the locks.
536 */
537 XLogRecPtr RedoRecPtr; /* current redo point for insertions */
538 bool forcePageWrites; /* forcing full-page writes for PITR? */
539 bool fullPageWrites;
540
541 /*
542 * exclusiveBackupState indicates the state of an exclusive backup
543 * (see comments of ExclusiveBackupState for more details).
544 * nonExclusiveBackups is a counter indicating the number of streaming
545 * base backups currently in progress. forcePageWrites is set to true
546 * when either of these is non-zero. lastBackupStart is the latest
547 * checkpoint redo location used as a starting point for an online
548 * backup.
549 */
550 ExclusiveBackupState exclusiveBackupState;
551 int nonExclusiveBackups;
552 XLogRecPtr lastBackupStart;
553
554 /*
555 * WAL insertion locks.
556 */
557 WALInsertLockPadded *WALInsertLocks;
558 LWLockTranche WALInsertLockTranche;
559 } XLogCtlInsert;
560
561 /*
562 * Total shared-memory state for XLOG.
563 */
564 typedef struct XLogCtlData
565 {
566 XLogCtlInsert Insert;
567
568 /* Protected by info_lck: */
569 XLogwrtRqst LogwrtRqst;
570 XLogRecPtr RedoRecPtr; /* a recent copy of Insert->RedoRecPtr */
571 uint32 ckptXidEpoch; /* nextXID & epoch of latest checkpoint */
572 TransactionId ckptXid;
573 XLogRecPtr asyncXactLSN; /* LSN of newest async commit/abort */
574 XLogRecPtr replicationSlotMinLSN; /* oldest LSN needed by any slot */
575
576 XLogSegNo lastRemovedSegNo; /* latest removed/recycled XLOG
577 * segment */
578
579 /* Fake LSN counter, for unlogged relations. Protected by ulsn_lck. */
580 XLogRecPtr unloggedLSN;
581 slock_t ulsn_lck;
582
583 /* Time of last xlog segment switch. Protected by WALWriteLock. */
584 pg_time_t lastSegSwitchTime;
585
586 /*
587 * Protected by info_lck and WALWriteLock (you must hold either lock to
588 * read it, but both to update)
589 */
590 XLogwrtResult LogwrtResult;
591
592 /*
593 * Latest initialized page in the cache (last byte position + 1).
594 *
595 * To change the identity of a buffer (and InitializedUpTo), you need to
596 * hold WALBufMappingLock. To change the identity of a buffer that's
597 * still dirty, the old page needs to be written out first, and for that
598 * you need WALWriteLock, and you need to ensure that there are no
599 * in-progress insertions to the page by calling
600 * WaitXLogInsertionsToFinish().
601 */
602 XLogRecPtr InitializedUpTo;
603
604 /*
605 * These values do not change after startup, although the pointed-to pages
606 * and xlblocks values certainly do. xlblock values are protected by
607 * WALBufMappingLock.
608 */
609 char *pages; /* buffers for unwritten XLOG pages */
610 XLogRecPtr *xlblocks; /* 1st byte ptr-s + XLOG_BLCKSZ */
611 int XLogCacheBlck; /* highest allocated xlog buffer index */
612
613 /*
614 * Shared copy of ThisTimeLineID. Does not change after end-of-recovery.
615 * If we created a new timeline when the system was started up,
616 * PrevTimeLineID is the old timeline's ID that we forked off from.
617 * Otherwise it's equal to ThisTimeLineID.
618 */
619 TimeLineID ThisTimeLineID;
620 TimeLineID PrevTimeLineID;
621
622 /*
623 * archiveCleanupCommand is read from recovery.conf but needs to be in
624 * shared memory so that the checkpointer process can access it.
625 */
626 char archiveCleanupCommand[MAXPGPATH];
627
628 /*
629 * SharedRecoveryState indicates if we're still in crash or archive
630 * recovery. Protected by info_lck.
631 */
632 RecoveryState SharedRecoveryState;
633
634 /*
635 * SharedHotStandbyActive indicates if we're still in crash or archive
636 * recovery. Protected by info_lck.
637 */
638 bool SharedHotStandbyActive;
639
640 /*
641 * WalWriterSleeping indicates whether the WAL writer is currently in
642 * low-power mode (and hence should be nudged if an async commit occurs).
643 * Protected by info_lck.
644 */
645 bool WalWriterSleeping;
646
647 /*
648 * recoveryWakeupLatch is used to wake up the startup process to continue
649 * WAL replay, if it is waiting for WAL to arrive or failover trigger file
650 * to appear.
651 */
652 Latch recoveryWakeupLatch;
653
654 /*
655 * During recovery, we keep a copy of the latest checkpoint record here.
656 * lastCheckPointRecPtr points to start of checkpoint record and
657 * lastCheckPointEndPtr points to end+1 of checkpoint record. Used by the
658 * background writer when it wants to create a restartpoint.
659 *
660 * Protected by info_lck.
661 */
662 XLogRecPtr lastCheckPointRecPtr;
663 XLogRecPtr lastCheckPointEndPtr;
664 CheckPoint lastCheckPoint;
665
666 /*
667 * lastReplayedEndRecPtr points to end+1 of the last record successfully
668 * replayed. When we're currently replaying a record, ie. in a redo
669 * function, replayEndRecPtr points to the end+1 of the record being
670 * replayed, otherwise it's equal to lastReplayedEndRecPtr.
671 */
672 XLogRecPtr lastReplayedEndRecPtr;
673 TimeLineID lastReplayedTLI;
674 XLogRecPtr replayEndRecPtr;
675 TimeLineID replayEndTLI;
676 /* timestamp of last COMMIT/ABORT record replayed (or being replayed) */
677 TimestampTz recoveryLastXTime;
678
679 /*
680 * timestamp of when we started replaying the current chunk of WAL data,
681 * only relevant for replication or archive recovery
682 */
683 TimestampTz currentChunkStartTime;
684 /* Are we requested to pause recovery? */
685 bool recoveryPause;
686
687 /*
688 * lastFpwDisableRecPtr points to the start of the last replayed
689 * XLOG_FPW_CHANGE record that instructs full_page_writes is disabled.
690 */
691 XLogRecPtr lastFpwDisableRecPtr;
692
693 slock_t info_lck; /* locks shared variables shown above */
694 } XLogCtlData;
695
696 static XLogCtlData *XLogCtl = NULL;
697
698 /* a private copy of XLogCtl->Insert.WALInsertLocks, for convenience */
699 static WALInsertLockPadded *WALInsertLocks = NULL;
700
701 /*
702 * We maintain an image of pg_control in shared memory.
703 */
704 static ControlFileData *ControlFile = NULL;
705
706 /*
707 * Calculate the amount of space left on the page after 'endptr'. Beware
708 * multiple evaluation!
709 */
710 #define INSERT_FREESPACE(endptr) \
711 (((endptr) % XLOG_BLCKSZ == 0) ? 0 : (XLOG_BLCKSZ - (endptr) % XLOG_BLCKSZ))
712
713 /* Macro to advance to next buffer index. */
714 #define NextBufIdx(idx) \
715 (((idx) == XLogCtl->XLogCacheBlck) ? 0 : ((idx) + 1))
716
717 /*
718 * XLogRecPtrToBufIdx returns the index of the WAL buffer that holds, or
719 * would hold if it was in cache, the page containing 'recptr'.
720 */
721 #define XLogRecPtrToBufIdx(recptr) \
722 (((recptr) / XLOG_BLCKSZ) % (XLogCtl->XLogCacheBlck + 1))
723
724 /*
725 * These are the number of bytes in a WAL page and segment usable for WAL data.
726 */
727 #define UsableBytesInPage (XLOG_BLCKSZ - SizeOfXLogShortPHD)
728 #define UsableBytesInSegment ((XLOG_SEG_SIZE / XLOG_BLCKSZ) * UsableBytesInPage - (SizeOfXLogLongPHD - SizeOfXLogShortPHD))
729
730 /*
731 * Private, possibly out-of-date copy of shared LogwrtResult.
732 * See discussion above.
733 */
734 static XLogwrtResult LogwrtResult = {0, 0};
735
736 /*
737 * Codes indicating where we got a WAL file from during recovery, or where
738 * to attempt to get one.
739 */
740 typedef enum
741 {
742 XLOG_FROM_ANY = 0, /* request to read WAL from any source */
743 XLOG_FROM_ARCHIVE, /* restored using restore_command */
744 XLOG_FROM_PG_XLOG, /* existing file in pg_xlog */
745 XLOG_FROM_STREAM /* streamed from master */
746 } XLogSource;
747
748 /* human-readable names for XLogSources, for debugging output */
749 static const char *xlogSourceNames[] = {"any", "archive", "pg_xlog", "stream"};
750
751 /*
752 * openLogFile is -1 or a kernel FD for an open log file segment.
753 * When it's open, openLogOff is the current seek offset in the file.
754 * openLogSegNo identifies the segment. These variables are only
755 * used to write the XLOG, and so will normally refer to the active segment.
756 */
757 static int openLogFile = -1;
758 static XLogSegNo openLogSegNo = 0;
759 static uint32 openLogOff = 0;
760
761 /*
762 * These variables are used similarly to the ones above, but for reading
763 * the XLOG. Note, however, that readOff generally represents the offset
764 * of the page just read, not the seek position of the FD itself, which
765 * will be just past that page. readLen indicates how much of the current
766 * page has been read into readBuf, and readSource indicates where we got
767 * the currently open file from.
768 */
769 static int readFile = -1;
770 static XLogSegNo readSegNo = 0;
771 static uint32 readOff = 0;
772 static uint32 readLen = 0;
773 static XLogSource readSource = 0; /* XLOG_FROM_* code */
774
775 /*
776 * Keeps track of which source we're currently reading from. This is
777 * different from readSource in that this is always set, even when we don't
778 * currently have a WAL file open. If lastSourceFailed is set, our last
779 * attempt to read from currentSource failed, and we should try another source
780 * next.
781 */
782 static XLogSource currentSource = 0; /* XLOG_FROM_* code */
783 static bool lastSourceFailed = false;
784
785 typedef struct XLogPageReadPrivate
786 {
787 int emode;
788 bool fetching_ckpt; /* are we fetching a checkpoint record? */
789 bool randAccess;
790 } XLogPageReadPrivate;
791
792 /*
793 * These variables track when we last obtained some WAL data to process,
794 * and where we got it from. (XLogReceiptSource is initially the same as
795 * readSource, but readSource gets reset to zero when we don't have data
796 * to process right now. It is also different from currentSource, which
797 * also changes when we try to read from a source and fail, while
798 * XLogReceiptSource tracks where we last successfully read some WAL.)
799 */
800 static TimestampTz XLogReceiptTime = 0;
801 static XLogSource XLogReceiptSource = 0; /* XLOG_FROM_* code */
802
803 /* State information for XLOG reading */
804 static XLogRecPtr ReadRecPtr; /* start of last record read */
805 static XLogRecPtr EndRecPtr; /* end+1 of last record read */
806
807 /*
808 * Local copies of equivalent fields in the control file. When running
809 * crash recovery, minRecoveryPoint is set to InvalidXLogRecPtr as we
810 * expect to replay all the WAL available, and updateMinRecoveryPoint is
811 * switched to false to prevent any updates while replaying records.
812 * Those values are kept consistent as long as crash recovery runs.
813 */
814 static XLogRecPtr minRecoveryPoint;
815 static TimeLineID minRecoveryPointTLI;
816 static bool updateMinRecoveryPoint = true;
817
818 /*
819 * Have we reached a consistent database state? In crash recovery, we have
820 * to replay all the WAL, so reachedConsistency is never set. During archive
821 * recovery, the database is consistent once minRecoveryPoint is reached.
822 */
823 bool reachedConsistency = false;
824
825 static bool InRedo = false;
826
827 /* Have we launched bgwriter during recovery? */
828 static bool bgwriterLaunched = false;
829
830 /* For WALInsertLockAcquire/Release functions */
831 static int MyLockNo = 0;
832 static bool holdingAllLocks = false;
833
834 #ifdef WAL_DEBUG
835 static MemoryContext walDebugCxt = NULL;
836 #endif
837
838 static void readRecoveryCommandFile(void);
839 static void exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog);
840 static bool recoveryStopsBefore(XLogReaderState *record);
841 static bool recoveryStopsAfter(XLogReaderState *record);
842 static void recoveryPausesHere(void);
843 static bool recoveryApplyDelay(XLogReaderState *record);
844 static void SetLatestXTime(TimestampTz xtime);
845 static void SetCurrentChunkStartTime(TimestampTz xtime);
846 static void CheckRequiredParameterValues(void);
847 static void XLogReportParameters(void);
848 static void checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI,
849 TimeLineID prevTLI);
850 static void VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec,
851 XLogReaderState *state);
852 static void LocalSetXLogInsertAllowed(void);
853 static void CreateEndOfRecoveryRecord(void);
854 static XLogRecPtr CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn);
855 static void CheckPointGuts(XLogRecPtr checkPointRedo, int flags);
856 static void KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo);
857 static XLogRecPtr XLogGetReplicationSlotMinimumLSN(void);
858
859 static void AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic);
860 static bool XLogCheckpointNeeded(XLogSegNo new_segno);
861 static void XLogWrite(XLogwrtRqst WriteRqst, bool flexible);
862 static bool InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
863 bool find_free, XLogSegNo max_segno,
864 bool use_lock);
865 static int XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
866 int source, bool notfoundOk);
867 static int XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source);
868 static int XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr,
869 int reqLen, XLogRecPtr targetRecPtr, char *readBuf,
870 TimeLineID *readTLI);
871 static bool WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
872 bool fetching_ckpt, XLogRecPtr tliRecPtr);
873 static int emode_for_corrupt_record(int emode, XLogRecPtr RecPtr);
874 static void XLogFileClose(void);
875 static void PreallocXlogFiles(XLogRecPtr endptr);
876 static void RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
877 static void RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr);
878 static void UpdateLastRemovedPtr(char *filename);
879 static void ValidateXLOGDirectoryStructure(void);
880 static void CleanupBackupHistory(void);
881 static void UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force);
882 static XLogRecord *ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
883 int emode, bool fetching_ckpt);
884 static void CheckRecoveryConsistency(void);
885 static XLogRecord *ReadCheckpointRecord(XLogReaderState *xlogreader,
886 XLogRecPtr RecPtr, int whichChkpti, bool report);
887 static bool rescanLatestTimeLine(void);
888 static void WriteControlFile(void);
889 static void ReadControlFile(void);
890 static char *str_time(pg_time_t tnow);
891 static bool CheckForStandbyTrigger(void);
892
893 #ifdef WAL_DEBUG
894 static void xlog_outrec(StringInfo buf, XLogReaderState *record);
895 #endif
896 static void xlog_outdesc(StringInfo buf, XLogReaderState *record);
897 static void pg_start_backup_callback(int code, Datum arg);
898 static void pg_stop_backup_callback(int code, Datum arg);
899 static bool read_backup_label(XLogRecPtr *checkPointLoc,
900 bool *backupEndRequired, bool *backupFromStandby);
901 static bool read_tablespace_map(List **tablespaces);
902
903 static void rm_redo_error_callback(void *arg);
904 static int get_sync_bit(int method);
905
906 static void CopyXLogRecordToWAL(int write_len, bool isLogSwitch,
907 XLogRecData *rdata,
908 XLogRecPtr StartPos, XLogRecPtr EndPos);
909 static void ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos,
910 XLogRecPtr *EndPos, XLogRecPtr *PrevPtr);
911 static bool ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos,
912 XLogRecPtr *PrevPtr);
913 static XLogRecPtr WaitXLogInsertionsToFinish(XLogRecPtr upto);
914 static char *GetXLogBuffer(XLogRecPtr ptr);
915 static XLogRecPtr XLogBytePosToRecPtr(uint64 bytepos);
916 static XLogRecPtr XLogBytePosToEndRecPtr(uint64 bytepos);
917 static uint64 XLogRecPtrToBytePos(XLogRecPtr ptr);
918
919 static void WALInsertLockAcquire(void);
920 static void WALInsertLockAcquireExclusive(void);
921 static void WALInsertLockRelease(void);
922 static void WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt);
923
924 /*
925 * Insert an XLOG record represented by an already-constructed chain of data
926 * chunks. This is a low-level routine; to construct the WAL record header
927 * and data, use the higher-level routines in xloginsert.c.
928 *
929 * If 'fpw_lsn' is valid, it is the oldest LSN among the pages that this
930 * WAL record applies to, that were not included in the record as full page
931 * images. If fpw_lsn <= RedoRecPtr, the function does not perform the
932 * insertion and returns InvalidXLogRecPtr. The caller can then recalculate
933 * which pages need a full-page image, and retry. If fpw_lsn is invalid, the
934 * record is always inserted.
935 *
936 * The first XLogRecData in the chain must be for the record header, and its
937 * data must be MAXALIGNed. XLogInsertRecord fills in the xl_prev and
938 * xl_crc fields in the header, the rest of the header must already be filled
939 * by the caller.
940 *
941 * Returns XLOG pointer to end of record (beginning of next record).
942 * This can be used as LSN for data pages affected by the logged action.
943 * (LSN is the XLOG point up to which the XLOG must be flushed to disk
944 * before the data page can be written out. This implements the basic
945 * WAL rule "write the log before the data".)
946 */
947 XLogRecPtr
XLogInsertRecord(XLogRecData * rdata,XLogRecPtr fpw_lsn)948 XLogInsertRecord(XLogRecData *rdata, XLogRecPtr fpw_lsn)
949 {
950 XLogCtlInsert *Insert = &XLogCtl->Insert;
951 pg_crc32c rdata_crc;
952 bool inserted;
953 XLogRecord *rechdr = (XLogRecord *) rdata->data;
954 bool isLogSwitch = (rechdr->xl_rmid == RM_XLOG_ID &&
955 rechdr->xl_info == XLOG_SWITCH);
956 XLogRecPtr StartPos;
957 XLogRecPtr EndPos;
958 bool prevDoPageWrites = doPageWrites;
959
960 /* we assume that all of the record header is in the first chunk */
961 Assert(rdata->len >= SizeOfXLogRecord);
962
963 /* cross-check on whether we should be here or not */
964 if (!XLogInsertAllowed())
965 elog(ERROR, "cannot make new WAL entries during recovery");
966
967 /*----------
968 *
969 * We have now done all the preparatory work we can without holding a
970 * lock or modifying shared state. From here on, inserting the new WAL
971 * record to the shared WAL buffer cache is a two-step process:
972 *
973 * 1. Reserve the right amount of space from the WAL. The current head of
974 * reserved space is kept in Insert->CurrBytePos, and is protected by
975 * insertpos_lck.
976 *
977 * 2. Copy the record to the reserved WAL space. This involves finding the
978 * correct WAL buffer containing the reserved space, and copying the
979 * record in place. This can be done concurrently in multiple processes.
980 *
981 * To keep track of which insertions are still in-progress, each concurrent
982 * inserter acquires an insertion lock. In addition to just indicating that
983 * an insertion is in progress, the lock tells others how far the inserter
984 * has progressed. There is a small fixed number of insertion locks,
985 * determined by NUM_XLOGINSERT_LOCKS. When an inserter crosses a page
986 * boundary, it updates the value stored in the lock to the how far it has
987 * inserted, to allow the previous buffer to be flushed.
988 *
989 * Holding onto an insertion lock also protects RedoRecPtr and
990 * fullPageWrites from changing until the insertion is finished.
991 *
992 * Step 2 can usually be done completely in parallel. If the required WAL
993 * page is not initialized yet, you have to grab WALBufMappingLock to
994 * initialize it, but the WAL writer tries to do that ahead of insertions
995 * to avoid that from happening in the critical path.
996 *
997 *----------
998 */
999 START_CRIT_SECTION();
1000 if (isLogSwitch)
1001 WALInsertLockAcquireExclusive();
1002 else
1003 WALInsertLockAcquire();
1004
1005 /*
1006 * Check to see if my copy of RedoRecPtr is out of date. If so, may have
1007 * to go back and have the caller recompute everything. This can only
1008 * happen just after a checkpoint, so it's better to be slow in this case
1009 * and fast otherwise.
1010 *
1011 * Also check to see if fullPageWrites or forcePageWrites was just turned
1012 * on; if we weren't already doing full-page writes then go back and
1013 * recompute.
1014 *
1015 * If we aren't doing full-page writes then RedoRecPtr doesn't actually
1016 * affect the contents of the XLOG record, so we'll update our local copy
1017 * but not force a recomputation. (If doPageWrites was just turned off,
1018 * we could recompute the record without full pages, but we choose not to
1019 * bother.)
1020 */
1021 if (RedoRecPtr != Insert->RedoRecPtr)
1022 {
1023 Assert(RedoRecPtr < Insert->RedoRecPtr);
1024 RedoRecPtr = Insert->RedoRecPtr;
1025 }
1026 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
1027
1028 if (doPageWrites &&
1029 (!prevDoPageWrites ||
1030 (fpw_lsn != InvalidXLogRecPtr && fpw_lsn <= RedoRecPtr)))
1031 {
1032 /*
1033 * Oops, some buffer now needs to be backed up that the caller didn't
1034 * back up. Start over.
1035 */
1036 WALInsertLockRelease();
1037 END_CRIT_SECTION();
1038 return InvalidXLogRecPtr;
1039 }
1040
1041 /*
1042 * Reserve space for the record in the WAL. This also sets the xl_prev
1043 * pointer.
1044 */
1045 if (isLogSwitch)
1046 inserted = ReserveXLogSwitch(&StartPos, &EndPos, &rechdr->xl_prev);
1047 else
1048 {
1049 ReserveXLogInsertLocation(rechdr->xl_tot_len, &StartPos, &EndPos,
1050 &rechdr->xl_prev);
1051 inserted = true;
1052 }
1053
1054 if (inserted)
1055 {
1056 /*
1057 * Now that xl_prev has been filled in, calculate CRC of the record
1058 * header.
1059 */
1060 rdata_crc = rechdr->xl_crc;
1061 COMP_CRC32C(rdata_crc, rechdr, offsetof(XLogRecord, xl_crc));
1062 FIN_CRC32C(rdata_crc);
1063 rechdr->xl_crc = rdata_crc;
1064
1065 /*
1066 * All the record data, including the header, is now ready to be
1067 * inserted. Copy the record in the space reserved.
1068 */
1069 CopyXLogRecordToWAL(rechdr->xl_tot_len, isLogSwitch, rdata,
1070 StartPos, EndPos);
1071 }
1072 else
1073 {
1074 /*
1075 * This was an xlog-switch record, but the current insert location was
1076 * already exactly at the beginning of a segment, so there was no need
1077 * to do anything.
1078 */
1079 }
1080
1081 /*
1082 * Done! Let others know that we're finished.
1083 */
1084 WALInsertLockRelease();
1085
1086 MarkCurrentTransactionIdLoggedIfAny();
1087
1088 END_CRIT_SECTION();
1089
1090 /*
1091 * Update shared LogwrtRqst.Write, if we crossed page boundary.
1092 */
1093 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1094 {
1095 SpinLockAcquire(&XLogCtl->info_lck);
1096 /* advance global request to include new block(s) */
1097 if (XLogCtl->LogwrtRqst.Write < EndPos)
1098 XLogCtl->LogwrtRqst.Write = EndPos;
1099 /* update local result copy while I have the chance */
1100 LogwrtResult = XLogCtl->LogwrtResult;
1101 SpinLockRelease(&XLogCtl->info_lck);
1102 }
1103
1104 /*
1105 * If this was an XLOG_SWITCH record, flush the record and the empty
1106 * padding space that fills the rest of the segment, and perform
1107 * end-of-segment actions (eg, notifying archiver).
1108 */
1109 if (isLogSwitch)
1110 {
1111 TRACE_POSTGRESQL_XLOG_SWITCH();
1112 XLogFlush(EndPos);
1113
1114 /*
1115 * Even though we reserved the rest of the segment for us, which is
1116 * reflected in EndPos, we return a pointer to just the end of the
1117 * xlog-switch record.
1118 */
1119 if (inserted)
1120 {
1121 EndPos = StartPos + SizeOfXLogRecord;
1122 if (StartPos / XLOG_BLCKSZ != EndPos / XLOG_BLCKSZ)
1123 {
1124 if (EndPos % XLOG_SEG_SIZE == EndPos % XLOG_BLCKSZ)
1125 EndPos += SizeOfXLogLongPHD;
1126 else
1127 EndPos += SizeOfXLogShortPHD;
1128 }
1129 }
1130 }
1131
1132 #ifdef WAL_DEBUG
1133 if (XLOG_DEBUG)
1134 {
1135 static XLogReaderState *debug_reader = NULL;
1136 StringInfoData buf;
1137 StringInfoData recordBuf;
1138 char *errormsg = NULL;
1139 MemoryContext oldCxt;
1140
1141 oldCxt = MemoryContextSwitchTo(walDebugCxt);
1142
1143 initStringInfo(&buf);
1144 appendStringInfo(&buf, "INSERT @ %X/%X: ",
1145 (uint32) (EndPos >> 32), (uint32) EndPos);
1146
1147 /*
1148 * We have to piece together the WAL record data from the XLogRecData
1149 * entries, so that we can pass it to the rm_desc function as one
1150 * contiguous chunk.
1151 */
1152 initStringInfo(&recordBuf);
1153 for (; rdata != NULL; rdata = rdata->next)
1154 appendBinaryStringInfo(&recordBuf, rdata->data, rdata->len);
1155
1156 if (!debug_reader)
1157 debug_reader = XLogReaderAllocate(NULL, NULL);
1158
1159 if (!debug_reader)
1160 {
1161 appendStringInfoString(&buf, "error decoding record: out of memory");
1162 }
1163 else if (!DecodeXLogRecord(debug_reader, (XLogRecord *) recordBuf.data,
1164 &errormsg))
1165 {
1166 appendStringInfo(&buf, "error decoding record: %s",
1167 errormsg ? errormsg : "no error message");
1168 }
1169 else
1170 {
1171 appendStringInfoString(&buf, " - ");
1172 xlog_outdesc(&buf, debug_reader);
1173 }
1174 elog(LOG, "%s", buf.data);
1175
1176 pfree(buf.data);
1177 pfree(recordBuf.data);
1178 MemoryContextSwitchTo(oldCxt);
1179 }
1180 #endif
1181
1182 /*
1183 * Update our global variables
1184 */
1185 ProcLastRecPtr = StartPos;
1186 XactLastRecEnd = EndPos;
1187
1188 return EndPos;
1189 }
1190
1191 /*
1192 * Reserves the right amount of space for a record of given size from the WAL.
1193 * *StartPos is set to the beginning of the reserved section, *EndPos to
1194 * its end+1. *PrevPtr is set to the beginning of the previous record; it is
1195 * used to set the xl_prev of this record.
1196 *
1197 * This is the performance critical part of XLogInsert that must be serialized
1198 * across backends. The rest can happen mostly in parallel. Try to keep this
1199 * section as short as possible, insertpos_lck can be heavily contended on a
1200 * busy system.
1201 *
1202 * NB: The space calculation here must match the code in CopyXLogRecordToWAL,
1203 * where we actually copy the record to the reserved space.
1204 */
1205 static void
ReserveXLogInsertLocation(int size,XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1206 ReserveXLogInsertLocation(int size, XLogRecPtr *StartPos, XLogRecPtr *EndPos,
1207 XLogRecPtr *PrevPtr)
1208 {
1209 XLogCtlInsert *Insert = &XLogCtl->Insert;
1210 uint64 startbytepos;
1211 uint64 endbytepos;
1212 uint64 prevbytepos;
1213
1214 size = MAXALIGN(size);
1215
1216 /* All (non xlog-switch) records should contain data. */
1217 Assert(size > SizeOfXLogRecord);
1218
1219 /*
1220 * The duration the spinlock needs to be held is minimized by minimizing
1221 * the calculations that have to be done while holding the lock. The
1222 * current tip of reserved WAL is kept in CurrBytePos, as a byte position
1223 * that only counts "usable" bytes in WAL, that is, it excludes all WAL
1224 * page headers. The mapping between "usable" byte positions and physical
1225 * positions (XLogRecPtrs) can be done outside the locked region, and
1226 * because the usable byte position doesn't include any headers, reserving
1227 * X bytes from WAL is almost as simple as "CurrBytePos += X".
1228 */
1229 SpinLockAcquire(&Insert->insertpos_lck);
1230
1231 startbytepos = Insert->CurrBytePos;
1232 endbytepos = startbytepos + size;
1233 prevbytepos = Insert->PrevBytePos;
1234 Insert->CurrBytePos = endbytepos;
1235 Insert->PrevBytePos = startbytepos;
1236
1237 SpinLockRelease(&Insert->insertpos_lck);
1238
1239 *StartPos = XLogBytePosToRecPtr(startbytepos);
1240 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1241 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1242
1243 /*
1244 * Check that the conversions between "usable byte positions" and
1245 * XLogRecPtrs work consistently in both directions.
1246 */
1247 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1248 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1249 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1250 }
1251
1252 /*
1253 * Like ReserveXLogInsertLocation(), but for an xlog-switch record.
1254 *
1255 * A log-switch record is handled slightly differently. The rest of the
1256 * segment will be reserved for this insertion, as indicated by the returned
1257 * *EndPos value. However, if we are already at the beginning of the current
1258 * segment, *StartPos and *EndPos are set to the current location without
1259 * reserving any space, and the function returns false.
1260 */
1261 static bool
ReserveXLogSwitch(XLogRecPtr * StartPos,XLogRecPtr * EndPos,XLogRecPtr * PrevPtr)1262 ReserveXLogSwitch(XLogRecPtr *StartPos, XLogRecPtr *EndPos, XLogRecPtr *PrevPtr)
1263 {
1264 XLogCtlInsert *Insert = &XLogCtl->Insert;
1265 uint64 startbytepos;
1266 uint64 endbytepos;
1267 uint64 prevbytepos;
1268 uint32 size = MAXALIGN(SizeOfXLogRecord);
1269 XLogRecPtr ptr;
1270 uint32 segleft;
1271
1272 /*
1273 * These calculations are a bit heavy-weight to be done while holding a
1274 * spinlock, but since we're holding all the WAL insertion locks, there
1275 * are no other inserters competing for it. GetXLogInsertRecPtr() does
1276 * compete for it, but that's not called very frequently.
1277 */
1278 SpinLockAcquire(&Insert->insertpos_lck);
1279
1280 startbytepos = Insert->CurrBytePos;
1281
1282 ptr = XLogBytePosToEndRecPtr(startbytepos);
1283 if (ptr % XLOG_SEG_SIZE == 0)
1284 {
1285 SpinLockRelease(&Insert->insertpos_lck);
1286 *EndPos = *StartPos = ptr;
1287 return false;
1288 }
1289
1290 endbytepos = startbytepos + size;
1291 prevbytepos = Insert->PrevBytePos;
1292
1293 *StartPos = XLogBytePosToRecPtr(startbytepos);
1294 *EndPos = XLogBytePosToEndRecPtr(endbytepos);
1295
1296 segleft = XLOG_SEG_SIZE - ((*EndPos) % XLOG_SEG_SIZE);
1297 if (segleft != XLOG_SEG_SIZE)
1298 {
1299 /* consume the rest of the segment */
1300 *EndPos += segleft;
1301 endbytepos = XLogRecPtrToBytePos(*EndPos);
1302 }
1303 Insert->CurrBytePos = endbytepos;
1304 Insert->PrevBytePos = startbytepos;
1305
1306 SpinLockRelease(&Insert->insertpos_lck);
1307
1308 *PrevPtr = XLogBytePosToRecPtr(prevbytepos);
1309
1310 Assert((*EndPos) % XLOG_SEG_SIZE == 0);
1311 Assert(XLogRecPtrToBytePos(*EndPos) == endbytepos);
1312 Assert(XLogRecPtrToBytePos(*StartPos) == startbytepos);
1313 Assert(XLogRecPtrToBytePos(*PrevPtr) == prevbytepos);
1314
1315 return true;
1316 }
1317
1318 /*
1319 * Subroutine of XLogInsertRecord. Copies a WAL record to an already-reserved
1320 * area in the WAL.
1321 */
1322 static void
CopyXLogRecordToWAL(int write_len,bool isLogSwitch,XLogRecData * rdata,XLogRecPtr StartPos,XLogRecPtr EndPos)1323 CopyXLogRecordToWAL(int write_len, bool isLogSwitch, XLogRecData *rdata,
1324 XLogRecPtr StartPos, XLogRecPtr EndPos)
1325 {
1326 char *currpos;
1327 int freespace;
1328 int written;
1329 XLogRecPtr CurrPos;
1330 XLogPageHeader pagehdr;
1331
1332 /*
1333 * Get a pointer to the right place in the right WAL buffer to start
1334 * inserting to.
1335 */
1336 CurrPos = StartPos;
1337 currpos = GetXLogBuffer(CurrPos);
1338 freespace = INSERT_FREESPACE(CurrPos);
1339
1340 /*
1341 * there should be enough space for at least the first field (xl_tot_len)
1342 * on this page.
1343 */
1344 Assert(freespace >= sizeof(uint32));
1345
1346 /* Copy record data */
1347 written = 0;
1348 while (rdata != NULL)
1349 {
1350 char *rdata_data = rdata->data;
1351 int rdata_len = rdata->len;
1352
1353 while (rdata_len > freespace)
1354 {
1355 /*
1356 * Write what fits on this page, and continue on the next page.
1357 */
1358 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || freespace == 0);
1359 memcpy(currpos, rdata_data, freespace);
1360 rdata_data += freespace;
1361 rdata_len -= freespace;
1362 written += freespace;
1363 CurrPos += freespace;
1364
1365 /*
1366 * Get pointer to beginning of next page, and set the xlp_rem_len
1367 * in the page header. Set XLP_FIRST_IS_CONTRECORD.
1368 *
1369 * It's safe to set the contrecord flag and xlp_rem_len without a
1370 * lock on the page. All the other flags were already set when the
1371 * page was initialized, in AdvanceXLInsertBuffer, and we're the
1372 * only backend that needs to set the contrecord flag.
1373 */
1374 currpos = GetXLogBuffer(CurrPos);
1375 pagehdr = (XLogPageHeader) currpos;
1376 pagehdr->xlp_rem_len = write_len - written;
1377 pagehdr->xlp_info |= XLP_FIRST_IS_CONTRECORD;
1378
1379 /* skip over the page header */
1380 if (CurrPos % XLogSegSize == 0)
1381 {
1382 CurrPos += SizeOfXLogLongPHD;
1383 currpos += SizeOfXLogLongPHD;
1384 }
1385 else
1386 {
1387 CurrPos += SizeOfXLogShortPHD;
1388 currpos += SizeOfXLogShortPHD;
1389 }
1390 freespace = INSERT_FREESPACE(CurrPos);
1391 }
1392
1393 Assert(CurrPos % XLOG_BLCKSZ >= SizeOfXLogShortPHD || rdata_len == 0);
1394 memcpy(currpos, rdata_data, rdata_len);
1395 currpos += rdata_len;
1396 CurrPos += rdata_len;
1397 freespace -= rdata_len;
1398 written += rdata_len;
1399
1400 rdata = rdata->next;
1401 }
1402 Assert(written == write_len);
1403
1404 /*
1405 * If this was an xlog-switch, it's not enough to write the switch record,
1406 * we also have to consume all the remaining space in the WAL segment. We
1407 * have already reserved it for us, but we still need to make sure it's
1408 * allocated and zeroed in the WAL buffers so that when the caller (or
1409 * someone else) does XLogWrite(), it can really write out all the zeros.
1410 */
1411 if (isLogSwitch && CurrPos % XLOG_SEG_SIZE != 0)
1412 {
1413 /* An xlog-switch record doesn't contain any data besides the header */
1414 Assert(write_len == SizeOfXLogRecord);
1415
1416 /*
1417 * We do this one page at a time, to make sure we don't deadlock
1418 * against ourselves if wal_buffers < XLOG_SEG_SIZE.
1419 */
1420 Assert(EndPos % XLogSegSize == 0);
1421
1422 /* Use up all the remaining space on the first page */
1423 CurrPos += freespace;
1424
1425 while (CurrPos < EndPos)
1426 {
1427 /* initialize the next page (if not initialized already) */
1428 WALInsertLockUpdateInsertingAt(CurrPos);
1429 AdvanceXLInsertBuffer(CurrPos, false);
1430 CurrPos += XLOG_BLCKSZ;
1431 }
1432 }
1433 else
1434 {
1435 /* Align the end position, so that the next record starts aligned */
1436 CurrPos = MAXALIGN64(CurrPos);
1437 }
1438
1439 if (CurrPos != EndPos)
1440 elog(PANIC, "space reserved for WAL record does not match what was written");
1441 }
1442
1443 /*
1444 * Acquire a WAL insertion lock, for inserting to WAL.
1445 */
1446 static void
WALInsertLockAcquire(void)1447 WALInsertLockAcquire(void)
1448 {
1449 bool immed;
1450
1451 /*
1452 * It doesn't matter which of the WAL insertion locks we acquire, so try
1453 * the one we used last time. If the system isn't particularly busy, it's
1454 * a good bet that it's still available, and it's good to have some
1455 * affinity to a particular lock so that you don't unnecessarily bounce
1456 * cache lines between processes when there's no contention.
1457 *
1458 * If this is the first time through in this backend, pick a lock
1459 * (semi-)randomly. This allows the locks to be used evenly if you have a
1460 * lot of very short connections.
1461 */
1462 static int lockToTry = -1;
1463
1464 if (lockToTry == -1)
1465 lockToTry = MyProc->pgprocno % NUM_XLOGINSERT_LOCKS;
1466 MyLockNo = lockToTry;
1467
1468 /*
1469 * The insertingAt value is initially set to 0, as we don't know our
1470 * insert location yet.
1471 */
1472 immed = LWLockAcquire(&WALInsertLocks[MyLockNo].l.lock, LW_EXCLUSIVE);
1473 if (!immed)
1474 {
1475 /*
1476 * If we couldn't get the lock immediately, try another lock next
1477 * time. On a system with more insertion locks than concurrent
1478 * inserters, this causes all the inserters to eventually migrate to a
1479 * lock that no-one else is using. On a system with more inserters
1480 * than locks, it still helps to distribute the inserters evenly
1481 * across the locks.
1482 */
1483 lockToTry = (lockToTry + 1) % NUM_XLOGINSERT_LOCKS;
1484 }
1485 }
1486
1487 /*
1488 * Acquire all WAL insertion locks, to prevent other backends from inserting
1489 * to WAL.
1490 */
1491 static void
WALInsertLockAcquireExclusive(void)1492 WALInsertLockAcquireExclusive(void)
1493 {
1494 int i;
1495
1496 /*
1497 * When holding all the locks, all but the last lock's insertingAt
1498 * indicator is set to 0xFFFFFFFFFFFFFFFF, which is higher than any real
1499 * XLogRecPtr value, to make sure that no-one blocks waiting on those.
1500 */
1501 for (i = 0; i < NUM_XLOGINSERT_LOCKS - 1; i++)
1502 {
1503 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1504 LWLockUpdateVar(&WALInsertLocks[i].l.lock,
1505 &WALInsertLocks[i].l.insertingAt,
1506 PG_UINT64_MAX);
1507 }
1508 /* Variable value reset to 0 at release */
1509 LWLockAcquire(&WALInsertLocks[i].l.lock, LW_EXCLUSIVE);
1510
1511 holdingAllLocks = true;
1512 }
1513
1514 /*
1515 * Release our insertion lock (or locks, if we're holding them all).
1516 *
1517 * NB: Reset all variables to 0, so they cause LWLockWaitForVar to block the
1518 * next time the lock is acquired.
1519 */
1520 static void
WALInsertLockRelease(void)1521 WALInsertLockRelease(void)
1522 {
1523 if (holdingAllLocks)
1524 {
1525 int i;
1526
1527 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1528 LWLockReleaseClearVar(&WALInsertLocks[i].l.lock,
1529 &WALInsertLocks[i].l.insertingAt,
1530 0);
1531
1532 holdingAllLocks = false;
1533 }
1534 else
1535 {
1536 LWLockReleaseClearVar(&WALInsertLocks[MyLockNo].l.lock,
1537 &WALInsertLocks[MyLockNo].l.insertingAt,
1538 0);
1539 }
1540 }
1541
1542 /*
1543 * Update our insertingAt value, to let others know that we've finished
1544 * inserting up to that point.
1545 */
1546 static void
WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)1547 WALInsertLockUpdateInsertingAt(XLogRecPtr insertingAt)
1548 {
1549 if (holdingAllLocks)
1550 {
1551 /*
1552 * We use the last lock to mark our actual position, see comments in
1553 * WALInsertLockAcquireExclusive.
1554 */
1555 LWLockUpdateVar(&WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.lock,
1556 &WALInsertLocks[NUM_XLOGINSERT_LOCKS - 1].l.insertingAt,
1557 insertingAt);
1558 }
1559 else
1560 LWLockUpdateVar(&WALInsertLocks[MyLockNo].l.lock,
1561 &WALInsertLocks[MyLockNo].l.insertingAt,
1562 insertingAt);
1563 }
1564
1565 /*
1566 * Wait for any WAL insertions < upto to finish.
1567 *
1568 * Returns the location of the oldest insertion that is still in-progress.
1569 * Any WAL prior to that point has been fully copied into WAL buffers, and
1570 * can be flushed out to disk. Because this waits for any insertions older
1571 * than 'upto' to finish, the return value is always >= 'upto'.
1572 *
1573 * Note: When you are about to write out WAL, you must call this function
1574 * *before* acquiring WALWriteLock, to avoid deadlocks. This function might
1575 * need to wait for an insertion to finish (or at least advance to next
1576 * uninitialized page), and the inserter might need to evict an old WAL buffer
1577 * to make room for a new one, which in turn requires WALWriteLock.
1578 */
1579 static XLogRecPtr
WaitXLogInsertionsToFinish(XLogRecPtr upto)1580 WaitXLogInsertionsToFinish(XLogRecPtr upto)
1581 {
1582 uint64 bytepos;
1583 XLogRecPtr reservedUpto;
1584 XLogRecPtr finishedUpto;
1585 XLogCtlInsert *Insert = &XLogCtl->Insert;
1586 int i;
1587
1588 if (MyProc == NULL)
1589 elog(PANIC, "cannot wait without a PGPROC structure");
1590
1591 /* Read the current insert position */
1592 SpinLockAcquire(&Insert->insertpos_lck);
1593 bytepos = Insert->CurrBytePos;
1594 SpinLockRelease(&Insert->insertpos_lck);
1595 reservedUpto = XLogBytePosToEndRecPtr(bytepos);
1596
1597 /*
1598 * No-one should request to flush a piece of WAL that hasn't even been
1599 * reserved yet. However, it can happen if there is a block with a bogus
1600 * LSN on disk, for example. XLogFlush checks for that situation and
1601 * complains, but only after the flush. Here we just assume that to mean
1602 * that all WAL that has been reserved needs to be finished. In this
1603 * corner-case, the return value can be smaller than 'upto' argument.
1604 */
1605 if (upto > reservedUpto)
1606 {
1607 elog(LOG, "request to flush past end of generated WAL; request %X/%X, currpos %X/%X",
1608 (uint32) (upto >> 32), (uint32) upto,
1609 (uint32) (reservedUpto >> 32), (uint32) reservedUpto);
1610 upto = reservedUpto;
1611 }
1612
1613 /*
1614 * Loop through all the locks, sleeping on any in-progress insert older
1615 * than 'upto'.
1616 *
1617 * finishedUpto is our return value, indicating the point upto which all
1618 * the WAL insertions have been finished. Initialize it to the head of
1619 * reserved WAL, and as we iterate through the insertion locks, back it
1620 * out for any insertion that's still in progress.
1621 */
1622 finishedUpto = reservedUpto;
1623 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
1624 {
1625 XLogRecPtr insertingat = InvalidXLogRecPtr;
1626
1627 do
1628 {
1629 /*
1630 * See if this insertion is in progress. LWLockWait will wait for
1631 * the lock to be released, or for the 'value' to be set by a
1632 * LWLockUpdateVar call. When a lock is initially acquired, its
1633 * value is 0 (InvalidXLogRecPtr), which means that we don't know
1634 * where it's inserting yet. We will have to wait for it. If
1635 * it's a small insertion, the record will most likely fit on the
1636 * same page and the inserter will release the lock without ever
1637 * calling LWLockUpdateVar. But if it has to sleep, it will
1638 * advertise the insertion point with LWLockUpdateVar before
1639 * sleeping.
1640 */
1641 if (LWLockWaitForVar(&WALInsertLocks[i].l.lock,
1642 &WALInsertLocks[i].l.insertingAt,
1643 insertingat, &insertingat))
1644 {
1645 /* the lock was free, so no insertion in progress */
1646 insertingat = InvalidXLogRecPtr;
1647 break;
1648 }
1649
1650 /*
1651 * This insertion is still in progress. Have to wait, unless the
1652 * inserter has proceeded past 'upto'.
1653 */
1654 } while (insertingat < upto);
1655
1656 if (insertingat != InvalidXLogRecPtr && insertingat < finishedUpto)
1657 finishedUpto = insertingat;
1658 }
1659 return finishedUpto;
1660 }
1661
1662 /*
1663 * Get a pointer to the right location in the WAL buffer containing the
1664 * given XLogRecPtr.
1665 *
1666 * If the page is not initialized yet, it is initialized. That might require
1667 * evicting an old dirty buffer from the buffer cache, which means I/O.
1668 *
1669 * The caller must ensure that the page containing the requested location
1670 * isn't evicted yet, and won't be evicted. The way to ensure that is to
1671 * hold onto a WAL insertion lock with the insertingAt position set to
1672 * something <= ptr. GetXLogBuffer() will update insertingAt if it needs
1673 * to evict an old page from the buffer. (This means that once you call
1674 * GetXLogBuffer() with a given 'ptr', you must not access anything before
1675 * that point anymore, and must not call GetXLogBuffer() with an older 'ptr'
1676 * later, because older buffers might be recycled already)
1677 */
1678 static char *
GetXLogBuffer(XLogRecPtr ptr)1679 GetXLogBuffer(XLogRecPtr ptr)
1680 {
1681 int idx;
1682 XLogRecPtr endptr;
1683 static uint64 cachedPage = 0;
1684 static char *cachedPos = NULL;
1685 XLogRecPtr expectedEndPtr;
1686
1687 /*
1688 * Fast path for the common case that we need to access again the same
1689 * page as last time.
1690 */
1691 if (ptr / XLOG_BLCKSZ == cachedPage)
1692 {
1693 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1694 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1695 return cachedPos + ptr % XLOG_BLCKSZ;
1696 }
1697
1698 /*
1699 * The XLog buffer cache is organized so that a page is always loaded to a
1700 * particular buffer. That way we can easily calculate the buffer a given
1701 * page must be loaded into, from the XLogRecPtr alone.
1702 */
1703 idx = XLogRecPtrToBufIdx(ptr);
1704
1705 /*
1706 * See what page is loaded in the buffer at the moment. It could be the
1707 * page we're looking for, or something older. It can't be anything newer
1708 * - that would imply the page we're looking for has already been written
1709 * out to disk and evicted, and the caller is responsible for making sure
1710 * that doesn't happen.
1711 *
1712 * However, we don't hold a lock while we read the value. If someone has
1713 * just initialized the page, it's possible that we get a "torn read" of
1714 * the XLogRecPtr if 64-bit fetches are not atomic on this platform. In
1715 * that case we will see a bogus value. That's ok, we'll grab the mapping
1716 * lock (in AdvanceXLInsertBuffer) and retry if we see anything else than
1717 * the page we're looking for. But it means that when we do this unlocked
1718 * read, we might see a value that appears to be ahead of the page we're
1719 * looking for. Don't PANIC on that, until we've verified the value while
1720 * holding the lock.
1721 */
1722 expectedEndPtr = ptr;
1723 expectedEndPtr += XLOG_BLCKSZ - ptr % XLOG_BLCKSZ;
1724
1725 endptr = XLogCtl->xlblocks[idx];
1726 if (expectedEndPtr != endptr)
1727 {
1728 XLogRecPtr initializedUpto;
1729
1730 /*
1731 * Before calling AdvanceXLInsertBuffer(), which can block, let others
1732 * know how far we're finished with inserting the record.
1733 *
1734 * NB: If 'ptr' points to just after the page header, advertise a
1735 * position at the beginning of the page rather than 'ptr' itself. If
1736 * there are no other insertions running, someone might try to flush
1737 * up to our advertised location. If we advertised a position after
1738 * the page header, someone might try to flush the page header, even
1739 * though page might actually not be initialized yet. As the first
1740 * inserter on the page, we are effectively responsible for making
1741 * sure that it's initialized, before we let insertingAt to move past
1742 * the page header.
1743 */
1744 if (ptr % XLOG_BLCKSZ == SizeOfXLogShortPHD &&
1745 ptr % XLOG_SEG_SIZE > XLOG_BLCKSZ)
1746 initializedUpto = ptr - SizeOfXLogShortPHD;
1747 else if (ptr % XLOG_BLCKSZ == SizeOfXLogLongPHD &&
1748 ptr % XLOG_SEG_SIZE < XLOG_BLCKSZ)
1749 initializedUpto = ptr - SizeOfXLogLongPHD;
1750 else
1751 initializedUpto = ptr;
1752
1753 WALInsertLockUpdateInsertingAt(initializedUpto);
1754
1755 AdvanceXLInsertBuffer(ptr, false);
1756 endptr = XLogCtl->xlblocks[idx];
1757
1758 if (expectedEndPtr != endptr)
1759 elog(PANIC, "could not find WAL buffer for %X/%X",
1760 (uint32) (ptr >> 32), (uint32) ptr);
1761 }
1762 else
1763 {
1764 /*
1765 * Make sure the initialization of the page is visible to us, and
1766 * won't arrive later to overwrite the WAL data we write on the page.
1767 */
1768 pg_memory_barrier();
1769 }
1770
1771 /*
1772 * Found the buffer holding this page. Return a pointer to the right
1773 * offset within the page.
1774 */
1775 cachedPage = ptr / XLOG_BLCKSZ;
1776 cachedPos = XLogCtl->pages + idx * (Size) XLOG_BLCKSZ;
1777
1778 Assert(((XLogPageHeader) cachedPos)->xlp_magic == XLOG_PAGE_MAGIC);
1779 Assert(((XLogPageHeader) cachedPos)->xlp_pageaddr == ptr - (ptr % XLOG_BLCKSZ));
1780
1781 return cachedPos + ptr % XLOG_BLCKSZ;
1782 }
1783
1784 /*
1785 * Converts a "usable byte position" to XLogRecPtr. A usable byte position
1786 * is the position starting from the beginning of WAL, excluding all WAL
1787 * page headers.
1788 */
1789 static XLogRecPtr
XLogBytePosToRecPtr(uint64 bytepos)1790 XLogBytePosToRecPtr(uint64 bytepos)
1791 {
1792 uint64 fullsegs;
1793 uint64 fullpages;
1794 uint64 bytesleft;
1795 uint32 seg_offset;
1796 XLogRecPtr result;
1797
1798 fullsegs = bytepos / UsableBytesInSegment;
1799 bytesleft = bytepos % UsableBytesInSegment;
1800
1801 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1802 {
1803 /* fits on first page of segment */
1804 seg_offset = bytesleft + SizeOfXLogLongPHD;
1805 }
1806 else
1807 {
1808 /* account for the first page on segment with long header */
1809 seg_offset = XLOG_BLCKSZ;
1810 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1811
1812 fullpages = bytesleft / UsableBytesInPage;
1813 bytesleft = bytesleft % UsableBytesInPage;
1814
1815 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1816 }
1817
1818 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1819
1820 return result;
1821 }
1822
1823 /*
1824 * Like XLogBytePosToRecPtr, but if the position is at a page boundary,
1825 * returns a pointer to the beginning of the page (ie. before page header),
1826 * not to where the first xlog record on that page would go to. This is used
1827 * when converting a pointer to the end of a record.
1828 */
1829 static XLogRecPtr
XLogBytePosToEndRecPtr(uint64 bytepos)1830 XLogBytePosToEndRecPtr(uint64 bytepos)
1831 {
1832 uint64 fullsegs;
1833 uint64 fullpages;
1834 uint64 bytesleft;
1835 uint32 seg_offset;
1836 XLogRecPtr result;
1837
1838 fullsegs = bytepos / UsableBytesInSegment;
1839 bytesleft = bytepos % UsableBytesInSegment;
1840
1841 if (bytesleft < XLOG_BLCKSZ - SizeOfXLogLongPHD)
1842 {
1843 /* fits on first page of segment */
1844 if (bytesleft == 0)
1845 seg_offset = 0;
1846 else
1847 seg_offset = bytesleft + SizeOfXLogLongPHD;
1848 }
1849 else
1850 {
1851 /* account for the first page on segment with long header */
1852 seg_offset = XLOG_BLCKSZ;
1853 bytesleft -= XLOG_BLCKSZ - SizeOfXLogLongPHD;
1854
1855 fullpages = bytesleft / UsableBytesInPage;
1856 bytesleft = bytesleft % UsableBytesInPage;
1857
1858 if (bytesleft == 0)
1859 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft;
1860 else
1861 seg_offset += fullpages * XLOG_BLCKSZ + bytesleft + SizeOfXLogShortPHD;
1862 }
1863
1864 XLogSegNoOffsetToRecPtr(fullsegs, seg_offset, result);
1865
1866 return result;
1867 }
1868
1869 /*
1870 * Convert an XLogRecPtr to a "usable byte position".
1871 */
1872 static uint64
XLogRecPtrToBytePos(XLogRecPtr ptr)1873 XLogRecPtrToBytePos(XLogRecPtr ptr)
1874 {
1875 uint64 fullsegs;
1876 uint32 fullpages;
1877 uint32 offset;
1878 uint64 result;
1879
1880 XLByteToSeg(ptr, fullsegs);
1881
1882 fullpages = (ptr % XLOG_SEG_SIZE) / XLOG_BLCKSZ;
1883 offset = ptr % XLOG_BLCKSZ;
1884
1885 if (fullpages == 0)
1886 {
1887 result = fullsegs * UsableBytesInSegment;
1888 if (offset > 0)
1889 {
1890 Assert(offset >= SizeOfXLogLongPHD);
1891 result += offset - SizeOfXLogLongPHD;
1892 }
1893 }
1894 else
1895 {
1896 result = fullsegs * UsableBytesInSegment +
1897 (XLOG_BLCKSZ - SizeOfXLogLongPHD) + /* account for first page */
1898 (fullpages - 1) * UsableBytesInPage; /* full pages */
1899 if (offset > 0)
1900 {
1901 Assert(offset >= SizeOfXLogShortPHD);
1902 result += offset - SizeOfXLogShortPHD;
1903 }
1904 }
1905
1906 return result;
1907 }
1908
1909 /*
1910 * Initialize XLOG buffers, writing out old buffers if they still contain
1911 * unwritten data, upto the page containing 'upto'. Or if 'opportunistic' is
1912 * true, initialize as many pages as we can without having to write out
1913 * unwritten data. Any new pages are initialized to zeros, with pages headers
1914 * initialized properly.
1915 */
1916 static void
AdvanceXLInsertBuffer(XLogRecPtr upto,bool opportunistic)1917 AdvanceXLInsertBuffer(XLogRecPtr upto, bool opportunistic)
1918 {
1919 XLogCtlInsert *Insert = &XLogCtl->Insert;
1920 int nextidx;
1921 XLogRecPtr OldPageRqstPtr;
1922 XLogwrtRqst WriteRqst;
1923 XLogRecPtr NewPageEndPtr = InvalidXLogRecPtr;
1924 XLogRecPtr NewPageBeginPtr;
1925 XLogPageHeader NewPage;
1926 int npages = 0;
1927
1928 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
1929
1930 /*
1931 * Now that we have the lock, check if someone initialized the page
1932 * already.
1933 */
1934 while (upto >= XLogCtl->InitializedUpTo || opportunistic)
1935 {
1936 nextidx = XLogRecPtrToBufIdx(XLogCtl->InitializedUpTo);
1937
1938 /*
1939 * Get ending-offset of the buffer page we need to replace (this may
1940 * be zero if the buffer hasn't been used yet). Fall through if it's
1941 * already written out.
1942 */
1943 OldPageRqstPtr = XLogCtl->xlblocks[nextidx];
1944 if (LogwrtResult.Write < OldPageRqstPtr)
1945 {
1946 /*
1947 * Nope, got work to do. If we just want to pre-initialize as much
1948 * as we can without flushing, give up now.
1949 */
1950 if (opportunistic)
1951 break;
1952
1953 /* Before waiting, get info_lck and update LogwrtResult */
1954 SpinLockAcquire(&XLogCtl->info_lck);
1955 if (XLogCtl->LogwrtRqst.Write < OldPageRqstPtr)
1956 XLogCtl->LogwrtRqst.Write = OldPageRqstPtr;
1957 LogwrtResult = XLogCtl->LogwrtResult;
1958 SpinLockRelease(&XLogCtl->info_lck);
1959
1960 /*
1961 * Now that we have an up-to-date LogwrtResult value, see if we
1962 * still need to write it or if someone else already did.
1963 */
1964 if (LogwrtResult.Write < OldPageRqstPtr)
1965 {
1966 /*
1967 * Must acquire write lock. Release WALBufMappingLock first,
1968 * to make sure that all insertions that we need to wait for
1969 * can finish (up to this same position). Otherwise we risk
1970 * deadlock.
1971 */
1972 LWLockRelease(WALBufMappingLock);
1973
1974 WaitXLogInsertionsToFinish(OldPageRqstPtr);
1975
1976 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
1977
1978 LogwrtResult = XLogCtl->LogwrtResult;
1979 if (LogwrtResult.Write >= OldPageRqstPtr)
1980 {
1981 /* OK, someone wrote it already */
1982 LWLockRelease(WALWriteLock);
1983 }
1984 else
1985 {
1986 /* Have to write it ourselves */
1987 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_START();
1988 WriteRqst.Write = OldPageRqstPtr;
1989 WriteRqst.Flush = 0;
1990 XLogWrite(WriteRqst, false);
1991 LWLockRelease(WALWriteLock);
1992 TRACE_POSTGRESQL_WAL_BUFFER_WRITE_DIRTY_DONE();
1993 }
1994 /* Re-acquire WALBufMappingLock and retry */
1995 LWLockAcquire(WALBufMappingLock, LW_EXCLUSIVE);
1996 continue;
1997 }
1998 }
1999
2000 /*
2001 * Now the next buffer slot is free and we can set it up to be the
2002 * next output page.
2003 */
2004 NewPageBeginPtr = XLogCtl->InitializedUpTo;
2005 NewPageEndPtr = NewPageBeginPtr + XLOG_BLCKSZ;
2006
2007 Assert(XLogRecPtrToBufIdx(NewPageBeginPtr) == nextidx);
2008
2009 NewPage = (XLogPageHeader) (XLogCtl->pages + nextidx * (Size) XLOG_BLCKSZ);
2010
2011 /*
2012 * Be sure to re-zero the buffer so that bytes beyond what we've
2013 * written will look like zeroes and not valid XLOG records...
2014 */
2015 MemSet((char *) NewPage, 0, XLOG_BLCKSZ);
2016
2017 /*
2018 * Fill the new page's header
2019 */
2020 NewPage->xlp_magic = XLOG_PAGE_MAGIC;
2021
2022 /* NewPage->xlp_info = 0; */ /* done by memset */
2023 NewPage->xlp_tli = ThisTimeLineID;
2024 NewPage->xlp_pageaddr = NewPageBeginPtr;
2025
2026 /* NewPage->xlp_rem_len = 0; */ /* done by memset */
2027
2028 /*
2029 * If online backup is not in progress, mark the header to indicate
2030 * that* WAL records beginning in this page have removable backup
2031 * blocks. This allows the WAL archiver to know whether it is safe to
2032 * compress archived WAL data by transforming full-block records into
2033 * the non-full-block format. It is sufficient to record this at the
2034 * page level because we force a page switch (in fact a segment
2035 * switch) when starting a backup, so the flag will be off before any
2036 * records can be written during the backup. At the end of a backup,
2037 * the last page will be marked as all unsafe when perhaps only part
2038 * is unsafe, but at worst the archiver would miss the opportunity to
2039 * compress a few records.
2040 */
2041 if (!Insert->forcePageWrites)
2042 NewPage->xlp_info |= XLP_BKP_REMOVABLE;
2043
2044 /*
2045 * If a record was found to be broken at the end of recovery, and
2046 * we're going to write on the page where its first contrecord was
2047 * lost, set the XLP_FIRST_IS_OVERWRITE_CONTRECORD flag on the page
2048 * header. See CreateOverwriteContrecordRecord().
2049 */
2050 if (missingContrecPtr == NewPageBeginPtr)
2051 {
2052 NewPage->xlp_info |= XLP_FIRST_IS_OVERWRITE_CONTRECORD;
2053 missingContrecPtr = InvalidXLogRecPtr;
2054 }
2055
2056 /*
2057 * If first page of an XLOG segment file, make it a long header.
2058 */
2059 if ((NewPage->xlp_pageaddr % XLogSegSize) == 0)
2060 {
2061 XLogLongPageHeader NewLongPage = (XLogLongPageHeader) NewPage;
2062
2063 NewLongPage->xlp_sysid = ControlFile->system_identifier;
2064 NewLongPage->xlp_seg_size = XLogSegSize;
2065 NewLongPage->xlp_xlog_blcksz = XLOG_BLCKSZ;
2066 NewPage->xlp_info |= XLP_LONG_HEADER;
2067 }
2068
2069 /*
2070 * Make sure the initialization of the page becomes visible to others
2071 * before the xlblocks update. GetXLogBuffer() reads xlblocks without
2072 * holding a lock.
2073 */
2074 pg_write_barrier();
2075
2076 *((volatile XLogRecPtr *) &XLogCtl->xlblocks[nextidx]) = NewPageEndPtr;
2077
2078 XLogCtl->InitializedUpTo = NewPageEndPtr;
2079
2080 npages++;
2081 }
2082 LWLockRelease(WALBufMappingLock);
2083
2084 #ifdef WAL_DEBUG
2085 if (XLOG_DEBUG && npages > 0)
2086 {
2087 elog(DEBUG1, "initialized %d pages, up to %X/%X",
2088 npages, (uint32) (NewPageEndPtr >> 32), (uint32) NewPageEndPtr);
2089 }
2090 #endif
2091 }
2092
2093 /*
2094 * Calculate CheckPointSegments based on max_wal_size and
2095 * checkpoint_completion_target.
2096 */
2097 static void
CalculateCheckpointSegments(void)2098 CalculateCheckpointSegments(void)
2099 {
2100 double target;
2101
2102 /*-------
2103 * Calculate the distance at which to trigger a checkpoint, to avoid
2104 * exceeding max_wal_size. This is based on two assumptions:
2105 *
2106 * a) we keep WAL for two checkpoint cycles, back to the "prev" checkpoint.
2107 * b) during checkpoint, we consume checkpoint_completion_target *
2108 * number of segments consumed between checkpoints.
2109 *-------
2110 */
2111 target = (double) max_wal_size / (2.0 + CheckPointCompletionTarget);
2112
2113 /* round down */
2114 CheckPointSegments = (int) target;
2115
2116 if (CheckPointSegments < 1)
2117 CheckPointSegments = 1;
2118 }
2119
2120 void
assign_max_wal_size(int newval,void * extra)2121 assign_max_wal_size(int newval, void *extra)
2122 {
2123 max_wal_size = newval;
2124 CalculateCheckpointSegments();
2125 }
2126
2127 void
assign_checkpoint_completion_target(double newval,void * extra)2128 assign_checkpoint_completion_target(double newval, void *extra)
2129 {
2130 CheckPointCompletionTarget = newval;
2131 CalculateCheckpointSegments();
2132 }
2133
2134 /*
2135 * At a checkpoint, how many WAL segments to recycle as preallocated future
2136 * XLOG segments? Returns the highest segment that should be preallocated.
2137 */
2138 static XLogSegNo
XLOGfileslop(XLogRecPtr PriorRedoPtr)2139 XLOGfileslop(XLogRecPtr PriorRedoPtr)
2140 {
2141 XLogSegNo minSegNo;
2142 XLogSegNo maxSegNo;
2143 double distance;
2144 XLogSegNo recycleSegNo;
2145
2146 /*
2147 * Calculate the segment numbers that min_wal_size and max_wal_size
2148 * correspond to. Always recycle enough segments to meet the minimum, and
2149 * remove enough segments to stay below the maximum.
2150 */
2151 minSegNo = PriorRedoPtr / XLOG_SEG_SIZE + min_wal_size - 1;
2152 maxSegNo = PriorRedoPtr / XLOG_SEG_SIZE + max_wal_size - 1;
2153
2154 /*
2155 * Between those limits, recycle enough segments to get us through to the
2156 * estimated end of next checkpoint.
2157 *
2158 * To estimate where the next checkpoint will finish, assume that the
2159 * system runs steadily consuming CheckPointDistanceEstimate bytes between
2160 * every checkpoint.
2161 *
2162 * The reason this calculation is done from the prior checkpoint, not the
2163 * one that just finished, is that this behaves better if some checkpoint
2164 * cycles are abnormally short, like if you perform a manual checkpoint
2165 * right after a timed one. The manual checkpoint will make almost a full
2166 * cycle's worth of WAL segments available for recycling, because the
2167 * segments from the prior's prior, fully-sized checkpoint cycle are no
2168 * longer needed. However, the next checkpoint will make only few segments
2169 * available for recycling, the ones generated between the timed
2170 * checkpoint and the manual one right after that. If at the manual
2171 * checkpoint we only retained enough segments to get us to the next timed
2172 * one, and removed the rest, then at the next checkpoint we would not
2173 * have enough segments around for recycling, to get us to the checkpoint
2174 * after that. Basing the calculations on the distance from the prior redo
2175 * pointer largely fixes that problem.
2176 */
2177 distance = (2.0 + CheckPointCompletionTarget) * CheckPointDistanceEstimate;
2178 /* add 10% for good measure. */
2179 distance *= 1.10;
2180
2181 recycleSegNo = (XLogSegNo) ceil(((double) PriorRedoPtr + distance) / XLOG_SEG_SIZE);
2182
2183 if (recycleSegNo < minSegNo)
2184 recycleSegNo = minSegNo;
2185 if (recycleSegNo > maxSegNo)
2186 recycleSegNo = maxSegNo;
2187
2188 return recycleSegNo;
2189 }
2190
2191 /*
2192 * Check whether we've consumed enough xlog space that a checkpoint is needed.
2193 *
2194 * new_segno indicates a log file that has just been filled up (or read
2195 * during recovery). We measure the distance from RedoRecPtr to new_segno
2196 * and see if that exceeds CheckPointSegments.
2197 *
2198 * Note: it is caller's responsibility that RedoRecPtr is up-to-date.
2199 */
2200 static bool
XLogCheckpointNeeded(XLogSegNo new_segno)2201 XLogCheckpointNeeded(XLogSegNo new_segno)
2202 {
2203 XLogSegNo old_segno;
2204
2205 XLByteToSeg(RedoRecPtr, old_segno);
2206
2207 if (new_segno >= old_segno + (uint64) (CheckPointSegments - 1))
2208 return true;
2209 return false;
2210 }
2211
2212 /*
2213 * Write and/or fsync the log at least as far as WriteRqst indicates.
2214 *
2215 * If flexible == TRUE, we don't have to write as far as WriteRqst, but
2216 * may stop at any convenient boundary (such as a cache or logfile boundary).
2217 * This option allows us to avoid uselessly issuing multiple writes when a
2218 * single one would do.
2219 *
2220 * Must be called with WALWriteLock held. WaitXLogInsertionsToFinish(WriteRqst)
2221 * must be called before grabbing the lock, to make sure the data is ready to
2222 * write.
2223 */
2224 static void
XLogWrite(XLogwrtRqst WriteRqst,bool flexible)2225 XLogWrite(XLogwrtRqst WriteRqst, bool flexible)
2226 {
2227 bool ispartialpage;
2228 bool last_iteration;
2229 bool finishing_seg;
2230 bool use_existent;
2231 int curridx;
2232 int npages;
2233 int startidx;
2234 uint32 startoffset;
2235
2236 /* We should always be inside a critical section here */
2237 Assert(CritSectionCount > 0);
2238
2239 /*
2240 * Update local LogwrtResult (caller probably did this already, but...)
2241 */
2242 LogwrtResult = XLogCtl->LogwrtResult;
2243
2244 /*
2245 * Since successive pages in the xlog cache are consecutively allocated,
2246 * we can usually gather multiple pages together and issue just one
2247 * write() call. npages is the number of pages we have determined can be
2248 * written together; startidx is the cache block index of the first one,
2249 * and startoffset is the file offset at which it should go. The latter
2250 * two variables are only valid when npages > 0, but we must initialize
2251 * all of them to keep the compiler quiet.
2252 */
2253 npages = 0;
2254 startidx = 0;
2255 startoffset = 0;
2256
2257 /*
2258 * Within the loop, curridx is the cache block index of the page to
2259 * consider writing. Begin at the buffer containing the next unwritten
2260 * page, or last partially written page.
2261 */
2262 curridx = XLogRecPtrToBufIdx(LogwrtResult.Write);
2263
2264 while (LogwrtResult.Write < WriteRqst.Write)
2265 {
2266 /*
2267 * Make sure we're not ahead of the insert process. This could happen
2268 * if we're passed a bogus WriteRqst.Write that is past the end of the
2269 * last page that's been initialized by AdvanceXLInsertBuffer.
2270 */
2271 XLogRecPtr EndPtr = XLogCtl->xlblocks[curridx];
2272
2273 if (LogwrtResult.Write >= EndPtr)
2274 elog(PANIC, "xlog write request %X/%X is past end of log %X/%X",
2275 (uint32) (LogwrtResult.Write >> 32),
2276 (uint32) LogwrtResult.Write,
2277 (uint32) (EndPtr >> 32), (uint32) EndPtr);
2278
2279 /* Advance LogwrtResult.Write to end of current buffer page */
2280 LogwrtResult.Write = EndPtr;
2281 ispartialpage = WriteRqst.Write < LogwrtResult.Write;
2282
2283 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2284 {
2285 /*
2286 * Switch to new logfile segment. We cannot have any pending
2287 * pages here (since we dump what we have at segment end).
2288 */
2289 Assert(npages == 0);
2290 if (openLogFile >= 0)
2291 XLogFileClose();
2292 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2293
2294 /* create/use new log file */
2295 use_existent = true;
2296 openLogFile = XLogFileInit(openLogSegNo, &use_existent, true);
2297 openLogOff = 0;
2298 }
2299
2300 /* Make sure we have the current logfile open */
2301 if (openLogFile < 0)
2302 {
2303 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2304 openLogFile = XLogFileOpen(openLogSegNo);
2305 openLogOff = 0;
2306 }
2307
2308 /* Add current page to the set of pending pages-to-dump */
2309 if (npages == 0)
2310 {
2311 /* first of group */
2312 startidx = curridx;
2313 startoffset = (LogwrtResult.Write - XLOG_BLCKSZ) % XLogSegSize;
2314 }
2315 npages++;
2316
2317 /*
2318 * Dump the set if this will be the last loop iteration, or if we are
2319 * at the last page of the cache area (since the next page won't be
2320 * contiguous in memory), or if we are at the end of the logfile
2321 * segment.
2322 */
2323 last_iteration = WriteRqst.Write <= LogwrtResult.Write;
2324
2325 finishing_seg = !ispartialpage &&
2326 (startoffset + npages * XLOG_BLCKSZ) >= XLogSegSize;
2327
2328 if (last_iteration ||
2329 curridx == XLogCtl->XLogCacheBlck ||
2330 finishing_seg)
2331 {
2332 char *from;
2333 Size nbytes;
2334 Size nleft;
2335 int written;
2336
2337 /* Need to seek in the file? */
2338 if (openLogOff != startoffset)
2339 {
2340 if (lseek(openLogFile, (off_t) startoffset, SEEK_SET) < 0)
2341 ereport(PANIC,
2342 (errcode_for_file_access(),
2343 errmsg("could not seek in log file %s to offset %u: %m",
2344 XLogFileNameP(ThisTimeLineID, openLogSegNo),
2345 startoffset)));
2346 openLogOff = startoffset;
2347 }
2348
2349 /* OK to write the page(s) */
2350 from = XLogCtl->pages + startidx * (Size) XLOG_BLCKSZ;
2351 nbytes = npages * (Size) XLOG_BLCKSZ;
2352 nleft = nbytes;
2353 do
2354 {
2355 errno = 0;
2356 written = write(openLogFile, from, nleft);
2357 if (written <= 0)
2358 {
2359 if (errno == EINTR)
2360 continue;
2361 ereport(PANIC,
2362 (errcode_for_file_access(),
2363 errmsg("could not write to log file %s "
2364 "at offset %u, length %zu: %m",
2365 XLogFileNameP(ThisTimeLineID, openLogSegNo),
2366 openLogOff, nbytes)));
2367 }
2368 nleft -= written;
2369 from += written;
2370 } while (nleft > 0);
2371
2372 /* Update state for write */
2373 openLogOff += nbytes;
2374 npages = 0;
2375
2376 /*
2377 * If we just wrote the whole last page of a logfile segment,
2378 * fsync the segment immediately. This avoids having to go back
2379 * and re-open prior segments when an fsync request comes along
2380 * later. Doing it here ensures that one and only one backend will
2381 * perform this fsync.
2382 *
2383 * This is also the right place to notify the Archiver that the
2384 * segment is ready to copy to archival storage, and to update the
2385 * timer for archive_timeout, and to signal for a checkpoint if
2386 * too many logfile segments have been used since the last
2387 * checkpoint.
2388 */
2389 if (finishing_seg)
2390 {
2391 issue_xlog_fsync(openLogFile, openLogSegNo);
2392
2393 /* signal that we need to wakeup walsenders later */
2394 WalSndWakeupRequest();
2395
2396 LogwrtResult.Flush = LogwrtResult.Write; /* end of page */
2397
2398 if (XLogArchivingActive())
2399 XLogArchiveNotifySeg(openLogSegNo);
2400
2401 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
2402
2403 /*
2404 * Request a checkpoint if we've consumed too much xlog since
2405 * the last one. For speed, we first check using the local
2406 * copy of RedoRecPtr, which might be out of date; if it looks
2407 * like a checkpoint is needed, forcibly update RedoRecPtr and
2408 * recheck.
2409 */
2410 if (IsUnderPostmaster && XLogCheckpointNeeded(openLogSegNo))
2411 {
2412 (void) GetRedoRecPtr();
2413 if (XLogCheckpointNeeded(openLogSegNo))
2414 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
2415 }
2416 }
2417 }
2418
2419 if (ispartialpage)
2420 {
2421 /* Only asked to write a partial page */
2422 LogwrtResult.Write = WriteRqst.Write;
2423 break;
2424 }
2425 curridx = NextBufIdx(curridx);
2426
2427 /* If flexible, break out of loop as soon as we wrote something */
2428 if (flexible && npages == 0)
2429 break;
2430 }
2431
2432 Assert(npages == 0);
2433
2434 /*
2435 * If asked to flush, do so
2436 */
2437 if (LogwrtResult.Flush < WriteRqst.Flush &&
2438 LogwrtResult.Flush < LogwrtResult.Write)
2439
2440 {
2441 /*
2442 * Could get here without iterating above loop, in which case we might
2443 * have no open file or the wrong one. However, we do not need to
2444 * fsync more than one file.
2445 */
2446 if (sync_method != SYNC_METHOD_OPEN &&
2447 sync_method != SYNC_METHOD_OPEN_DSYNC)
2448 {
2449 if (openLogFile >= 0 &&
2450 !XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2451 XLogFileClose();
2452 if (openLogFile < 0)
2453 {
2454 XLByteToPrevSeg(LogwrtResult.Write, openLogSegNo);
2455 openLogFile = XLogFileOpen(openLogSegNo);
2456 openLogOff = 0;
2457 }
2458
2459 issue_xlog_fsync(openLogFile, openLogSegNo);
2460 }
2461
2462 /* signal that we need to wakeup walsenders later */
2463 WalSndWakeupRequest();
2464
2465 LogwrtResult.Flush = LogwrtResult.Write;
2466 }
2467
2468 /*
2469 * Update shared-memory status
2470 *
2471 * We make sure that the shared 'request' values do not fall behind the
2472 * 'result' values. This is not absolutely essential, but it saves some
2473 * code in a couple of places.
2474 */
2475 {
2476 SpinLockAcquire(&XLogCtl->info_lck);
2477 XLogCtl->LogwrtResult = LogwrtResult;
2478 if (XLogCtl->LogwrtRqst.Write < LogwrtResult.Write)
2479 XLogCtl->LogwrtRqst.Write = LogwrtResult.Write;
2480 if (XLogCtl->LogwrtRqst.Flush < LogwrtResult.Flush)
2481 XLogCtl->LogwrtRqst.Flush = LogwrtResult.Flush;
2482 SpinLockRelease(&XLogCtl->info_lck);
2483 }
2484 }
2485
2486 /*
2487 * Record the LSN for an asynchronous transaction commit/abort
2488 * and nudge the WALWriter if there is work for it to do.
2489 * (This should not be called for synchronous commits.)
2490 */
2491 void
XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)2492 XLogSetAsyncXactLSN(XLogRecPtr asyncXactLSN)
2493 {
2494 XLogRecPtr WriteRqstPtr = asyncXactLSN;
2495 bool sleeping;
2496
2497 SpinLockAcquire(&XLogCtl->info_lck);
2498 LogwrtResult = XLogCtl->LogwrtResult;
2499 sleeping = XLogCtl->WalWriterSleeping;
2500 if (XLogCtl->asyncXactLSN < asyncXactLSN)
2501 XLogCtl->asyncXactLSN = asyncXactLSN;
2502 SpinLockRelease(&XLogCtl->info_lck);
2503
2504 /*
2505 * If the WALWriter is sleeping, we should kick it to make it come out of
2506 * low-power mode. Otherwise, determine whether there's a full page of
2507 * WAL available to write.
2508 */
2509 if (!sleeping)
2510 {
2511 /* back off to last completed page boundary */
2512 WriteRqstPtr -= WriteRqstPtr % XLOG_BLCKSZ;
2513
2514 /* if we have already flushed that far, we're done */
2515 if (WriteRqstPtr <= LogwrtResult.Flush)
2516 return;
2517 }
2518
2519 /*
2520 * Nudge the WALWriter: it has a full page of WAL to write, or we want it
2521 * to come out of low-power mode so that this async commit will reach disk
2522 * within the expected amount of time.
2523 */
2524 if (ProcGlobal->walwriterLatch)
2525 SetLatch(ProcGlobal->walwriterLatch);
2526 }
2527
2528 /*
2529 * Record the LSN up to which we can remove WAL because it's not required by
2530 * any replication slot.
2531 */
2532 void
XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)2533 XLogSetReplicationSlotMinimumLSN(XLogRecPtr lsn)
2534 {
2535 SpinLockAcquire(&XLogCtl->info_lck);
2536 XLogCtl->replicationSlotMinLSN = lsn;
2537 SpinLockRelease(&XLogCtl->info_lck);
2538 }
2539
2540
2541 /*
2542 * Return the oldest LSN we must retain to satisfy the needs of some
2543 * replication slot.
2544 */
2545 static XLogRecPtr
XLogGetReplicationSlotMinimumLSN(void)2546 XLogGetReplicationSlotMinimumLSN(void)
2547 {
2548 XLogRecPtr retval;
2549
2550 SpinLockAcquire(&XLogCtl->info_lck);
2551 retval = XLogCtl->replicationSlotMinLSN;
2552 SpinLockRelease(&XLogCtl->info_lck);
2553
2554 return retval;
2555 }
2556
2557 /*
2558 * Advance minRecoveryPoint in control file.
2559 *
2560 * If we crash during recovery, we must reach this point again before the
2561 * database is consistent.
2562 *
2563 * If 'force' is true, 'lsn' argument is ignored. Otherwise, minRecoveryPoint
2564 * is only updated if it's not already greater than or equal to 'lsn'.
2565 */
2566 static void
UpdateMinRecoveryPoint(XLogRecPtr lsn,bool force)2567 UpdateMinRecoveryPoint(XLogRecPtr lsn, bool force)
2568 {
2569 /* Quick check using our local copy of the variable */
2570 if (!updateMinRecoveryPoint || (!force && lsn <= minRecoveryPoint))
2571 return;
2572
2573 /*
2574 * An invalid minRecoveryPoint means that we need to recover all the WAL,
2575 * i.e., we're doing crash recovery. We never modify the control file's
2576 * value in that case, so we can short-circuit future checks here too. The
2577 * local values of minRecoveryPoint and minRecoveryPointTLI should not be
2578 * updated until crash recovery finishes. We only do this for the startup
2579 * process as it should not update its own reference of minRecoveryPoint
2580 * until it has finished crash recovery to make sure that all WAL
2581 * available is replayed in this case. This also saves from extra locks
2582 * taken on the control file from the startup process.
2583 */
2584 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2585 {
2586 updateMinRecoveryPoint = false;
2587 return;
2588 }
2589
2590 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
2591
2592 /* update local copy */
2593 minRecoveryPoint = ControlFile->minRecoveryPoint;
2594 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
2595
2596 if (XLogRecPtrIsInvalid(minRecoveryPoint))
2597 updateMinRecoveryPoint = false;
2598 else if (force || minRecoveryPoint < lsn)
2599 {
2600 XLogRecPtr newMinRecoveryPoint;
2601 TimeLineID newMinRecoveryPointTLI;
2602
2603 /*
2604 * To avoid having to update the control file too often, we update it
2605 * all the way to the last record being replayed, even though 'lsn'
2606 * would suffice for correctness. This also allows the 'force' case
2607 * to not need a valid 'lsn' value.
2608 *
2609 * Another important reason for doing it this way is that the passed
2610 * 'lsn' value could be bogus, i.e., past the end of available WAL, if
2611 * the caller got it from a corrupted heap page. Accepting such a
2612 * value as the min recovery point would prevent us from coming up at
2613 * all. Instead, we just log a warning and continue with recovery.
2614 * (See also the comments about corrupt LSNs in XLogFlush.)
2615 */
2616 SpinLockAcquire(&XLogCtl->info_lck);
2617 newMinRecoveryPoint = XLogCtl->replayEndRecPtr;
2618 newMinRecoveryPointTLI = XLogCtl->replayEndTLI;
2619 SpinLockRelease(&XLogCtl->info_lck);
2620
2621 if (!force && newMinRecoveryPoint < lsn)
2622 elog(WARNING,
2623 "xlog min recovery request %X/%X is past current point %X/%X",
2624 (uint32) (lsn >> 32), (uint32) lsn,
2625 (uint32) (newMinRecoveryPoint >> 32),
2626 (uint32) newMinRecoveryPoint);
2627
2628 /* update control file */
2629 if (ControlFile->minRecoveryPoint < newMinRecoveryPoint)
2630 {
2631 ControlFile->minRecoveryPoint = newMinRecoveryPoint;
2632 ControlFile->minRecoveryPointTLI = newMinRecoveryPointTLI;
2633 UpdateControlFile();
2634 minRecoveryPoint = newMinRecoveryPoint;
2635 minRecoveryPointTLI = newMinRecoveryPointTLI;
2636
2637 ereport(DEBUG2,
2638 (errmsg("updated min recovery point to %X/%X on timeline %u",
2639 (uint32) (minRecoveryPoint >> 32),
2640 (uint32) minRecoveryPoint,
2641 newMinRecoveryPointTLI)));
2642 }
2643 }
2644 LWLockRelease(ControlFileLock);
2645 }
2646
2647 /*
2648 * Ensure that all XLOG data through the given position is flushed to disk.
2649 *
2650 * NOTE: this differs from XLogWrite mainly in that the WALWriteLock is not
2651 * already held, and we try to avoid acquiring it if possible.
2652 */
2653 void
XLogFlush(XLogRecPtr record)2654 XLogFlush(XLogRecPtr record)
2655 {
2656 XLogRecPtr WriteRqstPtr;
2657 XLogwrtRqst WriteRqst;
2658
2659 /*
2660 * During REDO, we are reading not writing WAL. Therefore, instead of
2661 * trying to flush the WAL, we should update minRecoveryPoint instead. We
2662 * test XLogInsertAllowed(), not InRecovery, because we need checkpointer
2663 * to act this way too, and because when it tries to write the
2664 * end-of-recovery checkpoint, it should indeed flush.
2665 */
2666 if (!XLogInsertAllowed())
2667 {
2668 UpdateMinRecoveryPoint(record, false);
2669 return;
2670 }
2671
2672 /* Quick exit if already known flushed */
2673 if (record <= LogwrtResult.Flush)
2674 return;
2675
2676 #ifdef WAL_DEBUG
2677 if (XLOG_DEBUG)
2678 elog(LOG, "xlog flush request %X/%X; write %X/%X; flush %X/%X",
2679 (uint32) (record >> 32), (uint32) record,
2680 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2681 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2682 #endif
2683
2684 START_CRIT_SECTION();
2685
2686 /*
2687 * Since fsync is usually a horribly expensive operation, we try to
2688 * piggyback as much data as we can on each fsync: if we see any more data
2689 * entered into the xlog buffer, we'll write and fsync that too, so that
2690 * the final value of LogwrtResult.Flush is as large as possible. This
2691 * gives us some chance of avoiding another fsync immediately after.
2692 */
2693
2694 /* initialize to given target; may increase below */
2695 WriteRqstPtr = record;
2696
2697 /*
2698 * Now wait until we get the write lock, or someone else does the flush
2699 * for us.
2700 */
2701 for (;;)
2702 {
2703 XLogRecPtr insertpos;
2704
2705 /* read LogwrtResult and update local state */
2706 SpinLockAcquire(&XLogCtl->info_lck);
2707 if (WriteRqstPtr < XLogCtl->LogwrtRqst.Write)
2708 WriteRqstPtr = XLogCtl->LogwrtRqst.Write;
2709 LogwrtResult = XLogCtl->LogwrtResult;
2710 SpinLockRelease(&XLogCtl->info_lck);
2711
2712 /* done already? */
2713 if (record <= LogwrtResult.Flush)
2714 break;
2715
2716 /*
2717 * Before actually performing the write, wait for all in-flight
2718 * insertions to the pages we're about to write to finish.
2719 */
2720 insertpos = WaitXLogInsertionsToFinish(WriteRqstPtr);
2721
2722 /*
2723 * Try to get the write lock. If we can't get it immediately, wait
2724 * until it's released, and recheck if we still need to do the flush
2725 * or if the backend that held the lock did it for us already. This
2726 * helps to maintain a good rate of group committing when the system
2727 * is bottlenecked by the speed of fsyncing.
2728 */
2729 if (!LWLockAcquireOrWait(WALWriteLock, LW_EXCLUSIVE))
2730 {
2731 /*
2732 * The lock is now free, but we didn't acquire it yet. Before we
2733 * do, loop back to check if someone else flushed the record for
2734 * us already.
2735 */
2736 continue;
2737 }
2738
2739 /* Got the lock; recheck whether request is satisfied */
2740 LogwrtResult = XLogCtl->LogwrtResult;
2741 if (record <= LogwrtResult.Flush)
2742 {
2743 LWLockRelease(WALWriteLock);
2744 break;
2745 }
2746
2747 /*
2748 * Sleep before flush! By adding a delay here, we may give further
2749 * backends the opportunity to join the backlog of group commit
2750 * followers; this can significantly improve transaction throughput,
2751 * at the risk of increasing transaction latency.
2752 *
2753 * We do not sleep if enableFsync is not turned on, nor if there are
2754 * fewer than CommitSiblings other backends with active transactions.
2755 */
2756 if (CommitDelay > 0 && enableFsync &&
2757 MinimumActiveBackends(CommitSiblings))
2758 {
2759 pg_usleep(CommitDelay);
2760
2761 /*
2762 * Re-check how far we can now flush the WAL. It's generally not
2763 * safe to call WaitXLogInsertionsToFinish while holding
2764 * WALWriteLock, because an in-progress insertion might need to
2765 * also grab WALWriteLock to make progress. But we know that all
2766 * the insertions up to insertpos have already finished, because
2767 * that's what the earlier WaitXLogInsertionsToFinish() returned.
2768 * We're only calling it again to allow insertpos to be moved
2769 * further forward, not to actually wait for anyone.
2770 */
2771 insertpos = WaitXLogInsertionsToFinish(insertpos);
2772 }
2773
2774 /* try to write/flush later additions to XLOG as well */
2775 WriteRqst.Write = insertpos;
2776 WriteRqst.Flush = insertpos;
2777
2778 XLogWrite(WriteRqst, false);
2779
2780 LWLockRelease(WALWriteLock);
2781 /* done */
2782 break;
2783 }
2784
2785 END_CRIT_SECTION();
2786
2787 /* wake up walsenders now that we've released heavily contended locks */
2788 WalSndWakeupProcessRequests();
2789
2790 /*
2791 * If we still haven't flushed to the request point then we have a
2792 * problem; most likely, the requested flush point is past end of XLOG.
2793 * This has been seen to occur when a disk page has a corrupted LSN.
2794 *
2795 * Formerly we treated this as a PANIC condition, but that hurts the
2796 * system's robustness rather than helping it: we do not want to take down
2797 * the whole system due to corruption on one data page. In particular, if
2798 * the bad page is encountered again during recovery then we would be
2799 * unable to restart the database at all! (This scenario actually
2800 * happened in the field several times with 7.1 releases.) As of 8.4, bad
2801 * LSNs encountered during recovery are UpdateMinRecoveryPoint's problem;
2802 * the only time we can reach here during recovery is while flushing the
2803 * end-of-recovery checkpoint record, and we don't expect that to have a
2804 * bad LSN.
2805 *
2806 * Note that for calls from xact.c, the ERROR will be promoted to PANIC
2807 * since xact.c calls this routine inside a critical section. However,
2808 * calls from bufmgr.c are not within critical sections and so we will not
2809 * force a restart for a bad LSN on a data page.
2810 */
2811 if (LogwrtResult.Flush < record)
2812 elog(ERROR,
2813 "xlog flush request %X/%X is not satisfied --- flushed only to %X/%X",
2814 (uint32) (record >> 32), (uint32) record,
2815 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2816 }
2817
2818 /*
2819 * Write & flush xlog, but without specifying exactly where to.
2820 *
2821 * We normally write only completed blocks; but if there is nothing to do on
2822 * that basis, we check for unwritten async commits in the current incomplete
2823 * block, and write through the latest one of those. Thus, if async commits
2824 * are not being used, we will write complete blocks only.
2825 *
2826 * If, based on the above, there's anything to write we do so immediately. But
2827 * to avoid calling fsync, fdatasync et. al. at a rate that'd impact
2828 * concurrent IO, we only flush WAL every wal_writer_delay ms, or if there's
2829 * more than wal_writer_flush_after unflushed blocks.
2830 *
2831 * We can guarantee that async commits reach disk after at most three
2832 * wal_writer_delay cycles. (When flushing complete blocks, we allow XLogWrite
2833 * to write "flexibly", meaning it can stop at the end of the buffer ring;
2834 * this makes a difference only with very high load or long wal_writer_delay,
2835 * but imposes one extra cycle for the worst case for async commits.)
2836 *
2837 * This routine is invoked periodically by the background walwriter process.
2838 *
2839 * Returns TRUE if there was any work to do, even if we skipped flushing due
2840 * to wal_writer_delay/wal_writer_flush_after.
2841 */
2842 bool
XLogBackgroundFlush(void)2843 XLogBackgroundFlush(void)
2844 {
2845 XLogwrtRqst WriteRqst;
2846 bool flexible = true;
2847 static TimestampTz lastflush;
2848 TimestampTz now;
2849 int flushbytes;
2850
2851 /* XLOG doesn't need flushing during recovery */
2852 if (RecoveryInProgress())
2853 return false;
2854
2855 /* read LogwrtResult and update local state */
2856 SpinLockAcquire(&XLogCtl->info_lck);
2857 LogwrtResult = XLogCtl->LogwrtResult;
2858 WriteRqst = XLogCtl->LogwrtRqst;
2859 SpinLockRelease(&XLogCtl->info_lck);
2860
2861 /* back off to last completed page boundary */
2862 WriteRqst.Write -= WriteRqst.Write % XLOG_BLCKSZ;
2863
2864 /* if we have already flushed that far, consider async commit records */
2865 if (WriteRqst.Write <= LogwrtResult.Flush)
2866 {
2867 SpinLockAcquire(&XLogCtl->info_lck);
2868 WriteRqst.Write = XLogCtl->asyncXactLSN;
2869 SpinLockRelease(&XLogCtl->info_lck);
2870 flexible = false; /* ensure it all gets written */
2871 }
2872
2873 /*
2874 * If already known flushed, we're done. Just need to check if we are
2875 * holding an open file handle to a logfile that's no longer in use,
2876 * preventing the file from being deleted.
2877 */
2878 if (WriteRqst.Write <= LogwrtResult.Flush)
2879 {
2880 if (openLogFile >= 0)
2881 {
2882 if (!XLByteInPrevSeg(LogwrtResult.Write, openLogSegNo))
2883 {
2884 XLogFileClose();
2885 }
2886 }
2887 return false;
2888 }
2889
2890 /*
2891 * Determine how far to flush WAL, based on the wal_writer_delay and
2892 * wal_writer_flush_after GUCs.
2893 */
2894 now = GetCurrentTimestamp();
2895 flushbytes =
2896 WriteRqst.Write / XLOG_BLCKSZ - LogwrtResult.Flush / XLOG_BLCKSZ;
2897
2898 if (WalWriterFlushAfter == 0 || lastflush == 0)
2899 {
2900 /* first call, or block based limits disabled */
2901 WriteRqst.Flush = WriteRqst.Write;
2902 lastflush = now;
2903 }
2904 else if (TimestampDifferenceExceeds(lastflush, now, WalWriterDelay))
2905 {
2906 /*
2907 * Flush the writes at least every WalWriteDelay ms. This is important
2908 * to bound the amount of time it takes for an asynchronous commit to
2909 * hit disk.
2910 */
2911 WriteRqst.Flush = WriteRqst.Write;
2912 lastflush = now;
2913 }
2914 else if (flushbytes >= WalWriterFlushAfter)
2915 {
2916 /* exceeded wal_writer_flush_after blocks, flush */
2917 WriteRqst.Flush = WriteRqst.Write;
2918 lastflush = now;
2919 }
2920 else
2921 {
2922 /* no flushing, this time round */
2923 WriteRqst.Flush = 0;
2924 }
2925
2926 #ifdef WAL_DEBUG
2927 if (XLOG_DEBUG)
2928 elog(LOG, "xlog bg flush request write %X/%X; flush: %X/%X, current is write %X/%X; flush %X/%X",
2929 (uint32) (WriteRqst.Write >> 32), (uint32) WriteRqst.Write,
2930 (uint32) (WriteRqst.Flush >> 32), (uint32) WriteRqst.Flush,
2931 (uint32) (LogwrtResult.Write >> 32), (uint32) LogwrtResult.Write,
2932 (uint32) (LogwrtResult.Flush >> 32), (uint32) LogwrtResult.Flush);
2933 #endif
2934
2935 START_CRIT_SECTION();
2936
2937 /* now wait for any in-progress insertions to finish and get write lock */
2938 WaitXLogInsertionsToFinish(WriteRqst.Write);
2939 LWLockAcquire(WALWriteLock, LW_EXCLUSIVE);
2940 LogwrtResult = XLogCtl->LogwrtResult;
2941 if (WriteRqst.Write > LogwrtResult.Write ||
2942 WriteRqst.Flush > LogwrtResult.Flush)
2943 {
2944 XLogWrite(WriteRqst, flexible);
2945 }
2946 LWLockRelease(WALWriteLock);
2947
2948 END_CRIT_SECTION();
2949
2950 /* wake up walsenders now that we've released heavily contended locks */
2951 WalSndWakeupProcessRequests();
2952
2953 /*
2954 * Great, done. To take some work off the critical path, try to initialize
2955 * as many of the no-longer-needed WAL buffers for future use as we can.
2956 */
2957 AdvanceXLInsertBuffer(InvalidXLogRecPtr, true);
2958
2959 /*
2960 * If we determined that we need to write data, but somebody else
2961 * wrote/flushed already, it should be considered as being active, to
2962 * avoid hibernating too early.
2963 */
2964 return true;
2965 }
2966
2967 /*
2968 * Test whether XLOG data has been flushed up to (at least) the given position.
2969 *
2970 * Returns true if a flush is still needed. (It may be that someone else
2971 * is already in process of flushing that far, however.)
2972 */
2973 bool
XLogNeedsFlush(XLogRecPtr record)2974 XLogNeedsFlush(XLogRecPtr record)
2975 {
2976 /*
2977 * During recovery, we don't flush WAL but update minRecoveryPoint
2978 * instead. So "needs flush" is taken to mean whether minRecoveryPoint
2979 * would need to be updated.
2980 */
2981 if (RecoveryInProgress())
2982 {
2983 /*
2984 * An invalid minRecoveryPoint means that we need to recover all the
2985 * WAL, i.e., we're doing crash recovery. We never modify the control
2986 * file's value in that case, so we can short-circuit future checks
2987 * here too. This triggers a quick exit path for the startup process,
2988 * which cannot update its local copy of minRecoveryPoint as long as
2989 * it has not replayed all WAL available when doing crash recovery.
2990 */
2991 if (XLogRecPtrIsInvalid(minRecoveryPoint) && InRecovery)
2992 updateMinRecoveryPoint = false;
2993
2994 /* Quick exit if already known to be updated or cannot be updated */
2995 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
2996 return false;
2997
2998 /*
2999 * Update local copy of minRecoveryPoint. But if the lock is busy,
3000 * just return a conservative guess.
3001 */
3002 if (!LWLockConditionalAcquire(ControlFileLock, LW_SHARED))
3003 return true;
3004 minRecoveryPoint = ControlFile->minRecoveryPoint;
3005 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
3006 LWLockRelease(ControlFileLock);
3007
3008 /*
3009 * Check minRecoveryPoint for any other process than the startup
3010 * process doing crash recovery, which should not update the control
3011 * file value if crash recovery is still running.
3012 */
3013 if (XLogRecPtrIsInvalid(minRecoveryPoint))
3014 updateMinRecoveryPoint = false;
3015
3016 /* check again */
3017 if (record <= minRecoveryPoint || !updateMinRecoveryPoint)
3018 return false;
3019 else
3020 return true;
3021 }
3022
3023 /* Quick exit if already known flushed */
3024 if (record <= LogwrtResult.Flush)
3025 return false;
3026
3027 /* read LogwrtResult and update local state */
3028 SpinLockAcquire(&XLogCtl->info_lck);
3029 LogwrtResult = XLogCtl->LogwrtResult;
3030 SpinLockRelease(&XLogCtl->info_lck);
3031
3032 /* check again */
3033 if (record <= LogwrtResult.Flush)
3034 return false;
3035
3036 return true;
3037 }
3038
3039 /*
3040 * Create a new XLOG file segment, or open a pre-existing one.
3041 *
3042 * log, seg: identify segment to be created/opened.
3043 *
3044 * *use_existent: if TRUE, OK to use a pre-existing file (else, any
3045 * pre-existing file will be deleted). On return, TRUE if a pre-existing
3046 * file was used.
3047 *
3048 * use_lock: if TRUE, acquire ControlFileLock while moving file into
3049 * place. This should be TRUE except during bootstrap log creation. The
3050 * caller must *not* hold the lock at call.
3051 *
3052 * Returns FD of opened file.
3053 *
3054 * Note: errors here are ERROR not PANIC because we might or might not be
3055 * inside a critical section (eg, during checkpoint there is no reason to
3056 * take down the system on failure). They will promote to PANIC if we are
3057 * in a critical section.
3058 */
3059 int
XLogFileInit(XLogSegNo logsegno,bool * use_existent,bool use_lock)3060 XLogFileInit(XLogSegNo logsegno, bool *use_existent, bool use_lock)
3061 {
3062 char path[MAXPGPATH];
3063 char tmppath[MAXPGPATH];
3064 PGAlignedXLogBlock zbuffer;
3065 XLogSegNo installed_segno;
3066 XLogSegNo max_segno;
3067 int fd;
3068 int nbytes;
3069
3070 XLogFilePath(path, ThisTimeLineID, logsegno);
3071
3072 /*
3073 * Try to use existent file (checkpoint maker may have created it already)
3074 */
3075 if (*use_existent)
3076 {
3077 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3078 S_IRUSR | S_IWUSR);
3079 if (fd < 0)
3080 {
3081 if (errno != ENOENT)
3082 ereport(ERROR,
3083 (errcode_for_file_access(),
3084 errmsg("could not open file \"%s\": %m", path)));
3085 }
3086 else
3087 return fd;
3088 }
3089
3090 /*
3091 * Initialize an empty (all zeroes) segment. NOTE: it is possible that
3092 * another process is doing the same thing. If so, we will end up
3093 * pre-creating an extra log segment. That seems OK, and better than
3094 * holding the lock throughout this lengthy process.
3095 */
3096 elog(DEBUG2, "creating and filling new WAL file");
3097
3098 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3099
3100 unlink(tmppath);
3101
3102 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3103 fd = BasicOpenFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3104 S_IRUSR | S_IWUSR);
3105 if (fd < 0)
3106 ereport(ERROR,
3107 (errcode_for_file_access(),
3108 errmsg("could not create file \"%s\": %m", tmppath)));
3109
3110 /*
3111 * Zero-fill the file. We have to do this the hard way to ensure that all
3112 * the file space has really been allocated --- on platforms that allow
3113 * "holes" in files, just seeking to the end doesn't allocate intermediate
3114 * space. This way, we know that we have all the space and (after the
3115 * fsync below) that all the indirect blocks are down on disk. Therefore,
3116 * fdatasync(2) or O_DSYNC will be sufficient to sync future writes to the
3117 * log file.
3118 */
3119 memset(zbuffer.data, 0, XLOG_BLCKSZ);
3120 for (nbytes = 0; nbytes < XLogSegSize; nbytes += XLOG_BLCKSZ)
3121 {
3122 errno = 0;
3123 if ((int) write(fd, zbuffer.data, XLOG_BLCKSZ) != (int) XLOG_BLCKSZ)
3124 {
3125 int save_errno = errno;
3126
3127 /*
3128 * If we fail to make the file, delete it to release disk space
3129 */
3130 unlink(tmppath);
3131
3132 close(fd);
3133
3134 /* if write didn't set errno, assume problem is no disk space */
3135 errno = save_errno ? save_errno : ENOSPC;
3136
3137 ereport(ERROR,
3138 (errcode_for_file_access(),
3139 errmsg("could not write to file \"%s\": %m", tmppath)));
3140 }
3141 }
3142
3143 if (pg_fsync(fd) != 0)
3144 {
3145 int save_errno = errno;
3146
3147 close(fd);
3148 errno = save_errno;
3149 ereport(ERROR,
3150 (errcode_for_file_access(),
3151 errmsg("could not fsync file \"%s\": %m", tmppath)));
3152 }
3153
3154 if (close(fd))
3155 ereport(ERROR,
3156 (errcode_for_file_access(),
3157 errmsg("could not close file \"%s\": %m", tmppath)));
3158
3159 /*
3160 * Now move the segment into place with its final name.
3161 *
3162 * If caller didn't want to use a pre-existing file, get rid of any
3163 * pre-existing file. Otherwise, cope with possibility that someone else
3164 * has created the file while we were filling ours: if so, use ours to
3165 * pre-create a future log segment.
3166 */
3167 installed_segno = logsegno;
3168
3169 /*
3170 * XXX: What should we use as max_segno? We used to use XLOGfileslop when
3171 * that was a constant, but that was always a bit dubious: normally, at a
3172 * checkpoint, XLOGfileslop was the offset from the checkpoint record, but
3173 * here, it was the offset from the insert location. We can't do the
3174 * normal XLOGfileslop calculation here because we don't have access to
3175 * the prior checkpoint's redo location. So somewhat arbitrarily, just use
3176 * CheckPointSegments.
3177 */
3178 max_segno = logsegno + CheckPointSegments;
3179 if (!InstallXLogFileSegment(&installed_segno, tmppath,
3180 *use_existent, max_segno,
3181 use_lock))
3182 {
3183 /*
3184 * No need for any more future segments, or InstallXLogFileSegment()
3185 * failed to rename the file into place. If the rename failed, opening
3186 * the file below will fail.
3187 */
3188 unlink(tmppath);
3189 }
3190
3191 /* Set flag to tell caller there was no existent file */
3192 *use_existent = false;
3193
3194 /* Now open original target segment (might not be file I just made) */
3195 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3196 S_IRUSR | S_IWUSR);
3197 if (fd < 0)
3198 ereport(ERROR,
3199 (errcode_for_file_access(),
3200 errmsg("could not open file \"%s\": %m", path)));
3201
3202 elog(DEBUG2, "done creating and filling new WAL file");
3203
3204 return fd;
3205 }
3206
3207 /*
3208 * Create a new XLOG file segment by copying a pre-existing one.
3209 *
3210 * destsegno: identify segment to be created.
3211 *
3212 * srcTLI, srcsegno: identify segment to be copied (could be from
3213 * a different timeline)
3214 *
3215 * upto: how much of the source file to copy (the rest is filled with
3216 * zeros)
3217 *
3218 * Currently this is only used during recovery, and so there are no locking
3219 * considerations. But we should be just as tense as XLogFileInit to avoid
3220 * emplacing a bogus file.
3221 */
3222 static void
XLogFileCopy(XLogSegNo destsegno,TimeLineID srcTLI,XLogSegNo srcsegno,int upto)3223 XLogFileCopy(XLogSegNo destsegno, TimeLineID srcTLI, XLogSegNo srcsegno,
3224 int upto)
3225 {
3226 char path[MAXPGPATH];
3227 char tmppath[MAXPGPATH];
3228 PGAlignedXLogBlock buffer;
3229 int srcfd;
3230 int fd;
3231 int nbytes;
3232
3233 /*
3234 * Open the source file
3235 */
3236 XLogFilePath(path, srcTLI, srcsegno);
3237 srcfd = OpenTransientFile(path, O_RDONLY | PG_BINARY, 0);
3238 if (srcfd < 0)
3239 ereport(ERROR,
3240 (errcode_for_file_access(),
3241 errmsg("could not open file \"%s\": %m", path)));
3242
3243 /*
3244 * Copy into a temp file name.
3245 */
3246 snprintf(tmppath, MAXPGPATH, XLOGDIR "/xlogtemp.%d", (int) getpid());
3247
3248 unlink(tmppath);
3249
3250 /* do not use get_sync_bit() here --- want to fsync only at end of fill */
3251 fd = OpenTransientFile(tmppath, O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
3252 S_IRUSR | S_IWUSR);
3253 if (fd < 0)
3254 ereport(ERROR,
3255 (errcode_for_file_access(),
3256 errmsg("could not create file \"%s\": %m", tmppath)));
3257
3258 /*
3259 * Do the data copying.
3260 */
3261 for (nbytes = 0; nbytes < XLogSegSize; nbytes += sizeof(buffer))
3262 {
3263 int nread;
3264
3265 nread = upto - nbytes;
3266
3267 /*
3268 * The part that is not read from the source file is filled with
3269 * zeros.
3270 */
3271 if (nread < sizeof(buffer))
3272 memset(buffer.data, 0, sizeof(buffer));
3273
3274 if (nread > 0)
3275 {
3276 if (nread > sizeof(buffer))
3277 nread = sizeof(buffer);
3278 errno = 0;
3279 if (read(srcfd, buffer.data, nread) != nread)
3280 {
3281 if (errno != 0)
3282 ereport(ERROR,
3283 (errcode_for_file_access(),
3284 errmsg("could not read file \"%s\": %m",
3285 path)));
3286 else
3287 ereport(ERROR,
3288 (errmsg("not enough data in file \"%s\"",
3289 path)));
3290 }
3291 }
3292 errno = 0;
3293 if ((int) write(fd, buffer.data, sizeof(buffer)) != (int) sizeof(buffer))
3294 {
3295 int save_errno = errno;
3296
3297 /*
3298 * If we fail to make the file, delete it to release disk space
3299 */
3300 unlink(tmppath);
3301 /* if write didn't set errno, assume problem is no disk space */
3302 errno = save_errno ? save_errno : ENOSPC;
3303
3304 ereport(ERROR,
3305 (errcode_for_file_access(),
3306 errmsg("could not write to file \"%s\": %m", tmppath)));
3307 }
3308 }
3309
3310 if (pg_fsync(fd) != 0)
3311 ereport(data_sync_elevel(ERROR),
3312 (errcode_for_file_access(),
3313 errmsg("could not fsync file \"%s\": %m", tmppath)));
3314
3315 if (CloseTransientFile(fd))
3316 ereport(ERROR,
3317 (errcode_for_file_access(),
3318 errmsg("could not close file \"%s\": %m", tmppath)));
3319
3320 CloseTransientFile(srcfd);
3321
3322 /*
3323 * Now move the segment into place with its final name.
3324 */
3325 if (!InstallXLogFileSegment(&destsegno, tmppath, false, 0, false))
3326 elog(ERROR, "InstallXLogFileSegment should not have failed");
3327 }
3328
3329 /*
3330 * Install a new XLOG segment file as a current or future log segment.
3331 *
3332 * This is used both to install a newly-created segment (which has a temp
3333 * filename while it's being created) and to recycle an old segment.
3334 *
3335 * *segno: identify segment to install as (or first possible target).
3336 * When find_free is TRUE, this is modified on return to indicate the
3337 * actual installation location or last segment searched.
3338 *
3339 * tmppath: initial name of file to install. It will be renamed into place.
3340 *
3341 * find_free: if TRUE, install the new segment at the first empty segno
3342 * number at or after the passed numbers. If FALSE, install the new segment
3343 * exactly where specified, deleting any existing segment file there.
3344 *
3345 * max_segno: maximum segment number to install the new file as. Fail if no
3346 * free slot is found between *segno and max_segno. (Ignored when find_free
3347 * is FALSE.)
3348 *
3349 * use_lock: if TRUE, acquire ControlFileLock while moving file into
3350 * place. This should be TRUE except during bootstrap log creation. The
3351 * caller must *not* hold the lock at call.
3352 *
3353 * Returns TRUE if the file was installed successfully. FALSE indicates that
3354 * max_segno limit was exceeded, or an error occurred while renaming the
3355 * file into place.
3356 */
3357 static bool
InstallXLogFileSegment(XLogSegNo * segno,char * tmppath,bool find_free,XLogSegNo max_segno,bool use_lock)3358 InstallXLogFileSegment(XLogSegNo *segno, char *tmppath,
3359 bool find_free, XLogSegNo max_segno,
3360 bool use_lock)
3361 {
3362 char path[MAXPGPATH];
3363 struct stat stat_buf;
3364
3365 XLogFilePath(path, ThisTimeLineID, *segno);
3366
3367 /*
3368 * We want to be sure that only one process does this at a time.
3369 */
3370 if (use_lock)
3371 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
3372
3373 if (!find_free)
3374 {
3375 /* Force installation: get rid of any pre-existing segment file */
3376 unlink(path);
3377 }
3378 else
3379 {
3380 /* Find a free slot to put it in */
3381 while (stat(path, &stat_buf) == 0)
3382 {
3383 if ((*segno) >= max_segno)
3384 {
3385 /* Failed to find a free slot within specified range */
3386 if (use_lock)
3387 LWLockRelease(ControlFileLock);
3388 return false;
3389 }
3390 (*segno)++;
3391 XLogFilePath(path, ThisTimeLineID, *segno);
3392 }
3393 }
3394
3395 /*
3396 * Perform the rename using link if available, paranoidly trying to avoid
3397 * overwriting an existing file (there shouldn't be one).
3398 */
3399 if (durable_link_or_rename(tmppath, path, LOG) != 0)
3400 {
3401 if (use_lock)
3402 LWLockRelease(ControlFileLock);
3403 /* durable_link_or_rename already emitted log message */
3404 return false;
3405 }
3406
3407 if (use_lock)
3408 LWLockRelease(ControlFileLock);
3409
3410 return true;
3411 }
3412
3413 /*
3414 * Open a pre-existing logfile segment for writing.
3415 */
3416 int
XLogFileOpen(XLogSegNo segno)3417 XLogFileOpen(XLogSegNo segno)
3418 {
3419 char path[MAXPGPATH];
3420 int fd;
3421
3422 XLogFilePath(path, ThisTimeLineID, segno);
3423
3424 fd = BasicOpenFile(path, O_RDWR | PG_BINARY | get_sync_bit(sync_method),
3425 S_IRUSR | S_IWUSR);
3426 if (fd < 0)
3427 ereport(PANIC,
3428 (errcode_for_file_access(),
3429 errmsg("could not open transaction log file \"%s\": %m", path)));
3430
3431 return fd;
3432 }
3433
3434 /*
3435 * Open a logfile segment for reading (during recovery).
3436 *
3437 * If source == XLOG_FROM_ARCHIVE, the segment is retrieved from archive.
3438 * Otherwise, it's assumed to be already available in pg_xlog.
3439 */
3440 static int
XLogFileRead(XLogSegNo segno,int emode,TimeLineID tli,int source,bool notfoundOk)3441 XLogFileRead(XLogSegNo segno, int emode, TimeLineID tli,
3442 int source, bool notfoundOk)
3443 {
3444 char xlogfname[MAXFNAMELEN];
3445 char activitymsg[MAXFNAMELEN + 16];
3446 char path[MAXPGPATH];
3447 int fd;
3448
3449 XLogFileName(xlogfname, tli, segno);
3450
3451 switch (source)
3452 {
3453 case XLOG_FROM_ARCHIVE:
3454 /* Report recovery progress in PS display */
3455 snprintf(activitymsg, sizeof(activitymsg), "waiting for %s",
3456 xlogfname);
3457 set_ps_display(activitymsg, false);
3458
3459 restoredFromArchive = RestoreArchivedFile(path, xlogfname,
3460 "RECOVERYXLOG",
3461 XLogSegSize,
3462 InRedo);
3463 if (!restoredFromArchive)
3464 return -1;
3465 break;
3466
3467 case XLOG_FROM_PG_XLOG:
3468 case XLOG_FROM_STREAM:
3469 XLogFilePath(path, tli, segno);
3470 restoredFromArchive = false;
3471 break;
3472
3473 default:
3474 elog(ERROR, "invalid XLogFileRead source %d", source);
3475 }
3476
3477 /*
3478 * If the segment was fetched from archival storage, replace the existing
3479 * xlog segment (if any) with the archival version.
3480 */
3481 if (source == XLOG_FROM_ARCHIVE)
3482 {
3483 KeepFileRestoredFromArchive(path, xlogfname);
3484
3485 /*
3486 * Set path to point at the new file in pg_xlog.
3487 */
3488 snprintf(path, MAXPGPATH, XLOGDIR "/%s", xlogfname);
3489 }
3490
3491 fd = BasicOpenFile(path, O_RDONLY | PG_BINARY, 0);
3492 if (fd >= 0)
3493 {
3494 /* Success! */
3495 curFileTLI = tli;
3496
3497 /* Report recovery progress in PS display */
3498 snprintf(activitymsg, sizeof(activitymsg), "recovering %s",
3499 xlogfname);
3500 set_ps_display(activitymsg, false);
3501
3502 /* Track source of data in assorted state variables */
3503 readSource = source;
3504 XLogReceiptSource = source;
3505 /* In FROM_STREAM case, caller tracks receipt time, not me */
3506 if (source != XLOG_FROM_STREAM)
3507 XLogReceiptTime = GetCurrentTimestamp();
3508
3509 return fd;
3510 }
3511 if (errno != ENOENT || !notfoundOk) /* unexpected failure? */
3512 ereport(PANIC,
3513 (errcode_for_file_access(),
3514 errmsg("could not open file \"%s\": %m", path)));
3515 return -1;
3516 }
3517
3518 /*
3519 * Open a logfile segment for reading (during recovery).
3520 *
3521 * This version searches for the segment with any TLI listed in expectedTLEs.
3522 */
3523 static int
XLogFileReadAnyTLI(XLogSegNo segno,int emode,int source)3524 XLogFileReadAnyTLI(XLogSegNo segno, int emode, int source)
3525 {
3526 char path[MAXPGPATH];
3527 ListCell *cell;
3528 int fd;
3529 List *tles;
3530
3531 /*
3532 * Loop looking for a suitable timeline ID: we might need to read any of
3533 * the timelines listed in expectedTLEs.
3534 *
3535 * We expect curFileTLI on entry to be the TLI of the preceding file in
3536 * sequence, or 0 if there was no predecessor. We do not allow curFileTLI
3537 * to go backwards; this prevents us from picking up the wrong file when a
3538 * parent timeline extends to higher segment numbers than the child we
3539 * want to read.
3540 *
3541 * If we haven't read the timeline history file yet, read it now, so that
3542 * we know which TLIs to scan. We don't save the list in expectedTLEs,
3543 * however, unless we actually find a valid segment. That way if there is
3544 * neither a timeline history file nor a WAL segment in the archive, and
3545 * streaming replication is set up, we'll read the timeline history file
3546 * streamed from the master when we start streaming, instead of recovering
3547 * with a dummy history generated here.
3548 */
3549 if (expectedTLEs)
3550 tles = expectedTLEs;
3551 else
3552 tles = readTimeLineHistory(recoveryTargetTLI);
3553
3554 foreach(cell, tles)
3555 {
3556 TimeLineHistoryEntry *hent = (TimeLineHistoryEntry *) lfirst(cell);
3557 TimeLineID tli = hent->tli;
3558
3559 if (tli < curFileTLI)
3560 break; /* don't bother looking at too-old TLIs */
3561
3562 /*
3563 * Skip scanning the timeline ID that the logfile segment to read
3564 * doesn't belong to
3565 */
3566 if (hent->begin != InvalidXLogRecPtr)
3567 {
3568 XLogSegNo beginseg = 0;
3569
3570 XLByteToSeg(hent->begin, beginseg);
3571
3572 /*
3573 * The logfile segment that doesn't belong to the timeline is
3574 * older or newer than the segment that the timeline started or
3575 * ended at, respectively. It's sufficient to check only the
3576 * starting segment of the timeline here. Since the timelines are
3577 * scanned in descending order in this loop, any segments newer
3578 * than the ending segment should belong to newer timeline and
3579 * have already been read before. So it's not necessary to check
3580 * the ending segment of the timeline here.
3581 */
3582 if (segno < beginseg)
3583 continue;
3584 }
3585
3586 if (source == XLOG_FROM_ANY || source == XLOG_FROM_ARCHIVE)
3587 {
3588 fd = XLogFileRead(segno, emode, tli,
3589 XLOG_FROM_ARCHIVE, true);
3590 if (fd != -1)
3591 {
3592 elog(DEBUG1, "got WAL segment from archive");
3593 if (!expectedTLEs)
3594 expectedTLEs = tles;
3595 return fd;
3596 }
3597 }
3598
3599 if (source == XLOG_FROM_ANY || source == XLOG_FROM_PG_XLOG)
3600 {
3601 fd = XLogFileRead(segno, emode, tli,
3602 XLOG_FROM_PG_XLOG, true);
3603 if (fd != -1)
3604 {
3605 if (!expectedTLEs)
3606 expectedTLEs = tles;
3607 return fd;
3608 }
3609 }
3610 }
3611
3612 /* Couldn't find it. For simplicity, complain about front timeline */
3613 XLogFilePath(path, recoveryTargetTLI, segno);
3614 errno = ENOENT;
3615 ereport(emode,
3616 (errcode_for_file_access(),
3617 errmsg("could not open file \"%s\": %m", path)));
3618 return -1;
3619 }
3620
3621 /*
3622 * Close the current logfile segment for writing.
3623 */
3624 static void
XLogFileClose(void)3625 XLogFileClose(void)
3626 {
3627 Assert(openLogFile >= 0);
3628
3629 /*
3630 * WAL segment files will not be re-read in normal operation, so we advise
3631 * the OS to release any cached pages. But do not do so if WAL archiving
3632 * or streaming is active, because archiver and walsender process could
3633 * use the cache to read the WAL segment.
3634 */
3635 #if defined(USE_POSIX_FADVISE) && defined(POSIX_FADV_DONTNEED)
3636 if (!XLogIsNeeded())
3637 (void) posix_fadvise(openLogFile, 0, 0, POSIX_FADV_DONTNEED);
3638 #endif
3639
3640 if (close(openLogFile))
3641 ereport(PANIC,
3642 (errcode_for_file_access(),
3643 errmsg("could not close log file %s: %m",
3644 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
3645 openLogFile = -1;
3646 }
3647
3648 /*
3649 * Preallocate log files beyond the specified log endpoint.
3650 *
3651 * XXX this is currently extremely conservative, since it forces only one
3652 * future log segment to exist, and even that only if we are 75% done with
3653 * the current one. This is only appropriate for very low-WAL-volume systems.
3654 * High-volume systems will be OK once they've built up a sufficient set of
3655 * recycled log segments, but the startup transient is likely to include
3656 * a lot of segment creations by foreground processes, which is not so good.
3657 */
3658 static void
PreallocXlogFiles(XLogRecPtr endptr)3659 PreallocXlogFiles(XLogRecPtr endptr)
3660 {
3661 XLogSegNo _logSegNo;
3662 int lf;
3663 bool use_existent;
3664
3665 XLByteToPrevSeg(endptr, _logSegNo);
3666 if ((endptr - 1) % XLogSegSize >= (uint32) (0.75 * XLogSegSize))
3667 {
3668 _logSegNo++;
3669 use_existent = true;
3670 lf = XLogFileInit(_logSegNo, &use_existent, true);
3671 close(lf);
3672 if (!use_existent)
3673 CheckpointStats.ckpt_segs_added++;
3674 }
3675 }
3676
3677 /*
3678 * Throws an error if the given log segment has already been removed or
3679 * recycled. The caller should only pass a segment that it knows to have
3680 * existed while the server has been running, as this function always
3681 * succeeds if no WAL segments have been removed since startup.
3682 * 'tli' is only used in the error message.
3683 *
3684 * Note: this function guarantees to keep errno unchanged on return.
3685 * This supports callers that use this to possibly deliver a better
3686 * error message about a missing file, while still being able to throw
3687 * a normal file-access error afterwards, if this does return.
3688 */
3689 void
CheckXLogRemoved(XLogSegNo segno,TimeLineID tli)3690 CheckXLogRemoved(XLogSegNo segno, TimeLineID tli)
3691 {
3692 int save_errno = errno;
3693 XLogSegNo lastRemovedSegNo;
3694
3695 SpinLockAcquire(&XLogCtl->info_lck);
3696 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3697 SpinLockRelease(&XLogCtl->info_lck);
3698
3699 if (segno <= lastRemovedSegNo)
3700 {
3701 char filename[MAXFNAMELEN];
3702
3703 XLogFileName(filename, tli, segno);
3704 errno = save_errno;
3705 ereport(ERROR,
3706 (errcode_for_file_access(),
3707 errmsg("requested WAL segment %s has already been removed",
3708 filename)));
3709 }
3710 errno = save_errno;
3711 }
3712
3713 /*
3714 * Return the last WAL segment removed, or 0 if no segment has been removed
3715 * since startup.
3716 *
3717 * NB: the result can be out of date arbitrarily fast, the caller has to deal
3718 * with that.
3719 */
3720 XLogSegNo
XLogGetLastRemovedSegno(void)3721 XLogGetLastRemovedSegno(void)
3722 {
3723 XLogSegNo lastRemovedSegNo;
3724
3725 SpinLockAcquire(&XLogCtl->info_lck);
3726 lastRemovedSegNo = XLogCtl->lastRemovedSegNo;
3727 SpinLockRelease(&XLogCtl->info_lck);
3728
3729 return lastRemovedSegNo;
3730 }
3731
3732 /*
3733 * Update the last removed segno pointer in shared memory, to reflect
3734 * that the given XLOG file has been removed.
3735 */
3736 static void
UpdateLastRemovedPtr(char * filename)3737 UpdateLastRemovedPtr(char *filename)
3738 {
3739 uint32 tli;
3740 XLogSegNo segno;
3741
3742 XLogFromFileName(filename, &tli, &segno);
3743
3744 SpinLockAcquire(&XLogCtl->info_lck);
3745 if (segno > XLogCtl->lastRemovedSegNo)
3746 XLogCtl->lastRemovedSegNo = segno;
3747 SpinLockRelease(&XLogCtl->info_lck);
3748 }
3749
3750 /*
3751 * Recycle or remove all log files older or equal to passed segno.
3752 *
3753 * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3754 * redo pointer of the previous checkpoint. These are used to determine
3755 * whether we want to recycle rather than delete no-longer-wanted log files.
3756 */
3757 static void
RemoveOldXlogFiles(XLogSegNo segno,XLogRecPtr PriorRedoPtr,XLogRecPtr endptr)3758 RemoveOldXlogFiles(XLogSegNo segno, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3759 {
3760 DIR *xldir;
3761 struct dirent *xlde;
3762 char lastoff[MAXFNAMELEN];
3763
3764 xldir = AllocateDir(XLOGDIR);
3765 if (xldir == NULL)
3766 ereport(ERROR,
3767 (errcode_for_file_access(),
3768 errmsg("could not open transaction log directory \"%s\": %m",
3769 XLOGDIR)));
3770
3771 /*
3772 * Construct a filename of the last segment to be kept. The timeline ID
3773 * doesn't matter, we ignore that in the comparison. (During recovery,
3774 * ThisTimeLineID isn't set, so we can't use that.)
3775 */
3776 XLogFileName(lastoff, 0, segno);
3777
3778 elog(DEBUG2, "attempting to remove WAL segments older than log file %s",
3779 lastoff);
3780
3781 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3782 {
3783 /* Ignore files that are not XLOG segments */
3784 if (!IsXLogFileName(xlde->d_name) &&
3785 !IsPartialXLogFileName(xlde->d_name))
3786 continue;
3787
3788 /*
3789 * We ignore the timeline part of the XLOG segment identifiers in
3790 * deciding whether a segment is still needed. This ensures that we
3791 * won't prematurely remove a segment from a parent timeline. We could
3792 * probably be a little more proactive about removing segments of
3793 * non-parent timelines, but that would be a whole lot more
3794 * complicated.
3795 *
3796 * We use the alphanumeric sorting property of the filenames to decide
3797 * which ones are earlier than the lastoff segment.
3798 */
3799 if (strcmp(xlde->d_name + 8, lastoff + 8) <= 0)
3800 {
3801 if (XLogArchiveCheckDone(xlde->d_name))
3802 {
3803 /* Update the last removed location in shared memory first */
3804 UpdateLastRemovedPtr(xlde->d_name);
3805
3806 RemoveXlogFile(xlde->d_name, PriorRedoPtr, endptr);
3807 }
3808 }
3809 }
3810
3811 FreeDir(xldir);
3812 }
3813
3814 /*
3815 * Remove WAL files that are not part of the given timeline's history.
3816 *
3817 * This is called during recovery, whenever we switch to follow a new
3818 * timeline, and at the end of recovery when we create a new timeline. We
3819 * wouldn't otherwise care about extra WAL files lying in pg_xlog, but they
3820 * might be leftover pre-allocated or recycled WAL segments on the old timeline
3821 * that we haven't used yet, and contain garbage. If we just leave them in
3822 * pg_xlog, they will eventually be archived, and we can't let that happen.
3823 * Files that belong to our timeline history are valid, because we have
3824 * successfully replayed them, but from others we can't be sure.
3825 *
3826 * 'switchpoint' is the current point in WAL where we switch to new timeline,
3827 * and 'newTLI' is the new timeline we switch to.
3828 */
3829 static void
RemoveNonParentXlogFiles(XLogRecPtr switchpoint,TimeLineID newTLI)3830 RemoveNonParentXlogFiles(XLogRecPtr switchpoint, TimeLineID newTLI)
3831 {
3832 DIR *xldir;
3833 struct dirent *xlde;
3834 char switchseg[MAXFNAMELEN];
3835 XLogSegNo endLogSegNo;
3836
3837 XLByteToPrevSeg(switchpoint, endLogSegNo);
3838
3839 xldir = AllocateDir(XLOGDIR);
3840 if (xldir == NULL)
3841 ereport(ERROR,
3842 (errcode_for_file_access(),
3843 errmsg("could not open transaction log directory \"%s\": %m",
3844 XLOGDIR)));
3845
3846 /*
3847 * Construct a filename of the last segment to be kept.
3848 */
3849 XLogFileName(switchseg, newTLI, endLogSegNo);
3850
3851 elog(DEBUG2, "attempting to remove WAL segments newer than log file %s",
3852 switchseg);
3853
3854 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
3855 {
3856 /* Ignore files that are not XLOG segments */
3857 if (!IsXLogFileName(xlde->d_name))
3858 continue;
3859
3860 /*
3861 * Remove files that are on a timeline older than the new one we're
3862 * switching to, but with a segment number >= the first segment on the
3863 * new timeline.
3864 */
3865 if (strncmp(xlde->d_name, switchseg, 8) < 0 &&
3866 strcmp(xlde->d_name + 8, switchseg + 8) > 0)
3867 {
3868 /*
3869 * If the file has already been marked as .ready, however, don't
3870 * remove it yet. It should be OK to remove it - files that are
3871 * not part of our timeline history are not required for recovery
3872 * - but seems safer to let them be archived and removed later.
3873 */
3874 if (!XLogArchiveIsReady(xlde->d_name))
3875 RemoveXlogFile(xlde->d_name, InvalidXLogRecPtr, switchpoint);
3876 }
3877 }
3878
3879 FreeDir(xldir);
3880 }
3881
3882 /*
3883 * Recycle or remove a log file that's no longer needed.
3884 *
3885 * endptr is current (or recent) end of xlog, and PriorRedoRecPtr is the
3886 * redo pointer of the previous checkpoint. These are used to determine
3887 * whether we want to recycle rather than delete no-longer-wanted log files.
3888 * If PriorRedoRecPtr is not known, pass invalid, and the function will
3889 * recycle, somewhat arbitrarily, 10 future segments.
3890 */
3891 static void
RemoveXlogFile(const char * segname,XLogRecPtr PriorRedoPtr,XLogRecPtr endptr)3892 RemoveXlogFile(const char *segname, XLogRecPtr PriorRedoPtr, XLogRecPtr endptr)
3893 {
3894 char path[MAXPGPATH];
3895 #ifdef WIN32
3896 char newpath[MAXPGPATH];
3897 #endif
3898 struct stat statbuf;
3899 XLogSegNo endlogSegNo;
3900 XLogSegNo recycleSegNo;
3901
3902 /*
3903 * Initialize info about where to try to recycle to.
3904 */
3905 XLByteToSeg(endptr, endlogSegNo);
3906 if (PriorRedoPtr == InvalidXLogRecPtr)
3907 recycleSegNo = endlogSegNo + 10;
3908 else
3909 recycleSegNo = XLOGfileslop(PriorRedoPtr);
3910
3911 snprintf(path, MAXPGPATH, XLOGDIR "/%s", segname);
3912
3913 /*
3914 * Before deleting the file, see if it can be recycled as a future log
3915 * segment. Only recycle normal files, pg_standby for example can create
3916 * symbolic links pointing to a separate archive directory.
3917 */
3918 if (endlogSegNo <= recycleSegNo &&
3919 lstat(path, &statbuf) == 0 && S_ISREG(statbuf.st_mode) &&
3920 InstallXLogFileSegment(&endlogSegNo, path,
3921 true, recycleSegNo, true))
3922 {
3923 ereport(DEBUG2,
3924 (errmsg("recycled transaction log file \"%s\"",
3925 segname)));
3926 CheckpointStats.ckpt_segs_recycled++;
3927 /* Needn't recheck that slot on future iterations */
3928 endlogSegNo++;
3929 }
3930 else
3931 {
3932 /* No need for any more future segments... */
3933 int rc;
3934
3935 ereport(DEBUG2,
3936 (errmsg("removing transaction log file \"%s\"",
3937 segname)));
3938
3939 #ifdef WIN32
3940
3941 /*
3942 * On Windows, if another process (e.g another backend) holds the file
3943 * open in FILE_SHARE_DELETE mode, unlink will succeed, but the file
3944 * will still show up in directory listing until the last handle is
3945 * closed. To avoid confusing the lingering deleted file for a live
3946 * WAL file that needs to be archived, rename it before deleting it.
3947 *
3948 * If another process holds the file open without FILE_SHARE_DELETE
3949 * flag, rename will fail. We'll try again at the next checkpoint.
3950 */
3951 snprintf(newpath, MAXPGPATH, "%s.deleted", path);
3952 if (rename(path, newpath) != 0)
3953 {
3954 ereport(LOG,
3955 (errcode_for_file_access(),
3956 errmsg("could not rename old transaction log file \"%s\": %m",
3957 path)));
3958 return;
3959 }
3960 rc = unlink(newpath);
3961 #else
3962 rc = unlink(path);
3963 #endif
3964 if (rc != 0)
3965 {
3966 ereport(LOG,
3967 (errcode_for_file_access(),
3968 errmsg("could not remove old transaction log file \"%s\": %m",
3969 path)));
3970 return;
3971 }
3972 CheckpointStats.ckpt_segs_removed++;
3973 }
3974
3975 XLogArchiveCleanup(segname);
3976 }
3977
3978 /*
3979 * Verify whether pg_xlog and pg_xlog/archive_status exist.
3980 * If the latter does not exist, recreate it.
3981 *
3982 * It is not the goal of this function to verify the contents of these
3983 * directories, but to help in cases where someone has performed a cluster
3984 * copy for PITR purposes but omitted pg_xlog from the copy.
3985 *
3986 * We could also recreate pg_xlog if it doesn't exist, but a deliberate
3987 * policy decision was made not to. It is fairly common for pg_xlog to be
3988 * a symlink, and if that was the DBA's intent then automatically making a
3989 * plain directory would result in degraded performance with no notice.
3990 */
3991 static void
ValidateXLOGDirectoryStructure(void)3992 ValidateXLOGDirectoryStructure(void)
3993 {
3994 char path[MAXPGPATH];
3995 struct stat stat_buf;
3996
3997 /* Check for pg_xlog; if it doesn't exist, error out */
3998 if (stat(XLOGDIR, &stat_buf) != 0 ||
3999 !S_ISDIR(stat_buf.st_mode))
4000 ereport(FATAL,
4001 (errmsg("required WAL directory \"%s\" does not exist",
4002 XLOGDIR)));
4003
4004 /* Check for archive_status */
4005 snprintf(path, MAXPGPATH, XLOGDIR "/archive_status");
4006 if (stat(path, &stat_buf) == 0)
4007 {
4008 /* Check for weird cases where it exists but isn't a directory */
4009 if (!S_ISDIR(stat_buf.st_mode))
4010 ereport(FATAL,
4011 (errmsg("required WAL directory \"%s\" does not exist",
4012 path)));
4013 }
4014 else
4015 {
4016 ereport(LOG,
4017 (errmsg("creating missing WAL directory \"%s\"", path)));
4018 if (mkdir(path, S_IRWXU) < 0)
4019 ereport(FATAL,
4020 (errmsg("could not create missing directory \"%s\": %m",
4021 path)));
4022 }
4023 }
4024
4025 /*
4026 * Remove previous backup history files. This also retries creation of
4027 * .ready files for any backup history files for which XLogArchiveNotify
4028 * failed earlier.
4029 */
4030 static void
CleanupBackupHistory(void)4031 CleanupBackupHistory(void)
4032 {
4033 DIR *xldir;
4034 struct dirent *xlde;
4035 char path[MAXPGPATH + sizeof(XLOGDIR)];
4036
4037 xldir = AllocateDir(XLOGDIR);
4038 if (xldir == NULL)
4039 ereport(ERROR,
4040 (errcode_for_file_access(),
4041 errmsg("could not open transaction log directory \"%s\": %m",
4042 XLOGDIR)));
4043
4044 while ((xlde = ReadDir(xldir, XLOGDIR)) != NULL)
4045 {
4046 if (IsBackupHistoryFileName(xlde->d_name))
4047 {
4048 if (XLogArchiveCheckDone(xlde->d_name))
4049 {
4050 ereport(DEBUG2,
4051 (errmsg("removing transaction log backup history file \"%s\"",
4052 xlde->d_name)));
4053 snprintf(path, sizeof(path), XLOGDIR "/%s", xlde->d_name);
4054 unlink(path);
4055 XLogArchiveCleanup(xlde->d_name);
4056 }
4057 }
4058 }
4059
4060 FreeDir(xldir);
4061 }
4062
4063 /*
4064 * Attempt to read an XLOG record.
4065 *
4066 * If RecPtr is valid, try to read a record at that position. Otherwise
4067 * try to read a record just after the last one previously read.
4068 *
4069 * If no valid record is available, returns NULL, or fails if emode is PANIC.
4070 * (emode must be either PANIC, LOG). In standby mode, retries until a valid
4071 * record is available.
4072 *
4073 * The record is copied into readRecordBuf, so that on successful return,
4074 * the returned record pointer always points there.
4075 */
4076 static XLogRecord *
ReadRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int emode,bool fetching_ckpt)4077 ReadRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr, int emode,
4078 bool fetching_ckpt)
4079 {
4080 XLogRecord *record;
4081 XLogPageReadPrivate *private = (XLogPageReadPrivate *) xlogreader->private_data;
4082
4083 /* Pass through parameters to XLogPageRead */
4084 private->fetching_ckpt = fetching_ckpt;
4085 private->emode = emode;
4086 private->randAccess = (RecPtr != InvalidXLogRecPtr);
4087
4088 /* This is the first attempt to read this page. */
4089 lastSourceFailed = false;
4090
4091 for (;;)
4092 {
4093 char *errormsg;
4094
4095 record = XLogReadRecord(xlogreader, RecPtr, &errormsg);
4096 ReadRecPtr = xlogreader->ReadRecPtr;
4097 EndRecPtr = xlogreader->EndRecPtr;
4098 if (record == NULL)
4099 {
4100 /*
4101 * When not in standby mode we find that WAL ends in an incomplete
4102 * record, keep track of that record. After recovery is done,
4103 * we'll write a record to indicate downstream WAL readers that
4104 * that portion is to be ignored.
4105 */
4106 if (!StandbyMode &&
4107 !XLogRecPtrIsInvalid(xlogreader->abortedRecPtr))
4108 {
4109 abortedRecPtr = xlogreader->abortedRecPtr;
4110 missingContrecPtr = xlogreader->missingContrecPtr;
4111 }
4112
4113 if (readFile >= 0)
4114 {
4115 close(readFile);
4116 readFile = -1;
4117 }
4118
4119 /*
4120 * We only end up here without a message when XLogPageRead()
4121 * failed - in that case we already logged something. In
4122 * StandbyMode that only happens if we have been triggered, so we
4123 * shouldn't loop anymore in that case.
4124 */
4125 if (errormsg)
4126 ereport(emode_for_corrupt_record(emode,
4127 RecPtr ? RecPtr : EndRecPtr),
4128 (errmsg_internal("%s", errormsg) /* already translated */ ));
4129 }
4130
4131 /*
4132 * Check page TLI is one of the expected values.
4133 */
4134 else if (!tliInHistory(xlogreader->latestPageTLI, expectedTLEs))
4135 {
4136 char fname[MAXFNAMELEN];
4137 XLogSegNo segno;
4138 int32 offset;
4139
4140 XLByteToSeg(xlogreader->latestPagePtr, segno);
4141 offset = xlogreader->latestPagePtr % XLogSegSize;
4142 XLogFileName(fname, xlogreader->readPageTLI, segno);
4143 ereport(emode_for_corrupt_record(emode,
4144 RecPtr ? RecPtr : EndRecPtr),
4145 (errmsg("unexpected timeline ID %u in log segment %s, offset %u",
4146 xlogreader->latestPageTLI,
4147 fname,
4148 offset)));
4149 record = NULL;
4150 }
4151
4152 if (record)
4153 {
4154 /* Great, got a record */
4155 return record;
4156 }
4157 else
4158 {
4159 /* No valid record available from this source */
4160 lastSourceFailed = true;
4161
4162 /*
4163 * If archive recovery was requested, but we were still doing
4164 * crash recovery, switch to archive recovery and retry using the
4165 * offline archive. We have now replayed all the valid WAL in
4166 * pg_xlog, so we are presumably now consistent.
4167 *
4168 * We require that there's at least some valid WAL present in
4169 * pg_xlog, however (!fetch_ckpt). We could recover using the WAL
4170 * from the archive, even if pg_xlog is completely empty, but we'd
4171 * have no idea how far we'd have to replay to reach consistency.
4172 * So err on the safe side and give up.
4173 */
4174 if (!InArchiveRecovery && ArchiveRecoveryRequested &&
4175 !fetching_ckpt)
4176 {
4177 ereport(DEBUG1,
4178 (errmsg_internal("reached end of WAL in pg_xlog, entering archive recovery")));
4179 InArchiveRecovery = true;
4180 if (StandbyModeRequested)
4181 StandbyMode = true;
4182
4183 /* initialize minRecoveryPoint to this record */
4184 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
4185 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
4186 if (ControlFile->minRecoveryPoint < EndRecPtr)
4187 {
4188 ControlFile->minRecoveryPoint = EndRecPtr;
4189 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
4190 }
4191 /* update local copy */
4192 minRecoveryPoint = ControlFile->minRecoveryPoint;
4193 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
4194
4195 /*
4196 * The startup process can update its local copy of
4197 * minRecoveryPoint from this point.
4198 */
4199 updateMinRecoveryPoint = true;
4200
4201 UpdateControlFile();
4202
4203 /*
4204 * We update SharedRecoveryState while holding the lock on
4205 * ControlFileLock so both states are consistent in shared
4206 * memory.
4207 */
4208 SpinLockAcquire(&XLogCtl->info_lck);
4209 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
4210 SpinLockRelease(&XLogCtl->info_lck);
4211
4212 LWLockRelease(ControlFileLock);
4213
4214 CheckRecoveryConsistency();
4215
4216 /*
4217 * Before we retry, reset lastSourceFailed and currentSource
4218 * so that we will check the archive next.
4219 */
4220 lastSourceFailed = false;
4221 currentSource = 0;
4222
4223 continue;
4224 }
4225
4226 /* In standby mode, loop back to retry. Otherwise, give up. */
4227 if (StandbyMode && !CheckForStandbyTrigger())
4228 continue;
4229 else
4230 return NULL;
4231 }
4232 }
4233 }
4234
4235 /*
4236 * Scan for new timelines that might have appeared in the archive since we
4237 * started recovery.
4238 *
4239 * If there are any, the function changes recovery target TLI to the latest
4240 * one and returns 'true'.
4241 */
4242 static bool
rescanLatestTimeLine(void)4243 rescanLatestTimeLine(void)
4244 {
4245 List *newExpectedTLEs;
4246 bool found;
4247 ListCell *cell;
4248 TimeLineID newtarget;
4249 TimeLineID oldtarget = recoveryTargetTLI;
4250 TimeLineHistoryEntry *currentTle = NULL;
4251
4252 newtarget = findNewestTimeLine(recoveryTargetTLI);
4253 if (newtarget == recoveryTargetTLI)
4254 {
4255 /* No new timelines found */
4256 return false;
4257 }
4258
4259 /*
4260 * Determine the list of expected TLIs for the new TLI
4261 */
4262
4263 newExpectedTLEs = readTimeLineHistory(newtarget);
4264
4265 /*
4266 * If the current timeline is not part of the history of the new timeline,
4267 * we cannot proceed to it.
4268 */
4269 found = false;
4270 foreach(cell, newExpectedTLEs)
4271 {
4272 currentTle = (TimeLineHistoryEntry *) lfirst(cell);
4273
4274 if (currentTle->tli == recoveryTargetTLI)
4275 {
4276 found = true;
4277 break;
4278 }
4279 }
4280 if (!found)
4281 {
4282 ereport(LOG,
4283 (errmsg("new timeline %u is not a child of database system timeline %u",
4284 newtarget,
4285 ThisTimeLineID)));
4286 return false;
4287 }
4288
4289 /*
4290 * The current timeline was found in the history file, but check that the
4291 * next timeline was forked off from it *after* the current recovery
4292 * location.
4293 */
4294 if (currentTle->end < EndRecPtr)
4295 {
4296 ereport(LOG,
4297 (errmsg("new timeline %u forked off current database system timeline %u before current recovery point %X/%X",
4298 newtarget,
4299 ThisTimeLineID,
4300 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr)));
4301 return false;
4302 }
4303
4304 /* The new timeline history seems valid. Switch target */
4305 recoveryTargetTLI = newtarget;
4306 list_free_deep(expectedTLEs);
4307 expectedTLEs = newExpectedTLEs;
4308
4309 /*
4310 * As in StartupXLOG(), try to ensure we have all the history files
4311 * between the old target and new target in pg_xlog.
4312 */
4313 restoreTimeLineHistoryFiles(oldtarget + 1, newtarget);
4314
4315 ereport(LOG,
4316 (errmsg("new target timeline is %u",
4317 recoveryTargetTLI)));
4318
4319 return true;
4320 }
4321
4322 /*
4323 * I/O routines for pg_control
4324 *
4325 * *ControlFile is a buffer in shared memory that holds an image of the
4326 * contents of pg_control. WriteControlFile() initializes pg_control
4327 * given a preloaded buffer, ReadControlFile() loads the buffer from
4328 * the pg_control file (during postmaster or standalone-backend startup),
4329 * and UpdateControlFile() rewrites pg_control after we modify xlog state.
4330 *
4331 * For simplicity, WriteControlFile() initializes the fields of pg_control
4332 * that are related to checking backend/database compatibility, and
4333 * ReadControlFile() verifies they are correct. We could split out the
4334 * I/O and compatibility-check functions, but there seems no need currently.
4335 */
4336 static void
WriteControlFile(void)4337 WriteControlFile(void)
4338 {
4339 int fd;
4340 char buffer[PG_CONTROL_SIZE]; /* need not be aligned */
4341
4342 /*
4343 * Initialize version and compatibility-check fields
4344 */
4345 ControlFile->pg_control_version = PG_CONTROL_VERSION;
4346 ControlFile->catalog_version_no = CATALOG_VERSION_NO;
4347
4348 ControlFile->maxAlign = MAXIMUM_ALIGNOF;
4349 ControlFile->floatFormat = FLOATFORMAT_VALUE;
4350
4351 ControlFile->blcksz = BLCKSZ;
4352 ControlFile->relseg_size = RELSEG_SIZE;
4353 ControlFile->xlog_blcksz = XLOG_BLCKSZ;
4354 ControlFile->xlog_seg_size = XLOG_SEG_SIZE;
4355
4356 ControlFile->nameDataLen = NAMEDATALEN;
4357 ControlFile->indexMaxKeys = INDEX_MAX_KEYS;
4358
4359 ControlFile->toast_max_chunk_size = TOAST_MAX_CHUNK_SIZE;
4360 ControlFile->loblksize = LOBLKSIZE;
4361
4362 #ifdef HAVE_INT64_TIMESTAMP
4363 ControlFile->enableIntTimes = true;
4364 #else
4365 ControlFile->enableIntTimes = false;
4366 #endif
4367 ControlFile->float4ByVal = FLOAT4PASSBYVAL;
4368 ControlFile->float8ByVal = FLOAT8PASSBYVAL;
4369
4370 /* Contents are protected with a CRC */
4371 INIT_CRC32C(ControlFile->crc);
4372 COMP_CRC32C(ControlFile->crc,
4373 (char *) ControlFile,
4374 offsetof(ControlFileData, crc));
4375 FIN_CRC32C(ControlFile->crc);
4376
4377 /*
4378 * We write out PG_CONTROL_SIZE bytes into pg_control, zero-padding the
4379 * excess over sizeof(ControlFileData). This reduces the odds of
4380 * premature-EOF errors when reading pg_control. We'll still fail when we
4381 * check the contents of the file, but hopefully with a more specific
4382 * error than "couldn't read pg_control".
4383 */
4384 if (sizeof(ControlFileData) > PG_CONTROL_SIZE)
4385 elog(PANIC, "sizeof(ControlFileData) is larger than PG_CONTROL_SIZE; fix either one");
4386
4387 memset(buffer, 0, PG_CONTROL_SIZE);
4388 memcpy(buffer, ControlFile, sizeof(ControlFileData));
4389
4390 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4391 O_RDWR | O_CREAT | O_EXCL | PG_BINARY,
4392 S_IRUSR | S_IWUSR);
4393 if (fd < 0)
4394 ereport(PANIC,
4395 (errcode_for_file_access(),
4396 errmsg("could not create control file \"%s\": %m",
4397 XLOG_CONTROL_FILE)));
4398
4399 errno = 0;
4400 if (write(fd, buffer, PG_CONTROL_SIZE) != PG_CONTROL_SIZE)
4401 {
4402 /* if write didn't set errno, assume problem is no disk space */
4403 if (errno == 0)
4404 errno = ENOSPC;
4405 ereport(PANIC,
4406 (errcode_for_file_access(),
4407 errmsg("could not write to control file: %m")));
4408 }
4409
4410 if (pg_fsync(fd) != 0)
4411 ereport(PANIC,
4412 (errcode_for_file_access(),
4413 errmsg("could not fsync control file: %m")));
4414
4415 if (close(fd))
4416 ereport(PANIC,
4417 (errcode_for_file_access(),
4418 errmsg("could not close control file: %m")));
4419 }
4420
4421 static void
ReadControlFile(void)4422 ReadControlFile(void)
4423 {
4424 pg_crc32c crc;
4425 int fd;
4426 int r;
4427
4428 /*
4429 * Read data...
4430 */
4431 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4432 O_RDWR | PG_BINARY,
4433 S_IRUSR | S_IWUSR);
4434 if (fd < 0)
4435 ereport(PANIC,
4436 (errcode_for_file_access(),
4437 errmsg("could not open control file \"%s\": %m",
4438 XLOG_CONTROL_FILE)));
4439
4440 r = read(fd, ControlFile, sizeof(ControlFileData));
4441 if (r != sizeof(ControlFileData))
4442 {
4443 if (r < 0)
4444 ereport(PANIC,
4445 (errcode_for_file_access(),
4446 errmsg("could not read from control file: %m")));
4447 else
4448 ereport(PANIC,
4449 (errmsg("could not read from control file: read %d bytes, expected %d", r, (int) sizeof(ControlFileData))));
4450 }
4451
4452 close(fd);
4453
4454 /*
4455 * Check for expected pg_control format version. If this is wrong, the
4456 * CRC check will likely fail because we'll be checking the wrong number
4457 * of bytes. Complaining about wrong version will probably be more
4458 * enlightening than complaining about wrong CRC.
4459 */
4460
4461 if (ControlFile->pg_control_version != PG_CONTROL_VERSION && ControlFile->pg_control_version % 65536 == 0 && ControlFile->pg_control_version / 65536 != 0)
4462 ereport(FATAL,
4463 (errmsg("database files are incompatible with server"),
4464 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d (0x%08x),"
4465 " but the server was compiled with PG_CONTROL_VERSION %d (0x%08x).",
4466 ControlFile->pg_control_version, ControlFile->pg_control_version,
4467 PG_CONTROL_VERSION, PG_CONTROL_VERSION),
4468 errhint("This could be a problem of mismatched byte ordering. It looks like you need to initdb.")));
4469
4470 if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
4471 ereport(FATAL,
4472 (errmsg("database files are incompatible with server"),
4473 errdetail("The database cluster was initialized with PG_CONTROL_VERSION %d,"
4474 " but the server was compiled with PG_CONTROL_VERSION %d.",
4475 ControlFile->pg_control_version, PG_CONTROL_VERSION),
4476 errhint("It looks like you need to initdb.")));
4477
4478 /* Now check the CRC. */
4479 INIT_CRC32C(crc);
4480 COMP_CRC32C(crc,
4481 (char *) ControlFile,
4482 offsetof(ControlFileData, crc));
4483 FIN_CRC32C(crc);
4484
4485 if (!EQ_CRC32C(crc, ControlFile->crc))
4486 ereport(FATAL,
4487 (errmsg("incorrect checksum in control file")));
4488
4489 /*
4490 * Do compatibility checking immediately. If the database isn't
4491 * compatible with the backend executable, we want to abort before we can
4492 * possibly do any damage.
4493 */
4494 if (ControlFile->catalog_version_no != CATALOG_VERSION_NO)
4495 ereport(FATAL,
4496 (errmsg("database files are incompatible with server"),
4497 errdetail("The database cluster was initialized with CATALOG_VERSION_NO %d,"
4498 " but the server was compiled with CATALOG_VERSION_NO %d.",
4499 ControlFile->catalog_version_no, CATALOG_VERSION_NO),
4500 errhint("It looks like you need to initdb.")));
4501 if (ControlFile->maxAlign != MAXIMUM_ALIGNOF)
4502 ereport(FATAL,
4503 (errmsg("database files are incompatible with server"),
4504 errdetail("The database cluster was initialized with MAXALIGN %d,"
4505 " but the server was compiled with MAXALIGN %d.",
4506 ControlFile->maxAlign, MAXIMUM_ALIGNOF),
4507 errhint("It looks like you need to initdb.")));
4508 if (ControlFile->floatFormat != FLOATFORMAT_VALUE)
4509 ereport(FATAL,
4510 (errmsg("database files are incompatible with server"),
4511 errdetail("The database cluster appears to use a different floating-point number format than the server executable."),
4512 errhint("It looks like you need to initdb.")));
4513 if (ControlFile->blcksz != BLCKSZ)
4514 ereport(FATAL,
4515 (errmsg("database files are incompatible with server"),
4516 errdetail("The database cluster was initialized with BLCKSZ %d,"
4517 " but the server was compiled with BLCKSZ %d.",
4518 ControlFile->blcksz, BLCKSZ),
4519 errhint("It looks like you need to recompile or initdb.")));
4520 if (ControlFile->relseg_size != RELSEG_SIZE)
4521 ereport(FATAL,
4522 (errmsg("database files are incompatible with server"),
4523 errdetail("The database cluster was initialized with RELSEG_SIZE %d,"
4524 " but the server was compiled with RELSEG_SIZE %d.",
4525 ControlFile->relseg_size, RELSEG_SIZE),
4526 errhint("It looks like you need to recompile or initdb.")));
4527 if (ControlFile->xlog_blcksz != XLOG_BLCKSZ)
4528 ereport(FATAL,
4529 (errmsg("database files are incompatible with server"),
4530 errdetail("The database cluster was initialized with XLOG_BLCKSZ %d,"
4531 " but the server was compiled with XLOG_BLCKSZ %d.",
4532 ControlFile->xlog_blcksz, XLOG_BLCKSZ),
4533 errhint("It looks like you need to recompile or initdb.")));
4534 if (ControlFile->xlog_seg_size != XLOG_SEG_SIZE)
4535 ereport(FATAL,
4536 (errmsg("database files are incompatible with server"),
4537 errdetail("The database cluster was initialized with XLOG_SEG_SIZE %d,"
4538 " but the server was compiled with XLOG_SEG_SIZE %d.",
4539 ControlFile->xlog_seg_size, XLOG_SEG_SIZE),
4540 errhint("It looks like you need to recompile or initdb.")));
4541 if (ControlFile->nameDataLen != NAMEDATALEN)
4542 ereport(FATAL,
4543 (errmsg("database files are incompatible with server"),
4544 errdetail("The database cluster was initialized with NAMEDATALEN %d,"
4545 " but the server was compiled with NAMEDATALEN %d.",
4546 ControlFile->nameDataLen, NAMEDATALEN),
4547 errhint("It looks like you need to recompile or initdb.")));
4548 if (ControlFile->indexMaxKeys != INDEX_MAX_KEYS)
4549 ereport(FATAL,
4550 (errmsg("database files are incompatible with server"),
4551 errdetail("The database cluster was initialized with INDEX_MAX_KEYS %d,"
4552 " but the server was compiled with INDEX_MAX_KEYS %d.",
4553 ControlFile->indexMaxKeys, INDEX_MAX_KEYS),
4554 errhint("It looks like you need to recompile or initdb.")));
4555 if (ControlFile->toast_max_chunk_size != TOAST_MAX_CHUNK_SIZE)
4556 ereport(FATAL,
4557 (errmsg("database files are incompatible with server"),
4558 errdetail("The database cluster was initialized with TOAST_MAX_CHUNK_SIZE %d,"
4559 " but the server was compiled with TOAST_MAX_CHUNK_SIZE %d.",
4560 ControlFile->toast_max_chunk_size, (int) TOAST_MAX_CHUNK_SIZE),
4561 errhint("It looks like you need to recompile or initdb.")));
4562 if (ControlFile->loblksize != LOBLKSIZE)
4563 ereport(FATAL,
4564 (errmsg("database files are incompatible with server"),
4565 errdetail("The database cluster was initialized with LOBLKSIZE %d,"
4566 " but the server was compiled with LOBLKSIZE %d.",
4567 ControlFile->loblksize, (int) LOBLKSIZE),
4568 errhint("It looks like you need to recompile or initdb.")));
4569
4570 #ifdef HAVE_INT64_TIMESTAMP
4571 if (ControlFile->enableIntTimes != true)
4572 ereport(FATAL,
4573 (errmsg("database files are incompatible with server"),
4574 errdetail("The database cluster was initialized without HAVE_INT64_TIMESTAMP"
4575 " but the server was compiled with HAVE_INT64_TIMESTAMP."),
4576 errhint("It looks like you need to recompile or initdb.")));
4577 #else
4578 if (ControlFile->enableIntTimes != false)
4579 ereport(FATAL,
4580 (errmsg("database files are incompatible with server"),
4581 errdetail("The database cluster was initialized with HAVE_INT64_TIMESTAMP"
4582 " but the server was compiled without HAVE_INT64_TIMESTAMP."),
4583 errhint("It looks like you need to recompile or initdb.")));
4584 #endif
4585
4586 #ifdef USE_FLOAT4_BYVAL
4587 if (ControlFile->float4ByVal != true)
4588 ereport(FATAL,
4589 (errmsg("database files are incompatible with server"),
4590 errdetail("The database cluster was initialized without USE_FLOAT4_BYVAL"
4591 " but the server was compiled with USE_FLOAT4_BYVAL."),
4592 errhint("It looks like you need to recompile or initdb.")));
4593 #else
4594 if (ControlFile->float4ByVal != false)
4595 ereport(FATAL,
4596 (errmsg("database files are incompatible with server"),
4597 errdetail("The database cluster was initialized with USE_FLOAT4_BYVAL"
4598 " but the server was compiled without USE_FLOAT4_BYVAL."),
4599 errhint("It looks like you need to recompile or initdb.")));
4600 #endif
4601
4602 #ifdef USE_FLOAT8_BYVAL
4603 if (ControlFile->float8ByVal != true)
4604 ereport(FATAL,
4605 (errmsg("database files are incompatible with server"),
4606 errdetail("The database cluster was initialized without USE_FLOAT8_BYVAL"
4607 " but the server was compiled with USE_FLOAT8_BYVAL."),
4608 errhint("It looks like you need to recompile or initdb.")));
4609 #else
4610 if (ControlFile->float8ByVal != false)
4611 ereport(FATAL,
4612 (errmsg("database files are incompatible with server"),
4613 errdetail("The database cluster was initialized with USE_FLOAT8_BYVAL"
4614 " but the server was compiled without USE_FLOAT8_BYVAL."),
4615 errhint("It looks like you need to recompile or initdb.")));
4616 #endif
4617
4618 /* Make the initdb settings visible as GUC variables, too */
4619 SetConfigOption("data_checksums", DataChecksumsEnabled() ? "yes" : "no",
4620 PGC_INTERNAL, PGC_S_OVERRIDE);
4621 }
4622
4623 void
UpdateControlFile(void)4624 UpdateControlFile(void)
4625 {
4626 int fd;
4627
4628 INIT_CRC32C(ControlFile->crc);
4629 COMP_CRC32C(ControlFile->crc,
4630 (char *) ControlFile,
4631 offsetof(ControlFileData, crc));
4632 FIN_CRC32C(ControlFile->crc);
4633
4634 fd = BasicOpenFile(XLOG_CONTROL_FILE,
4635 O_RDWR | PG_BINARY,
4636 S_IRUSR | S_IWUSR);
4637 if (fd < 0)
4638 ereport(PANIC,
4639 (errcode_for_file_access(),
4640 errmsg("could not open control file \"%s\": %m",
4641 XLOG_CONTROL_FILE)));
4642
4643 errno = 0;
4644 if (write(fd, ControlFile, sizeof(ControlFileData)) != sizeof(ControlFileData))
4645 {
4646 /* if write didn't set errno, assume problem is no disk space */
4647 if (errno == 0)
4648 errno = ENOSPC;
4649 ereport(PANIC,
4650 (errcode_for_file_access(),
4651 errmsg("could not write to control file: %m")));
4652 }
4653
4654 if (pg_fsync(fd) != 0)
4655 ereport(PANIC,
4656 (errcode_for_file_access(),
4657 errmsg("could not fsync control file: %m")));
4658
4659 if (close(fd))
4660 ereport(PANIC,
4661 (errcode_for_file_access(),
4662 errmsg("could not close control file: %m")));
4663 }
4664
4665 /*
4666 * Returns the unique system identifier from control file.
4667 */
4668 uint64
GetSystemIdentifier(void)4669 GetSystemIdentifier(void)
4670 {
4671 Assert(ControlFile != NULL);
4672 return ControlFile->system_identifier;
4673 }
4674
4675 /*
4676 * Are checksums enabled for data pages?
4677 */
4678 bool
DataChecksumsEnabled(void)4679 DataChecksumsEnabled(void)
4680 {
4681 Assert(ControlFile != NULL);
4682 return (ControlFile->data_checksum_version > 0);
4683 }
4684
4685 /*
4686 * Returns a fake LSN for unlogged relations.
4687 *
4688 * Each call generates an LSN that is greater than any previous value
4689 * returned. The current counter value is saved and restored across clean
4690 * shutdowns, but like unlogged relations, does not survive a crash. This can
4691 * be used in lieu of real LSN values returned by XLogInsert, if you need an
4692 * LSN-like increasing sequence of numbers without writing any WAL.
4693 */
4694 XLogRecPtr
GetFakeLSNForUnloggedRel(void)4695 GetFakeLSNForUnloggedRel(void)
4696 {
4697 XLogRecPtr nextUnloggedLSN;
4698
4699 /* increment the unloggedLSN counter, need SpinLock */
4700 SpinLockAcquire(&XLogCtl->ulsn_lck);
4701 nextUnloggedLSN = XLogCtl->unloggedLSN++;
4702 SpinLockRelease(&XLogCtl->ulsn_lck);
4703
4704 return nextUnloggedLSN;
4705 }
4706
4707 /*
4708 * Auto-tune the number of XLOG buffers.
4709 *
4710 * The preferred setting for wal_buffers is about 3% of shared_buffers, with
4711 * a maximum of one XLOG segment (there is little reason to think that more
4712 * is helpful, at least so long as we force an fsync when switching log files)
4713 * and a minimum of 8 blocks (which was the default value prior to PostgreSQL
4714 * 9.1, when auto-tuning was added).
4715 *
4716 * This should not be called until NBuffers has received its final value.
4717 */
4718 static int
XLOGChooseNumBuffers(void)4719 XLOGChooseNumBuffers(void)
4720 {
4721 int xbuffers;
4722
4723 xbuffers = NBuffers / 32;
4724 if (xbuffers > XLOG_SEG_SIZE / XLOG_BLCKSZ)
4725 xbuffers = XLOG_SEG_SIZE / XLOG_BLCKSZ;
4726 if (xbuffers < 8)
4727 xbuffers = 8;
4728 return xbuffers;
4729 }
4730
4731 /*
4732 * GUC check_hook for wal_buffers
4733 */
4734 bool
check_wal_buffers(int * newval,void ** extra,GucSource source)4735 check_wal_buffers(int *newval, void **extra, GucSource source)
4736 {
4737 /*
4738 * -1 indicates a request for auto-tune.
4739 */
4740 if (*newval == -1)
4741 {
4742 /*
4743 * If we haven't yet changed the boot_val default of -1, just let it
4744 * be. We'll fix it when XLOGShmemSize is called.
4745 */
4746 if (XLOGbuffers == -1)
4747 return true;
4748
4749 /* Otherwise, substitute the auto-tune value */
4750 *newval = XLOGChooseNumBuffers();
4751 }
4752
4753 /*
4754 * We clamp manually-set values to at least 4 blocks. Prior to PostgreSQL
4755 * 9.1, a minimum of 4 was enforced by guc.c, but since that is no longer
4756 * the case, we just silently treat such values as a request for the
4757 * minimum. (We could throw an error instead, but that doesn't seem very
4758 * helpful.)
4759 */
4760 if (*newval < 4)
4761 *newval = 4;
4762
4763 return true;
4764 }
4765
4766 /*
4767 * Initialization of shared memory for XLOG
4768 */
4769 Size
XLOGShmemSize(void)4770 XLOGShmemSize(void)
4771 {
4772 Size size;
4773
4774 /*
4775 * If the value of wal_buffers is -1, use the preferred auto-tune value.
4776 * This isn't an amazingly clean place to do this, but we must wait till
4777 * NBuffers has received its final value, and must do it before using the
4778 * value of XLOGbuffers to do anything important.
4779 */
4780 if (XLOGbuffers == -1)
4781 {
4782 char buf[32];
4783
4784 snprintf(buf, sizeof(buf), "%d", XLOGChooseNumBuffers());
4785 SetConfigOption("wal_buffers", buf, PGC_POSTMASTER, PGC_S_OVERRIDE);
4786 }
4787 Assert(XLOGbuffers > 0);
4788
4789 /* XLogCtl */
4790 size = sizeof(XLogCtlData);
4791
4792 /* WAL insertion locks, plus alignment */
4793 size = add_size(size, mul_size(sizeof(WALInsertLockPadded), NUM_XLOGINSERT_LOCKS + 1));
4794 /* xlblocks array */
4795 size = add_size(size, mul_size(sizeof(XLogRecPtr), XLOGbuffers));
4796 /* extra alignment padding for XLOG I/O buffers */
4797 size = add_size(size, XLOG_BLCKSZ);
4798 /* and the buffers themselves */
4799 size = add_size(size, mul_size(XLOG_BLCKSZ, XLOGbuffers));
4800
4801 /*
4802 * Note: we don't count ControlFileData, it comes out of the "slop factor"
4803 * added by CreateSharedMemoryAndSemaphores. This lets us use this
4804 * routine again below to compute the actual allocation size.
4805 */
4806
4807 return size;
4808 }
4809
4810 void
XLOGShmemInit(void)4811 XLOGShmemInit(void)
4812 {
4813 bool foundCFile,
4814 foundXLog;
4815 char *allocptr;
4816 int i;
4817
4818 #ifdef WAL_DEBUG
4819
4820 /*
4821 * Create a memory context for WAL debugging that's exempt from the normal
4822 * "no pallocs in critical section" rule. Yes, that can lead to a PANIC if
4823 * an allocation fails, but wal_debug is not for production use anyway.
4824 */
4825 if (walDebugCxt == NULL)
4826 {
4827 walDebugCxt = AllocSetContextCreate(TopMemoryContext,
4828 "WAL Debug",
4829 ALLOCSET_DEFAULT_SIZES);
4830 MemoryContextAllowInCriticalSection(walDebugCxt, true);
4831 }
4832 #endif
4833
4834 ControlFile = (ControlFileData *)
4835 ShmemInitStruct("Control File", sizeof(ControlFileData), &foundCFile);
4836 XLogCtl = (XLogCtlData *)
4837 ShmemInitStruct("XLOG Ctl", XLOGShmemSize(), &foundXLog);
4838
4839 if (foundCFile || foundXLog)
4840 {
4841 /* both should be present or neither */
4842 Assert(foundCFile && foundXLog);
4843
4844 /* Initialize local copy of WALInsertLocks and register the tranche */
4845 WALInsertLocks = XLogCtl->Insert.WALInsertLocks;
4846 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT,
4847 &XLogCtl->Insert.WALInsertLockTranche);
4848 return;
4849 }
4850 memset(XLogCtl, 0, sizeof(XLogCtlData));
4851
4852 /*
4853 * Since XLogCtlData contains XLogRecPtr fields, its sizeof should be a
4854 * multiple of the alignment for same, so no extra alignment padding is
4855 * needed here.
4856 */
4857 allocptr = ((char *) XLogCtl) + sizeof(XLogCtlData);
4858 XLogCtl->xlblocks = (XLogRecPtr *) allocptr;
4859 memset(XLogCtl->xlblocks, 0, sizeof(XLogRecPtr) * XLOGbuffers);
4860 allocptr += sizeof(XLogRecPtr) * XLOGbuffers;
4861
4862
4863 /* WAL insertion locks. Ensure they're aligned to the full padded size */
4864 allocptr += sizeof(WALInsertLockPadded) -
4865 ((uintptr_t) allocptr) %sizeof(WALInsertLockPadded);
4866 WALInsertLocks = XLogCtl->Insert.WALInsertLocks =
4867 (WALInsertLockPadded *) allocptr;
4868 allocptr += sizeof(WALInsertLockPadded) * NUM_XLOGINSERT_LOCKS;
4869
4870 XLogCtl->Insert.WALInsertLockTranche.name = "wal_insert";
4871 XLogCtl->Insert.WALInsertLockTranche.array_base = WALInsertLocks;
4872 XLogCtl->Insert.WALInsertLockTranche.array_stride = sizeof(WALInsertLockPadded);
4873
4874 LWLockRegisterTranche(LWTRANCHE_WAL_INSERT, &XLogCtl->Insert.WALInsertLockTranche);
4875 for (i = 0; i < NUM_XLOGINSERT_LOCKS; i++)
4876 {
4877 LWLockInitialize(&WALInsertLocks[i].l.lock, LWTRANCHE_WAL_INSERT);
4878 WALInsertLocks[i].l.insertingAt = InvalidXLogRecPtr;
4879 }
4880
4881 /*
4882 * Align the start of the page buffers to a full xlog block size boundary.
4883 * This simplifies some calculations in XLOG insertion. It is also
4884 * required for O_DIRECT.
4885 */
4886 allocptr = (char *) TYPEALIGN(XLOG_BLCKSZ, allocptr);
4887 XLogCtl->pages = allocptr;
4888 memset(XLogCtl->pages, 0, (Size) XLOG_BLCKSZ * XLOGbuffers);
4889
4890 /*
4891 * Do basic initialization of XLogCtl shared data. (StartupXLOG will fill
4892 * in additional info.)
4893 */
4894 XLogCtl->XLogCacheBlck = XLOGbuffers - 1;
4895 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
4896 XLogCtl->SharedHotStandbyActive = false;
4897 XLogCtl->WalWriterSleeping = false;
4898
4899 SpinLockInit(&XLogCtl->Insert.insertpos_lck);
4900 SpinLockInit(&XLogCtl->info_lck);
4901 SpinLockInit(&XLogCtl->ulsn_lck);
4902 InitSharedLatch(&XLogCtl->recoveryWakeupLatch);
4903
4904 /*
4905 * If we are not in bootstrap mode, pg_control should already exist. Read
4906 * and validate it immediately (see comments in ReadControlFile() for the
4907 * reasons why).
4908 */
4909 if (!IsBootstrapProcessingMode())
4910 ReadControlFile();
4911 }
4912
4913 /*
4914 * This func must be called ONCE on system install. It creates pg_control
4915 * and the initial XLOG segment.
4916 */
4917 void
BootStrapXLOG(void)4918 BootStrapXLOG(void)
4919 {
4920 CheckPoint checkPoint;
4921 char *buffer;
4922 XLogPageHeader page;
4923 XLogLongPageHeader longpage;
4924 XLogRecord *record;
4925 char *recptr;
4926 bool use_existent;
4927 uint64 sysidentifier;
4928 struct timeval tv;
4929 pg_crc32c crc;
4930
4931 /*
4932 * Select a hopefully-unique system identifier code for this installation.
4933 * We use the result of gettimeofday(), including the fractional seconds
4934 * field, as being about as unique as we can easily get. (Think not to
4935 * use random(), since it hasn't been seeded and there's no portable way
4936 * to seed it other than the system clock value...) The upper half of the
4937 * uint64 value is just the tv_sec part, while the lower half contains the
4938 * tv_usec part (which must fit in 20 bits), plus 12 bits from our current
4939 * PID for a little extra uniqueness. A person knowing this encoding can
4940 * determine the initialization time of the installation, which could
4941 * perhaps be useful sometimes.
4942 */
4943 gettimeofday(&tv, NULL);
4944 sysidentifier = ((uint64) tv.tv_sec) << 32;
4945 sysidentifier |= ((uint64) tv.tv_usec) << 12;
4946 sysidentifier |= getpid() & 0xFFF;
4947
4948 /* First timeline ID is always 1 */
4949 ThisTimeLineID = 1;
4950
4951 /* page buffer must be aligned suitably for O_DIRECT */
4952 buffer = (char *) palloc(XLOG_BLCKSZ + XLOG_BLCKSZ);
4953 page = (XLogPageHeader) TYPEALIGN(XLOG_BLCKSZ, buffer);
4954 memset(page, 0, XLOG_BLCKSZ);
4955
4956 /*
4957 * Set up information for the initial checkpoint record
4958 *
4959 * The initial checkpoint record is written to the beginning of the WAL
4960 * segment with logid=0 logseg=1. The very first WAL segment, 0/0, is not
4961 * used, so that we can use 0/0 to mean "before any valid WAL segment".
4962 */
4963 checkPoint.redo = XLogSegSize + SizeOfXLogLongPHD;
4964 checkPoint.ThisTimeLineID = ThisTimeLineID;
4965 checkPoint.PrevTimeLineID = ThisTimeLineID;
4966 checkPoint.fullPageWrites = fullPageWrites;
4967 checkPoint.nextXidEpoch = 0;
4968 checkPoint.nextXid = FirstNormalTransactionId;
4969 checkPoint.nextOid = FirstBootstrapObjectId;
4970 checkPoint.nextMulti = FirstMultiXactId;
4971 checkPoint.nextMultiOffset = 0;
4972 checkPoint.oldestXid = FirstNormalTransactionId;
4973 checkPoint.oldestXidDB = TemplateDbOid;
4974 checkPoint.oldestMulti = FirstMultiXactId;
4975 checkPoint.oldestMultiDB = TemplateDbOid;
4976 checkPoint.oldestCommitTsXid = InvalidTransactionId;
4977 checkPoint.newestCommitTsXid = InvalidTransactionId;
4978 checkPoint.time = (pg_time_t) time(NULL);
4979 checkPoint.oldestActiveXid = InvalidTransactionId;
4980
4981 ShmemVariableCache->nextXid = checkPoint.nextXid;
4982 ShmemVariableCache->nextOid = checkPoint.nextOid;
4983 ShmemVariableCache->oidCount = 0;
4984 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
4985 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
4986 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
4987 SetCommitTsLimit(InvalidTransactionId, InvalidTransactionId);
4988
4989 /* Set up the XLOG page header */
4990 page->xlp_magic = XLOG_PAGE_MAGIC;
4991 page->xlp_info = XLP_LONG_HEADER;
4992 page->xlp_tli = ThisTimeLineID;
4993 page->xlp_pageaddr = XLogSegSize;
4994 longpage = (XLogLongPageHeader) page;
4995 longpage->xlp_sysid = sysidentifier;
4996 longpage->xlp_seg_size = XLogSegSize;
4997 longpage->xlp_xlog_blcksz = XLOG_BLCKSZ;
4998
4999 /* Insert the initial checkpoint record */
5000 recptr = ((char *) page + SizeOfXLogLongPHD);
5001 record = (XLogRecord *) recptr;
5002 record->xl_prev = 0;
5003 record->xl_xid = InvalidTransactionId;
5004 record->xl_tot_len = SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(checkPoint);
5005 record->xl_info = XLOG_CHECKPOINT_SHUTDOWN;
5006 record->xl_rmid = RM_XLOG_ID;
5007 recptr += SizeOfXLogRecord;
5008 /* fill the XLogRecordDataHeaderShort struct */
5009 *(recptr++) = (char) XLR_BLOCK_ID_DATA_SHORT;
5010 *(recptr++) = sizeof(checkPoint);
5011 memcpy(recptr, &checkPoint, sizeof(checkPoint));
5012 recptr += sizeof(checkPoint);
5013 Assert(recptr - (char *) record == record->xl_tot_len);
5014
5015 INIT_CRC32C(crc);
5016 COMP_CRC32C(crc, ((char *) record) + SizeOfXLogRecord, record->xl_tot_len - SizeOfXLogRecord);
5017 COMP_CRC32C(crc, (char *) record, offsetof(XLogRecord, xl_crc));
5018 FIN_CRC32C(crc);
5019 record->xl_crc = crc;
5020
5021 /* Create first XLOG segment file */
5022 use_existent = false;
5023 openLogFile = XLogFileInit(1, &use_existent, false);
5024
5025 /* Write the first page with the initial record */
5026 errno = 0;
5027 if (write(openLogFile, page, XLOG_BLCKSZ) != XLOG_BLCKSZ)
5028 {
5029 /* if write didn't set errno, assume problem is no disk space */
5030 if (errno == 0)
5031 errno = ENOSPC;
5032 ereport(PANIC,
5033 (errcode_for_file_access(),
5034 errmsg("could not write bootstrap transaction log file: %m")));
5035 }
5036
5037 if (pg_fsync(openLogFile) != 0)
5038 ereport(PANIC,
5039 (errcode_for_file_access(),
5040 errmsg("could not fsync bootstrap transaction log file: %m")));
5041
5042 if (close(openLogFile))
5043 ereport(PANIC,
5044 (errcode_for_file_access(),
5045 errmsg("could not close bootstrap transaction log file: %m")));
5046
5047 openLogFile = -1;
5048
5049 /* Now create pg_control */
5050
5051 memset(ControlFile, 0, sizeof(ControlFileData));
5052 /* Initialize pg_control status fields */
5053 ControlFile->system_identifier = sysidentifier;
5054 ControlFile->state = DB_SHUTDOWNED;
5055 ControlFile->time = checkPoint.time;
5056 ControlFile->checkPoint = checkPoint.redo;
5057 ControlFile->checkPointCopy = checkPoint;
5058 ControlFile->unloggedLSN = 1;
5059
5060 /* Set important parameter values for use when replaying WAL */
5061 ControlFile->MaxConnections = MaxConnections;
5062 ControlFile->max_worker_processes = max_worker_processes;
5063 ControlFile->max_prepared_xacts = max_prepared_xacts;
5064 ControlFile->max_locks_per_xact = max_locks_per_xact;
5065 ControlFile->wal_level = wal_level;
5066 ControlFile->wal_log_hints = wal_log_hints;
5067 ControlFile->track_commit_timestamp = track_commit_timestamp;
5068 ControlFile->data_checksum_version = bootstrap_data_checksum_version;
5069
5070 /* some additional ControlFile fields are set in WriteControlFile() */
5071
5072 WriteControlFile();
5073
5074 /* Bootstrap the commit log, too */
5075 BootStrapCLOG();
5076 BootStrapCommitTs();
5077 BootStrapSUBTRANS();
5078 BootStrapMultiXact();
5079
5080 pfree(buffer);
5081 }
5082
5083 static char *
str_time(pg_time_t tnow)5084 str_time(pg_time_t tnow)
5085 {
5086 static char buf[128];
5087
5088 pg_strftime(buf, sizeof(buf),
5089 "%Y-%m-%d %H:%M:%S %Z",
5090 pg_localtime(&tnow, log_timezone));
5091
5092 return buf;
5093 }
5094
5095 /*
5096 * See if there is a recovery command file (recovery.conf), and if so
5097 * read in parameters for archive recovery and XLOG streaming.
5098 *
5099 * The file is parsed using the main configuration parser.
5100 */
5101 static void
readRecoveryCommandFile(void)5102 readRecoveryCommandFile(void)
5103 {
5104 FILE *fd;
5105 TimeLineID rtli = 0;
5106 bool rtliGiven = false;
5107 ConfigVariable *item,
5108 *head = NULL,
5109 *tail = NULL;
5110 bool recoveryTargetActionSet = false;
5111
5112
5113 fd = AllocateFile(RECOVERY_COMMAND_FILE, "r");
5114 if (fd == NULL)
5115 {
5116 if (errno == ENOENT)
5117 return; /* not there, so no archive recovery */
5118 ereport(FATAL,
5119 (errcode_for_file_access(),
5120 errmsg("could not open recovery command file \"%s\": %m",
5121 RECOVERY_COMMAND_FILE)));
5122 }
5123
5124 /*
5125 * Since we're asking ParseConfigFp() to report errors as FATAL, there's
5126 * no need to check the return value.
5127 */
5128 (void) ParseConfigFp(fd, RECOVERY_COMMAND_FILE, 0, FATAL, &head, &tail);
5129
5130 FreeFile(fd);
5131
5132 for (item = head; item; item = item->next)
5133 {
5134 if (strcmp(item->name, "restore_command") == 0)
5135 {
5136 recoveryRestoreCommand = pstrdup(item->value);
5137 ereport(DEBUG2,
5138 (errmsg_internal("restore_command = '%s'",
5139 recoveryRestoreCommand)));
5140 }
5141 else if (strcmp(item->name, "recovery_end_command") == 0)
5142 {
5143 recoveryEndCommand = pstrdup(item->value);
5144 ereport(DEBUG2,
5145 (errmsg_internal("recovery_end_command = '%s'",
5146 recoveryEndCommand)));
5147 }
5148 else if (strcmp(item->name, "archive_cleanup_command") == 0)
5149 {
5150 archiveCleanupCommand = pstrdup(item->value);
5151 ereport(DEBUG2,
5152 (errmsg_internal("archive_cleanup_command = '%s'",
5153 archiveCleanupCommand)));
5154 }
5155 else if (strcmp(item->name, "recovery_target_action") == 0)
5156 {
5157 if (strcmp(item->value, "pause") == 0)
5158 recoveryTargetAction = RECOVERY_TARGET_ACTION_PAUSE;
5159 else if (strcmp(item->value, "promote") == 0)
5160 recoveryTargetAction = RECOVERY_TARGET_ACTION_PROMOTE;
5161 else if (strcmp(item->value, "shutdown") == 0)
5162 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5163 else
5164 ereport(ERROR,
5165 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5166 errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5167 "recovery_target_action",
5168 item->value),
5169 errhint("Valid values are \"pause\", \"promote\", and \"shutdown\".")));
5170
5171 ereport(DEBUG2,
5172 (errmsg_internal("recovery_target_action = '%s'",
5173 item->value)));
5174
5175 recoveryTargetActionSet = true;
5176 }
5177 else if (strcmp(item->name, "recovery_target_timeline") == 0)
5178 {
5179 rtliGiven = true;
5180 if (strcmp(item->value, "latest") == 0)
5181 rtli = 0;
5182 else
5183 {
5184 errno = 0;
5185 rtli = (TimeLineID) strtoul(item->value, NULL, 0);
5186 if (errno == EINVAL || errno == ERANGE)
5187 ereport(FATAL,
5188 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5189 errmsg("recovery_target_timeline is not a valid number: \"%s\"",
5190 item->value)));
5191 }
5192 if (rtli)
5193 ereport(DEBUG2,
5194 (errmsg_internal("recovery_target_timeline = %u", rtli)));
5195 else
5196 ereport(DEBUG2,
5197 (errmsg_internal("recovery_target_timeline = latest")));
5198 }
5199 else if (strcmp(item->name, "recovery_target_xid") == 0)
5200 {
5201 errno = 0;
5202 recoveryTargetXid = (TransactionId) pg_strtouint64(item->value, NULL, 0);
5203 if (errno == EINVAL || errno == ERANGE)
5204 ereport(FATAL,
5205 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5206 errmsg("recovery_target_xid is not a valid number: \"%s\"",
5207 item->value)));
5208 ereport(DEBUG2,
5209 (errmsg_internal("recovery_target_xid = %u",
5210 recoveryTargetXid)));
5211 recoveryTarget = RECOVERY_TARGET_XID;
5212 }
5213 else if (strcmp(item->name, "recovery_target_time") == 0)
5214 {
5215 recoveryTarget = RECOVERY_TARGET_TIME;
5216
5217 /*
5218 * Convert the time string given by the user to TimestampTz form.
5219 */
5220 recoveryTargetTime =
5221 DatumGetTimestampTz(DirectFunctionCall3(timestamptz_in,
5222 CStringGetDatum(item->value),
5223 ObjectIdGetDatum(InvalidOid),
5224 Int32GetDatum(-1)));
5225 ereport(DEBUG2,
5226 (errmsg_internal("recovery_target_time = '%s'",
5227 timestamptz_to_str(recoveryTargetTime))));
5228 }
5229 else if (strcmp(item->name, "recovery_target_name") == 0)
5230 {
5231 recoveryTarget = RECOVERY_TARGET_NAME;
5232
5233 recoveryTargetName = pstrdup(item->value);
5234 if (strlen(recoveryTargetName) >= MAXFNAMELEN)
5235 ereport(FATAL,
5236 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5237 errmsg("recovery_target_name is too long (maximum %d characters)",
5238 MAXFNAMELEN - 1)));
5239
5240 ereport(DEBUG2,
5241 (errmsg_internal("recovery_target_name = '%s'",
5242 recoveryTargetName)));
5243 }
5244 else if (strcmp(item->name, "recovery_target") == 0)
5245 {
5246 if (strcmp(item->value, "immediate") == 0)
5247 recoveryTarget = RECOVERY_TARGET_IMMEDIATE;
5248 else
5249 ereport(ERROR,
5250 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5251 errmsg("invalid value for recovery parameter \"%s\": \"%s\"",
5252 "recovery_target",
5253 item->value),
5254 errhint("The only allowed value is \"immediate\".")));
5255 ereport(DEBUG2,
5256 (errmsg_internal("recovery_target = '%s'",
5257 item->value)));
5258 }
5259 else if (strcmp(item->name, "recovery_target_inclusive") == 0)
5260 {
5261 /*
5262 * does nothing if a recovery_target is not also set
5263 */
5264 if (!parse_bool(item->value, &recoveryTargetInclusive))
5265 ereport(ERROR,
5266 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5267 errmsg("parameter \"%s\" requires a Boolean value",
5268 "recovery_target_inclusive")));
5269 ereport(DEBUG2,
5270 (errmsg_internal("recovery_target_inclusive = %s",
5271 item->value)));
5272 }
5273 else if (strcmp(item->name, "standby_mode") == 0)
5274 {
5275 if (!parse_bool(item->value, &StandbyModeRequested))
5276 ereport(ERROR,
5277 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5278 errmsg("parameter \"%s\" requires a Boolean value",
5279 "standby_mode")));
5280 ereport(DEBUG2,
5281 (errmsg_internal("standby_mode = '%s'", item->value)));
5282 }
5283 else if (strcmp(item->name, "primary_conninfo") == 0)
5284 {
5285 PrimaryConnInfo = pstrdup(item->value);
5286 ereport(DEBUG2,
5287 (errmsg_internal("primary_conninfo = '%s'",
5288 PrimaryConnInfo)));
5289 }
5290 else if (strcmp(item->name, "primary_slot_name") == 0)
5291 {
5292 ReplicationSlotValidateName(item->value, ERROR);
5293 PrimarySlotName = pstrdup(item->value);
5294 ereport(DEBUG2,
5295 (errmsg_internal("primary_slot_name = '%s'",
5296 PrimarySlotName)));
5297 }
5298 else if (strcmp(item->name, "trigger_file") == 0)
5299 {
5300 TriggerFile = pstrdup(item->value);
5301 ereport(DEBUG2,
5302 (errmsg_internal("trigger_file = '%s'",
5303 TriggerFile)));
5304 }
5305 else if (strcmp(item->name, "recovery_min_apply_delay") == 0)
5306 {
5307 const char *hintmsg;
5308
5309 if (!parse_int(item->value, &recovery_min_apply_delay, GUC_UNIT_MS,
5310 &hintmsg))
5311 ereport(ERROR,
5312 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5313 errmsg("parameter \"%s\" requires a temporal value",
5314 "recovery_min_apply_delay"),
5315 hintmsg ? errhint("%s", _(hintmsg)) : 0));
5316 ereport(DEBUG2,
5317 (errmsg_internal("recovery_min_apply_delay = '%s'", item->value)));
5318 }
5319 else
5320 ereport(FATAL,
5321 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5322 errmsg("unrecognized recovery parameter \"%s\"",
5323 item->name)));
5324 }
5325
5326 /*
5327 * Check for compulsory parameters
5328 */
5329 if (StandbyModeRequested)
5330 {
5331 if (PrimaryConnInfo == NULL && recoveryRestoreCommand == NULL)
5332 ereport(WARNING,
5333 (errmsg("recovery command file \"%s\" specified neither primary_conninfo nor restore_command",
5334 RECOVERY_COMMAND_FILE),
5335 errhint("The database server will regularly poll the pg_xlog subdirectory to check for files placed there.")));
5336 }
5337 else
5338 {
5339 if (recoveryRestoreCommand == NULL)
5340 ereport(FATAL,
5341 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5342 errmsg("recovery command file \"%s\" must specify restore_command when standby mode is not enabled",
5343 RECOVERY_COMMAND_FILE)));
5344 }
5345
5346 /*
5347 * Override any inconsistent requests. Not that this is a change of
5348 * behaviour in 9.5; prior to this we simply ignored a request to pause if
5349 * hot_standby = off, which was surprising behaviour.
5350 */
5351 if (recoveryTargetAction == RECOVERY_TARGET_ACTION_PAUSE &&
5352 recoveryTargetActionSet &&
5353 !EnableHotStandby)
5354 recoveryTargetAction = RECOVERY_TARGET_ACTION_SHUTDOWN;
5355
5356 /*
5357 * We don't support standby_mode in standalone backends; that requires
5358 * other processes such as the WAL receiver to be alive.
5359 */
5360 if (StandbyModeRequested && !IsUnderPostmaster)
5361 ereport(FATAL,
5362 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
5363 errmsg("standby mode is not supported by single-user servers")));
5364
5365 /* Enable fetching from archive recovery area */
5366 ArchiveRecoveryRequested = true;
5367
5368 /*
5369 * If user specified recovery_target_timeline, validate it or compute the
5370 * "latest" value. We can't do this until after we've gotten the restore
5371 * command and set InArchiveRecovery, because we need to fetch timeline
5372 * history files from the archive.
5373 */
5374 if (rtliGiven)
5375 {
5376 if (rtli)
5377 {
5378 /* Timeline 1 does not have a history file, all else should */
5379 if (rtli != 1 && !existsTimeLineHistory(rtli))
5380 ereport(FATAL,
5381 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
5382 errmsg("recovery target timeline %u does not exist",
5383 rtli)));
5384 recoveryTargetTLI = rtli;
5385 recoveryTargetIsLatest = false;
5386 }
5387 else
5388 {
5389 /* We start the "latest" search from pg_control's timeline */
5390 recoveryTargetTLI = findNewestTimeLine(recoveryTargetTLI);
5391 recoveryTargetIsLatest = true;
5392 }
5393 }
5394
5395 FreeConfigVariables(head);
5396 }
5397
5398 /*
5399 * Exit archive-recovery state
5400 */
5401 static void
exitArchiveRecovery(TimeLineID endTLI,XLogRecPtr endOfLog)5402 exitArchiveRecovery(TimeLineID endTLI, XLogRecPtr endOfLog)
5403 {
5404 char xlogfname[MAXFNAMELEN];
5405 XLogSegNo endLogSegNo;
5406 XLogSegNo startLogSegNo;
5407
5408 /* we always switch to a new timeline after archive recovery */
5409 Assert(endTLI != ThisTimeLineID);
5410
5411 /*
5412 * We are no longer in archive recovery state.
5413 */
5414 InArchiveRecovery = false;
5415
5416 /*
5417 * Update min recovery point one last time.
5418 */
5419 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
5420
5421 /*
5422 * If the ending log segment is still open, close it (to avoid problems on
5423 * Windows with trying to rename or delete an open file).
5424 */
5425 if (readFile >= 0)
5426 {
5427 close(readFile);
5428 readFile = -1;
5429 }
5430
5431 /*
5432 * Calculate the last segment on the old timeline, and the first segment
5433 * on the new timeline. If the switch happens in the middle of a segment,
5434 * they are the same, but if the switch happens exactly at a segment
5435 * boundary, startLogSegNo will be endLogSegNo + 1.
5436 */
5437 XLByteToPrevSeg(endOfLog, endLogSegNo);
5438 XLByteToSeg(endOfLog, startLogSegNo);
5439
5440 /*
5441 * Initialize the starting WAL segment for the new timeline. If the switch
5442 * happens in the middle of a segment, copy data from the last WAL segment
5443 * of the old timeline up to the switch point, to the starting WAL segment
5444 * on the new timeline.
5445 */
5446 if (endLogSegNo == startLogSegNo)
5447 {
5448 /*
5449 * Make a copy of the file on the new timeline.
5450 *
5451 * Writing WAL isn't allowed yet, so there are no locking
5452 * considerations. But we should be just as tense as XLogFileInit to
5453 * avoid emplacing a bogus file.
5454 */
5455 XLogFileCopy(endLogSegNo, endTLI, endLogSegNo,
5456 endOfLog % XLOG_SEG_SIZE);
5457 }
5458 else
5459 {
5460 /*
5461 * The switch happened at a segment boundary, so just create the next
5462 * segment on the new timeline.
5463 */
5464 bool use_existent = true;
5465 int fd;
5466
5467 fd = XLogFileInit(startLogSegNo, &use_existent, true);
5468
5469 if (close(fd))
5470 ereport(ERROR,
5471 (errcode_for_file_access(),
5472 errmsg("could not close log file %s: %m",
5473 XLogFileNameP(ThisTimeLineID, startLogSegNo))));
5474 }
5475
5476 /*
5477 * Let's just make real sure there are not .ready or .done flags posted
5478 * for the new segment.
5479 */
5480 XLogFileName(xlogfname, ThisTimeLineID, startLogSegNo);
5481 XLogArchiveCleanup(xlogfname);
5482
5483 /*
5484 * Rename the config file out of the way, so that we don't accidentally
5485 * re-enter archive recovery mode in a subsequent crash.
5486 */
5487 unlink(RECOVERY_COMMAND_DONE);
5488 durable_rename(RECOVERY_COMMAND_FILE, RECOVERY_COMMAND_DONE, FATAL);
5489
5490 ereport(LOG,
5491 (errmsg("archive recovery complete")));
5492 }
5493
5494 /*
5495 * Extract timestamp from WAL record.
5496 *
5497 * If the record contains a timestamp, returns true, and saves the timestamp
5498 * in *recordXtime. If the record type has no timestamp, returns false.
5499 * Currently, only transaction commit/abort records and restore points contain
5500 * timestamps.
5501 */
5502 static bool
getRecordTimestamp(XLogReaderState * record,TimestampTz * recordXtime)5503 getRecordTimestamp(XLogReaderState *record, TimestampTz *recordXtime)
5504 {
5505 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5506 uint8 xact_info = info & XLOG_XACT_OPMASK;
5507 uint8 rmid = XLogRecGetRmid(record);
5508
5509 if (rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5510 {
5511 *recordXtime = ((xl_restore_point *) XLogRecGetData(record))->rp_time;
5512 return true;
5513 }
5514 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_COMMIT ||
5515 xact_info == XLOG_XACT_COMMIT_PREPARED))
5516 {
5517 *recordXtime = ((xl_xact_commit *) XLogRecGetData(record))->xact_time;
5518 return true;
5519 }
5520 if (rmid == RM_XACT_ID && (xact_info == XLOG_XACT_ABORT ||
5521 xact_info == XLOG_XACT_ABORT_PREPARED))
5522 {
5523 *recordXtime = ((xl_xact_abort *) XLogRecGetData(record))->xact_time;
5524 return true;
5525 }
5526 return false;
5527 }
5528
5529 /*
5530 * For point-in-time recovery, this function decides whether we want to
5531 * stop applying the XLOG before the current record.
5532 *
5533 * Returns TRUE if we are stopping, FALSE otherwise. If stopping, some
5534 * information is saved in recoveryStopXid et al for use in annotating the
5535 * new timeline's history file.
5536 */
5537 static bool
recoveryStopsBefore(XLogReaderState * record)5538 recoveryStopsBefore(XLogReaderState *record)
5539 {
5540 bool stopsHere = false;
5541 uint8 xact_info;
5542 bool isCommit;
5543 TimestampTz recordXtime = 0;
5544 TransactionId recordXid;
5545
5546 /* Check if we should stop as soon as reaching consistency */
5547 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5548 {
5549 ereport(LOG,
5550 (errmsg("recovery stopping after reaching consistency")));
5551
5552 recoveryStopAfter = false;
5553 recoveryStopXid = InvalidTransactionId;
5554 recoveryStopTime = 0;
5555 recoveryStopName[0] = '\0';
5556 return true;
5557 }
5558
5559 /* Otherwise we only consider stopping before COMMIT or ABORT records. */
5560 if (XLogRecGetRmid(record) != RM_XACT_ID)
5561 return false;
5562
5563 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5564
5565 if (xact_info == XLOG_XACT_COMMIT)
5566 {
5567 isCommit = true;
5568 recordXid = XLogRecGetXid(record);
5569 }
5570 else if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5571 {
5572 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5573 xl_xact_parsed_commit parsed;
5574
5575 isCommit = true;
5576 ParseCommitRecord(XLogRecGetInfo(record),
5577 xlrec,
5578 &parsed);
5579 recordXid = parsed.twophase_xid;
5580 }
5581 else if (xact_info == XLOG_XACT_ABORT)
5582 {
5583 isCommit = false;
5584 recordXid = XLogRecGetXid(record);
5585 }
5586 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5587 {
5588 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5589 xl_xact_parsed_abort parsed;
5590
5591 isCommit = false;
5592 ParseAbortRecord(XLogRecGetInfo(record),
5593 xlrec,
5594 &parsed);
5595 recordXid = parsed.twophase_xid;
5596 }
5597 else
5598 return false;
5599
5600 if (recoveryTarget == RECOVERY_TARGET_XID && !recoveryTargetInclusive)
5601 {
5602 /*
5603 * There can be only one transaction end record with this exact
5604 * transactionid
5605 *
5606 * when testing for an xid, we MUST test for equality only, since
5607 * transactions are numbered in the order they start, not the order
5608 * they complete. A higher numbered xid will complete before you about
5609 * 50% of the time...
5610 */
5611 stopsHere = (recordXid == recoveryTargetXid);
5612 }
5613
5614 if (recoveryTarget == RECOVERY_TARGET_TIME &&
5615 getRecordTimestamp(record, &recordXtime))
5616 {
5617 /*
5618 * There can be many transactions that share the same commit time, so
5619 * we stop after the last one, if we are inclusive, or stop at the
5620 * first one if we are exclusive
5621 */
5622 if (recoveryTargetInclusive)
5623 stopsHere = (recordXtime > recoveryTargetTime);
5624 else
5625 stopsHere = (recordXtime >= recoveryTargetTime);
5626 }
5627
5628 if (stopsHere)
5629 {
5630 recoveryStopAfter = false;
5631 recoveryStopXid = recordXid;
5632 recoveryStopTime = recordXtime;
5633 recoveryStopName[0] = '\0';
5634
5635 if (isCommit)
5636 {
5637 ereport(LOG,
5638 (errmsg("recovery stopping before commit of transaction %u, time %s",
5639 recoveryStopXid,
5640 timestamptz_to_str(recoveryStopTime))));
5641 }
5642 else
5643 {
5644 ereport(LOG,
5645 (errmsg("recovery stopping before abort of transaction %u, time %s",
5646 recoveryStopXid,
5647 timestamptz_to_str(recoveryStopTime))));
5648 }
5649 }
5650
5651 return stopsHere;
5652 }
5653
5654 /*
5655 * Same as recoveryStopsBefore, but called after applying the record.
5656 *
5657 * We also track the timestamp of the latest applied COMMIT/ABORT
5658 * record in XLogCtl->recoveryLastXTime.
5659 */
5660 static bool
recoveryStopsAfter(XLogReaderState * record)5661 recoveryStopsAfter(XLogReaderState *record)
5662 {
5663 uint8 info;
5664 uint8 xact_info;
5665 uint8 rmid;
5666 TimestampTz recordXtime;
5667
5668 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
5669 rmid = XLogRecGetRmid(record);
5670
5671 /*
5672 * There can be many restore points that share the same name; we stop at
5673 * the first one.
5674 */
5675 if (recoveryTarget == RECOVERY_TARGET_NAME &&
5676 rmid == RM_XLOG_ID && info == XLOG_RESTORE_POINT)
5677 {
5678 xl_restore_point *recordRestorePointData;
5679
5680 recordRestorePointData = (xl_restore_point *) XLogRecGetData(record);
5681
5682 if (strcmp(recordRestorePointData->rp_name, recoveryTargetName) == 0)
5683 {
5684 recoveryStopAfter = true;
5685 recoveryStopXid = InvalidTransactionId;
5686 (void) getRecordTimestamp(record, &recoveryStopTime);
5687 strlcpy(recoveryStopName, recordRestorePointData->rp_name, MAXFNAMELEN);
5688
5689 ereport(LOG,
5690 (errmsg("recovery stopping at restore point \"%s\", time %s",
5691 recoveryStopName,
5692 timestamptz_to_str(recoveryStopTime))));
5693 return true;
5694 }
5695 }
5696
5697 if (rmid != RM_XACT_ID)
5698 return false;
5699
5700 xact_info = info & XLOG_XACT_OPMASK;
5701
5702 if (xact_info == XLOG_XACT_COMMIT ||
5703 xact_info == XLOG_XACT_COMMIT_PREPARED ||
5704 xact_info == XLOG_XACT_ABORT ||
5705 xact_info == XLOG_XACT_ABORT_PREPARED)
5706 {
5707 TransactionId recordXid;
5708
5709 /* Update the last applied transaction timestamp */
5710 if (getRecordTimestamp(record, &recordXtime))
5711 SetLatestXTime(recordXtime);
5712
5713 /* Extract the XID of the committed/aborted transaction */
5714 if (xact_info == XLOG_XACT_COMMIT_PREPARED)
5715 {
5716 xl_xact_commit *xlrec = (xl_xact_commit *) XLogRecGetData(record);
5717 xl_xact_parsed_commit parsed;
5718
5719 ParseCommitRecord(XLogRecGetInfo(record),
5720 xlrec,
5721 &parsed);
5722 recordXid = parsed.twophase_xid;
5723 }
5724 else if (xact_info == XLOG_XACT_ABORT_PREPARED)
5725 {
5726 xl_xact_abort *xlrec = (xl_xact_abort *) XLogRecGetData(record);
5727 xl_xact_parsed_abort parsed;
5728
5729 ParseAbortRecord(XLogRecGetInfo(record),
5730 xlrec,
5731 &parsed);
5732 recordXid = parsed.twophase_xid;
5733 }
5734 else
5735 recordXid = XLogRecGetXid(record);
5736
5737 /*
5738 * There can be only one transaction end record with this exact
5739 * transactionid
5740 *
5741 * when testing for an xid, we MUST test for equality only, since
5742 * transactions are numbered in the order they start, not the order
5743 * they complete. A higher numbered xid will complete before you about
5744 * 50% of the time...
5745 */
5746 if (recoveryTarget == RECOVERY_TARGET_XID && recoveryTargetInclusive &&
5747 recordXid == recoveryTargetXid)
5748 {
5749 recoveryStopAfter = true;
5750 recoveryStopXid = recordXid;
5751 recoveryStopTime = recordXtime;
5752 recoveryStopName[0] = '\0';
5753
5754 if (xact_info == XLOG_XACT_COMMIT ||
5755 xact_info == XLOG_XACT_COMMIT_PREPARED)
5756 {
5757 ereport(LOG,
5758 (errmsg("recovery stopping after commit of transaction %u, time %s",
5759 recoveryStopXid,
5760 timestamptz_to_str(recoveryStopTime))));
5761 }
5762 else if (xact_info == XLOG_XACT_ABORT ||
5763 xact_info == XLOG_XACT_ABORT_PREPARED)
5764 {
5765 ereport(LOG,
5766 (errmsg("recovery stopping after abort of transaction %u, time %s",
5767 recoveryStopXid,
5768 timestamptz_to_str(recoveryStopTime))));
5769 }
5770 return true;
5771 }
5772 }
5773
5774 /* Check if we should stop as soon as reaching consistency */
5775 if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE && reachedConsistency)
5776 {
5777 ereport(LOG,
5778 (errmsg("recovery stopping after reaching consistency")));
5779
5780 recoveryStopAfter = true;
5781 recoveryStopXid = InvalidTransactionId;
5782 recoveryStopTime = 0;
5783 recoveryStopName[0] = '\0';
5784 return true;
5785 }
5786
5787 return false;
5788 }
5789
5790 /*
5791 * Wait until shared recoveryPause flag is cleared.
5792 *
5793 * XXX Could also be done with shared latch, avoiding the pg_usleep loop.
5794 * Probably not worth the trouble though. This state shouldn't be one that
5795 * anyone cares about server power consumption in.
5796 */
5797 static void
recoveryPausesHere(void)5798 recoveryPausesHere(void)
5799 {
5800 /* Don't pause unless users can connect! */
5801 if (!LocalHotStandbyActive)
5802 return;
5803
5804 ereport(LOG,
5805 (errmsg("recovery has paused"),
5806 errhint("Execute pg_xlog_replay_resume() to continue.")));
5807
5808 while (RecoveryIsPaused())
5809 {
5810 pg_usleep(1000000L); /* 1000 ms */
5811 HandleStartupProcInterrupts();
5812 }
5813 }
5814
5815 bool
RecoveryIsPaused(void)5816 RecoveryIsPaused(void)
5817 {
5818 bool recoveryPause;
5819
5820 SpinLockAcquire(&XLogCtl->info_lck);
5821 recoveryPause = XLogCtl->recoveryPause;
5822 SpinLockRelease(&XLogCtl->info_lck);
5823
5824 return recoveryPause;
5825 }
5826
5827 void
SetRecoveryPause(bool recoveryPause)5828 SetRecoveryPause(bool recoveryPause)
5829 {
5830 SpinLockAcquire(&XLogCtl->info_lck);
5831 XLogCtl->recoveryPause = recoveryPause;
5832 SpinLockRelease(&XLogCtl->info_lck);
5833 }
5834
5835 /*
5836 * When recovery_min_apply_delay is set, we wait long enough to make sure
5837 * certain record types are applied at least that interval behind the master.
5838 *
5839 * Returns true if we waited.
5840 *
5841 * Note that the delay is calculated between the WAL record log time and
5842 * the current time on standby. We would prefer to keep track of when this
5843 * standby received each WAL record, which would allow a more consistent
5844 * approach and one not affected by time synchronisation issues, but that
5845 * is significantly more effort and complexity for little actual gain in
5846 * usability.
5847 */
5848 static bool
recoveryApplyDelay(XLogReaderState * record)5849 recoveryApplyDelay(XLogReaderState *record)
5850 {
5851 uint8 xact_info;
5852 TimestampTz xtime;
5853 long msecs;
5854
5855 /* nothing to do if no delay configured */
5856 if (recovery_min_apply_delay <= 0)
5857 return false;
5858
5859 /* no delay is applied on a database not yet consistent */
5860 if (!reachedConsistency)
5861 return false;
5862
5863 /*
5864 * Is it a COMMIT record?
5865 *
5866 * We deliberately choose not to delay aborts since they have no effect on
5867 * MVCC. We already allow replay of records that don't have a timestamp,
5868 * so there is already opportunity for issues caused by early conflicts on
5869 * standbys.
5870 */
5871 if (XLogRecGetRmid(record) != RM_XACT_ID)
5872 return false;
5873
5874 xact_info = XLogRecGetInfo(record) & XLOG_XACT_OPMASK;
5875
5876 if (xact_info != XLOG_XACT_COMMIT &&
5877 xact_info != XLOG_XACT_COMMIT_PREPARED)
5878 return false;
5879
5880 if (!getRecordTimestamp(record, &xtime))
5881 return false;
5882
5883 recoveryDelayUntilTime =
5884 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
5885
5886 /*
5887 * Exit without arming the latch if it's already past time to apply this
5888 * record
5889 */
5890 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
5891 recoveryDelayUntilTime);
5892 if (msecs <= 0)
5893 return false;
5894
5895 while (true)
5896 {
5897 ResetLatch(&XLogCtl->recoveryWakeupLatch);
5898
5899 /*
5900 * This might change recovery_min_apply_delay or the trigger file's
5901 * location.
5902 */
5903 HandleStartupProcInterrupts();
5904
5905 if (CheckForStandbyTrigger())
5906 break;
5907
5908 /*
5909 * Recalculate recoveryDelayUntilTime as recovery_min_apply_delay
5910 * could have changed while waiting in this loop.
5911 */
5912 recoveryDelayUntilTime =
5913 TimestampTzPlusMilliseconds(xtime, recovery_min_apply_delay);
5914
5915 /*
5916 * Wait for difference between GetCurrentTimestamp() and
5917 * recoveryDelayUntilTime
5918 */
5919 msecs = TimestampDifferenceMilliseconds(GetCurrentTimestamp(),
5920 recoveryDelayUntilTime);
5921
5922 if (msecs <= 0)
5923 break;
5924
5925 elog(DEBUG2, "recovery apply delay %ld milliseconds", msecs);
5926
5927 (void) WaitLatch(&XLogCtl->recoveryWakeupLatch,
5928 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
5929 msecs);
5930 }
5931 return true;
5932 }
5933
5934 /*
5935 * Save timestamp of latest processed commit/abort record.
5936 *
5937 * We keep this in XLogCtl, not a simple static variable, so that it can be
5938 * seen by processes other than the startup process. Note in particular
5939 * that CreateRestartPoint is executed in the checkpointer.
5940 */
5941 static void
SetLatestXTime(TimestampTz xtime)5942 SetLatestXTime(TimestampTz xtime)
5943 {
5944 SpinLockAcquire(&XLogCtl->info_lck);
5945 XLogCtl->recoveryLastXTime = xtime;
5946 SpinLockRelease(&XLogCtl->info_lck);
5947 }
5948
5949 /*
5950 * Fetch timestamp of latest processed commit/abort record.
5951 */
5952 TimestampTz
GetLatestXTime(void)5953 GetLatestXTime(void)
5954 {
5955 TimestampTz xtime;
5956
5957 SpinLockAcquire(&XLogCtl->info_lck);
5958 xtime = XLogCtl->recoveryLastXTime;
5959 SpinLockRelease(&XLogCtl->info_lck);
5960
5961 return xtime;
5962 }
5963
5964 /*
5965 * Save timestamp of the next chunk of WAL records to apply.
5966 *
5967 * We keep this in XLogCtl, not a simple static variable, so that it can be
5968 * seen by all backends.
5969 */
5970 static void
SetCurrentChunkStartTime(TimestampTz xtime)5971 SetCurrentChunkStartTime(TimestampTz xtime)
5972 {
5973 SpinLockAcquire(&XLogCtl->info_lck);
5974 XLogCtl->currentChunkStartTime = xtime;
5975 SpinLockRelease(&XLogCtl->info_lck);
5976 }
5977
5978 /*
5979 * Fetch timestamp of latest processed commit/abort record.
5980 * Startup process maintains an accurate local copy in XLogReceiptTime
5981 */
5982 TimestampTz
GetCurrentChunkReplayStartTime(void)5983 GetCurrentChunkReplayStartTime(void)
5984 {
5985 TimestampTz xtime;
5986
5987 SpinLockAcquire(&XLogCtl->info_lck);
5988 xtime = XLogCtl->currentChunkStartTime;
5989 SpinLockRelease(&XLogCtl->info_lck);
5990
5991 return xtime;
5992 }
5993
5994 /*
5995 * Returns time of receipt of current chunk of XLOG data, as well as
5996 * whether it was received from streaming replication or from archives.
5997 */
5998 void
GetXLogReceiptTime(TimestampTz * rtime,bool * fromStream)5999 GetXLogReceiptTime(TimestampTz *rtime, bool *fromStream)
6000 {
6001 /*
6002 * This must be executed in the startup process, since we don't export the
6003 * relevant state to shared memory.
6004 */
6005 Assert(InRecovery);
6006
6007 *rtime = XLogReceiptTime;
6008 *fromStream = (XLogReceiptSource == XLOG_FROM_STREAM);
6009 }
6010
6011 /*
6012 * Note that text field supplied is a parameter name and does not require
6013 * translation
6014 */
6015 #define RecoveryRequiresIntParameter(param_name, currValue, minValue) \
6016 do { \
6017 if ((currValue) < (minValue)) \
6018 ereport(ERROR, \
6019 (errcode(ERRCODE_INVALID_PARAMETER_VALUE), \
6020 errmsg("hot standby is not possible because " \
6021 "%s = %d is a lower setting than on the master server " \
6022 "(its value was %d)", \
6023 param_name, \
6024 currValue, \
6025 minValue))); \
6026 } while(0)
6027
6028 /*
6029 * Check to see if required parameters are set high enough on this server
6030 * for various aspects of recovery operation.
6031 *
6032 * Note that all the parameters which this function tests need to be
6033 * listed in Administrator's Overview section in high-availability.sgml.
6034 * If you change them, don't forget to update the list.
6035 */
6036 static void
CheckRequiredParameterValues(void)6037 CheckRequiredParameterValues(void)
6038 {
6039 /*
6040 * For archive recovery, the WAL must be generated with at least 'replica'
6041 * wal_level.
6042 */
6043 if (ArchiveRecoveryRequested && ControlFile->wal_level == WAL_LEVEL_MINIMAL)
6044 {
6045 ereport(WARNING,
6046 (errmsg("WAL was generated with wal_level=minimal, data may be missing"),
6047 errhint("This happens if you temporarily set wal_level=minimal without taking a new base backup.")));
6048 }
6049
6050 /*
6051 * For Hot Standby, the WAL must be generated with 'replica' mode, and we
6052 * must have at least as many backend slots as the primary.
6053 */
6054 if (ArchiveRecoveryRequested && EnableHotStandby)
6055 {
6056 if (ControlFile->wal_level < WAL_LEVEL_REPLICA)
6057 ereport(ERROR,
6058 (errmsg("hot standby is not possible because wal_level was not set to \"replica\" or higher on the master server"),
6059 errhint("Either set wal_level to \"replica\" on the master, or turn off hot_standby here.")));
6060
6061 /* We ignore autovacuum_max_workers when we make this test. */
6062 RecoveryRequiresIntParameter("max_connections",
6063 MaxConnections,
6064 ControlFile->MaxConnections);
6065 RecoveryRequiresIntParameter("max_worker_processes",
6066 max_worker_processes,
6067 ControlFile->max_worker_processes);
6068 RecoveryRequiresIntParameter("max_prepared_transactions",
6069 max_prepared_xacts,
6070 ControlFile->max_prepared_xacts);
6071 RecoveryRequiresIntParameter("max_locks_per_transaction",
6072 max_locks_per_xact,
6073 ControlFile->max_locks_per_xact);
6074 }
6075 }
6076
6077 /*
6078 * This must be called ONCE during postmaster or standalone-backend startup
6079 */
6080 void
StartupXLOG(void)6081 StartupXLOG(void)
6082 {
6083 XLogCtlInsert *Insert;
6084 CheckPoint checkPoint;
6085 bool wasShutdown;
6086 bool reachedStopPoint = false;
6087 bool haveBackupLabel = false;
6088 bool haveTblspcMap = false;
6089 XLogRecPtr RecPtr,
6090 checkPointLoc,
6091 EndOfLog;
6092 TimeLineID EndOfLogTLI;
6093 TimeLineID PrevTimeLineID;
6094 XLogRecord *record;
6095 TransactionId oldestActiveXID;
6096 bool backupEndRequired = false;
6097 bool backupFromStandby = false;
6098 DBState dbstate_at_startup;
6099 XLogReaderState *xlogreader;
6100 XLogPageReadPrivate private;
6101 bool fast_promoted = false;
6102 struct stat st;
6103
6104 /*
6105 * Read control file and check XLOG status looks valid.
6106 *
6107 * Note: in most control paths, *ControlFile is already valid and we need
6108 * not do ReadControlFile() here, but might as well do it to be sure.
6109 */
6110 ReadControlFile();
6111
6112 if (ControlFile->state < DB_SHUTDOWNED ||
6113 ControlFile->state > DB_IN_PRODUCTION ||
6114 !XRecOffIsValid(ControlFile->checkPoint))
6115 ereport(FATAL,
6116 (errmsg("control file contains invalid data")));
6117
6118 if (ControlFile->state == DB_SHUTDOWNED)
6119 {
6120 /* This is the expected case, so don't be chatty in standalone mode */
6121 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
6122 (errmsg("database system was shut down at %s",
6123 str_time(ControlFile->time))));
6124 }
6125 else if (ControlFile->state == DB_SHUTDOWNED_IN_RECOVERY)
6126 ereport(LOG,
6127 (errmsg("database system was shut down in recovery at %s",
6128 str_time(ControlFile->time))));
6129 else if (ControlFile->state == DB_SHUTDOWNING)
6130 ereport(LOG,
6131 (errmsg("database system shutdown was interrupted; last known up at %s",
6132 str_time(ControlFile->time))));
6133 else if (ControlFile->state == DB_IN_CRASH_RECOVERY)
6134 ereport(LOG,
6135 (errmsg("database system was interrupted while in recovery at %s",
6136 str_time(ControlFile->time)),
6137 errhint("This probably means that some data is corrupted and"
6138 " you will have to use the last backup for recovery.")));
6139 else if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY)
6140 ereport(LOG,
6141 (errmsg("database system was interrupted while in recovery at log time %s",
6142 str_time(ControlFile->checkPointCopy.time)),
6143 errhint("If this has occurred more than once some data might be corrupted"
6144 " and you might need to choose an earlier recovery target.")));
6145 else if (ControlFile->state == DB_IN_PRODUCTION)
6146 ereport(LOG,
6147 (errmsg("database system was interrupted; last known up at %s",
6148 str_time(ControlFile->time))));
6149
6150 /* This is just to allow attaching to startup process with a debugger */
6151 #ifdef XLOG_REPLAY_DELAY
6152 if (ControlFile->state != DB_SHUTDOWNED)
6153 pg_usleep(60000000L);
6154 #endif
6155
6156 /*
6157 * Verify that pg_xlog and pg_xlog/archive_status exist. In cases where
6158 * someone has performed a copy for PITR, these directories may have been
6159 * excluded and need to be re-created.
6160 */
6161 ValidateXLOGDirectoryStructure();
6162
6163 /*
6164 * If we previously crashed, there might be data which we had written,
6165 * intending to fsync it, but which we had not actually fsync'd yet.
6166 * Therefore, a power failure in the near future might cause earlier
6167 * unflushed writes to be lost, even though more recent data written to
6168 * disk from here on would be persisted. To avoid that, fsync the entire
6169 * data directory.
6170 */
6171 if (ControlFile->state != DB_SHUTDOWNED &&
6172 ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
6173 SyncDataDirectory();
6174
6175 /*
6176 * Initialize on the assumption we want to recover to the latest timeline
6177 * that's active according to pg_control.
6178 */
6179 if (ControlFile->minRecoveryPointTLI >
6180 ControlFile->checkPointCopy.ThisTimeLineID)
6181 recoveryTargetTLI = ControlFile->minRecoveryPointTLI;
6182 else
6183 recoveryTargetTLI = ControlFile->checkPointCopy.ThisTimeLineID;
6184
6185 /*
6186 * Check for recovery control file, and if so set up state for offline
6187 * recovery
6188 */
6189 readRecoveryCommandFile();
6190
6191 /*
6192 * Save archive_cleanup_command in shared memory so that other processes
6193 * can see it.
6194 */
6195 strlcpy(XLogCtl->archiveCleanupCommand,
6196 archiveCleanupCommand ? archiveCleanupCommand : "",
6197 sizeof(XLogCtl->archiveCleanupCommand));
6198
6199 if (ArchiveRecoveryRequested)
6200 {
6201 if (StandbyModeRequested)
6202 ereport(LOG,
6203 (errmsg("entering standby mode")));
6204 else if (recoveryTarget == RECOVERY_TARGET_XID)
6205 ereport(LOG,
6206 (errmsg("starting point-in-time recovery to XID %u",
6207 recoveryTargetXid)));
6208 else if (recoveryTarget == RECOVERY_TARGET_TIME)
6209 ereport(LOG,
6210 (errmsg("starting point-in-time recovery to %s",
6211 timestamptz_to_str(recoveryTargetTime))));
6212 else if (recoveryTarget == RECOVERY_TARGET_NAME)
6213 ereport(LOG,
6214 (errmsg("starting point-in-time recovery to \"%s\"",
6215 recoveryTargetName)));
6216 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
6217 ereport(LOG,
6218 (errmsg("starting point-in-time recovery to earliest consistent point")));
6219 else
6220 ereport(LOG,
6221 (errmsg("starting archive recovery")));
6222 }
6223
6224 /*
6225 * Take ownership of the wakeup latch if we're going to sleep during
6226 * recovery.
6227 */
6228 if (ArchiveRecoveryRequested)
6229 OwnLatch(&XLogCtl->recoveryWakeupLatch);
6230
6231 /* Set up XLOG reader facility */
6232 MemSet(&private, 0, sizeof(XLogPageReadPrivate));
6233 xlogreader = XLogReaderAllocate(&XLogPageRead, &private);
6234 if (!xlogreader)
6235 ereport(ERROR,
6236 (errcode(ERRCODE_OUT_OF_MEMORY),
6237 errmsg("out of memory"),
6238 errdetail("Failed while allocating an XLog reading processor.")));
6239 xlogreader->system_identifier = ControlFile->system_identifier;
6240
6241 if (read_backup_label(&checkPointLoc, &backupEndRequired,
6242 &backupFromStandby))
6243 {
6244 List *tablespaces = NIL;
6245
6246 /*
6247 * Archive recovery was requested, and thanks to the backup label
6248 * file, we know how far we need to replay to reach consistency. Enter
6249 * archive recovery directly.
6250 */
6251 InArchiveRecovery = true;
6252 if (StandbyModeRequested)
6253 StandbyMode = true;
6254
6255 /*
6256 * When a backup_label file is present, we want to roll forward from
6257 * the checkpoint it identifies, rather than using pg_control.
6258 */
6259 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 0, true);
6260 if (record != NULL)
6261 {
6262 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6263 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6264 ereport(DEBUG1,
6265 (errmsg("checkpoint record is at %X/%X",
6266 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6267 InRecovery = true; /* force recovery even if SHUTDOWNED */
6268
6269 /*
6270 * Make sure that REDO location exists. This may not be the case
6271 * if there was a crash during an online backup, which left a
6272 * backup_label around that references a WAL segment that's
6273 * already been archived.
6274 */
6275 if (checkPoint.redo < checkPointLoc)
6276 {
6277 if (!ReadRecord(xlogreader, checkPoint.redo, LOG, false))
6278 ereport(FATAL,
6279 (errmsg("could not find redo location referenced by checkpoint record"),
6280 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6281 }
6282 }
6283 else
6284 {
6285 ereport(FATAL,
6286 (errmsg("could not locate required checkpoint record"),
6287 errhint("If you are not restoring from a backup, try removing the file \"%s/backup_label\".", DataDir)));
6288 wasShutdown = false; /* keep compiler quiet */
6289 }
6290
6291 /* read the tablespace_map file if present and create symlinks. */
6292 if (read_tablespace_map(&tablespaces))
6293 {
6294 ListCell *lc;
6295
6296 foreach(lc, tablespaces)
6297 {
6298 tablespaceinfo *ti = lfirst(lc);
6299 char *linkloc;
6300
6301 linkloc = psprintf("pg_tblspc/%s", ti->oid);
6302
6303 /*
6304 * Remove the existing symlink if any and Create the symlink
6305 * under PGDATA.
6306 */
6307 remove_tablespace_symlink(linkloc);
6308
6309 if (symlink(ti->path, linkloc) < 0)
6310 ereport(ERROR,
6311 (errcode_for_file_access(),
6312 errmsg("could not create symbolic link \"%s\": %m",
6313 linkloc)));
6314
6315 pfree(ti->oid);
6316 pfree(ti->path);
6317 pfree(ti);
6318 }
6319
6320 /* set flag to delete it later */
6321 haveTblspcMap = true;
6322 }
6323
6324 /* set flag to delete it later */
6325 haveBackupLabel = true;
6326 }
6327 else
6328 {
6329 /*
6330 * If tablespace_map file is present without backup_label file, there
6331 * is no use of such file. There is no harm in retaining it, but it
6332 * is better to get rid of the map file so that we don't have any
6333 * redundant file in data directory and it will avoid any sort of
6334 * confusion. It seems prudent though to just rename the file out of
6335 * the way rather than delete it completely, also we ignore any error
6336 * that occurs in rename operation as even if map file is present
6337 * without backup_label file, it is harmless.
6338 */
6339 if (stat(TABLESPACE_MAP, &st) == 0)
6340 {
6341 unlink(TABLESPACE_MAP_OLD);
6342 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
6343 ereport(LOG,
6344 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6345 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6346 errdetail("File \"%s\" was renamed to \"%s\".",
6347 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6348 else
6349 ereport(LOG,
6350 (errmsg("ignoring file \"%s\" because no file \"%s\" exists",
6351 TABLESPACE_MAP, BACKUP_LABEL_FILE),
6352 errdetail("Could not rename file \"%s\" to \"%s\": %m.",
6353 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
6354 }
6355
6356 /*
6357 * It's possible that archive recovery was requested, but we don't
6358 * know how far we need to replay the WAL before we reach consistency.
6359 * This can happen for example if a base backup is taken from a
6360 * running server using an atomic filesystem snapshot, without calling
6361 * pg_start/stop_backup. Or if you just kill a running master server
6362 * and put it into archive recovery by creating a recovery.conf file.
6363 *
6364 * Our strategy in that case is to perform crash recovery first,
6365 * replaying all the WAL present in pg_xlog, and only enter archive
6366 * recovery after that.
6367 *
6368 * But usually we already know how far we need to replay the WAL (up
6369 * to minRecoveryPoint, up to backupEndPoint, or until we see an
6370 * end-of-backup record), and we can enter archive recovery directly.
6371 */
6372 if (ArchiveRecoveryRequested &&
6373 (ControlFile->minRecoveryPoint != InvalidXLogRecPtr ||
6374 ControlFile->backupEndRequired ||
6375 ControlFile->backupEndPoint != InvalidXLogRecPtr ||
6376 ControlFile->state == DB_SHUTDOWNED))
6377 {
6378 InArchiveRecovery = true;
6379 if (StandbyModeRequested)
6380 StandbyMode = true;
6381 }
6382
6383 /*
6384 * Get the last valid checkpoint record. If the latest one according
6385 * to pg_control is broken, try the next-to-last one.
6386 */
6387 checkPointLoc = ControlFile->checkPoint;
6388 RedoStartLSN = ControlFile->checkPointCopy.redo;
6389 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, true);
6390 if (record != NULL)
6391 {
6392 ereport(DEBUG1,
6393 (errmsg("checkpoint record is at %X/%X",
6394 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6395 }
6396 else if (StandbyMode)
6397 {
6398 /*
6399 * The last valid checkpoint record required for a streaming
6400 * recovery exists in neither standby nor the primary.
6401 */
6402 ereport(PANIC,
6403 (errmsg("could not locate a valid checkpoint record")));
6404 }
6405 else
6406 {
6407 checkPointLoc = ControlFile->prevCheckPoint;
6408 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 2, true);
6409 if (record != NULL)
6410 {
6411 ereport(LOG,
6412 (errmsg("using previous checkpoint record at %X/%X",
6413 (uint32) (checkPointLoc >> 32), (uint32) checkPointLoc)));
6414 InRecovery = true; /* force recovery even if SHUTDOWNED */
6415 }
6416 else
6417 ereport(PANIC,
6418 (errmsg("could not locate a valid checkpoint record")));
6419 }
6420 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
6421 wasShutdown = (record->xl_info == XLOG_CHECKPOINT_SHUTDOWN);
6422 }
6423
6424 /*
6425 * Clear out any old relcache cache files. This is *necessary* if we do
6426 * any WAL replay, since that would probably result in the cache files
6427 * being out of sync with database reality. In theory we could leave them
6428 * in place if the database had been cleanly shut down, but it seems
6429 * safest to just remove them always and let them be rebuilt during the
6430 * first backend startup. These files needs to be removed from all
6431 * directories including pg_tblspc, however the symlinks are created only
6432 * after reading tablespace_map file in case of archive recovery from
6433 * backup, so needs to clear old relcache files here after creating
6434 * symlinks.
6435 */
6436 RelationCacheInitFileRemove();
6437
6438 /*
6439 * If the location of the checkpoint record is not on the expected
6440 * timeline in the history of the requested timeline, we cannot proceed:
6441 * the backup is not part of the history of the requested timeline.
6442 */
6443 Assert(expectedTLEs); /* was initialized by reading checkpoint
6444 * record */
6445 if (tliOfPointInHistory(checkPointLoc, expectedTLEs) !=
6446 checkPoint.ThisTimeLineID)
6447 {
6448 XLogRecPtr switchpoint;
6449
6450 /*
6451 * tliSwitchPoint will throw an error if the checkpoint's timeline is
6452 * not in expectedTLEs at all.
6453 */
6454 switchpoint = tliSwitchPoint(ControlFile->checkPointCopy.ThisTimeLineID, expectedTLEs, NULL);
6455 ereport(FATAL,
6456 (errmsg("requested timeline %u is not a child of this server's history",
6457 recoveryTargetTLI),
6458 errdetail("Latest checkpoint is at %X/%X on timeline %u, but in the history of the requested timeline, the server forked off from that timeline at %X/%X.",
6459 (uint32) (ControlFile->checkPoint >> 32),
6460 (uint32) ControlFile->checkPoint,
6461 ControlFile->checkPointCopy.ThisTimeLineID,
6462 (uint32) (switchpoint >> 32),
6463 (uint32) switchpoint)));
6464 }
6465
6466 /*
6467 * The min recovery point should be part of the requested timeline's
6468 * history, too.
6469 */
6470 if (!XLogRecPtrIsInvalid(ControlFile->minRecoveryPoint) &&
6471 tliOfPointInHistory(ControlFile->minRecoveryPoint - 1, expectedTLEs) !=
6472 ControlFile->minRecoveryPointTLI)
6473 ereport(FATAL,
6474 (errmsg("requested timeline %u does not contain minimum recovery point %X/%X on timeline %u",
6475 recoveryTargetTLI,
6476 (uint32) (ControlFile->minRecoveryPoint >> 32),
6477 (uint32) ControlFile->minRecoveryPoint,
6478 ControlFile->minRecoveryPointTLI)));
6479
6480 LastRec = RecPtr = checkPointLoc;
6481
6482 ereport(DEBUG1,
6483 (errmsg_internal("redo record is at %X/%X; shutdown %s",
6484 (uint32) (checkPoint.redo >> 32), (uint32) checkPoint.redo,
6485 wasShutdown ? "TRUE" : "FALSE")));
6486 ereport(DEBUG1,
6487 (errmsg_internal("next transaction ID: %u:%u; next OID: %u",
6488 checkPoint.nextXidEpoch, checkPoint.nextXid,
6489 checkPoint.nextOid)));
6490 ereport(DEBUG1,
6491 (errmsg_internal("next MultiXactId: %u; next MultiXactOffset: %u",
6492 checkPoint.nextMulti, checkPoint.nextMultiOffset)));
6493 ereport(DEBUG1,
6494 (errmsg_internal("oldest unfrozen transaction ID: %u, in database %u",
6495 checkPoint.oldestXid, checkPoint.oldestXidDB)));
6496 ereport(DEBUG1,
6497 (errmsg_internal("oldest MultiXactId: %u, in database %u",
6498 checkPoint.oldestMulti, checkPoint.oldestMultiDB)));
6499 ereport(DEBUG1,
6500 (errmsg_internal("commit timestamp Xid oldest/newest: %u/%u",
6501 checkPoint.oldestCommitTsXid,
6502 checkPoint.newestCommitTsXid)));
6503 if (!TransactionIdIsNormal(checkPoint.nextXid))
6504 ereport(PANIC,
6505 (errmsg("invalid next transaction ID")));
6506
6507 /* initialize shared memory variables from the checkpoint record */
6508 ShmemVariableCache->nextXid = checkPoint.nextXid;
6509 ShmemVariableCache->nextOid = checkPoint.nextOid;
6510 ShmemVariableCache->oidCount = 0;
6511 MultiXactSetNextMXact(checkPoint.nextMulti, checkPoint.nextMultiOffset);
6512 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
6513 SetMultiXactIdLimit(checkPoint.oldestMulti, checkPoint.oldestMultiDB);
6514 SetCommitTsLimit(checkPoint.oldestCommitTsXid,
6515 checkPoint.newestCommitTsXid);
6516 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
6517 XLogCtl->ckptXid = checkPoint.nextXid;
6518
6519 /*
6520 * Initialize replication slots, before there's a chance to remove
6521 * required resources.
6522 */
6523 StartupReplicationSlots();
6524
6525 /*
6526 * Startup logical state, needs to be setup now so we have proper data
6527 * during crash recovery.
6528 */
6529 StartupReorderBuffer();
6530
6531 /*
6532 * Startup MultiXact. We need to do this early to be able to replay
6533 * truncations.
6534 */
6535 StartupMultiXact();
6536
6537 /*
6538 * Ditto for commit timestamps. Activate the facility if the setting is
6539 * enabled in the control file, as there should be no tracking of commit
6540 * timestamps done when the setting was disabled. This facility can be
6541 * started or stopped when replaying a XLOG_PARAMETER_CHANGE record.
6542 */
6543 if (ControlFile->track_commit_timestamp)
6544 StartupCommitTs();
6545
6546 /*
6547 * Recover knowledge about replay progress of known replication partners.
6548 */
6549 StartupReplicationOrigin();
6550
6551 /*
6552 * Initialize unlogged LSN. On a clean shutdown, it's restored from the
6553 * control file. On recovery, all unlogged relations are blown away, so
6554 * the unlogged LSN counter can be reset too.
6555 */
6556 if (ControlFile->state == DB_SHUTDOWNED)
6557 XLogCtl->unloggedLSN = ControlFile->unloggedLSN;
6558 else
6559 XLogCtl->unloggedLSN = 1;
6560
6561 /*
6562 * We must replay WAL entries using the same TimeLineID they were created
6563 * under, so temporarily adopt the TLI indicated by the checkpoint (see
6564 * also xlog_redo()).
6565 */
6566 ThisTimeLineID = checkPoint.ThisTimeLineID;
6567
6568 /*
6569 * Copy any missing timeline history files between 'now' and the recovery
6570 * target timeline from archive to pg_xlog. While we don't need those
6571 * files ourselves - the history file of the recovery target timeline
6572 * covers all the previous timelines in the history too - a cascading
6573 * standby server might be interested in them. Or, if you archive the WAL
6574 * from this server to a different archive than the master, it'd be good
6575 * for all the history files to get archived there after failover, so that
6576 * you can use one of the old timelines as a PITR target. Timeline history
6577 * files are small, so it's better to copy them unnecessarily than not
6578 * copy them and regret later.
6579 */
6580 restoreTimeLineHistoryFiles(ThisTimeLineID, recoveryTargetTLI);
6581
6582 lastFullPageWrites = checkPoint.fullPageWrites;
6583
6584 RedoRecPtr = XLogCtl->RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
6585 doPageWrites = lastFullPageWrites;
6586
6587 if (RecPtr < checkPoint.redo)
6588 ereport(PANIC,
6589 (errmsg("invalid redo in checkpoint record")));
6590
6591 /*
6592 * Check whether we need to force recovery from WAL. If it appears to
6593 * have been a clean shutdown and we did not have a recovery.conf file,
6594 * then assume no recovery needed.
6595 */
6596 if (checkPoint.redo < RecPtr)
6597 {
6598 if (wasShutdown)
6599 ereport(PANIC,
6600 (errmsg("invalid redo record in shutdown checkpoint")));
6601 InRecovery = true;
6602 }
6603 else if (ControlFile->state != DB_SHUTDOWNED)
6604 InRecovery = true;
6605 else if (ArchiveRecoveryRequested)
6606 {
6607 /* force recovery due to presence of recovery.conf */
6608 InRecovery = true;
6609 }
6610
6611 /*
6612 * Start recovery assuming that the final record isn't lost.
6613 */
6614 abortedRecPtr = InvalidXLogRecPtr;
6615 missingContrecPtr = InvalidXLogRecPtr;
6616
6617 /* REDO */
6618 if (InRecovery)
6619 {
6620 int rmid;
6621
6622 /*
6623 * Update pg_control to show that we are recovering and to show the
6624 * selected checkpoint as the place we are starting from. We also mark
6625 * pg_control with any minimum recovery stop point obtained from a
6626 * backup history file.
6627 */
6628 dbstate_at_startup = ControlFile->state;
6629 if (InArchiveRecovery)
6630 {
6631 ControlFile->state = DB_IN_ARCHIVE_RECOVERY;
6632
6633 SpinLockAcquire(&XLogCtl->info_lck);
6634 XLogCtl->SharedRecoveryState = RECOVERY_STATE_ARCHIVE;
6635 SpinLockRelease(&XLogCtl->info_lck);
6636 }
6637 else
6638 {
6639 ereport(LOG,
6640 (errmsg("database system was not properly shut down; "
6641 "automatic recovery in progress")));
6642 if (recoveryTargetTLI > ControlFile->checkPointCopy.ThisTimeLineID)
6643 ereport(LOG,
6644 (errmsg("crash recovery starts in timeline %u "
6645 "and has target timeline %u",
6646 ControlFile->checkPointCopy.ThisTimeLineID,
6647 recoveryTargetTLI)));
6648 ControlFile->state = DB_IN_CRASH_RECOVERY;
6649
6650 SpinLockAcquire(&XLogCtl->info_lck);
6651 XLogCtl->SharedRecoveryState = RECOVERY_STATE_CRASH;
6652 SpinLockRelease(&XLogCtl->info_lck);
6653 }
6654 ControlFile->prevCheckPoint = ControlFile->checkPoint;
6655 ControlFile->checkPoint = checkPointLoc;
6656 ControlFile->checkPointCopy = checkPoint;
6657 if (InArchiveRecovery)
6658 {
6659 /* initialize minRecoveryPoint if not set yet */
6660 if (ControlFile->minRecoveryPoint < checkPoint.redo)
6661 {
6662 ControlFile->minRecoveryPoint = checkPoint.redo;
6663 ControlFile->minRecoveryPointTLI = checkPoint.ThisTimeLineID;
6664 }
6665 }
6666
6667 /*
6668 * Set backupStartPoint if we're starting recovery from a base backup.
6669 *
6670 * Also set backupEndPoint and use minRecoveryPoint as the backup end
6671 * location if we're starting recovery from a base backup which was
6672 * taken from a standby. In this case, the database system status in
6673 * pg_control must indicate that the database was already in recovery.
6674 * Usually that will be DB_IN_ARCHIVE_RECOVERY but also can be
6675 * DB_SHUTDOWNED_IN_RECOVERY if recovery previously was interrupted
6676 * before reaching this point; e.g. because restore_command or
6677 * primary_conninfo were faulty.
6678 *
6679 * Any other state indicates that the backup somehow became corrupted
6680 * and we can't sensibly continue with recovery.
6681 */
6682 if (haveBackupLabel)
6683 {
6684 ControlFile->backupStartPoint = checkPoint.redo;
6685 ControlFile->backupEndRequired = backupEndRequired;
6686
6687 if (backupFromStandby)
6688 {
6689 if (dbstate_at_startup != DB_IN_ARCHIVE_RECOVERY &&
6690 dbstate_at_startup != DB_SHUTDOWNED_IN_RECOVERY)
6691 ereport(FATAL,
6692 (errmsg("backup_label contains data inconsistent with control file"),
6693 errhint("This means that the backup is corrupted and you will "
6694 "have to use another backup for recovery.")));
6695 ControlFile->backupEndPoint = ControlFile->minRecoveryPoint;
6696 }
6697 }
6698 ControlFile->time = (pg_time_t) time(NULL);
6699 /* No need to hold ControlFileLock yet, we aren't up far enough */
6700 UpdateControlFile();
6701
6702 /*
6703 * Initialize our local copy of minRecoveryPoint. When doing crash
6704 * recovery we want to replay up to the end of WAL. Particularly, in
6705 * the case of a promoted standby minRecoveryPoint value in the
6706 * control file is only updated after the first checkpoint. However,
6707 * if the instance crashes before the first post-recovery checkpoint
6708 * is completed then recovery will use a stale location causing the
6709 * startup process to think that there are still invalid page
6710 * references when checking for data consistency.
6711 */
6712 if (InArchiveRecovery)
6713 {
6714 minRecoveryPoint = ControlFile->minRecoveryPoint;
6715 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
6716 }
6717 else
6718 {
6719 minRecoveryPoint = InvalidXLogRecPtr;
6720 minRecoveryPointTLI = 0;
6721 }
6722
6723 /*
6724 * Reset pgstat data, because it may be invalid after recovery.
6725 */
6726 pgstat_reset_all();
6727
6728 /*
6729 * If there was a backup label file, it's done its job and the info
6730 * has now been propagated into pg_control. We must get rid of the
6731 * label file so that if we crash during recovery, we'll pick up at
6732 * the latest recovery restartpoint instead of going all the way back
6733 * to the backup start point. It seems prudent though to just rename
6734 * the file out of the way rather than delete it completely.
6735 */
6736 if (haveBackupLabel)
6737 {
6738 unlink(BACKUP_LABEL_OLD);
6739 durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, FATAL);
6740 }
6741
6742 /*
6743 * If there was a tablespace_map file, it's done its job and the
6744 * symlinks have been created. We must get rid of the map file so
6745 * that if we crash during recovery, we don't create symlinks again.
6746 * It seems prudent though to just rename the file out of the way
6747 * rather than delete it completely.
6748 */
6749 if (haveTblspcMap)
6750 {
6751 unlink(TABLESPACE_MAP_OLD);
6752 durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, FATAL);
6753 }
6754
6755 /* Check that the GUCs used to generate the WAL allow recovery */
6756 CheckRequiredParameterValues();
6757
6758 /*
6759 * We're in recovery, so unlogged relations may be trashed and must be
6760 * reset. This should be done BEFORE allowing Hot Standby
6761 * connections, so that read-only backends don't try to read whatever
6762 * garbage is left over from before.
6763 */
6764 ResetUnloggedRelations(UNLOGGED_RELATION_CLEANUP);
6765
6766 /*
6767 * Likewise, delete any saved transaction snapshot files that got left
6768 * behind by crashed backends.
6769 */
6770 DeleteAllExportedSnapshotFiles();
6771
6772 /*
6773 * Initialize for Hot Standby, if enabled. We won't let backends in
6774 * yet, not until we've reached the min recovery point specified in
6775 * control file and we've established a recovery snapshot from a
6776 * running-xacts WAL record.
6777 */
6778 if (ArchiveRecoveryRequested && EnableHotStandby)
6779 {
6780 TransactionId *xids;
6781 int nxids;
6782
6783 ereport(DEBUG1,
6784 (errmsg("initializing for hot standby")));
6785
6786 InitRecoveryTransactionEnvironment();
6787
6788 if (wasShutdown)
6789 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
6790 else
6791 oldestActiveXID = checkPoint.oldestActiveXid;
6792 Assert(TransactionIdIsValid(oldestActiveXID));
6793
6794 /* Tell procarray about the range of xids it has to deal with */
6795 ProcArrayInitRecovery(ShmemVariableCache->nextXid);
6796
6797 /*
6798 * Startup commit log and subtrans only. MultiXact and commit
6799 * timestamp have already been started up and other SLRUs are not
6800 * maintained during recovery and need not be started yet.
6801 */
6802 StartupCLOG();
6803 StartupSUBTRANS(oldestActiveXID);
6804
6805 /*
6806 * If we're beginning at a shutdown checkpoint, we know that
6807 * nothing was running on the master at this point. So fake-up an
6808 * empty running-xacts record and use that here and now. Recover
6809 * additional standby state for prepared transactions.
6810 */
6811 if (wasShutdown)
6812 {
6813 RunningTransactionsData running;
6814 TransactionId latestCompletedXid;
6815
6816 /*
6817 * Construct a RunningTransactions snapshot representing a
6818 * shut down server, with only prepared transactions still
6819 * alive. We're never overflowed at this point because all
6820 * subxids are listed with their parent prepared transactions.
6821 */
6822 running.xcnt = nxids;
6823 running.subxcnt = 0;
6824 running.subxid_overflow = false;
6825 running.nextXid = checkPoint.nextXid;
6826 running.oldestRunningXid = oldestActiveXID;
6827 latestCompletedXid = checkPoint.nextXid;
6828 TransactionIdRetreat(latestCompletedXid);
6829 Assert(TransactionIdIsNormal(latestCompletedXid));
6830 running.latestCompletedXid = latestCompletedXid;
6831 running.xids = xids;
6832
6833 ProcArrayApplyRecoveryInfo(&running);
6834
6835 StandbyRecoverPreparedTransactions(false);
6836 }
6837 }
6838
6839 /* Initialize resource managers */
6840 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
6841 {
6842 if (RmgrTable[rmid].rm_startup != NULL)
6843 RmgrTable[rmid].rm_startup();
6844 }
6845
6846 /*
6847 * Initialize shared variables for tracking progress of WAL replay, as
6848 * if we had just replayed the record before the REDO location (or the
6849 * checkpoint record itself, if it's a shutdown checkpoint).
6850 */
6851 SpinLockAcquire(&XLogCtl->info_lck);
6852 if (checkPoint.redo < RecPtr)
6853 XLogCtl->replayEndRecPtr = checkPoint.redo;
6854 else
6855 XLogCtl->replayEndRecPtr = EndRecPtr;
6856 XLogCtl->replayEndTLI = ThisTimeLineID;
6857 XLogCtl->lastReplayedEndRecPtr = XLogCtl->replayEndRecPtr;
6858 XLogCtl->lastReplayedTLI = XLogCtl->replayEndTLI;
6859 XLogCtl->recoveryLastXTime = 0;
6860 XLogCtl->currentChunkStartTime = 0;
6861 XLogCtl->recoveryPause = false;
6862 SpinLockRelease(&XLogCtl->info_lck);
6863
6864 /* Also ensure XLogReceiptTime has a sane value */
6865 XLogReceiptTime = GetCurrentTimestamp();
6866
6867 /*
6868 * Let postmaster know we've started redo now, so that it can launch
6869 * checkpointer to perform restartpoints. We don't bother during
6870 * crash recovery as restartpoints can only be performed during
6871 * archive recovery. And we'd like to keep crash recovery simple, to
6872 * avoid introducing bugs that could affect you when recovering after
6873 * crash.
6874 *
6875 * After this point, we can no longer assume that we're the only
6876 * process in addition to postmaster! Also, fsync requests are
6877 * subsequently to be handled by the checkpointer, not locally.
6878 */
6879 if (ArchiveRecoveryRequested && IsUnderPostmaster)
6880 {
6881 PublishStartupProcessInformation();
6882 SetForwardFsyncRequests();
6883 SendPostmasterSignal(PMSIGNAL_RECOVERY_STARTED);
6884 bgwriterLaunched = true;
6885 }
6886
6887 /*
6888 * Allow read-only connections immediately if we're consistent
6889 * already.
6890 */
6891 CheckRecoveryConsistency();
6892
6893 /*
6894 * Find the first record that logically follows the checkpoint --- it
6895 * might physically precede it, though.
6896 */
6897 if (checkPoint.redo < RecPtr)
6898 {
6899 /* back up to find the record */
6900 record = ReadRecord(xlogreader, checkPoint.redo, PANIC, false);
6901 }
6902 else
6903 {
6904 /* just have to read next record after CheckPoint */
6905 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
6906 }
6907
6908 if (record != NULL)
6909 {
6910 ErrorContextCallback errcallback;
6911 TimestampTz xtime;
6912
6913 InRedo = true;
6914
6915 ereport(LOG,
6916 (errmsg("redo starts at %X/%X",
6917 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
6918
6919 /*
6920 * main redo apply loop
6921 */
6922 do
6923 {
6924 bool switchedTLI = false;
6925
6926 #ifdef WAL_DEBUG
6927 if (XLOG_DEBUG ||
6928 (rmid == RM_XACT_ID && trace_recovery_messages <= DEBUG2) ||
6929 (rmid != RM_XACT_ID && trace_recovery_messages <= DEBUG3))
6930 {
6931 StringInfoData buf;
6932
6933 initStringInfo(&buf);
6934 appendStringInfo(&buf, "REDO @ %X/%X; LSN %X/%X: ",
6935 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr,
6936 (uint32) (EndRecPtr >> 32), (uint32) EndRecPtr);
6937 xlog_outrec(&buf, xlogreader);
6938 appendStringInfoString(&buf, " - ");
6939 xlog_outdesc(&buf, xlogreader);
6940 elog(LOG, "%s", buf.data);
6941 pfree(buf.data);
6942 }
6943 #endif
6944
6945 /* Handle interrupt signals of startup process */
6946 HandleStartupProcInterrupts();
6947
6948 /*
6949 * Pause WAL replay, if requested by a hot-standby session via
6950 * SetRecoveryPause().
6951 *
6952 * Note that we intentionally don't take the info_lck spinlock
6953 * here. We might therefore read a slightly stale value of
6954 * the recoveryPause flag, but it can't be very stale (no
6955 * worse than the last spinlock we did acquire). Since a
6956 * pause request is a pretty asynchronous thing anyway,
6957 * possibly responding to it one WAL record later than we
6958 * otherwise would is a minor issue, so it doesn't seem worth
6959 * adding another spinlock cycle to prevent that.
6960 */
6961 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
6962 recoveryPausesHere();
6963
6964 /*
6965 * Have we reached our recovery target?
6966 */
6967 if (recoveryStopsBefore(xlogreader))
6968 {
6969 reachedStopPoint = true; /* see below */
6970 break;
6971 }
6972
6973 /*
6974 * If we've been asked to lag the master, wait on latch until
6975 * enough time has passed.
6976 */
6977 if (recoveryApplyDelay(xlogreader))
6978 {
6979 /*
6980 * We test for paused recovery again here. If user sets
6981 * delayed apply, it may be because they expect to pause
6982 * recovery in case of problems, so we must test again
6983 * here otherwise pausing during the delay-wait wouldn't
6984 * work.
6985 */
6986 if (((volatile XLogCtlData *) XLogCtl)->recoveryPause)
6987 recoveryPausesHere();
6988 }
6989
6990 /* Setup error traceback support for ereport() */
6991 errcallback.callback = rm_redo_error_callback;
6992 errcallback.arg = (void *) xlogreader;
6993 errcallback.previous = error_context_stack;
6994 error_context_stack = &errcallback;
6995
6996 /*
6997 * ShmemVariableCache->nextXid must be beyond record's xid.
6998 *
6999 * We don't expect anyone else to modify nextXid, hence we
7000 * don't need to hold a lock while examining it. We still
7001 * acquire the lock to modify it, though.
7002 */
7003 if (TransactionIdFollowsOrEquals(record->xl_xid,
7004 ShmemVariableCache->nextXid))
7005 {
7006 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
7007 ShmemVariableCache->nextXid = record->xl_xid;
7008 TransactionIdAdvance(ShmemVariableCache->nextXid);
7009 LWLockRelease(XidGenLock);
7010 }
7011
7012 /*
7013 * Before replaying this record, check if this record causes
7014 * the current timeline to change. The record is already
7015 * considered to be part of the new timeline, so we update
7016 * ThisTimeLineID before replaying it. That's important so
7017 * that replayEndTLI, which is recorded as the minimum
7018 * recovery point's TLI if recovery stops after this record,
7019 * is set correctly.
7020 */
7021 if (record->xl_rmid == RM_XLOG_ID)
7022 {
7023 TimeLineID newTLI = ThisTimeLineID;
7024 TimeLineID prevTLI = ThisTimeLineID;
7025 uint8 info = record->xl_info & ~XLR_INFO_MASK;
7026
7027 if (info == XLOG_CHECKPOINT_SHUTDOWN)
7028 {
7029 CheckPoint checkPoint;
7030
7031 memcpy(&checkPoint, XLogRecGetData(xlogreader), sizeof(CheckPoint));
7032 newTLI = checkPoint.ThisTimeLineID;
7033 prevTLI = checkPoint.PrevTimeLineID;
7034 }
7035 else if (info == XLOG_END_OF_RECOVERY)
7036 {
7037 xl_end_of_recovery xlrec;
7038
7039 memcpy(&xlrec, XLogRecGetData(xlogreader), sizeof(xl_end_of_recovery));
7040 newTLI = xlrec.ThisTimeLineID;
7041 prevTLI = xlrec.PrevTimeLineID;
7042 }
7043
7044 if (newTLI != ThisTimeLineID)
7045 {
7046 /* Check that it's OK to switch to this TLI */
7047 checkTimeLineSwitch(EndRecPtr, newTLI, prevTLI);
7048
7049 /* Following WAL records should be run with new TLI */
7050 ThisTimeLineID = newTLI;
7051 switchedTLI = true;
7052 }
7053 }
7054
7055 /*
7056 * Update shared replayEndRecPtr before replaying this record,
7057 * so that XLogFlush will update minRecoveryPoint correctly.
7058 */
7059 SpinLockAcquire(&XLogCtl->info_lck);
7060 XLogCtl->replayEndRecPtr = EndRecPtr;
7061 XLogCtl->replayEndTLI = ThisTimeLineID;
7062 SpinLockRelease(&XLogCtl->info_lck);
7063
7064 /*
7065 * If we are attempting to enter Hot Standby mode, process
7066 * XIDs we see
7067 */
7068 if (standbyState >= STANDBY_INITIALIZED &&
7069 TransactionIdIsValid(record->xl_xid))
7070 RecordKnownAssignedTransactionIds(record->xl_xid);
7071
7072 /* Now apply the WAL record itself */
7073 RmgrTable[record->xl_rmid].rm_redo(xlogreader);
7074
7075 /* Pop the error context stack */
7076 error_context_stack = errcallback.previous;
7077
7078 /*
7079 * Update lastReplayedEndRecPtr after this record has been
7080 * successfully replayed.
7081 */
7082 SpinLockAcquire(&XLogCtl->info_lck);
7083 XLogCtl->lastReplayedEndRecPtr = EndRecPtr;
7084 XLogCtl->lastReplayedTLI = ThisTimeLineID;
7085 SpinLockRelease(&XLogCtl->info_lck);
7086
7087 /*
7088 * If rm_redo called XLogRequestWalReceiverReply, then we wake
7089 * up the receiver so that it notices the updated
7090 * lastReplayedEndRecPtr and sends a reply to the master.
7091 */
7092 if (doRequestWalReceiverReply)
7093 {
7094 doRequestWalReceiverReply = false;
7095 WalRcvForceReply();
7096 }
7097
7098 /* Remember this record as the last-applied one */
7099 LastRec = ReadRecPtr;
7100
7101 /* Allow read-only connections if we're consistent now */
7102 CheckRecoveryConsistency();
7103
7104 /* Is this a timeline switch? */
7105 if (switchedTLI)
7106 {
7107 /*
7108 * Before we continue on the new timeline, clean up any
7109 * (possibly bogus) future WAL segments on the old
7110 * timeline.
7111 */
7112 RemoveNonParentXlogFiles(EndRecPtr, ThisTimeLineID);
7113
7114 /*
7115 * Wake up any walsenders to notice that we are on a new
7116 * timeline.
7117 */
7118 if (switchedTLI && AllowCascadeReplication())
7119 WalSndWakeup();
7120 }
7121
7122 /* Exit loop if we reached inclusive recovery target */
7123 if (recoveryStopsAfter(xlogreader))
7124 {
7125 reachedStopPoint = true;
7126 break;
7127 }
7128
7129 /* Else, try to fetch the next WAL record */
7130 record = ReadRecord(xlogreader, InvalidXLogRecPtr, LOG, false);
7131 } while (record != NULL);
7132
7133 /*
7134 * end of main redo apply loop
7135 */
7136
7137 if (reachedStopPoint)
7138 {
7139 if (!reachedConsistency)
7140 ereport(FATAL,
7141 (errmsg("requested recovery stop point is before consistent recovery point")));
7142
7143 /*
7144 * This is the last point where we can restart recovery with a
7145 * new recovery target, if we shutdown and begin again. After
7146 * this, Resource Managers may choose to do permanent
7147 * corrective actions at end of recovery.
7148 */
7149 switch (recoveryTargetAction)
7150 {
7151 case RECOVERY_TARGET_ACTION_SHUTDOWN:
7152
7153 /*
7154 * exit with special return code to request shutdown
7155 * of postmaster. Log messages issued from
7156 * postmaster.
7157 */
7158 proc_exit(3);
7159
7160 case RECOVERY_TARGET_ACTION_PAUSE:
7161 SetRecoveryPause(true);
7162 recoveryPausesHere();
7163
7164 /* drop into promote */
7165
7166 case RECOVERY_TARGET_ACTION_PROMOTE:
7167 break;
7168 }
7169 }
7170
7171 /* Allow resource managers to do any required cleanup. */
7172 for (rmid = 0; rmid <= RM_MAX_ID; rmid++)
7173 {
7174 if (RmgrTable[rmid].rm_cleanup != NULL)
7175 RmgrTable[rmid].rm_cleanup();
7176 }
7177
7178 ereport(LOG,
7179 (errmsg("redo done at %X/%X",
7180 (uint32) (ReadRecPtr >> 32), (uint32) ReadRecPtr)));
7181 xtime = GetLatestXTime();
7182 if (xtime)
7183 ereport(LOG,
7184 (errmsg("last completed transaction was at log time %s",
7185 timestamptz_to_str(xtime))));
7186
7187 InRedo = false;
7188 }
7189 else
7190 {
7191 /* there are no WAL records following the checkpoint */
7192 ereport(LOG,
7193 (errmsg("redo is not required")));
7194 }
7195 }
7196
7197 /*
7198 * Kill WAL receiver, if it's still running, before we continue to write
7199 * the startup checkpoint and aborted-contrecord records. It will trump
7200 * over these records and subsequent ones if it's still alive when we
7201 * start writing WAL.
7202 */
7203 ShutdownWalRcv();
7204
7205 /*
7206 * Reset unlogged relations to the contents of their INIT fork. This is
7207 * done AFTER recovery is complete so as to include any unlogged relations
7208 * created during recovery, but BEFORE recovery is marked as having
7209 * completed successfully. Otherwise we'd not retry if any of the post
7210 * end-of-recovery steps fail.
7211 */
7212 if (InRecovery)
7213 ResetUnloggedRelations(UNLOGGED_RELATION_INIT);
7214
7215 /*
7216 * We don't need the latch anymore. It's not strictly necessary to disown
7217 * it, but let's do it for the sake of tidiness.
7218 */
7219 if (ArchiveRecoveryRequested)
7220 DisownLatch(&XLogCtl->recoveryWakeupLatch);
7221
7222 /*
7223 * We are now done reading the xlog from stream. Turn off streaming
7224 * recovery to force fetching the files (which would be required at end of
7225 * recovery, e.g., timeline history file) from archive or pg_xlog.
7226 *
7227 * Note that standby mode must be turned off after killing WAL receiver,
7228 * i.e., calling ShutdownWalRcv().
7229 */
7230 Assert(!WalRcvStreaming());
7231 StandbyMode = false;
7232
7233 /*
7234 * Determine where to start writing WAL next.
7235 *
7236 * When recovery ended in an incomplete record, write a WAL record about
7237 * that and continue after it. In all other cases, re-fetch the last
7238 * valid or last applied record, so we can identify the exact endpoint of
7239 * what we consider the valid portion of WAL.
7240 */
7241 record = ReadRecord(xlogreader, LastRec, PANIC, false);
7242 EndOfLog = EndRecPtr;
7243
7244 /*
7245 * EndOfLogTLI is the TLI in the filename of the XLOG segment containing
7246 * the end-of-log. It could be different from the timeline that EndOfLog
7247 * nominally belongs to, if there was a timeline switch in that segment,
7248 * and we were reading the old WAL from a segment belonging to a higher
7249 * timeline.
7250 */
7251 EndOfLogTLI = xlogreader->readPageTLI;
7252
7253 /*
7254 * Complain if we did not roll forward far enough to render the backup
7255 * dump consistent. Note: it is indeed okay to look at the local variable
7256 * minRecoveryPoint here, even though ControlFile->minRecoveryPoint might
7257 * be further ahead --- ControlFile->minRecoveryPoint cannot have been
7258 * advanced beyond the WAL we processed.
7259 */
7260 if (InRecovery &&
7261 (EndOfLog < minRecoveryPoint ||
7262 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint)))
7263 {
7264 /*
7265 * Ran off end of WAL before reaching end-of-backup WAL record, or
7266 * minRecoveryPoint. That's usually a bad sign, indicating that you
7267 * tried to recover from an online backup but never called
7268 * pg_stop_backup(), or you didn't archive all the WAL up to that
7269 * point. However, this also happens in crash recovery, if the system
7270 * crashes while an online backup is in progress. We must not treat
7271 * that as an error, or the database will refuse to start up.
7272 */
7273 if (ArchiveRecoveryRequested || ControlFile->backupEndRequired)
7274 {
7275 if (ControlFile->backupEndRequired)
7276 ereport(FATAL,
7277 (errmsg("WAL ends before end of online backup"),
7278 errhint("All WAL generated while online backup was taken must be available at recovery.")));
7279 else if (!XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7280 ereport(FATAL,
7281 (errmsg("WAL ends before end of online backup"),
7282 errhint("Online backup started with pg_start_backup() must be ended with pg_stop_backup(), and all WAL up to that point must be available at recovery.")));
7283 else
7284 ereport(FATAL,
7285 (errmsg("WAL ends before consistent recovery point")));
7286 }
7287 }
7288
7289 /*
7290 * Pre-scan prepared transactions to find out the range of XIDs present.
7291 * This information is not quite needed yet, but it is positioned here so
7292 * as potential problems are detected before any on-disk change is done.
7293 */
7294 oldestActiveXID = PrescanPreparedTransactions(NULL, NULL);
7295
7296 /*
7297 * Consider whether we need to assign a new timeline ID.
7298 *
7299 * If we are doing an archive recovery, we always assign a new ID. This
7300 * handles a couple of issues. If we stopped short of the end of WAL
7301 * during recovery, then we are clearly generating a new timeline and must
7302 * assign it a unique new ID. Even if we ran to the end, modifying the
7303 * current last segment is problematic because it may result in trying to
7304 * overwrite an already-archived copy of that segment, and we encourage
7305 * DBAs to make their archive_commands reject that. We can dodge the
7306 * problem by making the new active segment have a new timeline ID.
7307 *
7308 * In a normal crash recovery, we can just extend the timeline we were in.
7309 */
7310 PrevTimeLineID = ThisTimeLineID;
7311 if (ArchiveRecoveryRequested)
7312 {
7313 char reason[200];
7314 char recoveryPath[MAXPGPATH];
7315
7316 Assert(InArchiveRecovery);
7317
7318 ThisTimeLineID = findNewestTimeLine(recoveryTargetTLI) + 1;
7319 ereport(LOG,
7320 (errmsg("selected new timeline ID: %u", ThisTimeLineID)));
7321
7322 /*
7323 * Create a comment for the history file to explain why and where
7324 * timeline changed.
7325 */
7326 if (recoveryTarget == RECOVERY_TARGET_XID)
7327 snprintf(reason, sizeof(reason),
7328 "%s transaction %u",
7329 recoveryStopAfter ? "after" : "before",
7330 recoveryStopXid);
7331 else if (recoveryTarget == RECOVERY_TARGET_TIME)
7332 snprintf(reason, sizeof(reason),
7333 "%s %s\n",
7334 recoveryStopAfter ? "after" : "before",
7335 timestamptz_to_str(recoveryStopTime));
7336 else if (recoveryTarget == RECOVERY_TARGET_NAME)
7337 snprintf(reason, sizeof(reason),
7338 "at restore point \"%s\"",
7339 recoveryStopName);
7340 else if (recoveryTarget == RECOVERY_TARGET_IMMEDIATE)
7341 snprintf(reason, sizeof(reason), "reached consistency");
7342 else
7343 snprintf(reason, sizeof(reason), "no recovery target specified");
7344
7345 /*
7346 * We are now done reading the old WAL. Turn off archive fetching if
7347 * it was active, and make a writable copy of the last WAL segment.
7348 * (Note that we also have a copy of the last block of the old WAL in
7349 * readBuf; we will use that below.)
7350 */
7351 exitArchiveRecovery(EndOfLogTLI, EndOfLog);
7352
7353 /*
7354 * Write the timeline history file, and have it archived. After this
7355 * point (or rather, as soon as the file is archived), the timeline
7356 * will appear as "taken" in the WAL archive and to any standby
7357 * servers. If we crash before actually switching to the new
7358 * timeline, standby servers will nevertheless think that we switched
7359 * to the new timeline, and will try to connect to the new timeline.
7360 * To minimize the window for that, try to do as little as possible
7361 * between here and writing the end-of-recovery record.
7362 */
7363 writeTimeLineHistory(ThisTimeLineID, recoveryTargetTLI,
7364 EndRecPtr, reason);
7365
7366 /*
7367 * Since there might be a partial WAL segment named RECOVERYXLOG, get
7368 * rid of it.
7369 */
7370 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYXLOG");
7371 unlink(recoveryPath); /* ignore any error */
7372
7373 /* Get rid of any remaining recovered timeline-history file, too */
7374 snprintf(recoveryPath, MAXPGPATH, XLOGDIR "/RECOVERYHISTORY");
7375 unlink(recoveryPath); /* ignore any error */
7376 }
7377
7378 /* Save the selected TimeLineID in shared memory, too */
7379 XLogCtl->ThisTimeLineID = ThisTimeLineID;
7380 XLogCtl->PrevTimeLineID = PrevTimeLineID;
7381
7382 /*
7383 * Actually, if WAL ended in an incomplete record, skip the parts that
7384 * made it through and start writing after the portion that persisted.
7385 * (It's critical to first write an OVERWRITE_CONTRECORD message, which
7386 * we'll do as soon as we're open for writing new WAL.)
7387 */
7388 if (!XLogRecPtrIsInvalid(missingContrecPtr))
7389 {
7390 Assert(!XLogRecPtrIsInvalid(abortedRecPtr));
7391 EndOfLog = missingContrecPtr;
7392 }
7393
7394 /*
7395 * Prepare to write WAL starting at EndOfLog location, and init xlog
7396 * buffer cache using the block containing the last record from the
7397 * previous incarnation.
7398 */
7399 Insert = &XLogCtl->Insert;
7400 Insert->PrevBytePos = XLogRecPtrToBytePos(LastRec);
7401 Insert->CurrBytePos = XLogRecPtrToBytePos(EndOfLog);
7402
7403 /*
7404 * Tricky point here: readBuf contains the *last* block that the LastRec
7405 * record spans, not the one it starts in. The last block is indeed the
7406 * one we want to use.
7407 */
7408 if (EndOfLog % XLOG_BLCKSZ != 0)
7409 {
7410 char *page;
7411 int len;
7412 int firstIdx;
7413 XLogRecPtr pageBeginPtr;
7414
7415 pageBeginPtr = EndOfLog - (EndOfLog % XLOG_BLCKSZ);
7416 Assert(readOff == pageBeginPtr % XLogSegSize);
7417
7418 firstIdx = XLogRecPtrToBufIdx(EndOfLog);
7419
7420 /* Copy the valid part of the last block, and zero the rest */
7421 page = &XLogCtl->pages[firstIdx * XLOG_BLCKSZ];
7422 len = EndOfLog % XLOG_BLCKSZ;
7423 memcpy(page, xlogreader->readBuf, len);
7424 memset(page + len, 0, XLOG_BLCKSZ - len);
7425
7426 XLogCtl->xlblocks[firstIdx] = pageBeginPtr + XLOG_BLCKSZ;
7427 XLogCtl->InitializedUpTo = pageBeginPtr + XLOG_BLCKSZ;
7428 }
7429 else
7430 {
7431 /*
7432 * There is no partial block to copy. Just set InitializedUpTo, and
7433 * let the first attempt to insert a log record to initialize the next
7434 * buffer.
7435 */
7436 XLogCtl->InitializedUpTo = EndOfLog;
7437 }
7438
7439 LogwrtResult.Write = LogwrtResult.Flush = EndOfLog;
7440
7441 XLogCtl->LogwrtResult = LogwrtResult;
7442
7443 XLogCtl->LogwrtRqst.Write = EndOfLog;
7444 XLogCtl->LogwrtRqst.Flush = EndOfLog;
7445
7446 LocalSetXLogInsertAllowed();
7447
7448 /* If necessary, write overwrite-contrecord before doing anything else */
7449 if (!XLogRecPtrIsInvalid(abortedRecPtr))
7450 {
7451 Assert(!XLogRecPtrIsInvalid(missingContrecPtr));
7452 CreateOverwriteContrecordRecord(abortedRecPtr);
7453 abortedRecPtr = InvalidXLogRecPtr;
7454 missingContrecPtr = InvalidXLogRecPtr;
7455 }
7456
7457 /*
7458 * Update full_page_writes in shared memory and write an XLOG_FPW_CHANGE
7459 * record before resource manager writes cleanup WAL records or checkpoint
7460 * record is written.
7461 */
7462 Insert->fullPageWrites = lastFullPageWrites;
7463 UpdateFullPageWrites();
7464 LocalXLogInsertAllowed = -1;
7465
7466 if (InRecovery)
7467 {
7468 /*
7469 * Perform a checkpoint to update all our recovery activity to disk.
7470 *
7471 * Note that we write a shutdown checkpoint rather than an on-line
7472 * one. This is not particularly critical, but since we may be
7473 * assigning a new TLI, using a shutdown checkpoint allows us to have
7474 * the rule that TLI only changes in shutdown checkpoints, which
7475 * allows some extra error checking in xlog_redo.
7476 *
7477 * In fast promotion, only create a lightweight end-of-recovery record
7478 * instead of a full checkpoint. A checkpoint is requested later,
7479 * after we're fully out of recovery mode and already accepting
7480 * queries.
7481 */
7482 if (bgwriterLaunched)
7483 {
7484 if (fast_promote)
7485 {
7486 checkPointLoc = ControlFile->prevCheckPoint;
7487
7488 /*
7489 * Confirm the last checkpoint is available for us to recover
7490 * from if we fail. Note that we don't check for the secondary
7491 * checkpoint since that isn't available in most base backups.
7492 */
7493 record = ReadCheckpointRecord(xlogreader, checkPointLoc, 1, false);
7494 if (record != NULL)
7495 {
7496 fast_promoted = true;
7497
7498 /*
7499 * Insert a special WAL record to mark the end of
7500 * recovery, since we aren't doing a checkpoint. That
7501 * means that the checkpointer process may likely be in
7502 * the middle of a time-smoothed restartpoint and could
7503 * continue to be for minutes after this. That sounds
7504 * strange, but the effect is roughly the same and it
7505 * would be stranger to try to come out of the
7506 * restartpoint and then checkpoint. We request a
7507 * checkpoint later anyway, just for safety.
7508 */
7509 CreateEndOfRecoveryRecord();
7510 }
7511 }
7512
7513 if (!fast_promoted)
7514 RequestCheckpoint(CHECKPOINT_END_OF_RECOVERY |
7515 CHECKPOINT_IMMEDIATE |
7516 CHECKPOINT_WAIT);
7517 }
7518 else
7519 CreateCheckPoint(CHECKPOINT_END_OF_RECOVERY | CHECKPOINT_IMMEDIATE);
7520
7521 /*
7522 * And finally, execute the recovery_end_command, if any.
7523 */
7524 if (recoveryEndCommand)
7525 ExecuteRecoveryCommand(recoveryEndCommand,
7526 "recovery_end_command",
7527 true);
7528 }
7529
7530 if (ArchiveRecoveryRequested)
7531 {
7532 /*
7533 * We switched to a new timeline. Clean up segments on the old
7534 * timeline.
7535 *
7536 * If there are any higher-numbered segments on the old timeline,
7537 * remove them. They might contain valid WAL, but they might also be
7538 * pre-allocated files containing garbage. In any case, they are not
7539 * part of the new timeline's history so we don't need them.
7540 */
7541 RemoveNonParentXlogFiles(EndOfLog, ThisTimeLineID);
7542
7543 /*
7544 * If the switch happened in the middle of a segment, what to do with
7545 * the last, partial segment on the old timeline? If we don't archive
7546 * it, and the server that created the WAL never archives it either
7547 * (e.g. because it was hit by a meteor), it will never make it to the
7548 * archive. That's OK from our point of view, because the new segment
7549 * that we created with the new TLI contains all the WAL from the old
7550 * timeline up to the switch point. But if you later try to do PITR to
7551 * the "missing" WAL on the old timeline, recovery won't find it in
7552 * the archive. It's physically present in the new file with new TLI,
7553 * but recovery won't look there when it's recovering to the older
7554 * timeline. On the other hand, if we archive the partial segment, and
7555 * the original server on that timeline is still running and archives
7556 * the completed version of the same segment later, it will fail. (We
7557 * used to do that in 9.4 and below, and it caused such problems).
7558 *
7559 * As a compromise, we rename the last segment with the .partial
7560 * suffix, and archive it. Archive recovery will never try to read
7561 * .partial segments, so they will normally go unused. But in the odd
7562 * PITR case, the administrator can copy them manually to the pg_xlog
7563 * directory (removing the suffix). They can be useful in debugging,
7564 * too.
7565 *
7566 * If a .done or .ready file already exists for the old timeline,
7567 * however, we had already determined that the segment is complete, so
7568 * we can let it be archived normally. (In particular, if it was
7569 * restored from the archive to begin with, it's expected to have a
7570 * .done file).
7571 */
7572 if (EndOfLog % XLOG_SEG_SIZE != 0 && XLogArchivingActive())
7573 {
7574 char origfname[MAXFNAMELEN];
7575 XLogSegNo endLogSegNo;
7576
7577 XLByteToPrevSeg(EndOfLog, endLogSegNo);
7578 XLogFileName(origfname, EndOfLogTLI, endLogSegNo);
7579
7580 if (!XLogArchiveIsReadyOrDone(origfname))
7581 {
7582 char origpath[MAXPGPATH];
7583 char partialfname[MAXFNAMELEN];
7584 char partialpath[MAXPGPATH];
7585
7586 XLogFilePath(origpath, EndOfLogTLI, endLogSegNo);
7587 snprintf(partialfname, MAXFNAMELEN, "%s.partial", origfname);
7588 snprintf(partialpath, MAXPGPATH, "%s.partial", origpath);
7589
7590 /*
7591 * Make sure there's no .done or .ready file for the .partial
7592 * file.
7593 */
7594 XLogArchiveCleanup(partialfname);
7595
7596 durable_rename(origpath, partialpath, ERROR);
7597 XLogArchiveNotify(partialfname);
7598 }
7599 }
7600 }
7601
7602 /*
7603 * Preallocate additional log files, if wanted.
7604 */
7605 PreallocXlogFiles(EndOfLog);
7606
7607 /*
7608 * Okay, we're officially UP.
7609 */
7610 InRecovery = false;
7611
7612 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7613 ControlFile->state = DB_IN_PRODUCTION;
7614 ControlFile->time = (pg_time_t) time(NULL);
7615 UpdateControlFile();
7616 LWLockRelease(ControlFileLock);
7617
7618 /* start the archive_timeout timer running */
7619 XLogCtl->lastSegSwitchTime = (pg_time_t) time(NULL);
7620
7621 /* also initialize latestCompletedXid, to nextXid - 1 */
7622 LWLockAcquire(ProcArrayLock, LW_EXCLUSIVE);
7623 ShmemVariableCache->latestCompletedXid = ShmemVariableCache->nextXid;
7624 TransactionIdRetreat(ShmemVariableCache->latestCompletedXid);
7625 LWLockRelease(ProcArrayLock);
7626
7627 /*
7628 * Start up the commit log and subtrans, if not already done for hot
7629 * standby. (commit timestamps are started below, if necessary.)
7630 */
7631 if (standbyState == STANDBY_DISABLED)
7632 {
7633 StartupCLOG();
7634 StartupSUBTRANS(oldestActiveXID);
7635 }
7636
7637 /*
7638 * Perform end of recovery actions for any SLRUs that need it.
7639 */
7640 TrimCLOG();
7641 TrimMultiXact();
7642
7643 /* Reload shared-memory state for prepared transactions */
7644 RecoverPreparedTransactions();
7645
7646 /* Shut down xlogreader */
7647 if (readFile >= 0)
7648 {
7649 close(readFile);
7650 readFile = -1;
7651 }
7652 XLogReaderFree(xlogreader);
7653
7654 /*
7655 * If any of the critical GUCs have changed, log them before we allow
7656 * backends to write WAL.
7657 */
7658 LocalSetXLogInsertAllowed();
7659 XLogReportParameters();
7660
7661 /*
7662 * Local WAL inserts enabled, so it's time to finish initialization of
7663 * commit timestamp.
7664 */
7665 CompleteCommitTsInitialization();
7666
7667 /*
7668 * All done. Allow backends to write WAL. (Although the bool flag is
7669 * probably atomic in itself, we use the info_lck here to ensure that
7670 * there are no race conditions concerning visibility of other recent
7671 * updates to shared memory.)
7672 */
7673 SpinLockAcquire(&XLogCtl->info_lck);
7674 XLogCtl->SharedRecoveryState = RECOVERY_STATE_DONE;
7675 SpinLockRelease(&XLogCtl->info_lck);
7676
7677 /*
7678 * Shutdown the recovery environment. This must occur after
7679 * RecoverPreparedTransactions() (see notes in lock_twophase_recover())
7680 * and after switching SharedRecoveryState to RECOVERY_STATE_DONE so as
7681 * any session building a snapshot will not rely on KnownAssignedXids as
7682 * RecoveryInProgress() would return false at this stage. This is
7683 * particularly critical for prepared 2PC transactions, that would still
7684 * need to be included in snapshots once recovery has ended.
7685 */
7686 if (standbyState != STANDBY_DISABLED)
7687 ShutdownRecoveryTransactionEnvironment();
7688
7689 /*
7690 * If there were cascading standby servers connected to us, nudge any wal
7691 * sender processes to notice that we've been promoted.
7692 */
7693 WalSndWakeup();
7694
7695 /*
7696 * If this was a fast promotion, request an (online) checkpoint now. This
7697 * isn't required for consistency, but the last restartpoint might be far
7698 * back, and in case of a crash, recovering from it might take a longer
7699 * than is appropriate now that we're not in standby mode anymore.
7700 */
7701 if (fast_promoted)
7702 RequestCheckpoint(CHECKPOINT_FORCE);
7703 }
7704
7705 /*
7706 * Checks if recovery has reached a consistent state. When consistency is
7707 * reached and we have a valid starting standby snapshot, tell postmaster
7708 * that it can start accepting read-only connections.
7709 */
7710 static void
CheckRecoveryConsistency(void)7711 CheckRecoveryConsistency(void)
7712 {
7713 XLogRecPtr lastReplayedEndRecPtr;
7714
7715 /*
7716 * During crash recovery, we don't reach a consistent state until we've
7717 * replayed all the WAL.
7718 */
7719 if (XLogRecPtrIsInvalid(minRecoveryPoint))
7720 return;
7721
7722 Assert(InArchiveRecovery);
7723
7724 /*
7725 * assume that we are called in the startup process, and hence don't need
7726 * a lock to read lastReplayedEndRecPtr
7727 */
7728 lastReplayedEndRecPtr = XLogCtl->lastReplayedEndRecPtr;
7729
7730 /*
7731 * Have we reached the point where our base backup was completed?
7732 */
7733 if (!XLogRecPtrIsInvalid(ControlFile->backupEndPoint) &&
7734 ControlFile->backupEndPoint <= lastReplayedEndRecPtr)
7735 {
7736 /*
7737 * We have reached the end of base backup, as indicated by pg_control.
7738 * The data on disk is now consistent. Reset backupStartPoint and
7739 * backupEndPoint, and update minRecoveryPoint to make sure we don't
7740 * allow starting up at an earlier point even if recovery is stopped
7741 * and restarted soon after this.
7742 */
7743 elog(DEBUG1, "end of backup reached");
7744
7745 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
7746
7747 if (ControlFile->minRecoveryPoint < lastReplayedEndRecPtr)
7748 ControlFile->minRecoveryPoint = lastReplayedEndRecPtr;
7749
7750 ControlFile->backupStartPoint = InvalidXLogRecPtr;
7751 ControlFile->backupEndPoint = InvalidXLogRecPtr;
7752 ControlFile->backupEndRequired = false;
7753 UpdateControlFile();
7754
7755 LWLockRelease(ControlFileLock);
7756 }
7757
7758 /*
7759 * Have we passed our safe starting point? Note that minRecoveryPoint is
7760 * known to be incorrectly set if ControlFile->backupEndRequired, until
7761 * the XLOG_BACKUP_END arrives to advise us of the correct
7762 * minRecoveryPoint. All we know prior to that is that we're not
7763 * consistent yet.
7764 */
7765 if (!reachedConsistency && !ControlFile->backupEndRequired &&
7766 minRecoveryPoint <= lastReplayedEndRecPtr &&
7767 XLogRecPtrIsInvalid(ControlFile->backupStartPoint))
7768 {
7769 /*
7770 * Check to see if the XLOG sequence contained any unresolved
7771 * references to uninitialized pages.
7772 */
7773 XLogCheckInvalidPages();
7774
7775 reachedConsistency = true;
7776 ereport(LOG,
7777 (errmsg("consistent recovery state reached at %X/%X",
7778 (uint32) (lastReplayedEndRecPtr >> 32),
7779 (uint32) lastReplayedEndRecPtr)));
7780 }
7781
7782 /*
7783 * Have we got a valid starting snapshot that will allow queries to be
7784 * run? If so, we can tell postmaster that the database is consistent now,
7785 * enabling connections.
7786 */
7787 if (standbyState == STANDBY_SNAPSHOT_READY &&
7788 !LocalHotStandbyActive &&
7789 reachedConsistency &&
7790 IsUnderPostmaster)
7791 {
7792 SpinLockAcquire(&XLogCtl->info_lck);
7793 XLogCtl->SharedHotStandbyActive = true;
7794 SpinLockRelease(&XLogCtl->info_lck);
7795
7796 LocalHotStandbyActive = true;
7797
7798 SendPostmasterSignal(PMSIGNAL_BEGIN_HOT_STANDBY);
7799 }
7800 }
7801
7802 /*
7803 * Is the system still in recovery?
7804 *
7805 * Unlike testing InRecovery, this works in any process that's connected to
7806 * shared memory.
7807 *
7808 * As a side-effect, we initialize the local TimeLineID and RedoRecPtr
7809 * variables the first time we see that recovery is finished.
7810 */
7811 bool
RecoveryInProgress(void)7812 RecoveryInProgress(void)
7813 {
7814 /*
7815 * We check shared state each time only until we leave recovery mode. We
7816 * can't re-enter recovery, so there's no need to keep checking after the
7817 * shared variable has once been seen false.
7818 */
7819 if (!LocalRecoveryInProgress)
7820 return false;
7821 else
7822 {
7823 /*
7824 * use volatile pointer to make sure we make a fresh read of the
7825 * shared variable.
7826 */
7827 volatile XLogCtlData *xlogctl = XLogCtl;
7828
7829 LocalRecoveryInProgress = (xlogctl->SharedRecoveryState != RECOVERY_STATE_DONE);
7830
7831 /*
7832 * Initialize TimeLineID and RedoRecPtr when we discover that recovery
7833 * is finished. InitPostgres() relies upon this behaviour to ensure
7834 * that InitXLOGAccess() is called at backend startup. (If you change
7835 * this, see also LocalSetXLogInsertAllowed.)
7836 */
7837 if (!LocalRecoveryInProgress)
7838 {
7839 /*
7840 * If we just exited recovery, make sure we read TimeLineID and
7841 * RedoRecPtr after SharedRecoveryState (for machines with weak
7842 * memory ordering).
7843 */
7844 pg_memory_barrier();
7845 InitXLOGAccess();
7846 }
7847
7848 /*
7849 * Note: We don't need a memory barrier when we're still in recovery.
7850 * We might exit recovery immediately after return, so the caller
7851 * can't rely on 'true' meaning that we're still in recovery anyway.
7852 */
7853
7854 return LocalRecoveryInProgress;
7855 }
7856 }
7857
7858 /*
7859 * Returns current recovery state from shared memory.
7860 *
7861 * This returned state is kept consistent with the contents of the control
7862 * file. See details about the possible values of RecoveryState in xlog.h.
7863 */
7864 RecoveryState
GetRecoveryState(void)7865 GetRecoveryState(void)
7866 {
7867 RecoveryState retval;
7868
7869 SpinLockAcquire(&XLogCtl->info_lck);
7870 retval = XLogCtl->SharedRecoveryState;
7871 SpinLockRelease(&XLogCtl->info_lck);
7872
7873 return retval;
7874 }
7875
7876 /*
7877 * Is HotStandby active yet? This is only important in special backends
7878 * since normal backends won't ever be able to connect until this returns
7879 * true. Postmaster knows this by way of signal, not via shared memory.
7880 *
7881 * Unlike testing standbyState, this works in any process that's connected to
7882 * shared memory. (And note that standbyState alone doesn't tell the truth
7883 * anyway.)
7884 */
7885 bool
HotStandbyActive(void)7886 HotStandbyActive(void)
7887 {
7888 /*
7889 * We check shared state each time only until Hot Standby is active. We
7890 * can't de-activate Hot Standby, so there's no need to keep checking
7891 * after the shared variable has once been seen true.
7892 */
7893 if (LocalHotStandbyActive)
7894 return true;
7895 else
7896 {
7897 /* spinlock is essential on machines with weak memory ordering! */
7898 SpinLockAcquire(&XLogCtl->info_lck);
7899 LocalHotStandbyActive = XLogCtl->SharedHotStandbyActive;
7900 SpinLockRelease(&XLogCtl->info_lck);
7901
7902 return LocalHotStandbyActive;
7903 }
7904 }
7905
7906 /*
7907 * Like HotStandbyActive(), but to be used only in WAL replay code,
7908 * where we don't need to ask any other process what the state is.
7909 */
7910 bool
HotStandbyActiveInReplay(void)7911 HotStandbyActiveInReplay(void)
7912 {
7913 Assert(AmStartupProcess() || !IsPostmasterEnvironment);
7914 return LocalHotStandbyActive;
7915 }
7916
7917 /*
7918 * Is this process allowed to insert new WAL records?
7919 *
7920 * Ordinarily this is essentially equivalent to !RecoveryInProgress().
7921 * But we also have provisions for forcing the result "true" or "false"
7922 * within specific processes regardless of the global state.
7923 */
7924 bool
XLogInsertAllowed(void)7925 XLogInsertAllowed(void)
7926 {
7927 /*
7928 * If value is "unconditionally true" or "unconditionally false", just
7929 * return it. This provides the normal fast path once recovery is known
7930 * done.
7931 */
7932 if (LocalXLogInsertAllowed >= 0)
7933 return (bool) LocalXLogInsertAllowed;
7934
7935 /*
7936 * Else, must check to see if we're still in recovery.
7937 */
7938 if (RecoveryInProgress())
7939 return false;
7940
7941 /*
7942 * On exit from recovery, reset to "unconditionally true", since there is
7943 * no need to keep checking.
7944 */
7945 LocalXLogInsertAllowed = 1;
7946 return true;
7947 }
7948
7949 /*
7950 * Make XLogInsertAllowed() return true in the current process only.
7951 *
7952 * Note: it is allowed to switch LocalXLogInsertAllowed back to -1 later,
7953 * and even call LocalSetXLogInsertAllowed() again after that.
7954 */
7955 static void
LocalSetXLogInsertAllowed(void)7956 LocalSetXLogInsertAllowed(void)
7957 {
7958 Assert(LocalXLogInsertAllowed == -1);
7959 LocalXLogInsertAllowed = 1;
7960
7961 /* Initialize as RecoveryInProgress() would do when switching state */
7962 InitXLOGAccess();
7963 }
7964
7965 /*
7966 * Subroutine to try to fetch and validate a prior checkpoint record.
7967 *
7968 * whichChkpt identifies the checkpoint (merely for reporting purposes).
7969 * 1 for "primary", 2 for "secondary", 0 for "other" (backup_label)
7970 */
7971 static XLogRecord *
ReadCheckpointRecord(XLogReaderState * xlogreader,XLogRecPtr RecPtr,int whichChkpt,bool report)7972 ReadCheckpointRecord(XLogReaderState *xlogreader, XLogRecPtr RecPtr,
7973 int whichChkpt, bool report)
7974 {
7975 XLogRecord *record;
7976
7977 if (!XRecOffIsValid(RecPtr))
7978 {
7979 if (!report)
7980 return NULL;
7981
7982 switch (whichChkpt)
7983 {
7984 case 1:
7985 ereport(LOG,
7986 (errmsg("invalid primary checkpoint link in control file")));
7987 break;
7988 case 2:
7989 ereport(LOG,
7990 (errmsg("invalid secondary checkpoint link in control file")));
7991 break;
7992 default:
7993 ereport(LOG,
7994 (errmsg("invalid checkpoint link in backup_label file")));
7995 break;
7996 }
7997 return NULL;
7998 }
7999
8000 record = ReadRecord(xlogreader, RecPtr, LOG, true);
8001
8002 if (record == NULL)
8003 {
8004 if (!report)
8005 return NULL;
8006
8007 switch (whichChkpt)
8008 {
8009 case 1:
8010 ereport(LOG,
8011 (errmsg("invalid primary checkpoint record")));
8012 break;
8013 case 2:
8014 ereport(LOG,
8015 (errmsg("invalid secondary checkpoint record")));
8016 break;
8017 default:
8018 ereport(LOG,
8019 (errmsg("invalid checkpoint record")));
8020 break;
8021 }
8022 return NULL;
8023 }
8024 if (record->xl_rmid != RM_XLOG_ID)
8025 {
8026 switch (whichChkpt)
8027 {
8028 case 1:
8029 ereport(LOG,
8030 (errmsg("invalid resource manager ID in primary checkpoint record")));
8031 break;
8032 case 2:
8033 ereport(LOG,
8034 (errmsg("invalid resource manager ID in secondary checkpoint record")));
8035 break;
8036 default:
8037 ereport(LOG,
8038 (errmsg("invalid resource manager ID in checkpoint record")));
8039 break;
8040 }
8041 return NULL;
8042 }
8043 if (record->xl_info != XLOG_CHECKPOINT_SHUTDOWN &&
8044 record->xl_info != XLOG_CHECKPOINT_ONLINE)
8045 {
8046 switch (whichChkpt)
8047 {
8048 case 1:
8049 ereport(LOG,
8050 (errmsg("invalid xl_info in primary checkpoint record")));
8051 break;
8052 case 2:
8053 ereport(LOG,
8054 (errmsg("invalid xl_info in secondary checkpoint record")));
8055 break;
8056 default:
8057 ereport(LOG,
8058 (errmsg("invalid xl_info in checkpoint record")));
8059 break;
8060 }
8061 return NULL;
8062 }
8063 if (record->xl_tot_len != SizeOfXLogRecord + SizeOfXLogRecordDataHeaderShort + sizeof(CheckPoint))
8064 {
8065 switch (whichChkpt)
8066 {
8067 case 1:
8068 ereport(LOG,
8069 (errmsg("invalid length of primary checkpoint record")));
8070 break;
8071 case 2:
8072 ereport(LOG,
8073 (errmsg("invalid length of secondary checkpoint record")));
8074 break;
8075 default:
8076 ereport(LOG,
8077 (errmsg("invalid length of checkpoint record")));
8078 break;
8079 }
8080 return NULL;
8081 }
8082 return record;
8083 }
8084
8085 /*
8086 * This must be called in a backend process before creating WAL records
8087 * (except in a standalone backend, which does StartupXLOG instead). We need
8088 * to initialize the local copies of ThisTimeLineID and RedoRecPtr.
8089 *
8090 * Note: before Postgres 8.0, we went to some effort to keep the postmaster
8091 * process's copies of ThisTimeLineID and RedoRecPtr valid too. This was
8092 * unnecessary however, since the postmaster itself never touches XLOG anyway.
8093 */
8094 void
InitXLOGAccess(void)8095 InitXLOGAccess(void)
8096 {
8097 XLogCtlInsert *Insert = &XLogCtl->Insert;
8098
8099 /* ThisTimeLineID doesn't change so we need no lock to copy it */
8100 ThisTimeLineID = XLogCtl->ThisTimeLineID;
8101 Assert(ThisTimeLineID != 0 || IsBootstrapProcessingMode());
8102
8103 /* Use GetRedoRecPtr to copy the RedoRecPtr safely */
8104 (void) GetRedoRecPtr();
8105 /* Also update our copy of doPageWrites. */
8106 doPageWrites = (Insert->fullPageWrites || Insert->forcePageWrites);
8107
8108 /* Also initialize the working areas for constructing WAL records */
8109 InitXLogInsert();
8110 }
8111
8112 /*
8113 * Return the current Redo pointer from shared memory.
8114 *
8115 * As a side-effect, the local RedoRecPtr copy is updated.
8116 */
8117 XLogRecPtr
GetRedoRecPtr(void)8118 GetRedoRecPtr(void)
8119 {
8120 XLogRecPtr ptr;
8121
8122 /*
8123 * The possibly not up-to-date copy in XlogCtl is enough. Even if we
8124 * grabbed a WAL insertion lock to read the master copy, someone might
8125 * update it just after we've released the lock.
8126 */
8127 SpinLockAcquire(&XLogCtl->info_lck);
8128 ptr = XLogCtl->RedoRecPtr;
8129 SpinLockRelease(&XLogCtl->info_lck);
8130
8131 if (RedoRecPtr < ptr)
8132 RedoRecPtr = ptr;
8133
8134 return RedoRecPtr;
8135 }
8136
8137 /*
8138 * Return information needed to decide whether a modified block needs a
8139 * full-page image to be included in the WAL record.
8140 *
8141 * The returned values are cached copies from backend-private memory, and
8142 * possibly out-of-date. XLogInsertRecord will re-check them against
8143 * up-to-date values, while holding the WAL insert lock.
8144 */
8145 void
GetFullPageWriteInfo(XLogRecPtr * RedoRecPtr_p,bool * doPageWrites_p)8146 GetFullPageWriteInfo(XLogRecPtr *RedoRecPtr_p, bool *doPageWrites_p)
8147 {
8148 *RedoRecPtr_p = RedoRecPtr;
8149 *doPageWrites_p = doPageWrites;
8150 }
8151
8152 /*
8153 * GetInsertRecPtr -- Returns the current insert position.
8154 *
8155 * NOTE: The value *actually* returned is the position of the last full
8156 * xlog page. It lags behind the real insert position by at most 1 page.
8157 * For that, we don't need to scan through WAL insertion locks, and an
8158 * approximation is enough for the current usage of this function.
8159 */
8160 XLogRecPtr
GetInsertRecPtr(void)8161 GetInsertRecPtr(void)
8162 {
8163 XLogRecPtr recptr;
8164
8165 SpinLockAcquire(&XLogCtl->info_lck);
8166 recptr = XLogCtl->LogwrtRqst.Write;
8167 SpinLockRelease(&XLogCtl->info_lck);
8168
8169 return recptr;
8170 }
8171
8172 /*
8173 * GetFlushRecPtr -- Returns the current flush position, ie, the last WAL
8174 * position known to be fsync'd to disk.
8175 */
8176 XLogRecPtr
GetFlushRecPtr(void)8177 GetFlushRecPtr(void)
8178 {
8179 SpinLockAcquire(&XLogCtl->info_lck);
8180 LogwrtResult = XLogCtl->LogwrtResult;
8181 SpinLockRelease(&XLogCtl->info_lck);
8182
8183 return LogwrtResult.Flush;
8184 }
8185
8186 /*
8187 * Get the time of the last xlog segment switch
8188 */
8189 pg_time_t
GetLastSegSwitchTime(void)8190 GetLastSegSwitchTime(void)
8191 {
8192 pg_time_t result;
8193
8194 /* Need WALWriteLock, but shared lock is sufficient */
8195 LWLockAcquire(WALWriteLock, LW_SHARED);
8196 result = XLogCtl->lastSegSwitchTime;
8197 LWLockRelease(WALWriteLock);
8198
8199 return result;
8200 }
8201
8202 /*
8203 * GetNextXidAndEpoch - get the current nextXid value and associated epoch
8204 *
8205 * This is exported for use by code that would like to have 64-bit XIDs.
8206 * We don't really support such things, but all XIDs within the system
8207 * can be presumed "close to" the result, and thus the epoch associated
8208 * with them can be determined.
8209 */
8210 void
GetNextXidAndEpoch(TransactionId * xid,uint32 * epoch)8211 GetNextXidAndEpoch(TransactionId *xid, uint32 *epoch)
8212 {
8213 uint32 ckptXidEpoch;
8214 TransactionId ckptXid;
8215 TransactionId nextXid;
8216
8217 /* Must read checkpoint info first, else have race condition */
8218 SpinLockAcquire(&XLogCtl->info_lck);
8219 ckptXidEpoch = XLogCtl->ckptXidEpoch;
8220 ckptXid = XLogCtl->ckptXid;
8221 SpinLockRelease(&XLogCtl->info_lck);
8222
8223 /* Now fetch current nextXid */
8224 nextXid = ReadNewTransactionId();
8225
8226 /*
8227 * nextXid is certainly logically later than ckptXid. So if it's
8228 * numerically less, it must have wrapped into the next epoch.
8229 */
8230 if (nextXid < ckptXid)
8231 ckptXidEpoch++;
8232
8233 *xid = nextXid;
8234 *epoch = ckptXidEpoch;
8235 }
8236
8237 /*
8238 * This must be called ONCE during postmaster or standalone-backend shutdown
8239 */
8240 void
ShutdownXLOG(int code,Datum arg)8241 ShutdownXLOG(int code, Datum arg)
8242 {
8243 /* Don't be chatty in standalone mode */
8244 ereport(IsPostmasterEnvironment ? LOG : NOTICE,
8245 (errmsg("shutting down")));
8246
8247 /*
8248 * Signal walsenders to move to stopping state.
8249 */
8250 WalSndInitStopping();
8251
8252 /*
8253 * Wait for WAL senders to be in stopping state. This prevents commands
8254 * from writing new WAL.
8255 */
8256 WalSndWaitStopping();
8257
8258 if (RecoveryInProgress())
8259 CreateRestartPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8260 else
8261 {
8262 /*
8263 * If archiving is enabled, rotate the last XLOG file so that all the
8264 * remaining records are archived (postmaster wakes up the archiver
8265 * process one more time at the end of shutdown). The checkpoint
8266 * record will go to the next XLOG file and won't be archived (yet).
8267 */
8268 if (XLogArchivingActive() && XLogArchiveCommandSet())
8269 RequestXLogSwitch();
8270
8271 CreateCheckPoint(CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_IMMEDIATE);
8272 }
8273 ShutdownCLOG();
8274 ShutdownCommitTs();
8275 ShutdownSUBTRANS();
8276 ShutdownMultiXact();
8277 }
8278
8279 /*
8280 * Log start of a checkpoint.
8281 */
8282 static void
LogCheckpointStart(int flags,bool restartpoint)8283 LogCheckpointStart(int flags, bool restartpoint)
8284 {
8285 elog(LOG, "%s starting:%s%s%s%s%s%s%s%s",
8286 restartpoint ? "restartpoint" : "checkpoint",
8287 (flags & CHECKPOINT_IS_SHUTDOWN) ? " shutdown" : "",
8288 (flags & CHECKPOINT_END_OF_RECOVERY) ? " end-of-recovery" : "",
8289 (flags & CHECKPOINT_IMMEDIATE) ? " immediate" : "",
8290 (flags & CHECKPOINT_FORCE) ? " force" : "",
8291 (flags & CHECKPOINT_WAIT) ? " wait" : "",
8292 (flags & CHECKPOINT_CAUSE_XLOG) ? " xlog" : "",
8293 (flags & CHECKPOINT_CAUSE_TIME) ? " time" : "",
8294 (flags & CHECKPOINT_FLUSH_ALL) ? " flush-all" : "");
8295 }
8296
8297 /*
8298 * Log end of a checkpoint.
8299 */
8300 static void
LogCheckpointEnd(bool restartpoint)8301 LogCheckpointEnd(bool restartpoint)
8302 {
8303 long write_msecs,
8304 sync_msecs,
8305 total_msecs,
8306 longest_msecs,
8307 average_msecs;
8308 uint64 average_sync_time;
8309
8310 CheckpointStats.ckpt_end_t = GetCurrentTimestamp();
8311
8312 write_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_write_t,
8313 CheckpointStats.ckpt_sync_t);
8314
8315 sync_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_sync_t,
8316 CheckpointStats.ckpt_sync_end_t);
8317
8318 /* Accumulate checkpoint timing summary data, in milliseconds. */
8319 BgWriterStats.m_checkpoint_write_time += write_msecs;
8320 BgWriterStats.m_checkpoint_sync_time += sync_msecs;
8321
8322 /*
8323 * All of the published timing statistics are accounted for. Only
8324 * continue if a log message is to be written.
8325 */
8326 if (!log_checkpoints)
8327 return;
8328
8329 total_msecs = TimestampDifferenceMilliseconds(CheckpointStats.ckpt_start_t,
8330 CheckpointStats.ckpt_end_t);
8331
8332 /*
8333 * Timing values returned from CheckpointStats are in microseconds.
8334 * Convert to milliseconds for consistent printing.
8335 */
8336 longest_msecs = (long) ((CheckpointStats.ckpt_longest_sync + 999) / 1000);
8337
8338 average_sync_time = 0;
8339 if (CheckpointStats.ckpt_sync_rels > 0)
8340 average_sync_time = CheckpointStats.ckpt_agg_sync_time /
8341 CheckpointStats.ckpt_sync_rels;
8342 average_msecs = (long) ((average_sync_time + 999) / 1000);
8343
8344 elog(LOG, "%s complete: wrote %d buffers (%.1f%%); "
8345 "%d transaction log file(s) added, %d removed, %d recycled; "
8346 "write=%ld.%03d s, sync=%ld.%03d s, total=%ld.%03d s; "
8347 "sync files=%d, longest=%ld.%03d s, average=%ld.%03d s; "
8348 "distance=%d kB, estimate=%d kB",
8349 restartpoint ? "restartpoint" : "checkpoint",
8350 CheckpointStats.ckpt_bufs_written,
8351 (double) CheckpointStats.ckpt_bufs_written * 100 / NBuffers,
8352 CheckpointStats.ckpt_segs_added,
8353 CheckpointStats.ckpt_segs_removed,
8354 CheckpointStats.ckpt_segs_recycled,
8355 write_msecs / 1000, (int) (write_msecs % 1000),
8356 sync_msecs / 1000, (int) (sync_msecs % 1000),
8357 total_msecs / 1000, (int) (total_msecs % 1000),
8358 CheckpointStats.ckpt_sync_rels,
8359 longest_msecs / 1000, (int) (longest_msecs % 1000),
8360 average_msecs / 1000, (int) (average_msecs % 1000),
8361 (int) (PrevCheckPointDistance / 1024.0),
8362 (int) (CheckPointDistanceEstimate / 1024.0));
8363 }
8364
8365 /*
8366 * Update the estimate of distance between checkpoints.
8367 *
8368 * The estimate is used to calculate the number of WAL segments to keep
8369 * preallocated, see XLOGFileSlop().
8370 */
8371 static void
UpdateCheckPointDistanceEstimate(uint64 nbytes)8372 UpdateCheckPointDistanceEstimate(uint64 nbytes)
8373 {
8374 /*
8375 * To estimate the number of segments consumed between checkpoints, keep a
8376 * moving average of the amount of WAL generated in previous checkpoint
8377 * cycles. However, if the load is bursty, with quiet periods and busy
8378 * periods, we want to cater for the peak load. So instead of a plain
8379 * moving average, let the average decline slowly if the previous cycle
8380 * used less WAL than estimated, but bump it up immediately if it used
8381 * more.
8382 *
8383 * When checkpoints are triggered by max_wal_size, this should converge to
8384 * CheckpointSegments * XLOG_SEG_SIZE,
8385 *
8386 * Note: This doesn't pay any attention to what caused the checkpoint.
8387 * Checkpoints triggered manually with CHECKPOINT command, or by e.g.
8388 * starting a base backup, are counted the same as those created
8389 * automatically. The slow-decline will largely mask them out, if they are
8390 * not frequent. If they are frequent, it seems reasonable to count them
8391 * in as any others; if you issue a manual checkpoint every 5 minutes and
8392 * never let a timed checkpoint happen, it makes sense to base the
8393 * preallocation on that 5 minute interval rather than whatever
8394 * checkpoint_timeout is set to.
8395 */
8396 PrevCheckPointDistance = nbytes;
8397 if (CheckPointDistanceEstimate < nbytes)
8398 CheckPointDistanceEstimate = nbytes;
8399 else
8400 CheckPointDistanceEstimate =
8401 (0.90 * CheckPointDistanceEstimate + 0.10 * (double) nbytes);
8402 }
8403
8404 /*
8405 * Perform a checkpoint --- either during shutdown, or on-the-fly
8406 *
8407 * flags is a bitwise OR of the following:
8408 * CHECKPOINT_IS_SHUTDOWN: checkpoint is for database shutdown.
8409 * CHECKPOINT_END_OF_RECOVERY: checkpoint is for end of WAL recovery.
8410 * CHECKPOINT_IMMEDIATE: finish the checkpoint ASAP,
8411 * ignoring checkpoint_completion_target parameter.
8412 * CHECKPOINT_FORCE: force a checkpoint even if no XLOG activity has occurred
8413 * since the last one (implied by CHECKPOINT_IS_SHUTDOWN or
8414 * CHECKPOINT_END_OF_RECOVERY).
8415 * CHECKPOINT_FLUSH_ALL: also flush buffers of unlogged tables.
8416 *
8417 * Note: flags contains other bits, of interest here only for logging purposes.
8418 * In particular note that this routine is synchronous and does not pay
8419 * attention to CHECKPOINT_WAIT.
8420 *
8421 * If !shutdown then we are writing an online checkpoint. This is a very special
8422 * kind of operation and WAL record because the checkpoint action occurs over
8423 * a period of time yet logically occurs at just a single LSN. The logical
8424 * position of the WAL record (redo ptr) is the same or earlier than the
8425 * physical position. When we replay WAL we locate the checkpoint via its
8426 * physical position then read the redo ptr and actually start replay at the
8427 * earlier logical position. Note that we don't write *anything* to WAL at
8428 * the logical position, so that location could be any other kind of WAL record.
8429 * All of this mechanism allows us to continue working while we checkpoint.
8430 * As a result, timing of actions is critical here and be careful to note that
8431 * this function will likely take minutes to execute on a busy system.
8432 */
8433 void
CreateCheckPoint(int flags)8434 CreateCheckPoint(int flags)
8435 {
8436 bool shutdown;
8437 CheckPoint checkPoint;
8438 XLogRecPtr recptr;
8439 XLogCtlInsert *Insert = &XLogCtl->Insert;
8440 uint32 freespace;
8441 XLogRecPtr PriorRedoPtr;
8442 XLogRecPtr curInsert;
8443 XLogRecPtr prevPtr;
8444 VirtualTransactionId *vxids;
8445 int nvxids;
8446
8447 /*
8448 * An end-of-recovery checkpoint is really a shutdown checkpoint, just
8449 * issued at a different time.
8450 */
8451 if (flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY))
8452 shutdown = true;
8453 else
8454 shutdown = false;
8455
8456 /* sanity check */
8457 if (RecoveryInProgress() && (flags & CHECKPOINT_END_OF_RECOVERY) == 0)
8458 elog(ERROR, "can't create a checkpoint during recovery");
8459
8460 /*
8461 * Initialize InitXLogInsert working areas before entering the critical
8462 * section. Normally, this is done by the first call to
8463 * RecoveryInProgress() or LocalSetXLogInsertAllowed(), but when creating
8464 * an end-of-recovery checkpoint, the LocalSetXLogInsertAllowed call is
8465 * done below in a critical section, and InitXLogInsert cannot be called
8466 * in a critical section.
8467 */
8468 InitXLogInsert();
8469
8470 /*
8471 * Acquire CheckpointLock to ensure only one checkpoint happens at a time.
8472 * (This is just pro forma, since in the present system structure there is
8473 * only one process that is allowed to issue checkpoints at any given
8474 * time.)
8475 */
8476 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
8477
8478 /*
8479 * Prepare to accumulate statistics.
8480 *
8481 * Note: because it is possible for log_checkpoints to change while a
8482 * checkpoint proceeds, we always accumulate stats, even if
8483 * log_checkpoints is currently off.
8484 */
8485 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
8486 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
8487
8488 /*
8489 * Use a critical section to force system panic if we have trouble.
8490 */
8491 START_CRIT_SECTION();
8492
8493 if (shutdown)
8494 {
8495 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8496 ControlFile->state = DB_SHUTDOWNING;
8497 ControlFile->time = (pg_time_t) time(NULL);
8498 UpdateControlFile();
8499 LWLockRelease(ControlFileLock);
8500 }
8501
8502 /*
8503 * Let smgr prepare for checkpoint; this has to happen before we determine
8504 * the REDO pointer. Note that smgr must not do anything that'd have to
8505 * be undone if we decide no checkpoint is needed.
8506 */
8507 smgrpreckpt();
8508
8509 /* Begin filling in the checkpoint WAL record */
8510 MemSet(&checkPoint, 0, sizeof(checkPoint));
8511 checkPoint.time = (pg_time_t) time(NULL);
8512
8513 /*
8514 * For Hot Standby, derive the oldestActiveXid before we fix the redo
8515 * pointer. This allows us to begin accumulating changes to assemble our
8516 * starting snapshot of locks and transactions.
8517 */
8518 if (!shutdown && XLogStandbyInfoActive())
8519 checkPoint.oldestActiveXid = GetOldestActiveTransactionId();
8520 else
8521 checkPoint.oldestActiveXid = InvalidTransactionId;
8522
8523 /*
8524 * We must block concurrent insertions while examining insert state to
8525 * determine the checkpoint REDO pointer.
8526 */
8527 WALInsertLockAcquireExclusive();
8528 curInsert = XLogBytePosToRecPtr(Insert->CurrBytePos);
8529 prevPtr = XLogBytePosToRecPtr(Insert->PrevBytePos);
8530
8531 /*
8532 * If this isn't a shutdown or forced checkpoint, and we have not inserted
8533 * any XLOG records since the start of the last checkpoint, skip the
8534 * checkpoint. The idea here is to avoid inserting duplicate checkpoints
8535 * when the system is idle. That wastes log space, and more importantly it
8536 * exposes us to possible loss of both current and previous checkpoint
8537 * records if the machine crashes just as we're writing the update.
8538 * (Perhaps it'd make even more sense to checkpoint only when the previous
8539 * checkpoint record is in a different xlog page?)
8540 *
8541 * If the previous checkpoint crossed a WAL segment, however, we create
8542 * the checkpoint anyway, to have the latest checkpoint fully contained in
8543 * the new segment. This is for a little bit of extra robustness: it's
8544 * better if you don't need to keep two WAL segments around to recover the
8545 * checkpoint.
8546 */
8547 if ((flags & (CHECKPOINT_IS_SHUTDOWN | CHECKPOINT_END_OF_RECOVERY |
8548 CHECKPOINT_FORCE)) == 0)
8549 {
8550 if (prevPtr == ControlFile->checkPointCopy.redo &&
8551 prevPtr / XLOG_SEG_SIZE == curInsert / XLOG_SEG_SIZE)
8552 {
8553 WALInsertLockRelease();
8554 LWLockRelease(CheckpointLock);
8555 END_CRIT_SECTION();
8556 return;
8557 }
8558 }
8559
8560 /*
8561 * An end-of-recovery checkpoint is created before anyone is allowed to
8562 * write WAL. To allow us to write the checkpoint record, temporarily
8563 * enable XLogInsertAllowed. (This also ensures ThisTimeLineID is
8564 * initialized, which we need here and in AdvanceXLInsertBuffer.)
8565 */
8566 if (flags & CHECKPOINT_END_OF_RECOVERY)
8567 LocalSetXLogInsertAllowed();
8568
8569 checkPoint.ThisTimeLineID = ThisTimeLineID;
8570 if (flags & CHECKPOINT_END_OF_RECOVERY)
8571 checkPoint.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8572 else
8573 checkPoint.PrevTimeLineID = ThisTimeLineID;
8574
8575 checkPoint.fullPageWrites = Insert->fullPageWrites;
8576
8577 /*
8578 * Compute new REDO record ptr = location of next XLOG record.
8579 *
8580 * NB: this is NOT necessarily where the checkpoint record itself will be,
8581 * since other backends may insert more XLOG records while we're off doing
8582 * the buffer flush work. Those XLOG records are logically after the
8583 * checkpoint, even though physically before it. Got that?
8584 */
8585 freespace = INSERT_FREESPACE(curInsert);
8586 if (freespace == 0)
8587 {
8588 if (curInsert % XLogSegSize == 0)
8589 curInsert += SizeOfXLogLongPHD;
8590 else
8591 curInsert += SizeOfXLogShortPHD;
8592 }
8593 checkPoint.redo = curInsert;
8594
8595 /*
8596 * Here we update the shared RedoRecPtr for future XLogInsert calls; this
8597 * must be done while holding all the insertion locks.
8598 *
8599 * Note: if we fail to complete the checkpoint, RedoRecPtr will be left
8600 * pointing past where it really needs to point. This is okay; the only
8601 * consequence is that XLogInsert might back up whole buffers that it
8602 * didn't really need to. We can't postpone advancing RedoRecPtr because
8603 * XLogInserts that happen while we are dumping buffers must assume that
8604 * their buffer changes are not included in the checkpoint.
8605 */
8606 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = checkPoint.redo;
8607
8608 /*
8609 * Now we can release the WAL insertion locks, allowing other xacts to
8610 * proceed while we are flushing disk buffers.
8611 */
8612 WALInsertLockRelease();
8613
8614 /* Update the info_lck-protected copy of RedoRecPtr as well */
8615 SpinLockAcquire(&XLogCtl->info_lck);
8616 XLogCtl->RedoRecPtr = checkPoint.redo;
8617 SpinLockRelease(&XLogCtl->info_lck);
8618
8619 /*
8620 * If enabled, log checkpoint start. We postpone this until now so as not
8621 * to log anything if we decided to skip the checkpoint.
8622 */
8623 if (log_checkpoints)
8624 LogCheckpointStart(flags, false);
8625
8626 TRACE_POSTGRESQL_CHECKPOINT_START(flags);
8627
8628 /*
8629 * Get the other info we need for the checkpoint record.
8630 */
8631 LWLockAcquire(XidGenLock, LW_SHARED);
8632 checkPoint.nextXid = ShmemVariableCache->nextXid;
8633 checkPoint.oldestXid = ShmemVariableCache->oldestXid;
8634 checkPoint.oldestXidDB = ShmemVariableCache->oldestXidDB;
8635 LWLockRelease(XidGenLock);
8636
8637 LWLockAcquire(CommitTsLock, LW_SHARED);
8638 checkPoint.oldestCommitTsXid = ShmemVariableCache->oldestCommitTsXid;
8639 checkPoint.newestCommitTsXid = ShmemVariableCache->newestCommitTsXid;
8640 LWLockRelease(CommitTsLock);
8641
8642 /* Increase XID epoch if we've wrapped around since last checkpoint */
8643 checkPoint.nextXidEpoch = ControlFile->checkPointCopy.nextXidEpoch;
8644 if (checkPoint.nextXid < ControlFile->checkPointCopy.nextXid)
8645 checkPoint.nextXidEpoch++;
8646
8647 LWLockAcquire(OidGenLock, LW_SHARED);
8648 checkPoint.nextOid = ShmemVariableCache->nextOid;
8649 if (!shutdown)
8650 checkPoint.nextOid += ShmemVariableCache->oidCount;
8651 LWLockRelease(OidGenLock);
8652
8653 MultiXactGetCheckptMulti(shutdown,
8654 &checkPoint.nextMulti,
8655 &checkPoint.nextMultiOffset,
8656 &checkPoint.oldestMulti,
8657 &checkPoint.oldestMultiDB);
8658
8659 /*
8660 * Having constructed the checkpoint record, ensure all shmem disk buffers
8661 * and commit-log buffers are flushed to disk.
8662 *
8663 * This I/O could fail for various reasons. If so, we will fail to
8664 * complete the checkpoint, but there is no reason to force a system
8665 * panic. Accordingly, exit critical section while doing it.
8666 */
8667 END_CRIT_SECTION();
8668
8669 /*
8670 * In some cases there are groups of actions that must all occur on one
8671 * side or the other of a checkpoint record. Before flushing the
8672 * checkpoint record we must explicitly wait for any backend currently
8673 * performing those groups of actions.
8674 *
8675 * One example is end of transaction, so we must wait for any transactions
8676 * that are currently in commit critical sections. If an xact inserted
8677 * its commit record into XLOG just before the REDO point, then a crash
8678 * restart from the REDO point would not replay that record, which means
8679 * that our flushing had better include the xact's update of pg_clog. So
8680 * we wait till he's out of his commit critical section before proceeding.
8681 * See notes in RecordTransactionCommit().
8682 *
8683 * Because we've already released the insertion locks, this test is a bit
8684 * fuzzy: it is possible that we will wait for xacts we didn't really need
8685 * to wait for. But the delay should be short and it seems better to make
8686 * checkpoint take a bit longer than to hold off insertions longer than
8687 * necessary. (In fact, the whole reason we have this issue is that xact.c
8688 * does commit record XLOG insertion and clog update as two separate steps
8689 * protected by different locks, but again that seems best on grounds of
8690 * minimizing lock contention.)
8691 *
8692 * A transaction that has not yet set delayChkpt when we look cannot be at
8693 * risk, since he's not inserted his commit record yet; and one that's
8694 * already cleared it is not at risk either, since he's done fixing clog
8695 * and we will correctly flush the update below. So we cannot miss any
8696 * xacts we need to wait for.
8697 */
8698 vxids = GetVirtualXIDsDelayingChkpt(&nvxids);
8699 if (nvxids > 0)
8700 {
8701 do
8702 {
8703 pg_usleep(10000L); /* wait for 10 msec */
8704 } while (HaveVirtualXIDsDelayingChkpt(vxids, nvxids));
8705 }
8706 pfree(vxids);
8707
8708 CheckPointGuts(checkPoint.redo, flags);
8709
8710 /*
8711 * Take a snapshot of running transactions and write this to WAL. This
8712 * allows us to reconstruct the state of running transactions during
8713 * archive recovery, if required. Skip, if this info disabled.
8714 *
8715 * If we are shutting down, or Startup process is completing crash
8716 * recovery we don't need to write running xact data.
8717 */
8718 if (!shutdown && XLogStandbyInfoActive())
8719 LogStandbySnapshot();
8720
8721 START_CRIT_SECTION();
8722
8723 /*
8724 * Now insert the checkpoint record into XLOG.
8725 */
8726 XLogBeginInsert();
8727 XLogRegisterData((char *) (&checkPoint), sizeof(checkPoint));
8728 recptr = XLogInsert(RM_XLOG_ID,
8729 shutdown ? XLOG_CHECKPOINT_SHUTDOWN :
8730 XLOG_CHECKPOINT_ONLINE);
8731
8732 XLogFlush(recptr);
8733
8734 /*
8735 * We mustn't write any new WAL after a shutdown checkpoint, or it will be
8736 * overwritten at next startup. No-one should even try, this just allows
8737 * sanity-checking. In the case of an end-of-recovery checkpoint, we want
8738 * to just temporarily disable writing until the system has exited
8739 * recovery.
8740 */
8741 if (shutdown)
8742 {
8743 if (flags & CHECKPOINT_END_OF_RECOVERY)
8744 LocalXLogInsertAllowed = -1; /* return to "check" state */
8745 else
8746 LocalXLogInsertAllowed = 0; /* never again write WAL */
8747 }
8748
8749 /*
8750 * We now have ProcLastRecPtr = start of actual checkpoint record, recptr
8751 * = end of actual checkpoint record.
8752 */
8753 if (shutdown && checkPoint.redo != ProcLastRecPtr)
8754 ereport(PANIC,
8755 (errmsg("concurrent transaction log activity while database system is shutting down")));
8756
8757 /*
8758 * Remember the prior checkpoint's redo pointer, used later to determine
8759 * the point where the log can be truncated.
8760 */
8761 PriorRedoPtr = ControlFile->checkPointCopy.redo;
8762
8763 /*
8764 * Update the control file.
8765 */
8766 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8767 if (shutdown)
8768 ControlFile->state = DB_SHUTDOWNED;
8769 ControlFile->prevCheckPoint = ControlFile->checkPoint;
8770 ControlFile->checkPoint = ProcLastRecPtr;
8771 ControlFile->checkPointCopy = checkPoint;
8772 ControlFile->time = (pg_time_t) time(NULL);
8773 /* crash recovery should always recover to the end of WAL */
8774 ControlFile->minRecoveryPoint = InvalidXLogRecPtr;
8775 ControlFile->minRecoveryPointTLI = 0;
8776
8777 /*
8778 * Persist unloggedLSN value. It's reset on crash recovery, so this goes
8779 * unused on non-shutdown checkpoints, but seems useful to store it always
8780 * for debugging purposes.
8781 */
8782 SpinLockAcquire(&XLogCtl->ulsn_lck);
8783 ControlFile->unloggedLSN = XLogCtl->unloggedLSN;
8784 SpinLockRelease(&XLogCtl->ulsn_lck);
8785
8786 UpdateControlFile();
8787 LWLockRelease(ControlFileLock);
8788
8789 /* Update shared-memory copy of checkpoint XID/epoch */
8790 SpinLockAcquire(&XLogCtl->info_lck);
8791 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
8792 XLogCtl->ckptXid = checkPoint.nextXid;
8793 SpinLockRelease(&XLogCtl->info_lck);
8794
8795 /*
8796 * We are now done with critical updates; no need for system panic if we
8797 * have trouble while fooling with old log segments.
8798 */
8799 END_CRIT_SECTION();
8800
8801 /*
8802 * Let smgr do post-checkpoint cleanup (eg, deleting old files).
8803 */
8804 smgrpostckpt();
8805
8806 /*
8807 * Delete old log files (those no longer needed even for previous
8808 * checkpoint or the standbys in XLOG streaming).
8809 */
8810 if (PriorRedoPtr != InvalidXLogRecPtr)
8811 {
8812 XLogSegNo _logSegNo;
8813
8814 /* Update the average distance between checkpoints. */
8815 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
8816
8817 XLByteToSeg(PriorRedoPtr, _logSegNo);
8818 KeepLogSeg(recptr, &_logSegNo);
8819 _logSegNo--;
8820 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, recptr);
8821 }
8822
8823 /*
8824 * Make more log segments if needed. (Do this after recycling old log
8825 * segments, since that may supply some of the needed files.)
8826 */
8827 if (!shutdown)
8828 PreallocXlogFiles(recptr);
8829
8830 /*
8831 * Truncate pg_subtrans if possible. We can throw away all data before
8832 * the oldest XMIN of any running transaction. No future transaction will
8833 * attempt to reference any pg_subtrans entry older than that (see Asserts
8834 * in subtrans.c). During recovery, though, we mustn't do this because
8835 * StartupSUBTRANS hasn't been called yet.
8836 */
8837 if (!RecoveryInProgress())
8838 TruncateSUBTRANS(GetOldestXmin(NULL, false));
8839
8840 /* Real work is done, but log and update stats before releasing lock. */
8841 LogCheckpointEnd(false);
8842
8843 TRACE_POSTGRESQL_CHECKPOINT_DONE(CheckpointStats.ckpt_bufs_written,
8844 NBuffers,
8845 CheckpointStats.ckpt_segs_added,
8846 CheckpointStats.ckpt_segs_removed,
8847 CheckpointStats.ckpt_segs_recycled);
8848
8849 LWLockRelease(CheckpointLock);
8850 }
8851
8852 /*
8853 * Mark the end of recovery in WAL though without running a full checkpoint.
8854 * We can expect that a restartpoint is likely to be in progress as we
8855 * do this, though we are unwilling to wait for it to complete. So be
8856 * careful to avoid taking the CheckpointLock anywhere here.
8857 *
8858 * CreateRestartPoint() allows for the case where recovery may end before
8859 * the restartpoint completes so there is no concern of concurrent behaviour.
8860 */
8861 static void
CreateEndOfRecoveryRecord(void)8862 CreateEndOfRecoveryRecord(void)
8863 {
8864 xl_end_of_recovery xlrec;
8865 XLogRecPtr recptr;
8866
8867 /* sanity check */
8868 if (!RecoveryInProgress())
8869 elog(ERROR, "can only be used to end recovery");
8870
8871 xlrec.end_time = GetCurrentTimestamp();
8872
8873 WALInsertLockAcquireExclusive();
8874 xlrec.ThisTimeLineID = ThisTimeLineID;
8875 xlrec.PrevTimeLineID = XLogCtl->PrevTimeLineID;
8876 WALInsertLockRelease();
8877
8878 LocalSetXLogInsertAllowed();
8879
8880 START_CRIT_SECTION();
8881
8882 XLogBeginInsert();
8883 XLogRegisterData((char *) &xlrec, sizeof(xl_end_of_recovery));
8884 recptr = XLogInsert(RM_XLOG_ID, XLOG_END_OF_RECOVERY);
8885
8886 XLogFlush(recptr);
8887
8888 /*
8889 * Update the control file so that crash recovery can follow the timeline
8890 * changes to this point.
8891 */
8892 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
8893 ControlFile->time = (pg_time_t) time(NULL);
8894 ControlFile->minRecoveryPoint = recptr;
8895 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
8896 UpdateControlFile();
8897 LWLockRelease(ControlFileLock);
8898
8899 END_CRIT_SECTION();
8900
8901 LocalXLogInsertAllowed = -1; /* return to "check" state */
8902 }
8903
8904 /*
8905 * Write an OVERWRITE_CONTRECORD message.
8906 *
8907 * When on WAL replay we expect a continuation record at the start of a page
8908 * that is not there, recovery ends and WAL writing resumes at that point.
8909 * But it's wrong to resume writing new WAL back at the start of the record
8910 * that was broken, because downstream consumers of that WAL (physical
8911 * replicas) are not prepared to "rewind". So the first action after
8912 * finishing replay of all valid WAL must be to write a record of this type
8913 * at the point where the contrecord was missing; to support xlogreader
8914 * detecting the special case, XLP_FIRST_IS_OVERWRITE_CONTRECORD is also added
8915 * to the page header where the record occurs. xlogreader has an ad-hoc
8916 * mechanism to report metadata about the broken record, which is what we
8917 * use here.
8918 *
8919 * At replay time, XLP_FIRST_IS_OVERWRITE_CONTRECORD instructs xlogreader to
8920 * skip the record it was reading, and pass back the LSN of the skipped
8921 * record, so that its caller can verify (on "replay" of that record) that the
8922 * XLOG_OVERWRITE_CONTRECORD matches what was effectively overwritten.
8923 */
8924 static XLogRecPtr
CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)8925 CreateOverwriteContrecordRecord(XLogRecPtr aborted_lsn)
8926 {
8927 xl_overwrite_contrecord xlrec;
8928 XLogRecPtr recptr;
8929
8930 /* sanity check */
8931 if (!RecoveryInProgress())
8932 elog(ERROR, "can only be used at end of recovery");
8933
8934 xlrec.overwritten_lsn = aborted_lsn;
8935 xlrec.overwrite_time = GetCurrentTimestamp();
8936
8937 START_CRIT_SECTION();
8938
8939 XLogBeginInsert();
8940 XLogRegisterData((char *) &xlrec, sizeof(xl_overwrite_contrecord));
8941
8942 recptr = XLogInsert(RM_XLOG_ID, XLOG_OVERWRITE_CONTRECORD);
8943
8944 XLogFlush(recptr);
8945
8946 END_CRIT_SECTION();
8947
8948 return recptr;
8949 }
8950
8951 /*
8952 * Flush all data in shared memory to disk, and fsync
8953 *
8954 * This is the common code shared between regular checkpoints and
8955 * recovery restartpoints.
8956 */
8957 static void
CheckPointGuts(XLogRecPtr checkPointRedo,int flags)8958 CheckPointGuts(XLogRecPtr checkPointRedo, int flags)
8959 {
8960 CheckPointCLOG();
8961 CheckPointCommitTs();
8962 CheckPointSUBTRANS();
8963 CheckPointMultiXact();
8964 CheckPointPredicate();
8965 CheckPointRelationMap();
8966 CheckPointReplicationSlots();
8967 CheckPointSnapBuild();
8968 CheckPointLogicalRewriteHeap();
8969 CheckPointBuffers(flags); /* performs all required fsyncs */
8970 CheckPointReplicationOrigin();
8971 /* We deliberately delay 2PC checkpointing as long as possible */
8972 CheckPointTwoPhase(checkPointRedo);
8973 }
8974
8975 /*
8976 * Save a checkpoint for recovery restart if appropriate
8977 *
8978 * This function is called each time a checkpoint record is read from XLOG.
8979 * It must determine whether the checkpoint represents a safe restartpoint or
8980 * not. If so, the checkpoint record is stashed in shared memory so that
8981 * CreateRestartPoint can consult it. (Note that the latter function is
8982 * executed by the checkpointer, while this one will be executed by the
8983 * startup process.)
8984 */
8985 static void
RecoveryRestartPoint(const CheckPoint * checkPoint)8986 RecoveryRestartPoint(const CheckPoint *checkPoint)
8987 {
8988 /*
8989 * Also refrain from creating a restartpoint if we have seen any
8990 * references to non-existent pages. Restarting recovery from the
8991 * restartpoint would not see the references, so we would lose the
8992 * cross-check that the pages belonged to a relation that was dropped
8993 * later.
8994 */
8995 if (XLogHaveInvalidPages())
8996 {
8997 elog(trace_recovery(DEBUG2),
8998 "could not record restart point at %X/%X because there "
8999 "are unresolved references to invalid pages",
9000 (uint32) (checkPoint->redo >> 32),
9001 (uint32) checkPoint->redo);
9002 return;
9003 }
9004
9005 /*
9006 * Copy the checkpoint record to shared memory, so that checkpointer can
9007 * work out the next time it wants to perform a restartpoint.
9008 */
9009 SpinLockAcquire(&XLogCtl->info_lck);
9010 XLogCtl->lastCheckPointRecPtr = ReadRecPtr;
9011 XLogCtl->lastCheckPointEndPtr = EndRecPtr;
9012 XLogCtl->lastCheckPoint = *checkPoint;
9013 SpinLockRelease(&XLogCtl->info_lck);
9014 }
9015
9016 /*
9017 * Establish a restartpoint if possible.
9018 *
9019 * This is similar to CreateCheckPoint, but is used during WAL recovery
9020 * to establish a point from which recovery can roll forward without
9021 * replaying the entire recovery log.
9022 *
9023 * Returns true if a new restartpoint was established. We can only establish
9024 * a restartpoint if we have replayed a safe checkpoint record since last
9025 * restartpoint.
9026 */
9027 bool
CreateRestartPoint(int flags)9028 CreateRestartPoint(int flags)
9029 {
9030 XLogRecPtr lastCheckPointRecPtr;
9031 XLogRecPtr lastCheckPointEndPtr;
9032 CheckPoint lastCheckPoint;
9033 XLogRecPtr PriorRedoPtr;
9034 TimestampTz xtime;
9035
9036 /*
9037 * Acquire CheckpointLock to ensure only one restartpoint or checkpoint
9038 * happens at a time.
9039 */
9040 LWLockAcquire(CheckpointLock, LW_EXCLUSIVE);
9041
9042 /* Get a local copy of the last safe checkpoint record. */
9043 SpinLockAcquire(&XLogCtl->info_lck);
9044 lastCheckPointRecPtr = XLogCtl->lastCheckPointRecPtr;
9045 lastCheckPointEndPtr = XLogCtl->lastCheckPointEndPtr;
9046 lastCheckPoint = XLogCtl->lastCheckPoint;
9047 SpinLockRelease(&XLogCtl->info_lck);
9048
9049 /*
9050 * Check that we're still in recovery mode. It's ok if we exit recovery
9051 * mode after this check, the restart point is valid anyway.
9052 */
9053 if (!RecoveryInProgress())
9054 {
9055 ereport(DEBUG2,
9056 (errmsg("skipping restartpoint, recovery has already ended")));
9057 LWLockRelease(CheckpointLock);
9058 return false;
9059 }
9060
9061 /*
9062 * If the last checkpoint record we've replayed is already our last
9063 * restartpoint, we can't perform a new restart point. We still update
9064 * minRecoveryPoint in that case, so that if this is a shutdown restart
9065 * point, we won't start up earlier than before. That's not strictly
9066 * necessary, but when hot standby is enabled, it would be rather weird if
9067 * the database opened up for read-only connections at a point-in-time
9068 * before the last shutdown. Such time travel is still possible in case of
9069 * immediate shutdown, though.
9070 *
9071 * We don't explicitly advance minRecoveryPoint when we do create a
9072 * restartpoint. It's assumed that flushing the buffers will do that as a
9073 * side-effect.
9074 */
9075 if (XLogRecPtrIsInvalid(lastCheckPointRecPtr) ||
9076 lastCheckPoint.redo <= ControlFile->checkPointCopy.redo)
9077 {
9078 ereport(DEBUG2,
9079 (errmsg("skipping restartpoint, already performed at %X/%X",
9080 (uint32) (lastCheckPoint.redo >> 32),
9081 (uint32) lastCheckPoint.redo)));
9082
9083 UpdateMinRecoveryPoint(InvalidXLogRecPtr, true);
9084 if (flags & CHECKPOINT_IS_SHUTDOWN)
9085 {
9086 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9087 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9088 ControlFile->time = (pg_time_t) time(NULL);
9089 UpdateControlFile();
9090 LWLockRelease(ControlFileLock);
9091 }
9092 LWLockRelease(CheckpointLock);
9093 return false;
9094 }
9095
9096 /*
9097 * Update the shared RedoRecPtr so that the startup process can calculate
9098 * the number of segments replayed since last restartpoint, and request a
9099 * restartpoint if it exceeds CheckPointSegments.
9100 *
9101 * Like in CreateCheckPoint(), hold off insertions to update it, although
9102 * during recovery this is just pro forma, because no WAL insertions are
9103 * happening.
9104 */
9105 WALInsertLockAcquireExclusive();
9106 RedoRecPtr = XLogCtl->Insert.RedoRecPtr = lastCheckPoint.redo;
9107 WALInsertLockRelease();
9108
9109 /* Also update the info_lck-protected copy */
9110 SpinLockAcquire(&XLogCtl->info_lck);
9111 XLogCtl->RedoRecPtr = lastCheckPoint.redo;
9112 SpinLockRelease(&XLogCtl->info_lck);
9113
9114 /*
9115 * Prepare to accumulate statistics.
9116 *
9117 * Note: because it is possible for log_checkpoints to change while a
9118 * checkpoint proceeds, we always accumulate stats, even if
9119 * log_checkpoints is currently off.
9120 */
9121 MemSet(&CheckpointStats, 0, sizeof(CheckpointStats));
9122 CheckpointStats.ckpt_start_t = GetCurrentTimestamp();
9123
9124 if (log_checkpoints)
9125 LogCheckpointStart(flags, true);
9126
9127 CheckPointGuts(lastCheckPoint.redo, flags);
9128
9129 /*
9130 * Remember the prior checkpoint's redo pointer, used later to determine
9131 * the point at which we can truncate the log.
9132 */
9133 PriorRedoPtr = ControlFile->checkPointCopy.redo;
9134
9135 /*
9136 * Update pg_control, using current time. Check that it still shows
9137 * IN_ARCHIVE_RECOVERY state and an older checkpoint, else do nothing;
9138 * this is a quick hack to make sure nothing really bad happens if somehow
9139 * we get here after the end-of-recovery checkpoint.
9140 */
9141 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9142 if (ControlFile->state == DB_IN_ARCHIVE_RECOVERY &&
9143 ControlFile->checkPointCopy.redo < lastCheckPoint.redo)
9144 {
9145 ControlFile->prevCheckPoint = ControlFile->checkPoint;
9146 ControlFile->checkPoint = lastCheckPointRecPtr;
9147 ControlFile->checkPointCopy = lastCheckPoint;
9148 ControlFile->time = (pg_time_t) time(NULL);
9149
9150 /*
9151 * Ensure minRecoveryPoint is past the checkpoint record. Normally,
9152 * this will have happened already while writing out dirty buffers,
9153 * but not necessarily - e.g. because no buffers were dirtied. We do
9154 * this because a non-exclusive base backup uses minRecoveryPoint to
9155 * determine which WAL files must be included in the backup, and the
9156 * file (or files) containing the checkpoint record must be included,
9157 * at a minimum. Note that for an ordinary restart of recovery there's
9158 * no value in having the minimum recovery point any earlier than this
9159 * anyway, because redo will begin just after the checkpoint record.
9160 */
9161 if (ControlFile->minRecoveryPoint < lastCheckPointEndPtr)
9162 {
9163 ControlFile->minRecoveryPoint = lastCheckPointEndPtr;
9164 ControlFile->minRecoveryPointTLI = lastCheckPoint.ThisTimeLineID;
9165
9166 /* update local copy */
9167 minRecoveryPoint = ControlFile->minRecoveryPoint;
9168 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9169 }
9170 if (flags & CHECKPOINT_IS_SHUTDOWN)
9171 ControlFile->state = DB_SHUTDOWNED_IN_RECOVERY;
9172 UpdateControlFile();
9173 }
9174 LWLockRelease(ControlFileLock);
9175
9176 /*
9177 * Delete old log files (those no longer needed even for previous
9178 * checkpoint/restartpoint) to prevent the disk holding the xlog from
9179 * growing full.
9180 */
9181 if (PriorRedoPtr != InvalidXLogRecPtr)
9182 {
9183 XLogRecPtr receivePtr;
9184 XLogRecPtr replayPtr;
9185 TimeLineID replayTLI;
9186 XLogRecPtr endptr;
9187 XLogSegNo _logSegNo;
9188
9189 /* Update the average distance between checkpoints/restartpoints. */
9190 UpdateCheckPointDistanceEstimate(RedoRecPtr - PriorRedoPtr);
9191
9192 XLByteToSeg(PriorRedoPtr, _logSegNo);
9193
9194 /*
9195 * Get the current end of xlog replayed or received, whichever is
9196 * later.
9197 */
9198 receivePtr = GetWalRcvWriteRecPtr(NULL, NULL);
9199 replayPtr = GetXLogReplayRecPtr(&replayTLI);
9200 endptr = (receivePtr < replayPtr) ? replayPtr : receivePtr;
9201
9202 KeepLogSeg(endptr, &_logSegNo);
9203 _logSegNo--;
9204
9205 /*
9206 * Try to recycle segments on a useful timeline. If we've been
9207 * promoted since the beginning of this restartpoint, use the new
9208 * timeline chosen at end of recovery (RecoveryInProgress() sets
9209 * ThisTimeLineID in that case). If we're still in recovery, use the
9210 * timeline we're currently replaying.
9211 *
9212 * There is no guarantee that the WAL segments will be useful on the
9213 * current timeline; if recovery proceeds to a new timeline right
9214 * after this, the pre-allocated WAL segments on this timeline will
9215 * not be used, and will go wasted until recycled on the next
9216 * restartpoint. We'll live with that.
9217 */
9218 if (RecoveryInProgress())
9219 ThisTimeLineID = replayTLI;
9220
9221 RemoveOldXlogFiles(_logSegNo, PriorRedoPtr, endptr);
9222
9223 /*
9224 * Make more log segments if needed. (Do this after recycling old log
9225 * segments, since that may supply some of the needed files.)
9226 */
9227 PreallocXlogFiles(endptr);
9228
9229 /*
9230 * ThisTimeLineID is normally not set when we're still in recovery.
9231 * However, recycling/preallocating segments above needed
9232 * ThisTimeLineID to determine which timeline to install the segments
9233 * on. Reset it now, to restore the normal state of affairs for
9234 * debugging purposes.
9235 */
9236 if (RecoveryInProgress())
9237 ThisTimeLineID = 0;
9238 }
9239
9240 /*
9241 * Truncate pg_subtrans if possible. We can throw away all data before
9242 * the oldest XMIN of any running transaction. No future transaction will
9243 * attempt to reference any pg_subtrans entry older than that (see Asserts
9244 * in subtrans.c). When hot standby is disabled, though, we mustn't do
9245 * this because StartupSUBTRANS hasn't been called yet.
9246 */
9247 if (EnableHotStandby)
9248 TruncateSUBTRANS(GetOldestXmin(NULL, false));
9249
9250 /* Real work is done, but log and update before releasing lock. */
9251 LogCheckpointEnd(true);
9252
9253 xtime = GetLatestXTime();
9254 ereport((log_checkpoints ? LOG : DEBUG2),
9255 (errmsg("recovery restart point at %X/%X",
9256 (uint32) (lastCheckPoint.redo >> 32), (uint32) lastCheckPoint.redo),
9257 xtime ? errdetail("last completed transaction was at log time %s",
9258 timestamptz_to_str(xtime)) : 0));
9259
9260 LWLockRelease(CheckpointLock);
9261
9262 /*
9263 * Finally, execute archive_cleanup_command, if any.
9264 */
9265 if (XLogCtl->archiveCleanupCommand[0])
9266 ExecuteRecoveryCommand(XLogCtl->archiveCleanupCommand,
9267 "archive_cleanup_command",
9268 false);
9269
9270 return true;
9271 }
9272
9273 /*
9274 * Retreat *logSegNo to the last segment that we need to retain because of
9275 * either wal_keep_segments or replication slots.
9276 *
9277 * This is calculated by subtracting wal_keep_segments from the given xlog
9278 * location, recptr and by making sure that that result is below the
9279 * requirement of replication slots.
9280 */
9281 static void
KeepLogSeg(XLogRecPtr recptr,XLogSegNo * logSegNo)9282 KeepLogSeg(XLogRecPtr recptr, XLogSegNo *logSegNo)
9283 {
9284 XLogSegNo segno;
9285 XLogRecPtr keep;
9286
9287 XLByteToSeg(recptr, segno);
9288 keep = XLogGetReplicationSlotMinimumLSN();
9289
9290 /* compute limit for wal_keep_segments first */
9291 if (wal_keep_segments > 0)
9292 {
9293 /* avoid underflow, don't go below 1 */
9294 if (segno <= wal_keep_segments)
9295 segno = 1;
9296 else
9297 segno = segno - wal_keep_segments;
9298 }
9299
9300 /* then check whether slots limit removal further */
9301 if (max_replication_slots > 0 && keep != InvalidXLogRecPtr)
9302 {
9303 XLogSegNo slotSegNo;
9304
9305 XLByteToSeg(keep, slotSegNo);
9306
9307 if (slotSegNo <= 0)
9308 segno = 1;
9309 else if (slotSegNo < segno)
9310 segno = slotSegNo;
9311 }
9312
9313 /* don't delete WAL segments newer than the calculated segment */
9314 if (segno < *logSegNo)
9315 *logSegNo = segno;
9316 }
9317
9318 /*
9319 * Write a NEXTOID log record
9320 */
9321 void
XLogPutNextOid(Oid nextOid)9322 XLogPutNextOid(Oid nextOid)
9323 {
9324 XLogBeginInsert();
9325 XLogRegisterData((char *) (&nextOid), sizeof(Oid));
9326 (void) XLogInsert(RM_XLOG_ID, XLOG_NEXTOID);
9327
9328 /*
9329 * We need not flush the NEXTOID record immediately, because any of the
9330 * just-allocated OIDs could only reach disk as part of a tuple insert or
9331 * update that would have its own XLOG record that must follow the NEXTOID
9332 * record. Therefore, the standard buffer LSN interlock applied to those
9333 * records will ensure no such OID reaches disk before the NEXTOID record
9334 * does.
9335 *
9336 * Note, however, that the above statement only covers state "within" the
9337 * database. When we use a generated OID as a file or directory name, we
9338 * are in a sense violating the basic WAL rule, because that filesystem
9339 * change may reach disk before the NEXTOID WAL record does. The impact
9340 * of this is that if a database crash occurs immediately afterward, we
9341 * might after restart re-generate the same OID and find that it conflicts
9342 * with the leftover file or directory. But since for safety's sake we
9343 * always loop until finding a nonconflicting filename, this poses no real
9344 * problem in practice. See pgsql-hackers discussion 27-Sep-2006.
9345 */
9346 }
9347
9348 /*
9349 * Write an XLOG SWITCH record.
9350 *
9351 * Here we just blindly issue an XLogInsert request for the record.
9352 * All the magic happens inside XLogInsert.
9353 *
9354 * The return value is either the end+1 address of the switch record,
9355 * or the end+1 address of the prior segment if we did not need to
9356 * write a switch record because we are already at segment start.
9357 */
9358 XLogRecPtr
RequestXLogSwitch(void)9359 RequestXLogSwitch(void)
9360 {
9361 XLogRecPtr RecPtr;
9362
9363 /* XLOG SWITCH has no data */
9364 XLogBeginInsert();
9365 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_SWITCH);
9366
9367 return RecPtr;
9368 }
9369
9370 /*
9371 * Write a RESTORE POINT record
9372 */
9373 XLogRecPtr
XLogRestorePoint(const char * rpName)9374 XLogRestorePoint(const char *rpName)
9375 {
9376 XLogRecPtr RecPtr;
9377 xl_restore_point xlrec;
9378
9379 xlrec.rp_time = GetCurrentTimestamp();
9380 strlcpy(xlrec.rp_name, rpName, MAXFNAMELEN);
9381
9382 XLogBeginInsert();
9383 XLogRegisterData((char *) &xlrec, sizeof(xl_restore_point));
9384
9385 RecPtr = XLogInsert(RM_XLOG_ID, XLOG_RESTORE_POINT);
9386
9387 ereport(LOG,
9388 (errmsg("restore point \"%s\" created at %X/%X",
9389 rpName, (uint32) (RecPtr >> 32), (uint32) RecPtr)));
9390
9391 return RecPtr;
9392 }
9393
9394 /*
9395 * Check if any of the GUC parameters that are critical for hot standby
9396 * have changed, and update the value in pg_control file if necessary.
9397 */
9398 static void
XLogReportParameters(void)9399 XLogReportParameters(void)
9400 {
9401 if (wal_level != ControlFile->wal_level ||
9402 wal_log_hints != ControlFile->wal_log_hints ||
9403 MaxConnections != ControlFile->MaxConnections ||
9404 max_worker_processes != ControlFile->max_worker_processes ||
9405 max_prepared_xacts != ControlFile->max_prepared_xacts ||
9406 max_locks_per_xact != ControlFile->max_locks_per_xact ||
9407 track_commit_timestamp != ControlFile->track_commit_timestamp)
9408 {
9409 /*
9410 * The change in number of backend slots doesn't need to be WAL-logged
9411 * if archiving is not enabled, as you can't start archive recovery
9412 * with wal_level=minimal anyway. We don't really care about the
9413 * values in pg_control either if wal_level=minimal, but seems better
9414 * to keep them up-to-date to avoid confusion.
9415 */
9416 if (wal_level != ControlFile->wal_level || XLogIsNeeded())
9417 {
9418 xl_parameter_change xlrec;
9419 XLogRecPtr recptr;
9420
9421 xlrec.MaxConnections = MaxConnections;
9422 xlrec.max_worker_processes = max_worker_processes;
9423 xlrec.max_prepared_xacts = max_prepared_xacts;
9424 xlrec.max_locks_per_xact = max_locks_per_xact;
9425 xlrec.wal_level = wal_level;
9426 xlrec.wal_log_hints = wal_log_hints;
9427 xlrec.track_commit_timestamp = track_commit_timestamp;
9428
9429 XLogBeginInsert();
9430 XLogRegisterData((char *) &xlrec, sizeof(xlrec));
9431
9432 recptr = XLogInsert(RM_XLOG_ID, XLOG_PARAMETER_CHANGE);
9433 XLogFlush(recptr);
9434 }
9435
9436 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9437
9438 ControlFile->MaxConnections = MaxConnections;
9439 ControlFile->max_worker_processes = max_worker_processes;
9440 ControlFile->max_prepared_xacts = max_prepared_xacts;
9441 ControlFile->max_locks_per_xact = max_locks_per_xact;
9442 ControlFile->wal_level = wal_level;
9443 ControlFile->wal_log_hints = wal_log_hints;
9444 ControlFile->track_commit_timestamp = track_commit_timestamp;
9445 UpdateControlFile();
9446
9447 LWLockRelease(ControlFileLock);
9448 }
9449 }
9450
9451 /*
9452 * Update full_page_writes in shared memory, and write an
9453 * XLOG_FPW_CHANGE record if necessary.
9454 *
9455 * Note: this function assumes there is no other process running
9456 * concurrently that could update it.
9457 */
9458 void
UpdateFullPageWrites(void)9459 UpdateFullPageWrites(void)
9460 {
9461 XLogCtlInsert *Insert = &XLogCtl->Insert;
9462 bool recoveryInProgress;
9463
9464 /*
9465 * Do nothing if full_page_writes has not been changed.
9466 *
9467 * It's safe to check the shared full_page_writes without the lock,
9468 * because we assume that there is no concurrently running process which
9469 * can update it.
9470 */
9471 if (fullPageWrites == Insert->fullPageWrites)
9472 return;
9473
9474 /*
9475 * Perform this outside critical section so that the WAL insert
9476 * initialization done by RecoveryInProgress() doesn't trigger an
9477 * assertion failure.
9478 */
9479 recoveryInProgress = RecoveryInProgress();
9480
9481 START_CRIT_SECTION();
9482
9483 /*
9484 * It's always safe to take full page images, even when not strictly
9485 * required, but not the other round. So if we're setting full_page_writes
9486 * to true, first set it true and then write the WAL record. If we're
9487 * setting it to false, first write the WAL record and then set the global
9488 * flag.
9489 */
9490 if (fullPageWrites)
9491 {
9492 WALInsertLockAcquireExclusive();
9493 Insert->fullPageWrites = true;
9494 WALInsertLockRelease();
9495 }
9496
9497 /*
9498 * Write an XLOG_FPW_CHANGE record. This allows us to keep track of
9499 * full_page_writes during archive recovery, if required.
9500 */
9501 if (XLogStandbyInfoActive() && !recoveryInProgress)
9502 {
9503 XLogBeginInsert();
9504 XLogRegisterData((char *) (&fullPageWrites), sizeof(bool));
9505
9506 XLogInsert(RM_XLOG_ID, XLOG_FPW_CHANGE);
9507 }
9508
9509 if (!fullPageWrites)
9510 {
9511 WALInsertLockAcquireExclusive();
9512 Insert->fullPageWrites = false;
9513 WALInsertLockRelease();
9514 }
9515 END_CRIT_SECTION();
9516 }
9517
9518 /*
9519 * Check that it's OK to switch to new timeline during recovery.
9520 *
9521 * 'lsn' is the address of the shutdown checkpoint record we're about to
9522 * replay. (Currently, timeline can only change at a shutdown checkpoint).
9523 */
9524 static void
checkTimeLineSwitch(XLogRecPtr lsn,TimeLineID newTLI,TimeLineID prevTLI)9525 checkTimeLineSwitch(XLogRecPtr lsn, TimeLineID newTLI, TimeLineID prevTLI)
9526 {
9527 /* Check that the record agrees on what the current (old) timeline is */
9528 if (prevTLI != ThisTimeLineID)
9529 ereport(PANIC,
9530 (errmsg("unexpected previous timeline ID %u (current timeline ID %u) in checkpoint record",
9531 prevTLI, ThisTimeLineID)));
9532
9533 /*
9534 * The new timeline better be in the list of timelines we expect to see,
9535 * according to the timeline history. It should also not decrease.
9536 */
9537 if (newTLI < ThisTimeLineID || !tliInHistory(newTLI, expectedTLEs))
9538 ereport(PANIC,
9539 (errmsg("unexpected timeline ID %u (after %u) in checkpoint record",
9540 newTLI, ThisTimeLineID)));
9541
9542 /*
9543 * If we have not yet reached min recovery point, and we're about to
9544 * switch to a timeline greater than the timeline of the min recovery
9545 * point: trouble. After switching to the new timeline, we could not
9546 * possibly visit the min recovery point on the correct timeline anymore.
9547 * This can happen if there is a newer timeline in the archive that
9548 * branched before the timeline the min recovery point is on, and you
9549 * attempt to do PITR to the new timeline.
9550 */
9551 if (!XLogRecPtrIsInvalid(minRecoveryPoint) &&
9552 lsn < minRecoveryPoint &&
9553 newTLI > minRecoveryPointTLI)
9554 ereport(PANIC,
9555 (errmsg("unexpected timeline ID %u in checkpoint record, before reaching minimum recovery point %X/%X on timeline %u",
9556 newTLI,
9557 (uint32) (minRecoveryPoint >> 32),
9558 (uint32) minRecoveryPoint,
9559 minRecoveryPointTLI)));
9560
9561 /* Looks good */
9562 }
9563
9564 /*
9565 * XLOG resource manager's routines
9566 *
9567 * Definitions of info values are in include/catalog/pg_control.h, though
9568 * not all record types are related to control file updates.
9569 */
9570 void
xlog_redo(XLogReaderState * record)9571 xlog_redo(XLogReaderState *record)
9572 {
9573 uint8 info = XLogRecGetInfo(record) & ~XLR_INFO_MASK;
9574 XLogRecPtr lsn = record->EndRecPtr;
9575
9576 /* in XLOG rmgr, backup blocks are only used by XLOG_FPI records */
9577 Assert(info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9578 info == XLOG_FPI_MULTI || !XLogRecHasAnyBlockRefs(record));
9579
9580 if (info == XLOG_NEXTOID)
9581 {
9582 Oid nextOid;
9583
9584 /*
9585 * We used to try to take the maximum of ShmemVariableCache->nextOid
9586 * and the recorded nextOid, but that fails if the OID counter wraps
9587 * around. Since no OID allocation should be happening during replay
9588 * anyway, better to just believe the record exactly. We still take
9589 * OidGenLock while setting the variable, just in case.
9590 */
9591 memcpy(&nextOid, XLogRecGetData(record), sizeof(Oid));
9592 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9593 ShmemVariableCache->nextOid = nextOid;
9594 ShmemVariableCache->oidCount = 0;
9595 LWLockRelease(OidGenLock);
9596 }
9597 else if (info == XLOG_CHECKPOINT_SHUTDOWN)
9598 {
9599 CheckPoint checkPoint;
9600
9601 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9602 /* In a SHUTDOWN checkpoint, believe the counters exactly */
9603 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9604 ShmemVariableCache->nextXid = checkPoint.nextXid;
9605 LWLockRelease(XidGenLock);
9606 LWLockAcquire(OidGenLock, LW_EXCLUSIVE);
9607 ShmemVariableCache->nextOid = checkPoint.nextOid;
9608 ShmemVariableCache->oidCount = 0;
9609 LWLockRelease(OidGenLock);
9610 MultiXactSetNextMXact(checkPoint.nextMulti,
9611 checkPoint.nextMultiOffset);
9612
9613 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9614 checkPoint.oldestMultiDB);
9615 SetTransactionIdLimit(checkPoint.oldestXid, checkPoint.oldestXidDB);
9616
9617 /*
9618 * If we see a shutdown checkpoint while waiting for an end-of-backup
9619 * record, the backup was canceled and the end-of-backup record will
9620 * never arrive.
9621 */
9622 if (ArchiveRecoveryRequested &&
9623 !XLogRecPtrIsInvalid(ControlFile->backupStartPoint) &&
9624 XLogRecPtrIsInvalid(ControlFile->backupEndPoint))
9625 ereport(PANIC,
9626 (errmsg("online backup was canceled, recovery cannot continue")));
9627
9628 /*
9629 * If we see a shutdown checkpoint, we know that nothing was running
9630 * on the master at this point. So fake-up an empty running-xacts
9631 * record and use that here and now. Recover additional standby state
9632 * for prepared transactions.
9633 */
9634 if (standbyState >= STANDBY_INITIALIZED)
9635 {
9636 TransactionId *xids;
9637 int nxids;
9638 TransactionId oldestActiveXID;
9639 TransactionId latestCompletedXid;
9640 RunningTransactionsData running;
9641
9642 oldestActiveXID = PrescanPreparedTransactions(&xids, &nxids);
9643
9644 /*
9645 * Construct a RunningTransactions snapshot representing a shut
9646 * down server, with only prepared transactions still alive. We're
9647 * never overflowed at this point because all subxids are listed
9648 * with their parent prepared transactions.
9649 */
9650 running.xcnt = nxids;
9651 running.subxcnt = 0;
9652 running.subxid_overflow = false;
9653 running.nextXid = checkPoint.nextXid;
9654 running.oldestRunningXid = oldestActiveXID;
9655 latestCompletedXid = checkPoint.nextXid;
9656 TransactionIdRetreat(latestCompletedXid);
9657 Assert(TransactionIdIsNormal(latestCompletedXid));
9658 running.latestCompletedXid = latestCompletedXid;
9659 running.xids = xids;
9660
9661 ProcArrayApplyRecoveryInfo(&running);
9662
9663 StandbyRecoverPreparedTransactions(true);
9664 }
9665
9666 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9667 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9668 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9669 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9670 LWLockRelease(ControlFileLock);
9671
9672 /* Update shared-memory copy of checkpoint XID/epoch */
9673 SpinLockAcquire(&XLogCtl->info_lck);
9674 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9675 XLogCtl->ckptXid = checkPoint.nextXid;
9676 SpinLockRelease(&XLogCtl->info_lck);
9677
9678 /*
9679 * We should've already switched to the new TLI before replaying this
9680 * record.
9681 */
9682 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9683 ereport(PANIC,
9684 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9685 checkPoint.ThisTimeLineID, ThisTimeLineID)));
9686
9687 RecoveryRestartPoint(&checkPoint);
9688 }
9689 else if (info == XLOG_CHECKPOINT_ONLINE)
9690 {
9691 CheckPoint checkPoint;
9692
9693 memcpy(&checkPoint, XLogRecGetData(record), sizeof(CheckPoint));
9694 /* In an ONLINE checkpoint, treat the XID counter as a minimum */
9695 LWLockAcquire(XidGenLock, LW_EXCLUSIVE);
9696 if (TransactionIdPrecedes(ShmemVariableCache->nextXid,
9697 checkPoint.nextXid))
9698 ShmemVariableCache->nextXid = checkPoint.nextXid;
9699 LWLockRelease(XidGenLock);
9700
9701 /*
9702 * We ignore the nextOid counter in an ONLINE checkpoint, preferring
9703 * to track OID assignment through XLOG_NEXTOID records. The nextOid
9704 * counter is from the start of the checkpoint and might well be stale
9705 * compared to later XLOG_NEXTOID records. We could try to take the
9706 * maximum of the nextOid counter and our latest value, but since
9707 * there's no particular guarantee about the speed with which the OID
9708 * counter wraps around, that's a risky thing to do. In any case,
9709 * users of the nextOid counter are required to avoid assignment of
9710 * duplicates, so that a somewhat out-of-date value should be safe.
9711 */
9712
9713 /* Handle multixact */
9714 MultiXactAdvanceNextMXact(checkPoint.nextMulti,
9715 checkPoint.nextMultiOffset);
9716
9717 /*
9718 * NB: This may perform multixact truncation when replaying WAL
9719 * generated by an older primary.
9720 */
9721 MultiXactAdvanceOldest(checkPoint.oldestMulti,
9722 checkPoint.oldestMultiDB);
9723 if (TransactionIdPrecedes(ShmemVariableCache->oldestXid,
9724 checkPoint.oldestXid))
9725 SetTransactionIdLimit(checkPoint.oldestXid,
9726 checkPoint.oldestXidDB);
9727 /* ControlFile->checkPointCopy always tracks the latest ckpt XID */
9728 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9729 ControlFile->checkPointCopy.nextXidEpoch = checkPoint.nextXidEpoch;
9730 ControlFile->checkPointCopy.nextXid = checkPoint.nextXid;
9731 LWLockRelease(ControlFileLock);
9732
9733 /* Update shared-memory copy of checkpoint XID/epoch */
9734 SpinLockAcquire(&XLogCtl->info_lck);
9735 XLogCtl->ckptXidEpoch = checkPoint.nextXidEpoch;
9736 XLogCtl->ckptXid = checkPoint.nextXid;
9737 SpinLockRelease(&XLogCtl->info_lck);
9738
9739 /* TLI should not change in an on-line checkpoint */
9740 if (checkPoint.ThisTimeLineID != ThisTimeLineID)
9741 ereport(PANIC,
9742 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9743 checkPoint.ThisTimeLineID, ThisTimeLineID)));
9744
9745 RecoveryRestartPoint(&checkPoint);
9746 }
9747 else if (info == XLOG_OVERWRITE_CONTRECORD)
9748 {
9749 xl_overwrite_contrecord xlrec;
9750
9751 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_overwrite_contrecord));
9752 VerifyOverwriteContrecord(&xlrec, record);
9753 }
9754 else if (info == XLOG_END_OF_RECOVERY)
9755 {
9756 xl_end_of_recovery xlrec;
9757
9758 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_end_of_recovery));
9759
9760 /*
9761 * For Hot Standby, we could treat this like a Shutdown Checkpoint,
9762 * but this case is rarer and harder to test, so the benefit doesn't
9763 * outweigh the potential extra cost of maintenance.
9764 */
9765
9766 /*
9767 * We should've already switched to the new TLI before replaying this
9768 * record.
9769 */
9770 if (xlrec.ThisTimeLineID != ThisTimeLineID)
9771 ereport(PANIC,
9772 (errmsg("unexpected timeline ID %u (should be %u) in checkpoint record",
9773 xlrec.ThisTimeLineID, ThisTimeLineID)));
9774 }
9775 else if (info == XLOG_NOOP)
9776 {
9777 /* nothing to do here */
9778 }
9779 else if (info == XLOG_SWITCH)
9780 {
9781 /* nothing to do here */
9782 }
9783 else if (info == XLOG_RESTORE_POINT)
9784 {
9785 /* nothing to do here */
9786 }
9787 else if (info == XLOG_FPI || info == XLOG_FPI_FOR_HINT ||
9788 info == XLOG_FPI_MULTI)
9789 {
9790 uint8 block_id;
9791
9792 /*
9793 * Full-page image (FPI) records contain nothing else but a backup
9794 * block (or multiple backup blocks). Every block reference must
9795 * include a full-page image - otherwise there would be no point in
9796 * this record.
9797 *
9798 * No recovery conflicts are generated by these generic records - if a
9799 * resource manager needs to generate conflicts, it has to define a
9800 * separate WAL record type and redo routine.
9801 *
9802 * XLOG_FPI_FOR_HINT records are generated when a page needs to be
9803 * WAL- logged because of a hint bit update. They are only generated
9804 * when checksums are enabled. There is no difference in handling
9805 * XLOG_FPI and XLOG_FPI_FOR_HINT records, they use a different info
9806 * code just to distinguish them for statistics purposes.
9807 */
9808 for (block_id = 0; block_id <= record->max_block_id; block_id++)
9809 {
9810 Buffer buffer;
9811
9812 if (XLogReadBufferForRedo(record, block_id, &buffer) != BLK_RESTORED)
9813 elog(ERROR, "unexpected XLogReadBufferForRedo result when restoring backup block");
9814 UnlockReleaseBuffer(buffer);
9815 }
9816 }
9817 else if (info == XLOG_BACKUP_END)
9818 {
9819 XLogRecPtr startpoint;
9820
9821 memcpy(&startpoint, XLogRecGetData(record), sizeof(startpoint));
9822
9823 if (ControlFile->backupStartPoint == startpoint)
9824 {
9825 /*
9826 * We have reached the end of base backup, the point where
9827 * pg_stop_backup() was done. The data on disk is now consistent.
9828 * Reset backupStartPoint, and update minRecoveryPoint to make
9829 * sure we don't allow starting up at an earlier point even if
9830 * recovery is stopped and restarted soon after this.
9831 */
9832 elog(DEBUG1, "end of backup reached");
9833
9834 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9835
9836 if (ControlFile->minRecoveryPoint < lsn)
9837 {
9838 ControlFile->minRecoveryPoint = lsn;
9839 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9840 }
9841 ControlFile->backupStartPoint = InvalidXLogRecPtr;
9842 ControlFile->backupEndRequired = false;
9843 UpdateControlFile();
9844
9845 LWLockRelease(ControlFileLock);
9846 }
9847 }
9848 else if (info == XLOG_PARAMETER_CHANGE)
9849 {
9850 xl_parameter_change xlrec;
9851
9852 /* Update our copy of the parameters in pg_control */
9853 memcpy(&xlrec, XLogRecGetData(record), sizeof(xl_parameter_change));
9854
9855 LWLockAcquire(ControlFileLock, LW_EXCLUSIVE);
9856 ControlFile->MaxConnections = xlrec.MaxConnections;
9857 ControlFile->max_worker_processes = xlrec.max_worker_processes;
9858 ControlFile->max_prepared_xacts = xlrec.max_prepared_xacts;
9859 ControlFile->max_locks_per_xact = xlrec.max_locks_per_xact;
9860 ControlFile->wal_level = xlrec.wal_level;
9861 ControlFile->wal_log_hints = xlrec.wal_log_hints;
9862
9863 /*
9864 * Update minRecoveryPoint to ensure that if recovery is aborted, we
9865 * recover back up to this point before allowing hot standby again.
9866 * This is important if the max_* settings are decreased, to ensure
9867 * you don't run queries against the WAL preceding the change. The
9868 * local copies cannot be updated as long as crash recovery is
9869 * happening and we expect all the WAL to be replayed.
9870 */
9871 if (InArchiveRecovery)
9872 {
9873 minRecoveryPoint = ControlFile->minRecoveryPoint;
9874 minRecoveryPointTLI = ControlFile->minRecoveryPointTLI;
9875 }
9876 if (minRecoveryPoint != InvalidXLogRecPtr && minRecoveryPoint < lsn)
9877 {
9878 ControlFile->minRecoveryPoint = lsn;
9879 ControlFile->minRecoveryPointTLI = ThisTimeLineID;
9880 }
9881
9882 CommitTsParameterChange(xlrec.track_commit_timestamp,
9883 ControlFile->track_commit_timestamp);
9884 ControlFile->track_commit_timestamp = xlrec.track_commit_timestamp;
9885
9886 UpdateControlFile();
9887 LWLockRelease(ControlFileLock);
9888
9889 /* Check to see if any changes to max_connections give problems */
9890 CheckRequiredParameterValues();
9891 }
9892 else if (info == XLOG_FPW_CHANGE)
9893 {
9894 bool fpw;
9895
9896 memcpy(&fpw, XLogRecGetData(record), sizeof(bool));
9897
9898 /*
9899 * Update the LSN of the last replayed XLOG_FPW_CHANGE record so that
9900 * do_pg_start_backup() and do_pg_stop_backup() can check whether
9901 * full_page_writes has been disabled during online backup.
9902 */
9903 if (!fpw)
9904 {
9905 SpinLockAcquire(&XLogCtl->info_lck);
9906 if (XLogCtl->lastFpwDisableRecPtr < ReadRecPtr)
9907 XLogCtl->lastFpwDisableRecPtr = ReadRecPtr;
9908 SpinLockRelease(&XLogCtl->info_lck);
9909 }
9910
9911 /* Keep track of full_page_writes */
9912 lastFullPageWrites = fpw;
9913 }
9914 }
9915
9916 /*
9917 * Verify the payload of a XLOG_OVERWRITE_CONTRECORD record.
9918 */
9919 static void
VerifyOverwriteContrecord(xl_overwrite_contrecord * xlrec,XLogReaderState * state)9920 VerifyOverwriteContrecord(xl_overwrite_contrecord *xlrec, XLogReaderState *state)
9921 {
9922 if (xlrec->overwritten_lsn != state->overwrittenRecPtr)
9923 elog(FATAL, "mismatching overwritten LSN %X/%X -> %X/%X",
9924 (uint32) (xlrec->overwritten_lsn >> 32),
9925 (uint32) xlrec->overwritten_lsn,
9926 (uint32) (state->overwrittenRecPtr >> 32),
9927 (uint32) state->overwrittenRecPtr);
9928
9929 ereport(LOG,
9930 (errmsg("successfully skipped missing contrecord at %X/%X, overwritten at %s",
9931 (uint32) (xlrec->overwritten_lsn >> 32),
9932 (uint32) xlrec->overwritten_lsn,
9933 timestamptz_to_str(xlrec->overwrite_time))));
9934
9935 /* Verifying the record should only happen once */
9936 state->overwrittenRecPtr = InvalidXLogRecPtr;
9937 }
9938
9939 #ifdef WAL_DEBUG
9940
9941 static void
xlog_outrec(StringInfo buf,XLogReaderState * record)9942 xlog_outrec(StringInfo buf, XLogReaderState *record)
9943 {
9944 int block_id;
9945
9946 appendStringInfo(buf, "prev %X/%X; xid %u",
9947 (uint32) (XLogRecGetPrev(record) >> 32),
9948 (uint32) XLogRecGetPrev(record),
9949 XLogRecGetXid(record));
9950
9951 appendStringInfo(buf, "; len %u",
9952 XLogRecGetDataLen(record));
9953
9954 /* decode block references */
9955 for (block_id = 0; block_id <= record->max_block_id; block_id++)
9956 {
9957 RelFileNode rnode;
9958 ForkNumber forknum;
9959 BlockNumber blk;
9960
9961 if (!XLogRecHasBlockRef(record, block_id))
9962 continue;
9963
9964 XLogRecGetBlockTag(record, block_id, &rnode, &forknum, &blk);
9965 if (forknum != MAIN_FORKNUM)
9966 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, fork %u, blk %u",
9967 block_id,
9968 rnode.spcNode, rnode.dbNode, rnode.relNode,
9969 forknum,
9970 blk);
9971 else
9972 appendStringInfo(buf, "; blkref #%u: rel %u/%u/%u, blk %u",
9973 block_id,
9974 rnode.spcNode, rnode.dbNode, rnode.relNode,
9975 blk);
9976 if (XLogRecHasBlockImage(record, block_id))
9977 appendStringInfoString(buf, " FPW");
9978 }
9979 }
9980 #endif /* WAL_DEBUG */
9981
9982 /*
9983 * Returns a string describing an XLogRecord, consisting of its identity
9984 * optionally followed by a colon, a space, and a further description.
9985 */
9986 static void
xlog_outdesc(StringInfo buf,XLogReaderState * record)9987 xlog_outdesc(StringInfo buf, XLogReaderState *record)
9988 {
9989 RmgrId rmid = XLogRecGetRmid(record);
9990 uint8 info = XLogRecGetInfo(record);
9991 const char *id;
9992
9993 appendStringInfoString(buf, RmgrTable[rmid].rm_name);
9994 appendStringInfoChar(buf, '/');
9995
9996 id = RmgrTable[rmid].rm_identify(info);
9997 if (id == NULL)
9998 appendStringInfo(buf, "UNKNOWN (%X): ", info & ~XLR_INFO_MASK);
9999 else
10000 appendStringInfo(buf, "%s: ", id);
10001
10002 RmgrTable[rmid].rm_desc(buf, record);
10003 }
10004
10005
10006 /*
10007 * Return the (possible) sync flag used for opening a file, depending on the
10008 * value of the GUC wal_sync_method.
10009 */
10010 static int
get_sync_bit(int method)10011 get_sync_bit(int method)
10012 {
10013 int o_direct_flag = 0;
10014
10015 /* If fsync is disabled, never open in sync mode */
10016 if (!enableFsync)
10017 return 0;
10018
10019 /*
10020 * Optimize writes by bypassing kernel cache with O_DIRECT when using
10021 * O_SYNC/O_FSYNC and O_DSYNC. But only if archiving and streaming are
10022 * disabled, otherwise the archive command or walsender process will read
10023 * the WAL soon after writing it, which is guaranteed to cause a physical
10024 * read if we bypassed the kernel cache. We also skip the
10025 * posix_fadvise(POSIX_FADV_DONTNEED) call in XLogFileClose() for the same
10026 * reason.
10027 *
10028 * Never use O_DIRECT in walreceiver process for similar reasons; the WAL
10029 * written by walreceiver is normally read by the startup process soon
10030 * after its written. Also, walreceiver performs unaligned writes, which
10031 * don't work with O_DIRECT, so it is required for correctness too.
10032 */
10033 if (!XLogIsNeeded() && !AmWalReceiverProcess())
10034 o_direct_flag = PG_O_DIRECT;
10035
10036 switch (method)
10037 {
10038 /*
10039 * enum values for all sync options are defined even if they are
10040 * not supported on the current platform. But if not, they are
10041 * not included in the enum option array, and therefore will never
10042 * be seen here.
10043 */
10044 case SYNC_METHOD_FSYNC:
10045 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10046 case SYNC_METHOD_FDATASYNC:
10047 return 0;
10048 #ifdef OPEN_SYNC_FLAG
10049 case SYNC_METHOD_OPEN:
10050 return OPEN_SYNC_FLAG | o_direct_flag;
10051 #endif
10052 #ifdef OPEN_DATASYNC_FLAG
10053 case SYNC_METHOD_OPEN_DSYNC:
10054 return OPEN_DATASYNC_FLAG | o_direct_flag;
10055 #endif
10056 default:
10057 /* can't happen (unless we are out of sync with option array) */
10058 elog(ERROR, "unrecognized wal_sync_method: %d", method);
10059 return 0; /* silence warning */
10060 }
10061 }
10062
10063 /*
10064 * GUC support
10065 */
10066 void
assign_xlog_sync_method(int new_sync_method,void * extra)10067 assign_xlog_sync_method(int new_sync_method, void *extra)
10068 {
10069 if (sync_method != new_sync_method)
10070 {
10071 /*
10072 * To ensure that no blocks escape unsynced, force an fsync on the
10073 * currently open log segment (if any). Also, if the open flag is
10074 * changing, close the log file so it will be reopened (with new flag
10075 * bit) at next use.
10076 */
10077 if (openLogFile >= 0)
10078 {
10079 if (pg_fsync(openLogFile) != 0)
10080 ereport(PANIC,
10081 (errcode_for_file_access(),
10082 errmsg("could not fsync log segment %s: %m",
10083 XLogFileNameP(ThisTimeLineID, openLogSegNo))));
10084 if (get_sync_bit(sync_method) != get_sync_bit(new_sync_method))
10085 XLogFileClose();
10086 }
10087 }
10088 }
10089
10090
10091 /*
10092 * Issue appropriate kind of fsync (if any) for an XLOG output file.
10093 *
10094 * 'fd' is a file descriptor for the XLOG file to be fsync'd.
10095 * 'log' and 'seg' are for error reporting purposes.
10096 */
10097 void
issue_xlog_fsync(int fd,XLogSegNo segno)10098 issue_xlog_fsync(int fd, XLogSegNo segno)
10099 {
10100 switch (sync_method)
10101 {
10102 case SYNC_METHOD_FSYNC:
10103 if (pg_fsync_no_writethrough(fd) != 0)
10104 ereport(PANIC,
10105 (errcode_for_file_access(),
10106 errmsg("could not fsync log file %s: %m",
10107 XLogFileNameP(ThisTimeLineID, segno))));
10108 break;
10109 #ifdef HAVE_FSYNC_WRITETHROUGH
10110 case SYNC_METHOD_FSYNC_WRITETHROUGH:
10111 if (pg_fsync_writethrough(fd) != 0)
10112 ereport(PANIC,
10113 (errcode_for_file_access(),
10114 errmsg("could not fsync write-through log file %s: %m",
10115 XLogFileNameP(ThisTimeLineID, segno))));
10116 break;
10117 #endif
10118 #ifdef HAVE_FDATASYNC
10119 case SYNC_METHOD_FDATASYNC:
10120 if (pg_fdatasync(fd) != 0)
10121 ereport(PANIC,
10122 (errcode_for_file_access(),
10123 errmsg("could not fdatasync log file %s: %m",
10124 XLogFileNameP(ThisTimeLineID, segno))));
10125 break;
10126 #endif
10127 case SYNC_METHOD_OPEN:
10128 case SYNC_METHOD_OPEN_DSYNC:
10129 /* write synced it already */
10130 break;
10131 default:
10132 elog(PANIC, "unrecognized wal_sync_method: %d", sync_method);
10133 break;
10134 }
10135 }
10136
10137 /*
10138 * Return the filename of given log segment, as a palloc'd string.
10139 */
10140 char *
XLogFileNameP(TimeLineID tli,XLogSegNo segno)10141 XLogFileNameP(TimeLineID tli, XLogSegNo segno)
10142 {
10143 char *result = palloc(MAXFNAMELEN);
10144
10145 XLogFileName(result, tli, segno);
10146 return result;
10147 }
10148
10149 /*
10150 * do_pg_start_backup is the workhorse of the user-visible pg_start_backup()
10151 * function. It creates the necessary starting checkpoint and constructs the
10152 * backup label file.
10153 *
10154 * There are two kind of backups: exclusive and non-exclusive. An exclusive
10155 * backup is started with pg_start_backup(), and there can be only one active
10156 * at a time. The backup and tablespace map files of an exclusive backup are
10157 * written to $PGDATA/backup_label and $PGDATA/tablespace_map, and they are
10158 * removed by pg_stop_backup().
10159 *
10160 * A non-exclusive backup is used for the streaming base backups (see
10161 * src/backend/replication/basebackup.c). The difference to exclusive backups
10162 * is that the backup label and tablespace map files are not written to disk.
10163 * Instead, their would-be contents are returned in *labelfile and *tblspcmapfile,
10164 * and the caller is responsible for including them in the backup archive as
10165 * 'backup_label' and 'tablespace_map'. There can be many non-exclusive backups
10166 * active at the same time, and they don't conflict with an exclusive backup
10167 * either.
10168 *
10169 * tblspcmapfile is required mainly for tar format in windows as native windows
10170 * utilities are not able to create symlinks while extracting files from tar.
10171 * However for consistency, the same is used for all platforms.
10172 *
10173 * needtblspcmapfile is true for the cases (exclusive backup and for
10174 * non-exclusive backup only when tar format is used for taking backup)
10175 * when backup needs to generate tablespace_map file, it is used to
10176 * embed escape character before newline character in tablespace path.
10177 *
10178 * Returns the minimum WAL position that must be present to restore from this
10179 * backup, and the corresponding timeline ID in *starttli_p.
10180 *
10181 * Every successfully started non-exclusive backup must be stopped by calling
10182 * do_pg_stop_backup() or do_pg_abort_backup().
10183 *
10184 * It is the responsibility of the caller of this function to verify the
10185 * permissions of the calling user!
10186 */
10187 XLogRecPtr
do_pg_start_backup(const char * backupidstr,bool fast,TimeLineID * starttli_p,StringInfo labelfile,DIR * tblspcdir,List ** tablespaces,StringInfo tblspcmapfile,bool infotbssize,bool needtblspcmapfile)10188 do_pg_start_backup(const char *backupidstr, bool fast, TimeLineID *starttli_p,
10189 StringInfo labelfile, DIR *tblspcdir, List **tablespaces,
10190 StringInfo tblspcmapfile, bool infotbssize,
10191 bool needtblspcmapfile)
10192 {
10193 bool exclusive = (labelfile == NULL);
10194 bool backup_started_in_recovery = false;
10195 XLogRecPtr checkpointloc;
10196 XLogRecPtr startpoint;
10197 TimeLineID starttli;
10198 pg_time_t stamp_time;
10199 char strfbuf[128];
10200 char xlogfilename[MAXFNAMELEN];
10201 XLogSegNo _logSegNo;
10202 struct stat stat_buf;
10203 FILE *fp;
10204
10205 backup_started_in_recovery = RecoveryInProgress();
10206
10207 /*
10208 * Currently only non-exclusive backup can be taken during recovery.
10209 */
10210 if (backup_started_in_recovery && exclusive)
10211 ereport(ERROR,
10212 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10213 errmsg("recovery is in progress"),
10214 errhint("WAL control functions cannot be executed during recovery.")));
10215
10216 /*
10217 * During recovery, we don't need to check WAL level. Because, if WAL
10218 * level is not sufficient, it's impossible to get here during recovery.
10219 */
10220 if (!backup_started_in_recovery && !XLogIsNeeded())
10221 ereport(ERROR,
10222 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10223 errmsg("WAL level not sufficient for making an online backup"),
10224 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10225
10226 if (strlen(backupidstr) > MAXPGPATH)
10227 ereport(ERROR,
10228 (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
10229 errmsg("backup label too long (max %d bytes)",
10230 MAXPGPATH)));
10231
10232 /*
10233 * Mark backup active in shared memory. We must do full-page WAL writes
10234 * during an on-line backup even if not doing so at other times, because
10235 * it's quite possible for the backup dump to obtain a "torn" (partially
10236 * written) copy of a database page if it reads the page concurrently with
10237 * our write to the same page. This can be fixed as long as the first
10238 * write to the page in the WAL sequence is a full-page write. Hence, we
10239 * turn on forcePageWrites and then force a CHECKPOINT, to ensure there
10240 * are no dirty pages in shared memory that might get dumped while the
10241 * backup is in progress without having a corresponding WAL record. (Once
10242 * the backup is complete, we need not force full-page writes anymore,
10243 * since we expect that any pages not modified during the backup interval
10244 * must have been correctly captured by the backup.)
10245 *
10246 * Note that forcePageWrites has no effect during an online backup from
10247 * the standby.
10248 *
10249 * We must hold all the insertion locks to change the value of
10250 * forcePageWrites, to ensure adequate interlocking against
10251 * XLogInsertRecord().
10252 */
10253 WALInsertLockAcquireExclusive();
10254 if (exclusive)
10255 {
10256 /*
10257 * At first, mark that we're now starting an exclusive backup,
10258 * to ensure that there are no other sessions currently running
10259 * pg_start_backup() or pg_stop_backup().
10260 */
10261 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_NONE)
10262 {
10263 WALInsertLockRelease();
10264 ereport(ERROR,
10265 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10266 errmsg("a backup is already in progress"),
10267 errhint("Run pg_stop_backup() and try again.")));
10268 }
10269 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STARTING;
10270 }
10271 else
10272 XLogCtl->Insert.nonExclusiveBackups++;
10273 XLogCtl->Insert.forcePageWrites = true;
10274 WALInsertLockRelease();
10275
10276 /* Ensure we release forcePageWrites if fail below */
10277 PG_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10278 {
10279 bool gotUniqueStartpoint = false;
10280 struct dirent *de;
10281 tablespaceinfo *ti;
10282 int datadirpathlen;
10283
10284 /*
10285 * Force an XLOG file switch before the checkpoint, to ensure that the
10286 * WAL segment the checkpoint is written to doesn't contain pages with
10287 * old timeline IDs. That would otherwise happen if you called
10288 * pg_start_backup() right after restoring from a PITR archive: the
10289 * first WAL segment containing the startup checkpoint has pages in
10290 * the beginning with the old timeline ID. That can cause trouble at
10291 * recovery: we won't have a history file covering the old timeline if
10292 * pg_xlog directory was not included in the base backup and the WAL
10293 * archive was cleared too before starting the backup.
10294 *
10295 * This also ensures that we have emitted a WAL page header that has
10296 * XLP_BKP_REMOVABLE off before we emit the checkpoint record.
10297 * Therefore, if a WAL archiver (such as pglesslog) is trying to
10298 * compress out removable backup blocks, it won't remove any that
10299 * occur after this point.
10300 *
10301 * During recovery, we skip forcing XLOG file switch, which means that
10302 * the backup taken during recovery is not available for the special
10303 * recovery case described above.
10304 */
10305 if (!backup_started_in_recovery)
10306 RequestXLogSwitch();
10307
10308 do
10309 {
10310 bool checkpointfpw;
10311
10312 /*
10313 * Force a CHECKPOINT. Aside from being necessary to prevent torn
10314 * page problems, this guarantees that two successive backup runs
10315 * will have different checkpoint positions and hence different
10316 * history file names, even if nothing happened in between.
10317 *
10318 * During recovery, establish a restartpoint if possible. We use
10319 * the last restartpoint as the backup starting checkpoint. This
10320 * means that two successive backup runs can have same checkpoint
10321 * positions.
10322 *
10323 * Since the fact that we are executing do_pg_start_backup()
10324 * during recovery means that checkpointer is running, we can use
10325 * RequestCheckpoint() to establish a restartpoint.
10326 *
10327 * We use CHECKPOINT_IMMEDIATE only if requested by user (via
10328 * passing fast = true). Otherwise this can take awhile.
10329 */
10330 RequestCheckpoint(CHECKPOINT_FORCE | CHECKPOINT_WAIT |
10331 (fast ? CHECKPOINT_IMMEDIATE : 0));
10332
10333 /*
10334 * Now we need to fetch the checkpoint record location, and also
10335 * its REDO pointer. The oldest point in WAL that would be needed
10336 * to restore starting from the checkpoint is precisely the REDO
10337 * pointer.
10338 */
10339 LWLockAcquire(ControlFileLock, LW_SHARED);
10340 checkpointloc = ControlFile->checkPoint;
10341 startpoint = ControlFile->checkPointCopy.redo;
10342 starttli = ControlFile->checkPointCopy.ThisTimeLineID;
10343 checkpointfpw = ControlFile->checkPointCopy.fullPageWrites;
10344 LWLockRelease(ControlFileLock);
10345
10346 if (backup_started_in_recovery)
10347 {
10348 XLogRecPtr recptr;
10349
10350 /*
10351 * Check to see if all WAL replayed during online backup
10352 * (i.e., since last restartpoint used as backup starting
10353 * checkpoint) contain full-page writes.
10354 */
10355 SpinLockAcquire(&XLogCtl->info_lck);
10356 recptr = XLogCtl->lastFpwDisableRecPtr;
10357 SpinLockRelease(&XLogCtl->info_lck);
10358
10359 if (!checkpointfpw || startpoint <= recptr)
10360 ereport(ERROR,
10361 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10362 errmsg("WAL generated with full_page_writes=off was replayed "
10363 "since last restartpoint"),
10364 errhint("This means that the backup being taken on the standby "
10365 "is corrupt and should not be used. "
10366 "Enable full_page_writes and run CHECKPOINT on the master, "
10367 "and then try an online backup again.")));
10368
10369 /*
10370 * During recovery, since we don't use the end-of-backup WAL
10371 * record and don't write the backup history file, the
10372 * starting WAL location doesn't need to be unique. This means
10373 * that two base backups started at the same time might use
10374 * the same checkpoint as starting locations.
10375 */
10376 gotUniqueStartpoint = true;
10377 }
10378
10379 /*
10380 * If two base backups are started at the same time (in WAL sender
10381 * processes), we need to make sure that they use different
10382 * checkpoints as starting locations, because we use the starting
10383 * WAL location as a unique identifier for the base backup in the
10384 * end-of-backup WAL record and when we write the backup history
10385 * file. Perhaps it would be better generate a separate unique ID
10386 * for each backup instead of forcing another checkpoint, but
10387 * taking a checkpoint right after another is not that expensive
10388 * either because only few buffers have been dirtied yet.
10389 */
10390 WALInsertLockAcquireExclusive();
10391 if (XLogCtl->Insert.lastBackupStart < startpoint)
10392 {
10393 XLogCtl->Insert.lastBackupStart = startpoint;
10394 gotUniqueStartpoint = true;
10395 }
10396 WALInsertLockRelease();
10397 } while (!gotUniqueStartpoint);
10398
10399 XLByteToSeg(startpoint, _logSegNo);
10400 XLogFileName(xlogfilename, starttli, _logSegNo);
10401
10402 /*
10403 * Construct tablespace_map file
10404 */
10405 if (exclusive)
10406 tblspcmapfile = makeStringInfo();
10407
10408 datadirpathlen = strlen(DataDir);
10409
10410 /* Collect information about all tablespaces */
10411 while ((de = ReadDir(tblspcdir, "pg_tblspc")) != NULL)
10412 {
10413 char fullpath[MAXPGPATH + 10];
10414 char linkpath[MAXPGPATH];
10415 char *relpath = NULL;
10416 int rllen;
10417 StringInfoData buflinkpath;
10418 char *s = linkpath;
10419
10420 /* Skip special stuff */
10421 if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
10422 continue;
10423
10424 snprintf(fullpath, sizeof(fullpath), "pg_tblspc/%s", de->d_name);
10425
10426 #if defined(HAVE_READLINK) || defined(WIN32)
10427 rllen = readlink(fullpath, linkpath, sizeof(linkpath));
10428 if (rllen < 0)
10429 {
10430 ereport(WARNING,
10431 (errmsg("could not read symbolic link \"%s\": %m",
10432 fullpath)));
10433 continue;
10434 }
10435 else if (rllen >= sizeof(linkpath))
10436 {
10437 ereport(WARNING,
10438 (errmsg("symbolic link \"%s\" target is too long",
10439 fullpath)));
10440 continue;
10441 }
10442 linkpath[rllen] = '\0';
10443
10444 /*
10445 * Add the escape character '\\' before newline in a string to
10446 * ensure that we can distinguish between the newline in the
10447 * tablespace path and end of line while reading tablespace_map
10448 * file during archive recovery.
10449 */
10450 initStringInfo(&buflinkpath);
10451
10452 while (*s)
10453 {
10454 if ((*s == '\n' || *s == '\r') && needtblspcmapfile)
10455 appendStringInfoChar(&buflinkpath, '\\');
10456 appendStringInfoChar(&buflinkpath, *s++);
10457 }
10458
10459
10460 /*
10461 * Relpath holds the relative path of the tablespace directory
10462 * when it's located within PGDATA, or NULL if it's located
10463 * elsewhere.
10464 */
10465 if (rllen > datadirpathlen &&
10466 strncmp(linkpath, DataDir, datadirpathlen) == 0 &&
10467 IS_DIR_SEP(linkpath[datadirpathlen]))
10468 relpath = linkpath + datadirpathlen + 1;
10469
10470 ti = palloc(sizeof(tablespaceinfo));
10471 ti->oid = pstrdup(de->d_name);
10472 ti->path = pstrdup(buflinkpath.data);
10473 ti->rpath = relpath ? pstrdup(relpath) : NULL;
10474 ti->size = infotbssize ? sendTablespace(fullpath, true) : -1;
10475
10476 if (tablespaces)
10477 *tablespaces = lappend(*tablespaces, ti);
10478
10479 appendStringInfo(tblspcmapfile, "%s %s\n", ti->oid, ti->path);
10480
10481 pfree(buflinkpath.data);
10482 #else
10483
10484 /*
10485 * If the platform does not have symbolic links, it should not be
10486 * possible to have tablespaces - clearly somebody else created
10487 * them. Warn about it and ignore.
10488 */
10489 ereport(WARNING,
10490 (errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
10491 errmsg("tablespaces are not supported on this platform")));
10492 #endif
10493 }
10494
10495 /*
10496 * Construct backup label file
10497 */
10498 if (exclusive)
10499 labelfile = makeStringInfo();
10500
10501 /* Use the log timezone here, not the session timezone */
10502 stamp_time = (pg_time_t) time(NULL);
10503 pg_strftime(strfbuf, sizeof(strfbuf),
10504 "%Y-%m-%d %H:%M:%S %Z",
10505 pg_localtime(&stamp_time, log_timezone));
10506 appendStringInfo(labelfile, "START WAL LOCATION: %X/%X (file %s)\n",
10507 (uint32) (startpoint >> 32), (uint32) startpoint, xlogfilename);
10508 appendStringInfo(labelfile, "CHECKPOINT LOCATION: %X/%X\n",
10509 (uint32) (checkpointloc >> 32), (uint32) checkpointloc);
10510 appendStringInfo(labelfile, "BACKUP METHOD: %s\n",
10511 exclusive ? "pg_start_backup" : "streamed");
10512 appendStringInfo(labelfile, "BACKUP FROM: %s\n",
10513 backup_started_in_recovery ? "standby" : "master");
10514 appendStringInfo(labelfile, "START TIME: %s\n", strfbuf);
10515 appendStringInfo(labelfile, "LABEL: %s\n", backupidstr);
10516
10517 /*
10518 * Okay, write the file, or return its contents to caller.
10519 */
10520 if (exclusive)
10521 {
10522 /*
10523 * Check for existing backup label --- implies a backup is already
10524 * running. (XXX given that we checked exclusiveBackupState above,
10525 * maybe it would be OK to just unlink any such label file?)
10526 */
10527 if (stat(BACKUP_LABEL_FILE, &stat_buf) != 0)
10528 {
10529 if (errno != ENOENT)
10530 ereport(ERROR,
10531 (errcode_for_file_access(),
10532 errmsg("could not stat file \"%s\": %m",
10533 BACKUP_LABEL_FILE)));
10534 }
10535 else
10536 ereport(ERROR,
10537 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10538 errmsg("a backup is already in progress"),
10539 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10540 BACKUP_LABEL_FILE)));
10541
10542 fp = AllocateFile(BACKUP_LABEL_FILE, "w");
10543
10544 if (!fp)
10545 ereport(ERROR,
10546 (errcode_for_file_access(),
10547 errmsg("could not create file \"%s\": %m",
10548 BACKUP_LABEL_FILE)));
10549 if (fwrite(labelfile->data, labelfile->len, 1, fp) != 1 ||
10550 fflush(fp) != 0 ||
10551 pg_fsync(fileno(fp)) != 0 ||
10552 ferror(fp) ||
10553 FreeFile(fp))
10554 ereport(ERROR,
10555 (errcode_for_file_access(),
10556 errmsg("could not write file \"%s\": %m",
10557 BACKUP_LABEL_FILE)));
10558 /* Allocated locally for exclusive backups, so free separately */
10559 pfree(labelfile->data);
10560 pfree(labelfile);
10561
10562 /* Write backup tablespace_map file. */
10563 if (tblspcmapfile->len > 0)
10564 {
10565 if (stat(TABLESPACE_MAP, &stat_buf) != 0)
10566 {
10567 if (errno != ENOENT)
10568 ereport(ERROR,
10569 (errcode_for_file_access(),
10570 errmsg("could not stat file \"%s\": %m",
10571 TABLESPACE_MAP)));
10572 }
10573 else
10574 ereport(ERROR,
10575 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10576 errmsg("a backup is already in progress"),
10577 errhint("If you're sure there is no backup in progress, remove file \"%s\" and try again.",
10578 TABLESPACE_MAP)));
10579
10580 fp = AllocateFile(TABLESPACE_MAP, "w");
10581
10582 if (!fp)
10583 ereport(ERROR,
10584 (errcode_for_file_access(),
10585 errmsg("could not create file \"%s\": %m",
10586 TABLESPACE_MAP)));
10587 if (fwrite(tblspcmapfile->data, tblspcmapfile->len, 1, fp) != 1 ||
10588 fflush(fp) != 0 ||
10589 pg_fsync(fileno(fp)) != 0 ||
10590 ferror(fp) ||
10591 FreeFile(fp))
10592 ereport(ERROR,
10593 (errcode_for_file_access(),
10594 errmsg("could not write file \"%s\": %m",
10595 TABLESPACE_MAP)));
10596 }
10597
10598 /* Allocated locally for exclusive backups, so free separately */
10599 pfree(tblspcmapfile->data);
10600 pfree(tblspcmapfile);
10601 }
10602 }
10603 PG_END_ENSURE_ERROR_CLEANUP(pg_start_backup_callback, (Datum) BoolGetDatum(exclusive));
10604
10605 /*
10606 * Mark that start phase has correctly finished for an exclusive backup.
10607 * Session-level locks are updated as well to reflect that state.
10608 *
10609 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating
10610 * backup counters and session-level lock. Otherwise they can be
10611 * updated inconsistently, and which might cause do_pg_abort_backup()
10612 * to fail.
10613 */
10614 if (exclusive)
10615 {
10616 WALInsertLockAcquireExclusive();
10617 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10618
10619 /* Set session-level lock */
10620 sessionBackupState = SESSION_BACKUP_EXCLUSIVE;
10621 WALInsertLockRelease();
10622 }
10623 else
10624 sessionBackupState = SESSION_BACKUP_NON_EXCLUSIVE;
10625
10626 /*
10627 * We're done. As a convenience, return the starting WAL location.
10628 */
10629 if (starttli_p)
10630 *starttli_p = starttli;
10631 return startpoint;
10632 }
10633
10634 /* Error cleanup callback for pg_start_backup */
10635 static void
pg_start_backup_callback(int code,Datum arg)10636 pg_start_backup_callback(int code, Datum arg)
10637 {
10638 bool exclusive = DatumGetBool(arg);
10639
10640 /* Update backup counters and forcePageWrites on failure */
10641 WALInsertLockAcquireExclusive();
10642 if (exclusive)
10643 {
10644 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STARTING);
10645 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10646 }
10647 else
10648 {
10649 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10650 XLogCtl->Insert.nonExclusiveBackups--;
10651 }
10652
10653 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10654 XLogCtl->Insert.nonExclusiveBackups == 0)
10655 {
10656 XLogCtl->Insert.forcePageWrites = false;
10657 }
10658 WALInsertLockRelease();
10659 }
10660
10661 /*
10662 * Error cleanup callback for pg_stop_backup
10663 */
10664 static void
pg_stop_backup_callback(int code,Datum arg)10665 pg_stop_backup_callback(int code, Datum arg)
10666 {
10667 bool exclusive = DatumGetBool(arg);
10668
10669 /* Update backup status on failure */
10670 WALInsertLockAcquireExclusive();
10671 if (exclusive)
10672 {
10673 Assert(XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_STOPPING);
10674 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_IN_PROGRESS;
10675 }
10676 WALInsertLockRelease();
10677 }
10678
10679 /*
10680 * Utility routine to fetch the session-level status of a backup running.
10681 */
10682 SessionBackupState
get_backup_status(void)10683 get_backup_status(void)
10684 {
10685 return sessionBackupState;
10686 }
10687
10688 /*
10689 * do_pg_stop_backup is the workhorse of the user-visible pg_stop_backup()
10690 * function.
10691
10692 * If labelfile is NULL, this stops an exclusive backup. Otherwise this stops
10693 * the non-exclusive backup specified by 'labelfile'.
10694 *
10695 * Returns the last WAL position that must be present to restore from this
10696 * backup, and the corresponding timeline ID in *stoptli_p.
10697 *
10698 * It is the responsibility of the caller of this function to verify the
10699 * permissions of the calling user!
10700 */
10701 XLogRecPtr
do_pg_stop_backup(char * labelfile,bool waitforarchive,TimeLineID * stoptli_p)10702 do_pg_stop_backup(char *labelfile, bool waitforarchive, TimeLineID *stoptli_p)
10703 {
10704 bool exclusive = (labelfile == NULL);
10705 bool backup_started_in_recovery = false;
10706 XLogRecPtr startpoint;
10707 XLogRecPtr stoppoint;
10708 TimeLineID stoptli;
10709 pg_time_t stamp_time;
10710 char strfbuf[128];
10711 char histfilepath[MAXPGPATH];
10712 char startxlogfilename[MAXFNAMELEN];
10713 char stopxlogfilename[MAXFNAMELEN];
10714 char lastxlogfilename[MAXFNAMELEN];
10715 char histfilename[MAXFNAMELEN];
10716 char backupfrom[20];
10717 XLogSegNo _logSegNo;
10718 FILE *lfp;
10719 FILE *fp;
10720 char ch;
10721 int seconds_before_warning;
10722 int waits = 0;
10723 bool reported_waiting = false;
10724 char *remaining;
10725 char *ptr;
10726 uint32 hi,
10727 lo;
10728
10729 backup_started_in_recovery = RecoveryInProgress();
10730
10731 /*
10732 * Currently only non-exclusive backup can be taken during recovery.
10733 */
10734 if (backup_started_in_recovery && exclusive)
10735 ereport(ERROR,
10736 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10737 errmsg("recovery is in progress"),
10738 errhint("WAL control functions cannot be executed during recovery.")));
10739
10740 /*
10741 * During recovery, we don't need to check WAL level. Because, if WAL
10742 * level is not sufficient, it's impossible to get here during recovery.
10743 */
10744 if (!backup_started_in_recovery && !XLogIsNeeded())
10745 ereport(ERROR,
10746 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10747 errmsg("WAL level not sufficient for making an online backup"),
10748 errhint("wal_level must be set to \"replica\" or \"logical\" at server start.")));
10749
10750 if (exclusive)
10751 {
10752 /*
10753 * At first, mark that we're now stopping an exclusive backup,
10754 * to ensure that there are no other sessions currently running
10755 * pg_start_backup() or pg_stop_backup().
10756 */
10757 WALInsertLockAcquireExclusive();
10758 if (XLogCtl->Insert.exclusiveBackupState != EXCLUSIVE_BACKUP_IN_PROGRESS)
10759 {
10760 WALInsertLockRelease();
10761 ereport(ERROR,
10762 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10763 errmsg("exclusive backup not in progress")));
10764 }
10765 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_STOPPING;
10766 WALInsertLockRelease();
10767
10768 /*
10769 * Remove backup_label. In case of failure, the state for an exclusive
10770 * backup is switched back to in-progress.
10771 */
10772 PG_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10773 {
10774 /*
10775 * Read the existing label file into memory.
10776 */
10777 struct stat statbuf;
10778 int r;
10779
10780 if (stat(BACKUP_LABEL_FILE, &statbuf))
10781 {
10782 /* should not happen per the upper checks */
10783 if (errno != ENOENT)
10784 ereport(ERROR,
10785 (errcode_for_file_access(),
10786 errmsg("could not stat file \"%s\": %m",
10787 BACKUP_LABEL_FILE)));
10788 ereport(ERROR,
10789 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10790 errmsg("a backup is not in progress")));
10791 }
10792
10793 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
10794 if (!lfp)
10795 {
10796 ereport(ERROR,
10797 (errcode_for_file_access(),
10798 errmsg("could not read file \"%s\": %m",
10799 BACKUP_LABEL_FILE)));
10800 }
10801 labelfile = palloc(statbuf.st_size + 1);
10802 r = fread(labelfile, statbuf.st_size, 1, lfp);
10803 labelfile[statbuf.st_size] = '\0';
10804
10805 /*
10806 * Close and remove the backup label file
10807 */
10808 if (r != 1 || ferror(lfp) || FreeFile(lfp))
10809 ereport(ERROR,
10810 (errcode_for_file_access(),
10811 errmsg("could not read file \"%s\": %m",
10812 BACKUP_LABEL_FILE)));
10813 if (unlink(BACKUP_LABEL_FILE) != 0)
10814 ereport(ERROR,
10815 (errcode_for_file_access(),
10816 errmsg("could not remove file \"%s\": %m",
10817 BACKUP_LABEL_FILE)));
10818
10819 /*
10820 * Remove tablespace_map file if present, it is created only if there
10821 * are tablespaces.
10822 */
10823 unlink(TABLESPACE_MAP);
10824 }
10825 PG_END_ENSURE_ERROR_CLEANUP(pg_stop_backup_callback, (Datum) BoolGetDatum(exclusive));
10826 }
10827
10828 /*
10829 * OK to update backup counters, forcePageWrites and session-level lock.
10830 *
10831 * Note that CHECK_FOR_INTERRUPTS() must not occur while updating them.
10832 * Otherwise they can be updated inconsistently, and which might cause
10833 * do_pg_abort_backup() to fail.
10834 */
10835 WALInsertLockAcquireExclusive();
10836 if (exclusive)
10837 {
10838 XLogCtl->Insert.exclusiveBackupState = EXCLUSIVE_BACKUP_NONE;
10839 }
10840 else
10841 {
10842 /*
10843 * The user-visible pg_start/stop_backup() functions that operate on
10844 * exclusive backups can be called at any time, but for non-exclusive
10845 * backups, it is expected that each do_pg_start_backup() call is
10846 * matched by exactly one do_pg_stop_backup() call.
10847 */
10848 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
10849 XLogCtl->Insert.nonExclusiveBackups--;
10850 }
10851
10852 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
10853 XLogCtl->Insert.nonExclusiveBackups == 0)
10854 {
10855 XLogCtl->Insert.forcePageWrites = false;
10856 }
10857
10858 /*
10859 * Clean up session-level lock.
10860 *
10861 * You might think that WALInsertLockRelease() can be called
10862 * before cleaning up session-level lock because session-level
10863 * lock doesn't need to be protected with WAL insertion lock.
10864 * But since CHECK_FOR_INTERRUPTS() can occur in it,
10865 * session-level lock must be cleaned up before it.
10866 */
10867 sessionBackupState = SESSION_BACKUP_NONE;
10868
10869 WALInsertLockRelease();
10870
10871 /*
10872 * Read and parse the START WAL LOCATION line (this code is pretty crude,
10873 * but we are not expecting any variability in the file format).
10874 */
10875 if (sscanf(labelfile, "START WAL LOCATION: %X/%X (file %24s)%c",
10876 &hi, &lo, startxlogfilename,
10877 &ch) != 4 || ch != '\n')
10878 ereport(ERROR,
10879 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10880 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10881 startpoint = ((uint64) hi) << 32 | lo;
10882 remaining = strchr(labelfile, '\n') + 1; /* %n is not portable enough */
10883
10884 /*
10885 * Parse the BACKUP FROM line. If we are taking an online backup from the
10886 * standby, we confirm that the standby has not been promoted during the
10887 * backup.
10888 */
10889 ptr = strstr(remaining, "BACKUP FROM:");
10890 if (!ptr || sscanf(ptr, "BACKUP FROM: %19s\n", backupfrom) != 1)
10891 ereport(ERROR,
10892 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10893 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
10894 if (strcmp(backupfrom, "standby") == 0 && !backup_started_in_recovery)
10895 ereport(ERROR,
10896 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10897 errmsg("the standby was promoted during online backup"),
10898 errhint("This means that the backup being taken is corrupt "
10899 "and should not be used. "
10900 "Try taking another online backup.")));
10901
10902 /*
10903 * During recovery, we don't write an end-of-backup record. We assume that
10904 * pg_control was backed up last and its minimum recovery point can be
10905 * available as the backup end location. Since we don't have an
10906 * end-of-backup record, we use the pg_control value to check whether
10907 * we've reached the end of backup when starting recovery from this
10908 * backup. We have no way of checking if pg_control wasn't backed up last
10909 * however.
10910 *
10911 * We don't force a switch to new WAL file and wait for all the required
10912 * files to be archived. This is okay if we use the backup to start the
10913 * standby. But, if it's for an archive recovery, to ensure all the
10914 * required files are available, a user should wait for them to be
10915 * archived, or include them into the backup.
10916 *
10917 * We return the current minimum recovery point as the backup end
10918 * location. Note that it can be greater than the exact backup end
10919 * location if the minimum recovery point is updated after the backup of
10920 * pg_control. This is harmless for current uses.
10921 *
10922 * XXX currently a backup history file is for informational and debug
10923 * purposes only. It's not essential for an online backup. Furthermore,
10924 * even if it's created, it will not be archived during recovery because
10925 * an archiver is not invoked. So it doesn't seem worthwhile to write a
10926 * backup history file during recovery.
10927 */
10928 if (backup_started_in_recovery)
10929 {
10930 XLogRecPtr recptr;
10931
10932 /*
10933 * Check to see if all WAL replayed during online backup contain
10934 * full-page writes.
10935 */
10936 SpinLockAcquire(&XLogCtl->info_lck);
10937 recptr = XLogCtl->lastFpwDisableRecPtr;
10938 SpinLockRelease(&XLogCtl->info_lck);
10939
10940 if (startpoint <= recptr)
10941 ereport(ERROR,
10942 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
10943 errmsg("WAL generated with full_page_writes=off was replayed "
10944 "during online backup"),
10945 errhint("This means that the backup being taken on the standby "
10946 "is corrupt and should not be used. "
10947 "Enable full_page_writes and run CHECKPOINT on the master, "
10948 "and then try an online backup again.")));
10949
10950
10951 LWLockAcquire(ControlFileLock, LW_SHARED);
10952 stoppoint = ControlFile->minRecoveryPoint;
10953 stoptli = ControlFile->minRecoveryPointTLI;
10954 LWLockRelease(ControlFileLock);
10955
10956 if (stoptli_p)
10957 *stoptli_p = stoptli;
10958 return stoppoint;
10959 }
10960
10961 /*
10962 * Write the backup-end xlog record
10963 */
10964 XLogBeginInsert();
10965 XLogRegisterData((char *) (&startpoint), sizeof(startpoint));
10966 stoppoint = XLogInsert(RM_XLOG_ID, XLOG_BACKUP_END);
10967 stoptli = ThisTimeLineID;
10968
10969 /*
10970 * Force a switch to a new xlog segment file, so that the backup is valid
10971 * as soon as archiver moves out the current segment file.
10972 */
10973 RequestXLogSwitch();
10974
10975 XLByteToPrevSeg(stoppoint, _logSegNo);
10976 XLogFileName(stopxlogfilename, ThisTimeLineID, _logSegNo);
10977
10978 /* Use the log timezone here, not the session timezone */
10979 stamp_time = (pg_time_t) time(NULL);
10980 pg_strftime(strfbuf, sizeof(strfbuf),
10981 "%Y-%m-%d %H:%M:%S %Z",
10982 pg_localtime(&stamp_time, log_timezone));
10983
10984 /*
10985 * Write the backup history file
10986 */
10987 XLByteToSeg(startpoint, _logSegNo);
10988 BackupHistoryFilePath(histfilepath, ThisTimeLineID, _logSegNo,
10989 (uint32) (startpoint % XLogSegSize));
10990 fp = AllocateFile(histfilepath, "w");
10991 if (!fp)
10992 ereport(ERROR,
10993 (errcode_for_file_access(),
10994 errmsg("could not create file \"%s\": %m",
10995 histfilepath)));
10996 fprintf(fp, "START WAL LOCATION: %X/%X (file %s)\n",
10997 (uint32) (startpoint >> 32), (uint32) startpoint, startxlogfilename);
10998 fprintf(fp, "STOP WAL LOCATION: %X/%X (file %s)\n",
10999 (uint32) (stoppoint >> 32), (uint32) stoppoint, stopxlogfilename);
11000 /* transfer remaining lines from label to history file */
11001 fprintf(fp, "%s", remaining);
11002 fprintf(fp, "STOP TIME: %s\n", strfbuf);
11003 if (fflush(fp) || ferror(fp) || FreeFile(fp))
11004 ereport(ERROR,
11005 (errcode_for_file_access(),
11006 errmsg("could not write file \"%s\": %m",
11007 histfilepath)));
11008
11009 /*
11010 * Clean out any no-longer-needed history files. As a side effect, this
11011 * will post a .ready file for the newly created history file, notifying
11012 * the archiver that history file may be archived immediately.
11013 */
11014 CleanupBackupHistory();
11015
11016 /*
11017 * If archiving is enabled, wait for all the required WAL files to be
11018 * archived before returning. If archiving isn't enabled, the required WAL
11019 * needs to be transported via streaming replication (hopefully with
11020 * wal_keep_segments set high enough), or some more exotic mechanism like
11021 * polling and copying files from pg_xlog with script. We have no
11022 * knowledge of those mechanisms, so it's up to the user to ensure that he
11023 * gets all the required WAL.
11024 *
11025 * We wait until both the last WAL file filled during backup and the
11026 * history file have been archived, and assume that the alphabetic sorting
11027 * property of the WAL files ensures any earlier WAL files are safely
11028 * archived as well.
11029 *
11030 * We wait forever, since archive_command is supposed to work and we
11031 * assume the admin wanted his backup to work completely. If you don't
11032 * wish to wait, you can set statement_timeout. Also, some notices are
11033 * issued to clue in anyone who might be doing this interactively.
11034 */
11035 if (waitforarchive && XLogArchivingActive())
11036 {
11037 XLByteToPrevSeg(stoppoint, _logSegNo);
11038 XLogFileName(lastxlogfilename, ThisTimeLineID, _logSegNo);
11039
11040 XLByteToSeg(startpoint, _logSegNo);
11041 BackupHistoryFileName(histfilename, ThisTimeLineID, _logSegNo,
11042 (uint32) (startpoint % XLogSegSize));
11043
11044 seconds_before_warning = 60;
11045 waits = 0;
11046
11047 while (XLogArchiveIsBusy(lastxlogfilename) ||
11048 XLogArchiveIsBusy(histfilename))
11049 {
11050 CHECK_FOR_INTERRUPTS();
11051
11052 if (!reported_waiting && waits > 5)
11053 {
11054 ereport(NOTICE,
11055 (errmsg("pg_stop_backup cleanup done, waiting for required WAL segments to be archived")));
11056 reported_waiting = true;
11057 }
11058
11059 pg_usleep(1000000L);
11060
11061 if (++waits >= seconds_before_warning)
11062 {
11063 seconds_before_warning *= 2; /* This wraps in >10 years... */
11064 ereport(WARNING,
11065 (errmsg("pg_stop_backup still waiting for all required WAL segments to be archived (%d seconds elapsed)",
11066 waits),
11067 errhint("Check that your archive_command is executing properly. "
11068 "pg_stop_backup can be canceled safely, "
11069 "but the database backup will not be usable without all the WAL segments.")));
11070 }
11071 }
11072
11073 ereport(NOTICE,
11074 (errmsg("pg_stop_backup complete, all required WAL segments have been archived")));
11075 }
11076 else if (waitforarchive)
11077 ereport(NOTICE,
11078 (errmsg("WAL archiving is not enabled; you must ensure that all required WAL segments are copied through other means to complete the backup")));
11079
11080 /*
11081 * We're done. As a convenience, return the ending WAL location.
11082 */
11083 if (stoptli_p)
11084 *stoptli_p = stoptli;
11085 return stoppoint;
11086 }
11087
11088
11089 /*
11090 * do_pg_abort_backup: abort a running backup
11091 *
11092 * This does just the most basic steps of do_pg_stop_backup(), by taking the
11093 * system out of backup mode, thus making it a lot more safe to call from
11094 * an error handler.
11095 *
11096 * NB: This is only for aborting a non-exclusive backup that doesn't write
11097 * backup_label. A backup started with pg_start_backup() needs to be finished
11098 * with pg_stop_backup().
11099 */
11100 void
do_pg_abort_backup(void)11101 do_pg_abort_backup(void)
11102 {
11103 /*
11104 * Quick exit if session is not keeping around a non-exclusive backup
11105 * already started.
11106 */
11107 if (sessionBackupState == SESSION_BACKUP_NONE)
11108 return;
11109
11110 WALInsertLockAcquireExclusive();
11111 Assert(XLogCtl->Insert.nonExclusiveBackups > 0);
11112 Assert(sessionBackupState == SESSION_BACKUP_NON_EXCLUSIVE);
11113 XLogCtl->Insert.nonExclusiveBackups--;
11114
11115 if (XLogCtl->Insert.exclusiveBackupState == EXCLUSIVE_BACKUP_NONE &&
11116 XLogCtl->Insert.nonExclusiveBackups == 0)
11117 {
11118 XLogCtl->Insert.forcePageWrites = false;
11119 }
11120 WALInsertLockRelease();
11121 }
11122
11123 /*
11124 * Get latest redo apply position.
11125 *
11126 * Exported to allow WALReceiver to read the pointer directly.
11127 */
11128 XLogRecPtr
GetXLogReplayRecPtr(TimeLineID * replayTLI)11129 GetXLogReplayRecPtr(TimeLineID *replayTLI)
11130 {
11131 XLogRecPtr recptr;
11132 TimeLineID tli;
11133
11134 SpinLockAcquire(&XLogCtl->info_lck);
11135 recptr = XLogCtl->lastReplayedEndRecPtr;
11136 tli = XLogCtl->lastReplayedTLI;
11137 SpinLockRelease(&XLogCtl->info_lck);
11138
11139 if (replayTLI)
11140 *replayTLI = tli;
11141 return recptr;
11142 }
11143
11144 /*
11145 * Get latest WAL insert pointer
11146 */
11147 XLogRecPtr
GetXLogInsertRecPtr(void)11148 GetXLogInsertRecPtr(void)
11149 {
11150 XLogCtlInsert *Insert = &XLogCtl->Insert;
11151 uint64 current_bytepos;
11152
11153 SpinLockAcquire(&Insert->insertpos_lck);
11154 current_bytepos = Insert->CurrBytePos;
11155 SpinLockRelease(&Insert->insertpos_lck);
11156
11157 return XLogBytePosToRecPtr(current_bytepos);
11158 }
11159
11160 /*
11161 * Get latest WAL write pointer
11162 */
11163 XLogRecPtr
GetXLogWriteRecPtr(void)11164 GetXLogWriteRecPtr(void)
11165 {
11166 SpinLockAcquire(&XLogCtl->info_lck);
11167 LogwrtResult = XLogCtl->LogwrtResult;
11168 SpinLockRelease(&XLogCtl->info_lck);
11169
11170 return LogwrtResult.Write;
11171 }
11172
11173 /*
11174 * Returns the redo pointer of the last checkpoint or restartpoint. This is
11175 * the oldest point in WAL that we still need, if we have to restart recovery.
11176 */
11177 void
GetOldestRestartPoint(XLogRecPtr * oldrecptr,TimeLineID * oldtli)11178 GetOldestRestartPoint(XLogRecPtr *oldrecptr, TimeLineID *oldtli)
11179 {
11180 LWLockAcquire(ControlFileLock, LW_SHARED);
11181 *oldrecptr = ControlFile->checkPointCopy.redo;
11182 *oldtli = ControlFile->checkPointCopy.ThisTimeLineID;
11183 LWLockRelease(ControlFileLock);
11184 }
11185
11186 /*
11187 * read_backup_label: check to see if a backup_label file is present
11188 *
11189 * If we see a backup_label during recovery, we assume that we are recovering
11190 * from a backup dump file, and we therefore roll forward from the checkpoint
11191 * identified by the label file, NOT what pg_control says. This avoids the
11192 * problem that pg_control might have been archived one or more checkpoints
11193 * later than the start of the dump, and so if we rely on it as the start
11194 * point, we will fail to restore a consistent database state.
11195 *
11196 * Returns TRUE if a backup_label was found (and fills the checkpoint
11197 * location and its REDO location into *checkPointLoc and RedoStartLSN,
11198 * respectively); returns FALSE if not. If this backup_label came from a
11199 * streamed backup, *backupEndRequired is set to TRUE. If this backup_label
11200 * was created during recovery, *backupFromStandby is set to TRUE.
11201 */
11202 static bool
read_backup_label(XLogRecPtr * checkPointLoc,bool * backupEndRequired,bool * backupFromStandby)11203 read_backup_label(XLogRecPtr *checkPointLoc, bool *backupEndRequired,
11204 bool *backupFromStandby)
11205 {
11206 char startxlogfilename[MAXFNAMELEN];
11207 TimeLineID tli;
11208 FILE *lfp;
11209 char ch;
11210 char backuptype[20];
11211 char backupfrom[20];
11212 uint32 hi,
11213 lo;
11214
11215 *backupEndRequired = false;
11216 *backupFromStandby = false;
11217
11218 /*
11219 * See if label file is present
11220 */
11221 lfp = AllocateFile(BACKUP_LABEL_FILE, "r");
11222 if (!lfp)
11223 {
11224 if (errno != ENOENT)
11225 ereport(FATAL,
11226 (errcode_for_file_access(),
11227 errmsg("could not read file \"%s\": %m",
11228 BACKUP_LABEL_FILE)));
11229 return false; /* it's not there, all is fine */
11230 }
11231
11232 /*
11233 * Read and parse the START WAL LOCATION and CHECKPOINT lines (this code
11234 * is pretty crude, but we are not expecting any variability in the file
11235 * format).
11236 */
11237 if (fscanf(lfp, "START WAL LOCATION: %X/%X (file %08X%16s)%c",
11238 &hi, &lo, &tli, startxlogfilename, &ch) != 5 || ch != '\n')
11239 ereport(FATAL,
11240 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11241 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11242 RedoStartLSN = ((uint64) hi) << 32 | lo;
11243 if (fscanf(lfp, "CHECKPOINT LOCATION: %X/%X%c",
11244 &hi, &lo, &ch) != 3 || ch != '\n')
11245 ereport(FATAL,
11246 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11247 errmsg("invalid data in file \"%s\"", BACKUP_LABEL_FILE)));
11248 *checkPointLoc = ((uint64) hi) << 32 | lo;
11249
11250 /*
11251 * BACKUP METHOD and BACKUP FROM lines are new in 9.2. We can't restore
11252 * from an older backup anyway, but since the information on it is not
11253 * strictly required, don't error out if it's missing for some reason.
11254 */
11255 if (fscanf(lfp, "BACKUP METHOD: %19s\n", backuptype) == 1)
11256 {
11257 if (strcmp(backuptype, "streamed") == 0)
11258 *backupEndRequired = true;
11259 }
11260
11261 if (fscanf(lfp, "BACKUP FROM: %19s\n", backupfrom) == 1)
11262 {
11263 if (strcmp(backupfrom, "standby") == 0)
11264 *backupFromStandby = true;
11265 }
11266
11267 if (ferror(lfp) || FreeFile(lfp))
11268 ereport(FATAL,
11269 (errcode_for_file_access(),
11270 errmsg("could not read file \"%s\": %m",
11271 BACKUP_LABEL_FILE)));
11272
11273 return true;
11274 }
11275
11276 /*
11277 * read_tablespace_map: check to see if a tablespace_map file is present
11278 *
11279 * If we see a tablespace_map file during recovery, we assume that we are
11280 * recovering from a backup dump file, and we therefore need to create symlinks
11281 * as per the information present in tablespace_map file.
11282 *
11283 * Returns TRUE if a tablespace_map file was found (and fills the link
11284 * information for all the tablespace links present in file); returns FALSE
11285 * if not.
11286 */
11287 static bool
read_tablespace_map(List ** tablespaces)11288 read_tablespace_map(List **tablespaces)
11289 {
11290 tablespaceinfo *ti;
11291 FILE *lfp;
11292 char tbsoid[MAXPGPATH];
11293 char *tbslinkpath;
11294 char str[MAXPGPATH];
11295 int ch,
11296 prev_ch = -1,
11297 i = 0,
11298 n;
11299
11300 /*
11301 * See if tablespace_map file is present
11302 */
11303 lfp = AllocateFile(TABLESPACE_MAP, "r");
11304 if (!lfp)
11305 {
11306 if (errno != ENOENT)
11307 ereport(FATAL,
11308 (errcode_for_file_access(),
11309 errmsg("could not read file \"%s\": %m",
11310 TABLESPACE_MAP)));
11311 return false; /* it's not there, all is fine */
11312 }
11313
11314 /*
11315 * Read and parse the link name and path lines from tablespace_map file
11316 * (this code is pretty crude, but we are not expecting any variability in
11317 * the file format). While taking backup we embed escape character '\\'
11318 * before newline in tablespace path, so that during reading of
11319 * tablespace_map file, we could distinguish newline in tablespace path
11320 * and end of line. Now while reading tablespace_map file, remove the
11321 * escape character that has been added in tablespace path during backup.
11322 */
11323 while ((ch = fgetc(lfp)) != EOF)
11324 {
11325 if ((ch == '\n' || ch == '\r') && prev_ch != '\\')
11326 {
11327 str[i] = '\0';
11328 if (sscanf(str, "%s %n", tbsoid, &n) != 1)
11329 ereport(FATAL,
11330 (errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
11331 errmsg("invalid data in file \"%s\"", TABLESPACE_MAP)));
11332 tbslinkpath = str + n;
11333 i = 0;
11334
11335 ti = palloc(sizeof(tablespaceinfo));
11336 ti->oid = pstrdup(tbsoid);
11337 ti->path = pstrdup(tbslinkpath);
11338
11339 *tablespaces = lappend(*tablespaces, ti);
11340 continue;
11341 }
11342 else if ((ch == '\n' || ch == '\r') && prev_ch == '\\')
11343 str[i - 1] = ch;
11344 else if (i < sizeof(str) - 1)
11345 str[i++] = ch;
11346 prev_ch = ch;
11347 }
11348
11349 if (ferror(lfp) || FreeFile(lfp))
11350 ereport(FATAL,
11351 (errcode_for_file_access(),
11352 errmsg("could not read file \"%s\": %m",
11353 TABLESPACE_MAP)));
11354
11355 return true;
11356 }
11357
11358 /*
11359 * Error context callback for errors occurring during rm_redo().
11360 */
11361 static void
rm_redo_error_callback(void * arg)11362 rm_redo_error_callback(void *arg)
11363 {
11364 XLogReaderState *record = (XLogReaderState *) arg;
11365 StringInfoData buf;
11366
11367 initStringInfo(&buf);
11368 xlog_outdesc(&buf, record);
11369
11370 /* translator: %s is an XLog record description */
11371 errcontext("xlog redo at %X/%X for %s",
11372 (uint32) (record->ReadRecPtr >> 32),
11373 (uint32) record->ReadRecPtr,
11374 buf.data);
11375
11376 pfree(buf.data);
11377 }
11378
11379 /*
11380 * BackupInProgress: check if online backup mode is active
11381 *
11382 * This is done by checking for existence of the "backup_label" file.
11383 */
11384 bool
BackupInProgress(void)11385 BackupInProgress(void)
11386 {
11387 struct stat stat_buf;
11388
11389 return (stat(BACKUP_LABEL_FILE, &stat_buf) == 0);
11390 }
11391
11392 /*
11393 * CancelBackup: rename the "backup_label" and "tablespace_map"
11394 * files to cancel backup mode
11395 *
11396 * If the "backup_label" file exists, it will be renamed to "backup_label.old".
11397 * Similarly, if the "tablespace_map" file exists, it will be renamed to
11398 * "tablespace_map.old".
11399 *
11400 * Note that this will render an online backup in progress
11401 * useless. To correctly finish an online backup, pg_stop_backup must be
11402 * called.
11403 */
11404 void
CancelBackup(void)11405 CancelBackup(void)
11406 {
11407 struct stat stat_buf;
11408
11409 /* if the backup_label file is not there, return */
11410 if (stat(BACKUP_LABEL_FILE, &stat_buf) < 0)
11411 return;
11412
11413 /* remove leftover file from previously canceled backup if it exists */
11414 unlink(BACKUP_LABEL_OLD);
11415
11416 if (durable_rename(BACKUP_LABEL_FILE, BACKUP_LABEL_OLD, DEBUG1) != 0)
11417 {
11418 ereport(WARNING,
11419 (errcode_for_file_access(),
11420 errmsg("online backup mode was not canceled"),
11421 errdetail("File \"%s\" could not be renamed to \"%s\": %m.",
11422 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11423 return;
11424 }
11425
11426 /* if the tablespace_map file is not there, return */
11427 if (stat(TABLESPACE_MAP, &stat_buf) < 0)
11428 {
11429 ereport(LOG,
11430 (errmsg("online backup mode canceled"),
11431 errdetail("File \"%s\" was renamed to \"%s\".",
11432 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD)));
11433 return;
11434 }
11435
11436 /* remove leftover file from previously canceled backup if it exists */
11437 unlink(TABLESPACE_MAP_OLD);
11438
11439 if (durable_rename(TABLESPACE_MAP, TABLESPACE_MAP_OLD, DEBUG1) == 0)
11440 {
11441 ereport(LOG,
11442 (errmsg("online backup mode canceled"),
11443 errdetail("Files \"%s\" and \"%s\" were renamed to "
11444 "\"%s\" and \"%s\", respectively.",
11445 BACKUP_LABEL_FILE, TABLESPACE_MAP,
11446 BACKUP_LABEL_OLD, TABLESPACE_MAP_OLD)));
11447 }
11448 else
11449 {
11450 ereport(WARNING,
11451 (errcode_for_file_access(),
11452 errmsg("online backup mode canceled"),
11453 errdetail("File \"%s\" was renamed to \"%s\", but "
11454 "file \"%s\" could not be renamed to \"%s\": %m.",
11455 BACKUP_LABEL_FILE, BACKUP_LABEL_OLD,
11456 TABLESPACE_MAP, TABLESPACE_MAP_OLD)));
11457 }
11458 }
11459
11460 /*
11461 * Read the XLOG page containing RecPtr into readBuf (if not read already).
11462 * Returns number of bytes read, if the page is read successfully, or -1
11463 * in case of errors. When errors occur, they are ereport'ed, but only
11464 * if they have not been previously reported.
11465 *
11466 * This is responsible for restoring files from archive as needed, as well
11467 * as for waiting for the requested WAL record to arrive in standby mode.
11468 *
11469 * 'emode' specifies the log level used for reporting "file not found" or
11470 * "end of WAL" situations in archive recovery, or in standby mode when a
11471 * trigger file is found. If set to WARNING or below, XLogPageRead() returns
11472 * false in those situations, on higher log levels the ereport() won't
11473 * return.
11474 *
11475 * In standby mode, if after a successful return of XLogPageRead() the
11476 * caller finds the record it's interested in to be broken, it should
11477 * ereport the error with the level determined by
11478 * emode_for_corrupt_record(), and then set lastSourceFailed
11479 * and call XLogPageRead() again with the same arguments. This lets
11480 * XLogPageRead() to try fetching the record from another source, or to
11481 * sleep and retry.
11482 */
11483 static int
XLogPageRead(XLogReaderState * xlogreader,XLogRecPtr targetPagePtr,int reqLen,XLogRecPtr targetRecPtr,char * readBuf,TimeLineID * readTLI)11484 XLogPageRead(XLogReaderState *xlogreader, XLogRecPtr targetPagePtr, int reqLen,
11485 XLogRecPtr targetRecPtr, char *readBuf, TimeLineID *readTLI)
11486 {
11487 XLogPageReadPrivate *private =
11488 (XLogPageReadPrivate *) xlogreader->private_data;
11489 int emode = private->emode;
11490 uint32 targetPageOff;
11491 XLogSegNo targetSegNo PG_USED_FOR_ASSERTS_ONLY;
11492
11493 XLByteToSeg(targetPagePtr, targetSegNo);
11494 targetPageOff = targetPagePtr % XLogSegSize;
11495
11496 /*
11497 * See if we need to switch to a new segment because the requested record
11498 * is not in the currently open one.
11499 */
11500 if (readFile >= 0 && !XLByteInSeg(targetPagePtr, readSegNo))
11501 {
11502 /*
11503 * Request a restartpoint if we've replayed too much xlog since the
11504 * last one.
11505 */
11506 if (bgwriterLaunched)
11507 {
11508 if (XLogCheckpointNeeded(readSegNo))
11509 {
11510 (void) GetRedoRecPtr();
11511 if (XLogCheckpointNeeded(readSegNo))
11512 RequestCheckpoint(CHECKPOINT_CAUSE_XLOG);
11513 }
11514 }
11515
11516 close(readFile);
11517 readFile = -1;
11518 readSource = 0;
11519 }
11520
11521 XLByteToSeg(targetPagePtr, readSegNo);
11522
11523 retry:
11524 /* See if we need to retrieve more data */
11525 if (readFile < 0 ||
11526 (readSource == XLOG_FROM_STREAM &&
11527 receivedUpto < targetPagePtr + reqLen))
11528 {
11529 if (!WaitForWALToBecomeAvailable(targetPagePtr + reqLen,
11530 private->randAccess,
11531 private->fetching_ckpt,
11532 targetRecPtr))
11533 {
11534 if (readFile >= 0)
11535 close(readFile);
11536 readFile = -1;
11537 readLen = 0;
11538 readSource = 0;
11539
11540 return -1;
11541 }
11542 }
11543
11544 /*
11545 * At this point, we have the right segment open and if we're streaming we
11546 * know the requested record is in it.
11547 */
11548 Assert(readFile != -1);
11549
11550 /*
11551 * If the current segment is being streamed from master, calculate how
11552 * much of the current page we have received already. We know the
11553 * requested record has been received, but this is for the benefit of
11554 * future calls, to allow quick exit at the top of this function.
11555 */
11556 if (readSource == XLOG_FROM_STREAM)
11557 {
11558 if (((targetPagePtr) / XLOG_BLCKSZ) != (receivedUpto / XLOG_BLCKSZ))
11559 readLen = XLOG_BLCKSZ;
11560 else
11561 readLen = receivedUpto % XLogSegSize - targetPageOff;
11562 }
11563 else
11564 readLen = XLOG_BLCKSZ;
11565
11566 /* Read the requested page */
11567 readOff = targetPageOff;
11568 if (lseek(readFile, (off_t) readOff, SEEK_SET) < 0)
11569 {
11570 char fname[MAXFNAMELEN];
11571 int save_errno = errno;
11572
11573 XLogFileName(fname, curFileTLI, readSegNo);
11574 errno = save_errno;
11575 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11576 (errcode_for_file_access(),
11577 errmsg("could not seek in log segment %s to offset %u: %m",
11578 fname, readOff)));
11579 goto next_record_is_invalid;
11580 }
11581
11582 if (read(readFile, readBuf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
11583 {
11584 char fname[MAXFNAMELEN];
11585 int save_errno = errno;
11586
11587 XLogFileName(fname, curFileTLI, readSegNo);
11588 errno = save_errno;
11589 ereport(emode_for_corrupt_record(emode, targetPagePtr + reqLen),
11590 (errcode_for_file_access(),
11591 errmsg("could not read from log segment %s, offset %u: %m",
11592 fname, readOff)));
11593 goto next_record_is_invalid;
11594 }
11595
11596 Assert(targetSegNo == readSegNo);
11597 Assert(targetPageOff == readOff);
11598 Assert(reqLen <= readLen);
11599
11600 *readTLI = curFileTLI;
11601
11602 /*
11603 * Check the page header immediately, so that we can retry immediately if
11604 * it's not valid. This may seem unnecessary, because XLogReadRecord()
11605 * validates the page header anyway, and would propagate the failure up to
11606 * ReadRecord(), which would retry. However, there's a corner case with
11607 * continuation records, if a record is split across two pages such that
11608 * we would need to read the two pages from different sources. For
11609 * example, imagine a scenario where a streaming replica is started up,
11610 * and replay reaches a record that's split across two WAL segments. The
11611 * first page is only available locally, in pg_wal, because it's already
11612 * been recycled in the master. The second page, however, is not present
11613 * in pg_wal, and we should stream it from the master. There is a recycled
11614 * WAL segment present in pg_wal, with garbage contents, however. We would
11615 * read the first page from the local WAL segment, but when reading the
11616 * second page, we would read the bogus, recycled, WAL segment. If we
11617 * didn't catch that case here, we would never recover, because
11618 * ReadRecord() would retry reading the whole record from the beginning.
11619 *
11620 * Of course, this only catches errors in the page header, which is what
11621 * happens in the case of a recycled WAL segment. Other kinds of errors or
11622 * corruption still has the same problem. But this at least fixes the
11623 * common case, which can happen as part of normal operation.
11624 *
11625 * Validating the page header is cheap enough that doing it twice
11626 * shouldn't be a big deal from a performance point of view.
11627 */
11628 if (!XLogReaderValidatePageHeader(xlogreader, targetPagePtr, readBuf))
11629 {
11630 /* reset any error XLogReaderValidatePageHeader() might have set */
11631 xlogreader->errormsg_buf[0] = '\0';
11632 goto next_record_is_invalid;
11633 }
11634
11635 return readLen;
11636
11637 next_record_is_invalid:
11638 lastSourceFailed = true;
11639
11640 if (readFile >= 0)
11641 close(readFile);
11642 readFile = -1;
11643 readLen = 0;
11644 readSource = 0;
11645
11646 /* In standby-mode, keep trying */
11647 if (StandbyMode)
11648 goto retry;
11649 else
11650 return -1;
11651 }
11652
11653 /*
11654 * Open the WAL segment containing WAL position 'RecPtr'.
11655 *
11656 * The segment can be fetched via restore_command, or via walreceiver having
11657 * streamed the record, or it can already be present in pg_xlog. Checking
11658 * pg_xlog is mainly for crash recovery, but it will be polled in standby mode
11659 * too, in case someone copies a new segment directly to pg_xlog. That is not
11660 * documented or recommended, though.
11661 *
11662 * If 'fetching_ckpt' is true, we're fetching a checkpoint record, and should
11663 * prepare to read WAL starting from RedoStartLSN after this.
11664 *
11665 * 'RecPtr' might not point to the beginning of the record we're interested
11666 * in, it might also point to the page or segment header. In that case,
11667 * 'tliRecPtr' is the position of the WAL record we're interested in. It is
11668 * used to decide which timeline to stream the requested WAL from.
11669 *
11670 * If the record is not immediately available, the function returns false
11671 * if we're not in standby mode. In standby mode, waits for it to become
11672 * available.
11673 *
11674 * When the requested record becomes available, the function opens the file
11675 * containing it (if not open already), and returns true. When end of standby
11676 * mode is triggered by the user, and there is no more WAL available, returns
11677 * false.
11678 */
11679 static bool
WaitForWALToBecomeAvailable(XLogRecPtr RecPtr,bool randAccess,bool fetching_ckpt,XLogRecPtr tliRecPtr)11680 WaitForWALToBecomeAvailable(XLogRecPtr RecPtr, bool randAccess,
11681 bool fetching_ckpt, XLogRecPtr tliRecPtr)
11682 {
11683 static TimestampTz last_fail_time = 0;
11684 TimestampTz now;
11685
11686 /*-------
11687 * Standby mode is implemented by a state machine:
11688 *
11689 * 1. Read from either archive or pg_xlog (XLOG_FROM_ARCHIVE), or just
11690 * pg_xlog (XLOG_FROM_XLOG)
11691 * 2. Check trigger file
11692 * 3. Read from primary server via walreceiver (XLOG_FROM_STREAM)
11693 * 4. Rescan timelines
11694 * 5. Sleep wal_retrieve_retry_interval milliseconds, and loop back to 1.
11695 *
11696 * Failure to read from the current source advances the state machine to
11697 * the next state.
11698 *
11699 * 'currentSource' indicates the current state. There are no currentSource
11700 * values for "check trigger", "rescan timelines", and "sleep" states,
11701 * those actions are taken when reading from the previous source fails, as
11702 * part of advancing to the next state.
11703 *
11704 * If standby mode is turned off while reading WAL from stream, we move
11705 * to XLOG_FROM_ARCHIVE and reset lastSourceFailed, to force fetching
11706 * the files (which would be required at end of recovery, e.g., timeline
11707 * history file) from archive or pg_wal. We don't need to kill WAL receiver
11708 * here because it's already stopped when standby mode is turned off at
11709 * the end of recovery.
11710 *-------
11711 */
11712 if (!InArchiveRecovery)
11713 currentSource = XLOG_FROM_PG_XLOG;
11714 else if (currentSource == 0 ||
11715 (!StandbyMode && currentSource == XLOG_FROM_STREAM))
11716 {
11717 lastSourceFailed = false;
11718 currentSource = XLOG_FROM_ARCHIVE;
11719 }
11720
11721 for (;;)
11722 {
11723 int oldSource = currentSource;
11724
11725 /*
11726 * First check if we failed to read from the current source, and
11727 * advance the state machine if so. The failure to read might've
11728 * happened outside this function, e.g when a CRC check fails on a
11729 * record, or within this loop.
11730 */
11731 if (lastSourceFailed)
11732 {
11733 switch (currentSource)
11734 {
11735 case XLOG_FROM_ARCHIVE:
11736 case XLOG_FROM_PG_XLOG:
11737
11738 /*
11739 * Check to see if the trigger file exists. Note that we
11740 * do this only after failure, so when you create the
11741 * trigger file, we still finish replaying as much as we
11742 * can from archive and pg_xlog before failover.
11743 */
11744 if (StandbyMode && CheckForStandbyTrigger())
11745 {
11746 ShutdownWalRcv();
11747 return false;
11748 }
11749
11750 /*
11751 * Not in standby mode, and we've now tried the archive
11752 * and pg_xlog.
11753 */
11754 if (!StandbyMode)
11755 return false;
11756
11757 /*
11758 * If primary_conninfo is set, launch walreceiver to try
11759 * to stream the missing WAL.
11760 *
11761 * If fetching_ckpt is TRUE, RecPtr points to the initial
11762 * checkpoint location. In that case, we use RedoStartLSN
11763 * as the streaming start position instead of RecPtr, so
11764 * that when we later jump backwards to start redo at
11765 * RedoStartLSN, we will have the logs streamed already.
11766 */
11767 if (PrimaryConnInfo)
11768 {
11769 XLogRecPtr ptr;
11770 TimeLineID tli;
11771
11772 if (fetching_ckpt)
11773 {
11774 ptr = RedoStartLSN;
11775 tli = ControlFile->checkPointCopy.ThisTimeLineID;
11776 }
11777 else
11778 {
11779 ptr = RecPtr;
11780
11781 /*
11782 * Use the record begin position to determine the
11783 * TLI, rather than the position we're reading.
11784 */
11785 tli = tliOfPointInHistory(tliRecPtr, expectedTLEs);
11786
11787 if (curFileTLI > 0 && tli < curFileTLI)
11788 elog(ERROR, "according to history file, WAL location %X/%X belongs to timeline %u, but previous recovered WAL file came from timeline %u",
11789 (uint32) (tliRecPtr >> 32),
11790 (uint32) tliRecPtr,
11791 tli, curFileTLI);
11792 }
11793 curFileTLI = tli;
11794 RequestXLogStreaming(tli, ptr, PrimaryConnInfo,
11795 PrimarySlotName);
11796 receivedUpto = 0;
11797 }
11798
11799 /*
11800 * Move to XLOG_FROM_STREAM state in either case. We'll
11801 * get immediate failure if we didn't launch walreceiver,
11802 * and move on to the next state.
11803 */
11804 currentSource = XLOG_FROM_STREAM;
11805 break;
11806
11807 case XLOG_FROM_STREAM:
11808
11809 /*
11810 * Failure while streaming. Most likely, we got here
11811 * because streaming replication was terminated, or
11812 * promotion was triggered. But we also get here if we
11813 * find an invalid record in the WAL streamed from master,
11814 * in which case something is seriously wrong. There's
11815 * little chance that the problem will just go away, but
11816 * PANIC is not good for availability either, especially
11817 * in hot standby mode. So, we treat that the same as
11818 * disconnection, and retry from archive/pg_xlog again.
11819 * The WAL in the archive should be identical to what was
11820 * streamed, so it's unlikely that it helps, but one can
11821 * hope...
11822 */
11823
11824 /*
11825 * We should be able to move to XLOG_FROM_STREAM
11826 * only in standby mode.
11827 */
11828 Assert(StandbyMode);
11829
11830 /*
11831 * Before we leave XLOG_FROM_STREAM state, make sure that
11832 * walreceiver is not active, so that it won't overwrite
11833 * WAL that we restore from archive.
11834 */
11835 if (WalRcvStreaming())
11836 ShutdownWalRcv();
11837
11838 /*
11839 * Before we sleep, re-scan for possible new timelines if
11840 * we were requested to recover to the latest timeline.
11841 */
11842 if (recoveryTargetIsLatest)
11843 {
11844 if (rescanLatestTimeLine())
11845 {
11846 currentSource = XLOG_FROM_ARCHIVE;
11847 break;
11848 }
11849 }
11850
11851 /*
11852 * XLOG_FROM_STREAM is the last state in our state
11853 * machine, so we've exhausted all the options for
11854 * obtaining the requested WAL. We're going to loop back
11855 * and retry from the archive, but if it hasn't been long
11856 * since last attempt, sleep wal_retrieve_retry_interval
11857 * milliseconds to avoid busy-waiting.
11858 */
11859 now = GetCurrentTimestamp();
11860 if (!TimestampDifferenceExceeds(last_fail_time, now,
11861 wal_retrieve_retry_interval))
11862 {
11863 long wait_time;
11864
11865 wait_time = wal_retrieve_retry_interval -
11866 TimestampDifferenceMilliseconds(last_fail_time, now);
11867
11868 WaitLatch(&XLogCtl->recoveryWakeupLatch,
11869 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
11870 wait_time);
11871 ResetLatch(&XLogCtl->recoveryWakeupLatch);
11872 now = GetCurrentTimestamp();
11873
11874 /* Handle interrupt signals of startup process */
11875 HandleStartupProcInterrupts();
11876 }
11877 last_fail_time = now;
11878 currentSource = XLOG_FROM_ARCHIVE;
11879 break;
11880
11881 default:
11882 elog(ERROR, "unexpected WAL source %d", currentSource);
11883 }
11884 }
11885 else if (currentSource == XLOG_FROM_PG_XLOG)
11886 {
11887 /*
11888 * We just successfully read a file in pg_xlog. We prefer files in
11889 * the archive over ones in pg_xlog, so try the next file again
11890 * from the archive first.
11891 */
11892 if (InArchiveRecovery)
11893 currentSource = XLOG_FROM_ARCHIVE;
11894 }
11895
11896 if (currentSource != oldSource)
11897 elog(DEBUG2, "switched WAL source from %s to %s after %s",
11898 xlogSourceNames[oldSource], xlogSourceNames[currentSource],
11899 lastSourceFailed ? "failure" : "success");
11900
11901 /*
11902 * We've now handled possible failure. Try to read from the chosen
11903 * source.
11904 */
11905 lastSourceFailed = false;
11906
11907 switch (currentSource)
11908 {
11909 case XLOG_FROM_ARCHIVE:
11910 case XLOG_FROM_PG_XLOG:
11911 /*
11912 * WAL receiver must not be running when reading WAL from
11913 * archive or pg_wal.
11914 */
11915 Assert(!WalRcvStreaming());
11916
11917 /* Close any old file we might have open. */
11918 if (readFile >= 0)
11919 {
11920 close(readFile);
11921 readFile = -1;
11922 }
11923 /* Reset curFileTLI if random fetch. */
11924 if (randAccess)
11925 curFileTLI = 0;
11926
11927 /*
11928 * Try to restore the file from archive, or read an existing
11929 * file from pg_xlog.
11930 */
11931 readFile = XLogFileReadAnyTLI(readSegNo, DEBUG2,
11932 currentSource == XLOG_FROM_ARCHIVE ? XLOG_FROM_ANY :
11933 currentSource);
11934 if (readFile >= 0)
11935 return true; /* success! */
11936
11937 /*
11938 * Nope, not found in archive or pg_xlog.
11939 */
11940 lastSourceFailed = true;
11941 break;
11942
11943 case XLOG_FROM_STREAM:
11944 {
11945 bool havedata;
11946
11947 /*
11948 * We should be able to move to XLOG_FROM_STREAM
11949 * only in standby mode.
11950 */
11951 Assert(StandbyMode);
11952
11953 /*
11954 * Check if WAL receiver is still active.
11955 */
11956 if (!WalRcvStreaming())
11957 {
11958 lastSourceFailed = true;
11959 break;
11960 }
11961
11962 /*
11963 * Walreceiver is active, so see if new data has arrived.
11964 *
11965 * We only advance XLogReceiptTime when we obtain fresh
11966 * WAL from walreceiver and observe that we had already
11967 * processed everything before the most recent "chunk"
11968 * that it flushed to disk. In steady state where we are
11969 * keeping up with the incoming data, XLogReceiptTime will
11970 * be updated on each cycle. When we are behind,
11971 * XLogReceiptTime will not advance, so the grace time
11972 * allotted to conflicting queries will decrease.
11973 */
11974 if (RecPtr < receivedUpto)
11975 havedata = true;
11976 else
11977 {
11978 XLogRecPtr latestChunkStart;
11979
11980 receivedUpto = GetWalRcvWriteRecPtr(&latestChunkStart, &receiveTLI);
11981 if (RecPtr < receivedUpto && receiveTLI == curFileTLI)
11982 {
11983 havedata = true;
11984 if (latestChunkStart <= RecPtr)
11985 {
11986 XLogReceiptTime = GetCurrentTimestamp();
11987 SetCurrentChunkStartTime(XLogReceiptTime);
11988 }
11989 }
11990 else
11991 havedata = false;
11992 }
11993 if (havedata)
11994 {
11995 /*
11996 * Great, streamed far enough. Open the file if it's
11997 * not open already. Also read the timeline history
11998 * file if we haven't initialized timeline history
11999 * yet; it should be streamed over and present in
12000 * pg_xlog by now. Use XLOG_FROM_STREAM so that
12001 * source info is set correctly and XLogReceiptTime
12002 * isn't changed.
12003 *
12004 * NB: We must set readTimeLineHistory based on
12005 * recoveryTargetTLI, not receiveTLI. Normally they'll
12006 * be the same, but if recovery_target_timeline is
12007 * 'latest' and archiving is configured, then it's
12008 * possible that we managed to retrieve one or more
12009 * new timeline history files from the archive,
12010 * updating recoveryTargetTLI.
12011 */
12012 if (readFile < 0)
12013 {
12014 if (!expectedTLEs)
12015 expectedTLEs = readTimeLineHistory(recoveryTargetTLI);
12016 readFile = XLogFileRead(readSegNo, PANIC,
12017 receiveTLI,
12018 XLOG_FROM_STREAM, false);
12019 Assert(readFile >= 0);
12020 }
12021 else
12022 {
12023 /* just make sure source info is correct... */
12024 readSource = XLOG_FROM_STREAM;
12025 XLogReceiptSource = XLOG_FROM_STREAM;
12026 return true;
12027 }
12028 break;
12029 }
12030
12031 /*
12032 * Data not here yet. Check for trigger, then wait for
12033 * walreceiver to wake us up when new WAL arrives.
12034 */
12035 if (CheckForStandbyTrigger())
12036 {
12037 /*
12038 * Note that we don't "return false" immediately here.
12039 * After being triggered, we still want to replay all
12040 * the WAL that was already streamed. It's in pg_xlog
12041 * now, so we just treat this as a failure, and the
12042 * state machine will move on to replay the streamed
12043 * WAL from pg_xlog, and then recheck the trigger and
12044 * exit replay.
12045 */
12046 lastSourceFailed = true;
12047 break;
12048 }
12049
12050 /*
12051 * Wait for more WAL to arrive. Time out after 5 seconds
12052 * to react to a trigger file promptly.
12053 */
12054 WaitLatch(&XLogCtl->recoveryWakeupLatch,
12055 WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
12056 5000L);
12057 ResetLatch(&XLogCtl->recoveryWakeupLatch);
12058 break;
12059 }
12060
12061 default:
12062 elog(ERROR, "unexpected WAL source %d", currentSource);
12063 }
12064
12065 /*
12066 * This possibly-long loop needs to handle interrupts of startup
12067 * process.
12068 */
12069 HandleStartupProcInterrupts();
12070 }
12071
12072 return false; /* not reached */
12073 }
12074
12075 /*
12076 * Determine what log level should be used to report a corrupt WAL record
12077 * in the current WAL page, previously read by XLogPageRead().
12078 *
12079 * 'emode' is the error mode that would be used to report a file-not-found
12080 * or legitimate end-of-WAL situation. Generally, we use it as-is, but if
12081 * we're retrying the exact same record that we've tried previously, only
12082 * complain the first time to keep the noise down. However, we only do when
12083 * reading from pg_xlog, because we don't expect any invalid records in archive
12084 * or in records streamed from master. Files in the archive should be complete,
12085 * and we should never hit the end of WAL because we stop and wait for more WAL
12086 * to arrive before replaying it.
12087 *
12088 * NOTE: This function remembers the RecPtr value it was last called with,
12089 * to suppress repeated messages about the same record. Only call this when
12090 * you are about to ereport(), or you might cause a later message to be
12091 * erroneously suppressed.
12092 */
12093 static int
emode_for_corrupt_record(int emode,XLogRecPtr RecPtr)12094 emode_for_corrupt_record(int emode, XLogRecPtr RecPtr)
12095 {
12096 static XLogRecPtr lastComplaint = 0;
12097
12098 if (readSource == XLOG_FROM_PG_XLOG && emode == LOG)
12099 {
12100 if (RecPtr == lastComplaint)
12101 emode = DEBUG1;
12102 else
12103 lastComplaint = RecPtr;
12104 }
12105 return emode;
12106 }
12107
12108 /*
12109 * Check to see whether the user-specified trigger file exists and whether a
12110 * promote request has arrived. If either condition holds, return true.
12111 */
12112 static bool
CheckForStandbyTrigger(void)12113 CheckForStandbyTrigger(void)
12114 {
12115 struct stat stat_buf;
12116 static bool triggered = false;
12117
12118 if (triggered)
12119 return true;
12120
12121 if (IsPromoteTriggered())
12122 {
12123 /*
12124 * In 9.1 and 9.2 the postmaster unlinked the promote file inside the
12125 * signal handler. It now leaves the file in place and lets the
12126 * Startup process do the unlink. This allows Startup to know whether
12127 * it should create a full checkpoint before starting up (fallback
12128 * mode). Fast promotion takes precedence.
12129 */
12130 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12131 {
12132 unlink(PROMOTE_SIGNAL_FILE);
12133 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12134 fast_promote = true;
12135 }
12136 else if (stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12137 {
12138 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12139 fast_promote = false;
12140 }
12141
12142 ereport(LOG, (errmsg("received promote request")));
12143
12144 ResetPromoteTriggered();
12145 triggered = true;
12146 return true;
12147 }
12148
12149 if (TriggerFile == NULL)
12150 return false;
12151
12152 if (stat(TriggerFile, &stat_buf) == 0)
12153 {
12154 ereport(LOG,
12155 (errmsg("trigger file found: %s", TriggerFile)));
12156 unlink(TriggerFile);
12157 triggered = true;
12158 fast_promote = true;
12159 return true;
12160 }
12161 else if (errno != ENOENT)
12162 ereport(ERROR,
12163 (errcode_for_file_access(),
12164 errmsg("could not stat trigger file \"%s\": %m",
12165 TriggerFile)));
12166
12167 return false;
12168 }
12169
12170 /*
12171 * Remove the files signaling a standby promotion request.
12172 */
12173 void
RemovePromoteSignalFiles(void)12174 RemovePromoteSignalFiles(void)
12175 {
12176 unlink(PROMOTE_SIGNAL_FILE);
12177 unlink(FALLBACK_PROMOTE_SIGNAL_FILE);
12178 }
12179
12180 /*
12181 * Check to see if a promote request has arrived. Should be
12182 * called by postmaster after receiving SIGUSR1.
12183 */
12184 bool
CheckPromoteSignal(void)12185 CheckPromoteSignal(void)
12186 {
12187 struct stat stat_buf;
12188
12189 if (stat(PROMOTE_SIGNAL_FILE, &stat_buf) == 0 ||
12190 stat(FALLBACK_PROMOTE_SIGNAL_FILE, &stat_buf) == 0)
12191 return true;
12192
12193 return false;
12194 }
12195
12196 /*
12197 * Wake up startup process to replay newly arrived WAL, or to notice that
12198 * failover has been requested.
12199 */
12200 void
WakeupRecovery(void)12201 WakeupRecovery(void)
12202 {
12203 SetLatch(&XLogCtl->recoveryWakeupLatch);
12204 }
12205
12206 /*
12207 * Update the WalWriterSleeping flag.
12208 */
12209 void
SetWalWriterSleeping(bool sleeping)12210 SetWalWriterSleeping(bool sleeping)
12211 {
12212 SpinLockAcquire(&XLogCtl->info_lck);
12213 XLogCtl->WalWriterSleeping = sleeping;
12214 SpinLockRelease(&XLogCtl->info_lck);
12215 }
12216
12217 /*
12218 * Schedule a walreceiver wakeup in the main recovery loop.
12219 */
12220 void
XLogRequestWalReceiverReply(void)12221 XLogRequestWalReceiverReply(void)
12222 {
12223 doRequestWalReceiverReply = true;
12224 }
12225