1 /*-------------------------------------------------------------------------
2  *
3  * basebackup.c
4  *	  code for taking a base backup and streaming it to a standby
5  *
6  * Portions Copyright (c) 2010-2018, PostgreSQL Global Development Group
7  *
8  * IDENTIFICATION
9  *	  src/backend/replication/basebackup.c
10  *
11  *-------------------------------------------------------------------------
12  */
13 #include "postgres.h"
14 
15 #include <sys/stat.h>
16 #include <unistd.h>
17 #include <time.h>
18 
19 #include "access/xlog_internal.h"	/* for pg_start/stop_backup */
20 #include "catalog/pg_type.h"
21 #include "common/file_perm.h"
22 #include "lib/stringinfo.h"
23 #include "libpq/libpq.h"
24 #include "libpq/pqformat.h"
25 #include "miscadmin.h"
26 #include "nodes/pg_list.h"
27 #include "pgtar.h"
28 #include "pgstat.h"
29 #include "port.h"
30 #include "postmaster/syslogger.h"
31 #include "replication/basebackup.h"
32 #include "replication/walsender.h"
33 #include "replication/walsender_private.h"
34 #include "storage/bufpage.h"
35 #include "storage/checksum.h"
36 #include "storage/dsm_impl.h"
37 #include "storage/fd.h"
38 #include "storage/ipc.h"
39 #include "storage/reinit.h"
40 #include "utils/builtins.h"
41 #include "utils/ps_status.h"
42 #include "utils/relcache.h"
43 #include "utils/timestamp.h"
44 
45 
46 typedef struct
47 {
48 	const char *label;
49 	bool		progress;
50 	bool		fastcheckpoint;
51 	bool		nowait;
52 	bool		includewal;
53 	uint32		maxrate;
54 	bool		sendtblspcmapfile;
55 } basebackup_options;
56 
57 
58 static int64 sendDir(const char *path, int basepathlen, bool sizeonly,
59 		List *tablespaces, bool sendtblspclinks);
60 static bool sendFile(const char *readfilename, const char *tarfilename,
61 		 struct stat *statbuf, bool missing_ok);
62 static void sendFileWithContent(const char *filename, const char *content);
63 static int64 _tarWriteHeader(const char *filename, const char *linktarget,
64 				struct stat *statbuf, bool sizeonly);
65 static int64 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf,
66 			 bool sizeonly);
67 static void send_int8_string(StringInfoData *buf, int64 intval);
68 static void SendBackupHeader(List *tablespaces);
69 static void perform_base_backup(basebackup_options *opt);
70 static void parse_basebackup_options(List *options, basebackup_options *opt);
71 static void SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli);
72 static int	compareWalFileNames(const void *a, const void *b);
73 static void throttle(size_t increment);
74 static bool is_checksummed_file(const char *fullpath, const char *filename);
75 
76 /* Was the backup currently in-progress initiated in recovery mode? */
77 static bool backup_started_in_recovery = false;
78 
79 /* Relative path of temporary statistics directory */
80 static char *statrelpath = NULL;
81 
82 /*
83  * Size of each block sent into the tar stream for larger files.
84  */
85 #define TAR_SEND_SIZE 32768
86 
87 /*
88  * How frequently to throttle, as a fraction of the specified rate-second.
89  */
90 #define THROTTLING_FREQUENCY	8
91 
92 /*
93  * Checks whether we encountered any error in fread().  fread() doesn't give
94  * any clue what has happened, so we check with ferror().  Also, neither
95  * fread() nor ferror() set errno, so we just throw a generic error.
96  */
97 #define CHECK_FREAD_ERROR(fp, filename) \
98 do { \
99 	if (ferror(fp)) \
100 		ereport(ERROR, \
101 				(errmsg("could not read from file \"%s\"", filename))); \
102 } while (0)
103 
104 /* The actual number of bytes, transfer of which may cause sleep. */
105 static uint64 throttling_sample;
106 
107 /* Amount of data already transferred but not yet throttled.  */
108 static int64 throttling_counter;
109 
110 /* The minimum time required to transfer throttling_sample bytes. */
111 static TimeOffset elapsed_min_unit;
112 
113 /* The last check of the transfer rate. */
114 static TimestampTz throttled_last;
115 
116 /* The starting XLOG position of the base backup. */
117 static XLogRecPtr startptr;
118 
119 /* Total number of checksum failures during base backup. */
120 static int64 total_checksum_failures;
121 
122 /* Do not verify checksums. */
123 static bool noverify_checksums = false;
124 
125 /*
126  * Definition of one element part of an exclusion list, used for paths part
127  * of checksum validation or base backups.  "name" is the name of the file
128  * or path to check for exclusion.  If "match_prefix" is true, any items
129  * matching the name as prefix are excluded.
130  */
131 struct exclude_list_item
132 {
133 	const char *name;
134 	bool		match_prefix;
135 };
136 
137 /*
138  * The contents of these directories are removed or recreated during server
139  * start so they are not included in backups.  The directories themselves are
140  * kept and included as empty to preserve access permissions.
141  *
142  * Note: this list should be kept in sync with the filter lists in pg_rewind's
143  * filemap.c.
144  */
145 static const char *excludeDirContents[] =
146 {
147 	/*
148 	 * Skip temporary statistics files. PG_STAT_TMP_DIR must be skipped even
149 	 * when stats_temp_directory is set because PGSS_TEXT_FILE is always
150 	 * created there.
151 	 */
152 	PG_STAT_TMP_DIR,
153 
154 	/*
155 	 * It is generally not useful to backup the contents of this directory
156 	 * even if the intention is to restore to another master. See backup.sgml
157 	 * for a more detailed description.
158 	 */
159 	"pg_replslot",
160 
161 	/* Contents removed on startup, see dsm_cleanup_for_mmap(). */
162 	PG_DYNSHMEM_DIR,
163 
164 	/* Contents removed on startup, see AsyncShmemInit(). */
165 	"pg_notify",
166 
167 	/*
168 	 * Old contents are loaded for possible debugging but are not required for
169 	 * normal operation, see OldSerXidInit().
170 	 */
171 	"pg_serial",
172 
173 	/* Contents removed on startup, see DeleteAllExportedSnapshotFiles(). */
174 	"pg_snapshots",
175 
176 	/* Contents zeroed on startup, see StartupSUBTRANS(). */
177 	"pg_subtrans",
178 
179 	/* end of list */
180 	NULL
181 };
182 
183 /*
184  * List of files excluded from backups.
185  */
186 static const struct exclude_list_item excludeFiles[] =
187 {
188 	/* Skip auto conf temporary file. */
189 	{PG_AUTOCONF_FILENAME ".tmp", false},
190 
191 	/* Skip current log file temporary file */
192 	{LOG_METAINFO_DATAFILE_TMP, false},
193 
194 	/*
195 	 * Skip relation cache because it is rebuilt on startup.  This includes
196 	 * temporary files.
197 	 */
198 	{RELCACHE_INIT_FILENAME, true},
199 
200 	/*
201 	 * If there's a backup_label or tablespace_map file, it belongs to a
202 	 * backup started by the user with pg_start_backup().  It is *not* correct
203 	 * for this backup.  Our backup_label/tablespace_map is injected into the
204 	 * tar separately.
205 	 */
206 	{BACKUP_LABEL_FILE, false},
207 	{TABLESPACE_MAP, false},
208 
209 	{"postmaster.pid", false},
210 	{"postmaster.opts", false},
211 
212 	/* end of list */
213 	{NULL, false}
214 };
215 
216 /*
217  * List of files excluded from checksum validation.
218  *
219  * Note: this list should be kept in sync with what pg_verify_checksums.c
220  * includes.
221  */
222 static const struct exclude_list_item noChecksumFiles[] = {
223 	{"pg_control", false},
224 	{"pg_filenode.map", false},
225 	{"pg_internal.init", true},
226 	{"PG_VERSION", false},
227 #ifdef EXEC_BACKEND
228 	{"config_exec_params", true},
229 #endif
230 	{NULL, false}
231 };
232 
233 /*
234  * Actually do a base backup for the specified tablespaces.
235  *
236  * This is split out mainly to avoid complaints about "variable might be
237  * clobbered by longjmp" from stupider versions of gcc.
238  */
239 static void
perform_base_backup(basebackup_options * opt)240 perform_base_backup(basebackup_options *opt)
241 {
242 	TimeLineID	starttli;
243 	XLogRecPtr	endptr;
244 	TimeLineID	endtli;
245 	StringInfo	labelfile;
246 	StringInfo	tblspc_map_file = NULL;
247 	int			datadirpathlen;
248 	List	   *tablespaces = NIL;
249 
250 	datadirpathlen = strlen(DataDir);
251 
252 	backup_started_in_recovery = RecoveryInProgress();
253 
254 	labelfile = makeStringInfo();
255 	tblspc_map_file = makeStringInfo();
256 
257 	total_checksum_failures = 0;
258 
259 	startptr = do_pg_start_backup(opt->label, opt->fastcheckpoint, &starttli,
260 								  labelfile, &tablespaces,
261 								  tblspc_map_file,
262 								  opt->progress, opt->sendtblspcmapfile);
263 
264 	/*
265 	 * Once do_pg_start_backup has been called, ensure that any failure causes
266 	 * us to abort the backup so we don't "leak" a backup counter. For this
267 	 * reason, *all* functionality between do_pg_start_backup() and the end of
268 	 * do_pg_stop_backup() should be inside the error cleanup block!
269 	 */
270 
271 	PG_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
272 	{
273 		ListCell   *lc;
274 		tablespaceinfo *ti;
275 
276 		SendXlogRecPtrResult(startptr, starttli);
277 
278 		/*
279 		 * Calculate the relative path of temporary statistics directory in
280 		 * order to skip the files which are located in that directory later.
281 		 */
282 		if (is_absolute_path(pgstat_stat_directory) &&
283 			strncmp(pgstat_stat_directory, DataDir, datadirpathlen) == 0)
284 			statrelpath = psprintf("./%s", pgstat_stat_directory + datadirpathlen + 1);
285 		else if (strncmp(pgstat_stat_directory, "./", 2) != 0)
286 			statrelpath = psprintf("./%s", pgstat_stat_directory);
287 		else
288 			statrelpath = pgstat_stat_directory;
289 
290 		/* Add a node for the base directory at the end */
291 		ti = palloc0(sizeof(tablespaceinfo));
292 		ti->size = opt->progress ? sendDir(".", 1, true, tablespaces, true) : -1;
293 		tablespaces = lappend(tablespaces, ti);
294 
295 		/* Send tablespace header */
296 		SendBackupHeader(tablespaces);
297 
298 		/* Setup and activate network throttling, if client requested it */
299 		if (opt->maxrate > 0)
300 		{
301 			throttling_sample =
302 				(int64) opt->maxrate * (int64) 1024 / THROTTLING_FREQUENCY;
303 
304 			/*
305 			 * The minimum amount of time for throttling_sample bytes to be
306 			 * transferred.
307 			 */
308 			elapsed_min_unit = USECS_PER_SEC / THROTTLING_FREQUENCY;
309 
310 			/* Enable throttling. */
311 			throttling_counter = 0;
312 
313 			/* The 'real data' starts now (header was ignored). */
314 			throttled_last = GetCurrentTimestamp();
315 		}
316 		else
317 		{
318 			/* Disable throttling. */
319 			throttling_counter = -1;
320 		}
321 
322 		/* Send off our tablespaces one by one */
323 		foreach(lc, tablespaces)
324 		{
325 			tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
326 			StringInfoData buf;
327 
328 			/* Send CopyOutResponse message */
329 			pq_beginmessage(&buf, 'H');
330 			pq_sendbyte(&buf, 0);	/* overall format */
331 			pq_sendint16(&buf, 0);	/* natts */
332 			pq_endmessage(&buf);
333 
334 			if (ti->path == NULL)
335 			{
336 				struct stat statbuf;
337 
338 				/* In the main tar, include the backup_label first... */
339 				sendFileWithContent(BACKUP_LABEL_FILE, labelfile->data);
340 
341 				/*
342 				 * Send tablespace_map file if required and then the bulk of
343 				 * the files.
344 				 */
345 				if (tblspc_map_file && opt->sendtblspcmapfile)
346 				{
347 					sendFileWithContent(TABLESPACE_MAP, tblspc_map_file->data);
348 					sendDir(".", 1, false, tablespaces, false);
349 				}
350 				else
351 					sendDir(".", 1, false, tablespaces, true);
352 
353 				/* ... and pg_control after everything else. */
354 				if (lstat(XLOG_CONTROL_FILE, &statbuf) != 0)
355 					ereport(ERROR,
356 							(errcode_for_file_access(),
357 							 errmsg("could not stat control file \"%s\": %m",
358 									XLOG_CONTROL_FILE)));
359 				sendFile(XLOG_CONTROL_FILE, XLOG_CONTROL_FILE, &statbuf, false);
360 			}
361 			else
362 				sendTablespace(ti->path, false);
363 
364 			/*
365 			 * If we're including WAL, and this is the main data directory we
366 			 * don't terminate the tar stream here. Instead, we will append
367 			 * the xlog files below and terminate it then. This is safe since
368 			 * the main data directory is always sent *last*.
369 			 */
370 			if (opt->includewal && ti->path == NULL)
371 			{
372 				Assert(lnext(lc) == NULL);
373 			}
374 			else
375 				pq_putemptymessage('c');	/* CopyDone */
376 		}
377 
378 		endptr = do_pg_stop_backup(labelfile->data, !opt->nowait, &endtli);
379 	}
380 	PG_END_ENSURE_ERROR_CLEANUP(do_pg_abort_backup, BoolGetDatum(false));
381 
382 
383 	if (opt->includewal)
384 	{
385 		/*
386 		 * We've left the last tar file "open", so we can now append the
387 		 * required WAL files to it.
388 		 */
389 		char		pathbuf[MAXPGPATH];
390 		XLogSegNo	segno;
391 		XLogSegNo	startsegno;
392 		XLogSegNo	endsegno;
393 		struct stat statbuf;
394 		List	   *historyFileList = NIL;
395 		List	   *walFileList = NIL;
396 		char	  **walFiles;
397 		int			nWalFiles;
398 		char		firstoff[MAXFNAMELEN];
399 		char		lastoff[MAXFNAMELEN];
400 		DIR		   *dir;
401 		struct dirent *de;
402 		int			i;
403 		ListCell   *lc;
404 		TimeLineID	tli;
405 
406 		/*
407 		 * I'd rather not worry about timelines here, so scan pg_wal and
408 		 * include all WAL files in the range between 'startptr' and 'endptr',
409 		 * regardless of the timeline the file is stamped with. If there are
410 		 * some spurious WAL files belonging to timelines that don't belong in
411 		 * this server's history, they will be included too. Normally there
412 		 * shouldn't be such files, but if there are, there's little harm in
413 		 * including them.
414 		 */
415 		XLByteToSeg(startptr, startsegno, wal_segment_size);
416 		XLogFileName(firstoff, ThisTimeLineID, startsegno, wal_segment_size);
417 		XLByteToPrevSeg(endptr, endsegno, wal_segment_size);
418 		XLogFileName(lastoff, ThisTimeLineID, endsegno, wal_segment_size);
419 
420 		dir = AllocateDir("pg_wal");
421 		while ((de = ReadDir(dir, "pg_wal")) != NULL)
422 		{
423 			/* Does it look like a WAL segment, and is it in the range? */
424 			if (IsXLogFileName(de->d_name) &&
425 				strcmp(de->d_name + 8, firstoff + 8) >= 0 &&
426 				strcmp(de->d_name + 8, lastoff + 8) <= 0)
427 			{
428 				walFileList = lappend(walFileList, pstrdup(de->d_name));
429 			}
430 			/* Does it look like a timeline history file? */
431 			else if (IsTLHistoryFileName(de->d_name))
432 			{
433 				historyFileList = lappend(historyFileList, pstrdup(de->d_name));
434 			}
435 		}
436 		FreeDir(dir);
437 
438 		/*
439 		 * Before we go any further, check that none of the WAL segments we
440 		 * need were removed.
441 		 */
442 		CheckXLogRemoved(startsegno, ThisTimeLineID);
443 
444 		/*
445 		 * Put the WAL filenames into an array, and sort. We send the files in
446 		 * order from oldest to newest, to reduce the chance that a file is
447 		 * recycled before we get a chance to send it over.
448 		 */
449 		nWalFiles = list_length(walFileList);
450 		walFiles = palloc(nWalFiles * sizeof(char *));
451 		i = 0;
452 		foreach(lc, walFileList)
453 		{
454 			walFiles[i++] = lfirst(lc);
455 		}
456 		qsort(walFiles, nWalFiles, sizeof(char *), compareWalFileNames);
457 
458 		/*
459 		 * There must be at least one xlog file in the pg_wal directory, since
460 		 * we are doing backup-including-xlog.
461 		 */
462 		if (nWalFiles < 1)
463 			ereport(ERROR,
464 					(errmsg("could not find any WAL files")));
465 
466 		/*
467 		 * Sanity check: the first and last segment should cover startptr and
468 		 * endptr, with no gaps in between.
469 		 */
470 		XLogFromFileName(walFiles[0], &tli, &segno, wal_segment_size);
471 		if (segno != startsegno)
472 		{
473 			char		startfname[MAXFNAMELEN];
474 
475 			XLogFileName(startfname, ThisTimeLineID, startsegno,
476 						 wal_segment_size);
477 			ereport(ERROR,
478 					(errmsg("could not find WAL file \"%s\"", startfname)));
479 		}
480 		for (i = 0; i < nWalFiles; i++)
481 		{
482 			XLogSegNo	currsegno = segno;
483 			XLogSegNo	nextsegno = segno + 1;
484 
485 			XLogFromFileName(walFiles[i], &tli, &segno, wal_segment_size);
486 			if (!(nextsegno == segno || currsegno == segno))
487 			{
488 				char		nextfname[MAXFNAMELEN];
489 
490 				XLogFileName(nextfname, ThisTimeLineID, nextsegno,
491 							 wal_segment_size);
492 				ereport(ERROR,
493 						(errmsg("could not find WAL file \"%s\"", nextfname)));
494 			}
495 		}
496 		if (segno != endsegno)
497 		{
498 			char		endfname[MAXFNAMELEN];
499 
500 			XLogFileName(endfname, ThisTimeLineID, endsegno, wal_segment_size);
501 			ereport(ERROR,
502 					(errmsg("could not find WAL file \"%s\"", endfname)));
503 		}
504 
505 		/* Ok, we have everything we need. Send the WAL files. */
506 		for (i = 0; i < nWalFiles; i++)
507 		{
508 			FILE	   *fp;
509 			char		buf[TAR_SEND_SIZE];
510 			size_t		cnt;
511 			pgoff_t		len = 0;
512 
513 			snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", walFiles[i]);
514 			XLogFromFileName(walFiles[i], &tli, &segno, wal_segment_size);
515 
516 			fp = AllocateFile(pathbuf, "rb");
517 			if (fp == NULL)
518 			{
519 				int			save_errno = errno;
520 
521 				/*
522 				 * Most likely reason for this is that the file was already
523 				 * removed by a checkpoint, so check for that to get a better
524 				 * error message.
525 				 */
526 				CheckXLogRemoved(segno, tli);
527 
528 				errno = save_errno;
529 				ereport(ERROR,
530 						(errcode_for_file_access(),
531 						 errmsg("could not open file \"%s\": %m", pathbuf)));
532 			}
533 
534 			if (fstat(fileno(fp), &statbuf) != 0)
535 				ereport(ERROR,
536 						(errcode_for_file_access(),
537 						 errmsg("could not stat file \"%s\": %m",
538 								pathbuf)));
539 			if (statbuf.st_size != wal_segment_size)
540 			{
541 				CheckXLogRemoved(segno, tli);
542 				ereport(ERROR,
543 						(errcode_for_file_access(),
544 						 errmsg("unexpected WAL file size \"%s\"", walFiles[i])));
545 			}
546 
547 			/* send the WAL file itself */
548 			_tarWriteHeader(pathbuf, NULL, &statbuf, false);
549 
550 			while ((cnt = fread(buf, 1,
551 								Min(sizeof(buf), wal_segment_size - len),
552 								fp)) > 0)
553 			{
554 				CheckXLogRemoved(segno, tli);
555 				/* Send the chunk as a CopyData message */
556 				if (pq_putmessage('d', buf, cnt))
557 					ereport(ERROR,
558 							(errmsg("base backup could not send data, aborting backup")));
559 
560 				len += cnt;
561 				throttle(cnt);
562 
563 				if (len == wal_segment_size)
564 					break;
565 			}
566 
567 			CHECK_FREAD_ERROR(fp, pathbuf);
568 
569 			if (len != wal_segment_size)
570 			{
571 				CheckXLogRemoved(segno, tli);
572 				ereport(ERROR,
573 						(errcode_for_file_access(),
574 						 errmsg("unexpected WAL file size \"%s\"", walFiles[i])));
575 			}
576 
577 			/* wal_segment_size is a multiple of 512, so no need for padding */
578 
579 			FreeFile(fp);
580 
581 			/*
582 			 * Mark file as archived, otherwise files can get archived again
583 			 * after promotion of a new node. This is in line with
584 			 * walreceiver.c always doing an XLogArchiveForceDone() after a
585 			 * complete segment.
586 			 */
587 			StatusFilePath(pathbuf, walFiles[i], ".done");
588 			sendFileWithContent(pathbuf, "");
589 		}
590 
591 		/*
592 		 * Send timeline history files too. Only the latest timeline history
593 		 * file is required for recovery, and even that only if there happens
594 		 * to be a timeline switch in the first WAL segment that contains the
595 		 * checkpoint record, or if we're taking a base backup from a standby
596 		 * server and the target timeline changes while the backup is taken.
597 		 * But they are small and highly useful for debugging purposes, so
598 		 * better include them all, always.
599 		 */
600 		foreach(lc, historyFileList)
601 		{
602 			char	   *fname = lfirst(lc);
603 
604 			snprintf(pathbuf, MAXPGPATH, XLOGDIR "/%s", fname);
605 
606 			if (lstat(pathbuf, &statbuf) != 0)
607 				ereport(ERROR,
608 						(errcode_for_file_access(),
609 						 errmsg("could not stat file \"%s\": %m", pathbuf)));
610 
611 			sendFile(pathbuf, pathbuf, &statbuf, false);
612 
613 			/* unconditionally mark file as archived */
614 			StatusFilePath(pathbuf, fname, ".done");
615 			sendFileWithContent(pathbuf, "");
616 		}
617 
618 		/* Send CopyDone message for the last tar file */
619 		pq_putemptymessage('c');
620 	}
621 	SendXlogRecPtrResult(endptr, endtli);
622 
623 	if (total_checksum_failures)
624 	{
625 		if (total_checksum_failures > 1)
626 		{
627 			char		buf[64];
628 
629 			snprintf(buf, sizeof(buf), INT64_FORMAT, total_checksum_failures);
630 
631 			ereport(WARNING,
632 					(errmsg("%s total checksum verification failures", buf)));
633 		}
634 		ereport(ERROR,
635 				(errcode(ERRCODE_DATA_CORRUPTED),
636 				 errmsg("checksum verification failure during base backup")));
637 	}
638 
639 }
640 
641 /*
642  * qsort comparison function, to compare log/seg portion of WAL segment
643  * filenames, ignoring the timeline portion.
644  */
645 static int
compareWalFileNames(const void * a,const void * b)646 compareWalFileNames(const void *a, const void *b)
647 {
648 	char	   *fna = *((char **) a);
649 	char	   *fnb = *((char **) b);
650 
651 	return strcmp(fna + 8, fnb + 8);
652 }
653 
654 /*
655  * Parse the base backup options passed down by the parser
656  */
657 static void
parse_basebackup_options(List * options,basebackup_options * opt)658 parse_basebackup_options(List *options, basebackup_options *opt)
659 {
660 	ListCell   *lopt;
661 	bool		o_label = false;
662 	bool		o_progress = false;
663 	bool		o_fast = false;
664 	bool		o_nowait = false;
665 	bool		o_wal = false;
666 	bool		o_maxrate = false;
667 	bool		o_tablespace_map = false;
668 	bool		o_noverify_checksums = false;
669 
670 	MemSet(opt, 0, sizeof(*opt));
671 	foreach(lopt, options)
672 	{
673 		DefElem    *defel = (DefElem *) lfirst(lopt);
674 
675 		if (strcmp(defel->defname, "label") == 0)
676 		{
677 			if (o_label)
678 				ereport(ERROR,
679 						(errcode(ERRCODE_SYNTAX_ERROR),
680 						 errmsg("duplicate option \"%s\"", defel->defname)));
681 			opt->label = strVal(defel->arg);
682 			o_label = true;
683 		}
684 		else if (strcmp(defel->defname, "progress") == 0)
685 		{
686 			if (o_progress)
687 				ereport(ERROR,
688 						(errcode(ERRCODE_SYNTAX_ERROR),
689 						 errmsg("duplicate option \"%s\"", defel->defname)));
690 			opt->progress = true;
691 			o_progress = true;
692 		}
693 		else if (strcmp(defel->defname, "fast") == 0)
694 		{
695 			if (o_fast)
696 				ereport(ERROR,
697 						(errcode(ERRCODE_SYNTAX_ERROR),
698 						 errmsg("duplicate option \"%s\"", defel->defname)));
699 			opt->fastcheckpoint = true;
700 			o_fast = true;
701 		}
702 		else if (strcmp(defel->defname, "nowait") == 0)
703 		{
704 			if (o_nowait)
705 				ereport(ERROR,
706 						(errcode(ERRCODE_SYNTAX_ERROR),
707 						 errmsg("duplicate option \"%s\"", defel->defname)));
708 			opt->nowait = true;
709 			o_nowait = true;
710 		}
711 		else if (strcmp(defel->defname, "wal") == 0)
712 		{
713 			if (o_wal)
714 				ereport(ERROR,
715 						(errcode(ERRCODE_SYNTAX_ERROR),
716 						 errmsg("duplicate option \"%s\"", defel->defname)));
717 			opt->includewal = true;
718 			o_wal = true;
719 		}
720 		else if (strcmp(defel->defname, "max_rate") == 0)
721 		{
722 			long		maxrate;
723 
724 			if (o_maxrate)
725 				ereport(ERROR,
726 						(errcode(ERRCODE_SYNTAX_ERROR),
727 						 errmsg("duplicate option \"%s\"", defel->defname)));
728 
729 			maxrate = intVal(defel->arg);
730 			if (maxrate < MAX_RATE_LOWER || maxrate > MAX_RATE_UPPER)
731 				ereport(ERROR,
732 						(errcode(ERRCODE_NUMERIC_VALUE_OUT_OF_RANGE),
733 						 errmsg("%d is outside the valid range for parameter \"%s\" (%d .. %d)",
734 								(int) maxrate, "MAX_RATE", MAX_RATE_LOWER, MAX_RATE_UPPER)));
735 
736 			opt->maxrate = (uint32) maxrate;
737 			o_maxrate = true;
738 		}
739 		else if (strcmp(defel->defname, "tablespace_map") == 0)
740 		{
741 			if (o_tablespace_map)
742 				ereport(ERROR,
743 						(errcode(ERRCODE_SYNTAX_ERROR),
744 						 errmsg("duplicate option \"%s\"", defel->defname)));
745 			opt->sendtblspcmapfile = true;
746 			o_tablespace_map = true;
747 		}
748 		else if (strcmp(defel->defname, "noverify_checksums") == 0)
749 		{
750 			if (o_noverify_checksums)
751 				ereport(ERROR,
752 						(errcode(ERRCODE_SYNTAX_ERROR),
753 						 errmsg("duplicate option \"%s\"", defel->defname)));
754 			noverify_checksums = true;
755 			o_noverify_checksums = true;
756 		}
757 		else
758 			elog(ERROR, "option \"%s\" not recognized",
759 				 defel->defname);
760 	}
761 	if (opt->label == NULL)
762 		opt->label = "base backup";
763 }
764 
765 
766 /*
767  * SendBaseBackup() - send a complete base backup.
768  *
769  * The function will put the system into backup mode like pg_start_backup()
770  * does, so that the backup is consistent even though we read directly from
771  * the filesystem, bypassing the buffer cache.
772  */
773 void
SendBaseBackup(BaseBackupCmd * cmd)774 SendBaseBackup(BaseBackupCmd *cmd)
775 {
776 	basebackup_options opt;
777 
778 	parse_basebackup_options(cmd->options, &opt);
779 
780 	WalSndSetState(WALSNDSTATE_BACKUP);
781 
782 	if (update_process_title)
783 	{
784 		char		activitymsg[50];
785 
786 		snprintf(activitymsg, sizeof(activitymsg), "sending backup \"%s\"",
787 				 opt.label);
788 		set_ps_display(activitymsg, false);
789 	}
790 
791 	perform_base_backup(&opt);
792 }
793 
794 static void
send_int8_string(StringInfoData * buf,int64 intval)795 send_int8_string(StringInfoData *buf, int64 intval)
796 {
797 	char		is[32];
798 
799 	sprintf(is, INT64_FORMAT, intval);
800 	pq_sendint32(buf, strlen(is));
801 	pq_sendbytes(buf, is, strlen(is));
802 }
803 
804 static void
SendBackupHeader(List * tablespaces)805 SendBackupHeader(List *tablespaces)
806 {
807 	StringInfoData buf;
808 	ListCell   *lc;
809 
810 	/* Construct and send the directory information */
811 	pq_beginmessage(&buf, 'T'); /* RowDescription */
812 	pq_sendint16(&buf, 3);		/* 3 fields */
813 
814 	/* First field - spcoid */
815 	pq_sendstring(&buf, "spcoid");
816 	pq_sendint32(&buf, 0);		/* table oid */
817 	pq_sendint16(&buf, 0);		/* attnum */
818 	pq_sendint32(&buf, OIDOID); /* type oid */
819 	pq_sendint16(&buf, 4);		/* typlen */
820 	pq_sendint32(&buf, 0);		/* typmod */
821 	pq_sendint16(&buf, 0);		/* format code */
822 
823 	/* Second field - spcpath */
824 	pq_sendstring(&buf, "spclocation");
825 	pq_sendint32(&buf, 0);
826 	pq_sendint16(&buf, 0);
827 	pq_sendint32(&buf, TEXTOID);
828 	pq_sendint16(&buf, -1);
829 	pq_sendint32(&buf, 0);
830 	pq_sendint16(&buf, 0);
831 
832 	/* Third field - size */
833 	pq_sendstring(&buf, "size");
834 	pq_sendint32(&buf, 0);
835 	pq_sendint16(&buf, 0);
836 	pq_sendint32(&buf, INT8OID);
837 	pq_sendint16(&buf, 8);
838 	pq_sendint32(&buf, 0);
839 	pq_sendint16(&buf, 0);
840 	pq_endmessage(&buf);
841 
842 	foreach(lc, tablespaces)
843 	{
844 		tablespaceinfo *ti = lfirst(lc);
845 
846 		/* Send one datarow message */
847 		pq_beginmessage(&buf, 'D');
848 		pq_sendint16(&buf, 3);	/* number of columns */
849 		if (ti->path == NULL)
850 		{
851 			pq_sendint32(&buf, -1); /* Length = -1 ==> NULL */
852 			pq_sendint32(&buf, -1);
853 		}
854 		else
855 		{
856 			Size		len;
857 
858 			len = strlen(ti->oid);
859 			pq_sendint32(&buf, len);
860 			pq_sendbytes(&buf, ti->oid, len);
861 
862 			len = strlen(ti->path);
863 			pq_sendint32(&buf, len);
864 			pq_sendbytes(&buf, ti->path, len);
865 		}
866 		if (ti->size >= 0)
867 			send_int8_string(&buf, ti->size / 1024);
868 		else
869 			pq_sendint32(&buf, -1); /* NULL */
870 
871 		pq_endmessage(&buf);
872 	}
873 
874 	/* Send a CommandComplete message */
875 	pq_puttextmessage('C', "SELECT");
876 }
877 
878 /*
879  * Send a single resultset containing just a single
880  * XLogRecPtr record (in text format)
881  */
882 static void
SendXlogRecPtrResult(XLogRecPtr ptr,TimeLineID tli)883 SendXlogRecPtrResult(XLogRecPtr ptr, TimeLineID tli)
884 {
885 	StringInfoData buf;
886 	char		str[MAXFNAMELEN];
887 	Size		len;
888 
889 	pq_beginmessage(&buf, 'T'); /* RowDescription */
890 	pq_sendint16(&buf, 2);		/* 2 fields */
891 
892 	/* Field headers */
893 	pq_sendstring(&buf, "recptr");
894 	pq_sendint32(&buf, 0);		/* table oid */
895 	pq_sendint16(&buf, 0);		/* attnum */
896 	pq_sendint32(&buf, TEXTOID);	/* type oid */
897 	pq_sendint16(&buf, -1);
898 	pq_sendint32(&buf, 0);
899 	pq_sendint16(&buf, 0);
900 
901 	pq_sendstring(&buf, "tli");
902 	pq_sendint32(&buf, 0);		/* table oid */
903 	pq_sendint16(&buf, 0);		/* attnum */
904 
905 	/*
906 	 * int8 may seem like a surprising data type for this, but in theory int4
907 	 * would not be wide enough for this, as TimeLineID is unsigned.
908 	 */
909 	pq_sendint32(&buf, INT8OID);	/* type oid */
910 	pq_sendint16(&buf, -1);
911 	pq_sendint32(&buf, 0);
912 	pq_sendint16(&buf, 0);
913 	pq_endmessage(&buf);
914 
915 	/* Data row */
916 	pq_beginmessage(&buf, 'D');
917 	pq_sendint16(&buf, 2);		/* number of columns */
918 
919 	len = snprintf(str, sizeof(str),
920 				   "%X/%X", (uint32) (ptr >> 32), (uint32) ptr);
921 	pq_sendint32(&buf, len);
922 	pq_sendbytes(&buf, str, len);
923 
924 	len = snprintf(str, sizeof(str), "%u", tli);
925 	pq_sendint32(&buf, len);
926 	pq_sendbytes(&buf, str, len);
927 
928 	pq_endmessage(&buf);
929 
930 	/* Send a CommandComplete message */
931 	pq_puttextmessage('C', "SELECT");
932 }
933 
934 /*
935  * Inject a file with given name and content in the output tar stream.
936  */
937 static void
sendFileWithContent(const char * filename,const char * content)938 sendFileWithContent(const char *filename, const char *content)
939 {
940 	struct stat statbuf;
941 	int			pad,
942 				len;
943 
944 	len = strlen(content);
945 
946 	/*
947 	 * Construct a stat struct for the backup_label file we're injecting in
948 	 * the tar.
949 	 */
950 	/* Windows doesn't have the concept of uid and gid */
951 #ifdef WIN32
952 	statbuf.st_uid = 0;
953 	statbuf.st_gid = 0;
954 #else
955 	statbuf.st_uid = geteuid();
956 	statbuf.st_gid = getegid();
957 #endif
958 	statbuf.st_mtime = time(NULL);
959 	statbuf.st_mode = pg_file_create_mode;
960 	statbuf.st_size = len;
961 
962 	_tarWriteHeader(filename, NULL, &statbuf, false);
963 	/* Send the contents as a CopyData message */
964 	pq_putmessage('d', content, len);
965 
966 	/* Pad to 512 byte boundary, per tar format requirements */
967 	pad = ((len + 511) & ~511) - len;
968 	if (pad > 0)
969 	{
970 		char		buf[512];
971 
972 		MemSet(buf, 0, pad);
973 		pq_putmessage('d', buf, pad);
974 	}
975 }
976 
977 /*
978  * Include the tablespace directory pointed to by 'path' in the output tar
979  * stream.  If 'sizeonly' is true, we just calculate a total length and return
980  * it, without actually sending anything.
981  *
982  * Only used to send auxiliary tablespaces, not PGDATA.
983  */
984 int64
sendTablespace(char * path,bool sizeonly)985 sendTablespace(char *path, bool sizeonly)
986 {
987 	int64		size;
988 	char		pathbuf[MAXPGPATH];
989 	struct stat statbuf;
990 
991 	/*
992 	 * 'path' points to the tablespace location, but we only want to include
993 	 * the version directory in it that belongs to us.
994 	 */
995 	snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path,
996 			 TABLESPACE_VERSION_DIRECTORY);
997 
998 	/*
999 	 * Store a directory entry in the tar file so we get the permissions
1000 	 * right.
1001 	 */
1002 	if (lstat(pathbuf, &statbuf) != 0)
1003 	{
1004 		if (errno != ENOENT)
1005 			ereport(ERROR,
1006 					(errcode_for_file_access(),
1007 					 errmsg("could not stat file or directory \"%s\": %m",
1008 							pathbuf)));
1009 
1010 		/* If the tablespace went away while scanning, it's no error. */
1011 		return 0;
1012 	}
1013 
1014 	size = _tarWriteHeader(TABLESPACE_VERSION_DIRECTORY, NULL, &statbuf,
1015 						   sizeonly);
1016 
1017 	/* Send all the files in the tablespace version directory */
1018 	size += sendDir(pathbuf, strlen(path), sizeonly, NIL, true);
1019 
1020 	return size;
1021 }
1022 
1023 /*
1024  * Include all files from the given directory in the output tar stream. If
1025  * 'sizeonly' is true, we just calculate a total length and return it, without
1026  * actually sending anything.
1027  *
1028  * Omit any directory in the tablespaces list, to avoid backing up
1029  * tablespaces twice when they were created inside PGDATA.
1030  *
1031  * If sendtblspclinks is true, we need to include symlink
1032  * information in the tar file. If not, we can skip that
1033  * as it will be sent separately in the tablespace_map file.
1034  */
1035 static int64
sendDir(const char * path,int basepathlen,bool sizeonly,List * tablespaces,bool sendtblspclinks)1036 sendDir(const char *path, int basepathlen, bool sizeonly, List *tablespaces,
1037 		bool sendtblspclinks)
1038 {
1039 	DIR		   *dir;
1040 	struct dirent *de;
1041 	char		pathbuf[MAXPGPATH * 2];
1042 	struct stat statbuf;
1043 	int64		size = 0;
1044 	const char *lastDir;		/* Split last dir from parent path. */
1045 	bool		isDbDir = false;	/* Does this directory contain relations? */
1046 
1047 	/*
1048 	 * Determine if the current path is a database directory that can contain
1049 	 * relations.
1050 	 *
1051 	 * Start by finding the location of the delimiter between the parent path
1052 	 * and the current path.
1053 	 */
1054 	lastDir = last_dir_separator(path);
1055 
1056 	/* Does this path look like a database path (i.e. all digits)? */
1057 	if (lastDir != NULL &&
1058 		strspn(lastDir + 1, "0123456789") == strlen(lastDir + 1))
1059 	{
1060 		/* Part of path that contains the parent directory. */
1061 		int			parentPathLen = lastDir - path;
1062 
1063 		/*
1064 		 * Mark path as a database directory if the parent path is either
1065 		 * $PGDATA/base or a tablespace version path.
1066 		 */
1067 		if (strncmp(path, "./base", parentPathLen) == 0 ||
1068 			(parentPathLen >= (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) &&
1069 			 strncmp(lastDir - (sizeof(TABLESPACE_VERSION_DIRECTORY) - 1),
1070 					 TABLESPACE_VERSION_DIRECTORY,
1071 					 sizeof(TABLESPACE_VERSION_DIRECTORY) - 1) == 0))
1072 			isDbDir = true;
1073 	}
1074 
1075 	dir = AllocateDir(path);
1076 	while ((de = ReadDir(dir, path)) != NULL)
1077 	{
1078 		int			excludeIdx;
1079 		bool		excludeFound;
1080 		ForkNumber	relForkNum; /* Type of fork if file is a relation */
1081 		int			relOidChars;	/* Chars in filename that are the rel oid */
1082 
1083 		/* Skip special stuff */
1084 		if (strcmp(de->d_name, ".") == 0 || strcmp(de->d_name, "..") == 0)
1085 			continue;
1086 
1087 		/* Skip temporary files */
1088 		if (strncmp(de->d_name,
1089 					PG_TEMP_FILE_PREFIX,
1090 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
1091 			continue;
1092 
1093 		/*
1094 		 * Check if the postmaster has signaled us to exit, and abort with an
1095 		 * error in that case. The error handler further up will call
1096 		 * do_pg_abort_backup() for us. Also check that if the backup was
1097 		 * started while still in recovery, the server wasn't promoted.
1098 		 * dp_pg_stop_backup() will check that too, but it's better to stop
1099 		 * the backup early than continue to the end and fail there.
1100 		 */
1101 		CHECK_FOR_INTERRUPTS();
1102 		if (RecoveryInProgress() != backup_started_in_recovery)
1103 			ereport(ERROR,
1104 					(errcode(ERRCODE_OBJECT_NOT_IN_PREREQUISITE_STATE),
1105 					 errmsg("the standby was promoted during online backup"),
1106 					 errhint("This means that the backup being taken is corrupt "
1107 							 "and should not be used. "
1108 							 "Try taking another online backup.")));
1109 
1110 		/* Scan for files that should be excluded */
1111 		excludeFound = false;
1112 		for (excludeIdx = 0; excludeFiles[excludeIdx].name != NULL; excludeIdx++)
1113 		{
1114 			int			cmplen = strlen(excludeFiles[excludeIdx].name);
1115 
1116 			if (!excludeFiles[excludeIdx].match_prefix)
1117 				cmplen++;
1118 			if (strncmp(de->d_name, excludeFiles[excludeIdx].name, cmplen) == 0)
1119 			{
1120 				elog(DEBUG1, "file \"%s\" excluded from backup", de->d_name);
1121 				excludeFound = true;
1122 				break;
1123 			}
1124 		}
1125 
1126 		if (excludeFound)
1127 			continue;
1128 
1129 		/* Exclude all forks for unlogged tables except the init fork */
1130 		if (isDbDir &&
1131 			parse_filename_for_nontemp_relation(de->d_name, &relOidChars,
1132 												&relForkNum))
1133 		{
1134 			/* Never exclude init forks */
1135 			if (relForkNum != INIT_FORKNUM)
1136 			{
1137 				char		initForkFile[MAXPGPATH];
1138 				char		relOid[OIDCHARS + 1];
1139 
1140 				/*
1141 				 * If any other type of fork, check if there is an init fork
1142 				 * with the same OID. If so, the file can be excluded.
1143 				 */
1144 				memcpy(relOid, de->d_name, relOidChars);
1145 				relOid[relOidChars] = '\0';
1146 				snprintf(initForkFile, sizeof(initForkFile), "%s/%s_init",
1147 						 path, relOid);
1148 
1149 				if (lstat(initForkFile, &statbuf) == 0)
1150 				{
1151 					elog(DEBUG2,
1152 						 "unlogged relation file \"%s\" excluded from backup",
1153 						 de->d_name);
1154 
1155 					continue;
1156 				}
1157 			}
1158 		}
1159 
1160 		/* Exclude temporary relations */
1161 		if (isDbDir && looks_like_temp_rel_name(de->d_name))
1162 		{
1163 			elog(DEBUG2,
1164 				 "temporary relation file \"%s\" excluded from backup",
1165 				 de->d_name);
1166 
1167 			continue;
1168 		}
1169 
1170 		snprintf(pathbuf, sizeof(pathbuf), "%s/%s", path, de->d_name);
1171 
1172 		/* Skip pg_control here to back up it last */
1173 		if (strcmp(pathbuf, "./global/pg_control") == 0)
1174 			continue;
1175 
1176 		if (lstat(pathbuf, &statbuf) != 0)
1177 		{
1178 			if (errno != ENOENT)
1179 				ereport(ERROR,
1180 						(errcode_for_file_access(),
1181 						 errmsg("could not stat file or directory \"%s\": %m",
1182 								pathbuf)));
1183 
1184 			/* If the file went away while scanning, it's not an error. */
1185 			continue;
1186 		}
1187 
1188 		/* Scan for directories whose contents should be excluded */
1189 		excludeFound = false;
1190 		for (excludeIdx = 0; excludeDirContents[excludeIdx] != NULL; excludeIdx++)
1191 		{
1192 			if (strcmp(de->d_name, excludeDirContents[excludeIdx]) == 0)
1193 			{
1194 				elog(DEBUG1, "contents of directory \"%s\" excluded from backup", de->d_name);
1195 				size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1196 				excludeFound = true;
1197 				break;
1198 			}
1199 		}
1200 
1201 		if (excludeFound)
1202 			continue;
1203 
1204 		/*
1205 		 * Exclude contents of directory specified by statrelpath if not set
1206 		 * to the default (pg_stat_tmp) which is caught in the loop above.
1207 		 */
1208 		if (statrelpath != NULL && strcmp(pathbuf, statrelpath) == 0)
1209 		{
1210 			elog(DEBUG1, "contents of directory \"%s\" excluded from backup", statrelpath);
1211 			size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1212 			continue;
1213 		}
1214 
1215 		/*
1216 		 * We can skip pg_wal, the WAL segments need to be fetched from the
1217 		 * WAL archive anyway. But include it as an empty directory anyway, so
1218 		 * we get permissions right.
1219 		 */
1220 		if (strcmp(pathbuf, "./pg_wal") == 0)
1221 		{
1222 			/* If pg_wal is a symlink, write it as a directory anyway */
1223 			size += _tarWriteDir(pathbuf, basepathlen, &statbuf, sizeonly);
1224 
1225 			/*
1226 			 * Also send archive_status directory (by hackishly reusing
1227 			 * statbuf from above ...).
1228 			 */
1229 			size += _tarWriteHeader("./pg_wal/archive_status", NULL, &statbuf,
1230 									sizeonly);
1231 
1232 			continue;			/* don't recurse into pg_wal */
1233 		}
1234 
1235 		/* Allow symbolic links in pg_tblspc only */
1236 		if (strcmp(path, "./pg_tblspc") == 0 &&
1237 #ifndef WIN32
1238 			S_ISLNK(statbuf.st_mode)
1239 #else
1240 			pgwin32_is_junction(pathbuf)
1241 #endif
1242 			)
1243 		{
1244 #if defined(HAVE_READLINK) || defined(WIN32)
1245 			char		linkpath[MAXPGPATH];
1246 			int			rllen;
1247 
1248 			rllen = readlink(pathbuf, linkpath, sizeof(linkpath));
1249 			if (rllen < 0)
1250 				ereport(ERROR,
1251 						(errcode_for_file_access(),
1252 						 errmsg("could not read symbolic link \"%s\": %m",
1253 								pathbuf)));
1254 			if (rllen >= sizeof(linkpath))
1255 				ereport(ERROR,
1256 						(errcode(ERRCODE_PROGRAM_LIMIT_EXCEEDED),
1257 						 errmsg("symbolic link \"%s\" target is too long",
1258 								pathbuf)));
1259 			linkpath[rllen] = '\0';
1260 
1261 			size += _tarWriteHeader(pathbuf + basepathlen + 1, linkpath,
1262 									&statbuf, sizeonly);
1263 #else
1264 
1265 			/*
1266 			 * If the platform does not have symbolic links, it should not be
1267 			 * possible to have tablespaces - clearly somebody else created
1268 			 * them. Warn about it and ignore.
1269 			 */
1270 			ereport(WARNING,
1271 					(errcode(ERRCODE_FEATURE_NOT_SUPPORTED),
1272 					 errmsg("tablespaces are not supported on this platform")));
1273 			continue;
1274 #endif							/* HAVE_READLINK */
1275 		}
1276 		else if (S_ISDIR(statbuf.st_mode))
1277 		{
1278 			bool		skip_this_dir = false;
1279 			ListCell   *lc;
1280 
1281 			/*
1282 			 * Store a directory entry in the tar file so we can get the
1283 			 * permissions right.
1284 			 */
1285 			size += _tarWriteHeader(pathbuf + basepathlen + 1, NULL, &statbuf,
1286 									sizeonly);
1287 
1288 			/*
1289 			 * Call ourselves recursively for a directory, unless it happens
1290 			 * to be a separate tablespace located within PGDATA.
1291 			 */
1292 			foreach(lc, tablespaces)
1293 			{
1294 				tablespaceinfo *ti = (tablespaceinfo *) lfirst(lc);
1295 
1296 				/*
1297 				 * ti->rpath is the tablespace relative path within PGDATA, or
1298 				 * NULL if the tablespace has been properly located somewhere
1299 				 * else.
1300 				 *
1301 				 * Skip past the leading "./" in pathbuf when comparing.
1302 				 */
1303 				if (ti->rpath && strcmp(ti->rpath, pathbuf + 2) == 0)
1304 				{
1305 					skip_this_dir = true;
1306 					break;
1307 				}
1308 			}
1309 
1310 			/*
1311 			 * skip sending directories inside pg_tblspc, if not required.
1312 			 */
1313 			if (strcmp(pathbuf, "./pg_tblspc") == 0 && !sendtblspclinks)
1314 				skip_this_dir = true;
1315 
1316 			if (!skip_this_dir)
1317 				size += sendDir(pathbuf, basepathlen, sizeonly, tablespaces, sendtblspclinks);
1318 		}
1319 		else if (S_ISREG(statbuf.st_mode))
1320 		{
1321 			bool		sent = false;
1322 
1323 			if (!sizeonly)
1324 				sent = sendFile(pathbuf, pathbuf + basepathlen + 1, &statbuf,
1325 								true);
1326 
1327 			if (sent || sizeonly)
1328 			{
1329 				/* Add size, rounded up to 512byte block */
1330 				size += ((statbuf.st_size + 511) & ~511);
1331 				size += 512;	/* Size of the header of the file */
1332 			}
1333 		}
1334 		else
1335 			ereport(WARNING,
1336 					(errmsg("skipping special file \"%s\"", pathbuf)));
1337 	}
1338 	FreeDir(dir);
1339 	return size;
1340 }
1341 
1342 /*
1343  * Check if a file should have its checksum validated.
1344  * We validate checksums on files in regular tablespaces
1345  * (including global and default) only, and in those there
1346  * are some files that are explicitly excluded.
1347  */
1348 static bool
is_checksummed_file(const char * fullpath,const char * filename)1349 is_checksummed_file(const char *fullpath, const char *filename)
1350 {
1351 	/* Check that the file is in a tablespace */
1352 	if (strncmp(fullpath, "./global/", 9) == 0 ||
1353 		strncmp(fullpath, "./base/", 7) == 0 ||
1354 		strncmp(fullpath, "/", 1) == 0)
1355 	{
1356 		int			excludeIdx;
1357 
1358 		/* Compare file against noChecksumFiles skip list */
1359 		for (excludeIdx = 0; noChecksumFiles[excludeIdx].name != NULL; excludeIdx++)
1360 		{
1361 			int			cmplen = strlen(noChecksumFiles[excludeIdx].name);
1362 
1363 			if (!noChecksumFiles[excludeIdx].match_prefix)
1364 				cmplen++;
1365 			if (strncmp(filename, noChecksumFiles[excludeIdx].name,
1366 						cmplen) == 0)
1367 				return false;
1368 		}
1369 
1370 		return true;
1371 	}
1372 	else
1373 		return false;
1374 }
1375 
1376 /*****
1377  * Functions for handling tar file format
1378  *
1379  * Copied from pg_dump, but modified to work with libpq for sending
1380  */
1381 
1382 
1383 /*
1384  * Given the member, write the TAR header & send the file.
1385  *
1386  * If 'missing_ok' is true, will not throw an error if the file is not found.
1387  *
1388  * Returns true if the file was successfully sent, false if 'missing_ok',
1389  * and the file did not exist.
1390  */
1391 static bool
sendFile(const char * readfilename,const char * tarfilename,struct stat * statbuf,bool missing_ok)1392 sendFile(const char *readfilename, const char *tarfilename, struct stat *statbuf,
1393 		 bool missing_ok)
1394 {
1395 	FILE	   *fp;
1396 	BlockNumber blkno = 0;
1397 	bool		block_retry = false;
1398 	char		buf[TAR_SEND_SIZE];
1399 	uint16		checksum;
1400 	int			checksum_failures = 0;
1401 	off_t		cnt;
1402 	int			i;
1403 	pgoff_t		len = 0;
1404 	char	   *page;
1405 	size_t		pad;
1406 	PageHeader	phdr;
1407 	int			segmentno = 0;
1408 	char	   *segmentpath;
1409 	bool		verify_checksum = false;
1410 
1411 	fp = AllocateFile(readfilename, "rb");
1412 	if (fp == NULL)
1413 	{
1414 		if (errno == ENOENT && missing_ok)
1415 			return false;
1416 		ereport(ERROR,
1417 				(errcode_for_file_access(),
1418 				 errmsg("could not open file \"%s\": %m", readfilename)));
1419 	}
1420 
1421 	_tarWriteHeader(tarfilename, NULL, statbuf, false);
1422 
1423 	if (!noverify_checksums && DataChecksumsEnabled())
1424 	{
1425 		char	   *filename;
1426 
1427 		/*
1428 		 * Get the filename (excluding path).  As last_dir_separator()
1429 		 * includes the last directory separator, we chop that off by
1430 		 * incrementing the pointer.
1431 		 */
1432 		filename = last_dir_separator(readfilename) + 1;
1433 
1434 		if (is_checksummed_file(readfilename, filename))
1435 		{
1436 			verify_checksum = true;
1437 
1438 			/*
1439 			 * Cut off at the segment boundary (".") to get the segment number
1440 			 * in order to mix it into the checksum.
1441 			 */
1442 			segmentpath = strstr(filename, ".");
1443 			if (segmentpath != NULL)
1444 			{
1445 				segmentno = atoi(segmentpath + 1);
1446 				if (segmentno == 0)
1447 					ereport(ERROR,
1448 							(errmsg("invalid segment number %d in file \"%s\"",
1449 									segmentno, filename)));
1450 			}
1451 		}
1452 	}
1453 
1454 	while ((cnt = fread(buf, 1, Min(sizeof(buf), statbuf->st_size - len), fp)) > 0)
1455 	{
1456 		/*
1457 		 * The checksums are verified at block level, so we iterate over the
1458 		 * buffer in chunks of BLCKSZ, after making sure that
1459 		 * TAR_SEND_SIZE/buf is divisible by BLCKSZ and we read a multiple of
1460 		 * BLCKSZ bytes.
1461 		 */
1462 		Assert(TAR_SEND_SIZE % BLCKSZ == 0);
1463 
1464 		if (verify_checksum && (cnt % BLCKSZ != 0))
1465 		{
1466 			ereport(WARNING,
1467 					(errmsg("cannot verify checksum in file \"%s\", block "
1468 							"%d: read buffer size %d and page size %d "
1469 							"differ",
1470 							readfilename, blkno, (int) cnt, BLCKSZ)));
1471 			verify_checksum = false;
1472 		}
1473 
1474 		if (verify_checksum)
1475 		{
1476 			for (i = 0; i < cnt / BLCKSZ; i++)
1477 			{
1478 				page = buf + BLCKSZ * i;
1479 
1480 				/*
1481 				 * Only check pages which have not been modified since the
1482 				 * start of the base backup. Otherwise, they might have been
1483 				 * written only halfway and the checksum would not be valid.
1484 				 * However, replaying WAL would reinstate the correct page in
1485 				 * this case. We also skip completely new pages, since they
1486 				 * don't have a checksum yet.
1487 				 */
1488 				if (!PageIsNew(page) && PageGetLSN(page) < startptr)
1489 				{
1490 					checksum = pg_checksum_page((char *) page, blkno + segmentno * RELSEG_SIZE);
1491 					phdr = (PageHeader) page;
1492 					if (phdr->pd_checksum != checksum)
1493 					{
1494 						/*
1495 						 * Retry the block on the first failure.  It's
1496 						 * possible that we read the first 4K page of the
1497 						 * block just before postgres updated the entire block
1498 						 * so it ends up looking torn to us.  We only need to
1499 						 * retry once because the LSN should be updated to
1500 						 * something we can ignore on the next pass.  If the
1501 						 * error happens again then it is a true validation
1502 						 * failure.
1503 						 */
1504 						if (block_retry == false)
1505 						{
1506 							/* Reread the failed block */
1507 							if (fseek(fp, -(cnt - BLCKSZ * i), SEEK_CUR) == -1)
1508 							{
1509 								ereport(ERROR,
1510 										(errcode_for_file_access(),
1511 										 errmsg("could not fseek in file \"%s\": %m",
1512 												readfilename)));
1513 							}
1514 
1515 							if (fread(buf + BLCKSZ * i, 1, BLCKSZ, fp) != BLCKSZ)
1516 							{
1517 								/*
1518 								 * If we hit end-of-file, a concurrent
1519 								 * truncation must have occurred, so break out
1520 								 * of this loop just as if the initial fread()
1521 								 * returned 0. We'll drop through to the same
1522 								 * code that handles that case. (We must fix
1523 								 * up cnt first, though.)
1524 								 */
1525 								if (feof(fp))
1526 								{
1527 									cnt = BLCKSZ * i;
1528 									break;
1529 								}
1530 
1531 								ereport(ERROR,
1532 										(errcode_for_file_access(),
1533 										 errmsg("could not reread block %d of file \"%s\": %m",
1534 												blkno, readfilename)));
1535 							}
1536 
1537 							if (fseek(fp, cnt - BLCKSZ * i - BLCKSZ, SEEK_CUR) == -1)
1538 							{
1539 								ereport(ERROR,
1540 										(errcode_for_file_access(),
1541 										 errmsg("could not fseek in file \"%s\": %m",
1542 												readfilename)));
1543 							}
1544 
1545 							/* Set flag so we know a retry was attempted */
1546 							block_retry = true;
1547 
1548 							/* Reset loop to validate the block again */
1549 							i--;
1550 							continue;
1551 						}
1552 
1553 						checksum_failures++;
1554 
1555 						if (checksum_failures <= 5)
1556 							ereport(WARNING,
1557 									(errmsg("checksum verification failed in "
1558 											"file \"%s\", block %d: calculated "
1559 											"%X but expected %X",
1560 											readfilename, blkno, checksum,
1561 											phdr->pd_checksum)));
1562 						if (checksum_failures == 5)
1563 							ereport(WARNING,
1564 									(errmsg("further checksum verification "
1565 											"failures in file \"%s\" will not "
1566 											"be reported", readfilename)));
1567 					}
1568 				}
1569 				block_retry = false;
1570 				blkno++;
1571 			}
1572 		}
1573 
1574 		/* Send the chunk as a CopyData message */
1575 		if (pq_putmessage('d', buf, cnt))
1576 			ereport(ERROR,
1577 					(errmsg("base backup could not send data, aborting backup")));
1578 
1579 		len += cnt;
1580 		throttle(cnt);
1581 
1582 		if (feof(fp) || len >= statbuf->st_size)
1583 		{
1584 			/*
1585 			 * Reached end of file. The file could be longer, if it was
1586 			 * extended while we were sending it, but for a base backup we can
1587 			 * ignore such extended data. It will be restored from WAL.
1588 			 */
1589 			break;
1590 		}
1591 	}
1592 
1593 	CHECK_FREAD_ERROR(fp, readfilename);
1594 
1595 	/* If the file was truncated while we were sending it, pad it with zeros */
1596 	if (len < statbuf->st_size)
1597 	{
1598 		MemSet(buf, 0, sizeof(buf));
1599 		while (len < statbuf->st_size)
1600 		{
1601 			cnt = Min(sizeof(buf), statbuf->st_size - len);
1602 			pq_putmessage('d', buf, cnt);
1603 			len += cnt;
1604 			throttle(cnt);
1605 		}
1606 	}
1607 
1608 	/*
1609 	 * Pad to 512 byte boundary, per tar format requirements. (This small
1610 	 * piece of data is probably not worth throttling.)
1611 	 */
1612 	pad = ((len + 511) & ~511) - len;
1613 	if (pad > 0)
1614 	{
1615 		MemSet(buf, 0, pad);
1616 		pq_putmessage('d', buf, pad);
1617 	}
1618 
1619 	FreeFile(fp);
1620 
1621 	if (checksum_failures > 1)
1622 	{
1623 		ereport(WARNING,
1624 				(errmsg("file \"%s\" has a total of %d checksum verification "
1625 						"failures", readfilename, checksum_failures)));
1626 	}
1627 	total_checksum_failures += checksum_failures;
1628 
1629 	return true;
1630 }
1631 
1632 
1633 static int64
_tarWriteHeader(const char * filename,const char * linktarget,struct stat * statbuf,bool sizeonly)1634 _tarWriteHeader(const char *filename, const char *linktarget,
1635 				struct stat *statbuf, bool sizeonly)
1636 {
1637 	char		h[512];
1638 	enum tarError rc;
1639 
1640 	if (!sizeonly)
1641 	{
1642 		rc = tarCreateHeader(h, filename, linktarget, statbuf->st_size,
1643 							 statbuf->st_mode, statbuf->st_uid, statbuf->st_gid,
1644 							 statbuf->st_mtime);
1645 
1646 		switch (rc)
1647 		{
1648 			case TAR_OK:
1649 				break;
1650 			case TAR_NAME_TOO_LONG:
1651 				ereport(ERROR,
1652 						(errmsg("file name too long for tar format: \"%s\"",
1653 								filename)));
1654 				break;
1655 			case TAR_SYMLINK_TOO_LONG:
1656 				ereport(ERROR,
1657 						(errmsg("symbolic link target too long for tar format: "
1658 								"file name \"%s\", target \"%s\"",
1659 								filename, linktarget)));
1660 				break;
1661 			default:
1662 				elog(ERROR, "unrecognized tar error: %d", rc);
1663 		}
1664 
1665 		pq_putmessage('d', h, sizeof(h));
1666 	}
1667 
1668 	return sizeof(h);
1669 }
1670 
1671 /*
1672  * Write tar header for a directory.  If the entry in statbuf is a link then
1673  * write it as a directory anyway.
1674  */
1675 static int64
_tarWriteDir(const char * pathbuf,int basepathlen,struct stat * statbuf,bool sizeonly)1676 _tarWriteDir(const char *pathbuf, int basepathlen, struct stat *statbuf,
1677 			 bool sizeonly)
1678 {
1679 	/* If symlink, write it as a directory anyway */
1680 #ifndef WIN32
1681 	if (S_ISLNK(statbuf->st_mode))
1682 #else
1683 	if (pgwin32_is_junction(pathbuf))
1684 #endif
1685 		statbuf->st_mode = S_IFDIR | pg_dir_create_mode;
1686 
1687 	return _tarWriteHeader(pathbuf + basepathlen + 1, NULL, statbuf, sizeonly);
1688 }
1689 
1690 /*
1691  * Increment the network transfer counter by the given number of bytes,
1692  * and sleep if necessary to comply with the requested network transfer
1693  * rate.
1694  */
1695 static void
throttle(size_t increment)1696 throttle(size_t increment)
1697 {
1698 	TimeOffset	elapsed_min;
1699 
1700 	if (throttling_counter < 0)
1701 		return;
1702 
1703 	throttling_counter += increment;
1704 	if (throttling_counter < throttling_sample)
1705 		return;
1706 
1707 	/* How much time should have elapsed at minimum? */
1708 	elapsed_min = elapsed_min_unit *
1709 		(throttling_counter / throttling_sample);
1710 
1711 	/*
1712 	 * Since the latch could be set repeatedly because of concurrently WAL
1713 	 * activity, sleep in a loop to ensure enough time has passed.
1714 	 */
1715 	for (;;)
1716 	{
1717 		TimeOffset	elapsed,
1718 					sleep;
1719 		int			wait_result;
1720 
1721 		/* Time elapsed since the last measurement (and possible wake up). */
1722 		elapsed = GetCurrentTimestamp() - throttled_last;
1723 
1724 		/* sleep if the transfer is faster than it should be */
1725 		sleep = elapsed_min - elapsed;
1726 		if (sleep <= 0)
1727 			break;
1728 
1729 		ResetLatch(MyLatch);
1730 
1731 		/* We're eating a potentially set latch, so check for interrupts */
1732 		CHECK_FOR_INTERRUPTS();
1733 
1734 		/*
1735 		 * (TAR_SEND_SIZE / throttling_sample * elapsed_min_unit) should be
1736 		 * the maximum time to sleep. Thus the cast to long is safe.
1737 		 */
1738 		wait_result = WaitLatch(MyLatch,
1739 								WL_LATCH_SET | WL_TIMEOUT | WL_POSTMASTER_DEATH,
1740 								(long) (sleep / 1000),
1741 								WAIT_EVENT_BASE_BACKUP_THROTTLE);
1742 
1743 		if (wait_result & WL_LATCH_SET)
1744 			CHECK_FOR_INTERRUPTS();
1745 
1746 		/* Done waiting? */
1747 		if (wait_result & WL_TIMEOUT)
1748 			break;
1749 	}
1750 
1751 	/*
1752 	 * As we work with integers, only whole multiple of throttling_sample was
1753 	 * processed. The rest will be done during the next call of this function.
1754 	 */
1755 	throttling_counter %= throttling_sample;
1756 
1757 	/*
1758 	 * Time interval for the remaining amount and possible next increments
1759 	 * starts now.
1760 	 */
1761 	throttled_last = GetCurrentTimestamp();
1762 }
1763