1 /*-------------------------------------------------------------------------
2  *
3  * pg_checksums.c
4  *	  Checks, enables or disables page level checksums for an offline
5  *	  cluster
6  *
7  * Copyright (c) 2010-2020, PostgreSQL Global Development Group
8  *
9  * IDENTIFICATION
10  *	  src/bin/pg_checksums/pg_checksums.c
11  *
12  *-------------------------------------------------------------------------
13  */
14 
15 #include "postgres_fe.h"
16 
17 #include <dirent.h>
18 #include <time.h>
19 #include <sys/stat.h>
20 #include <unistd.h>
21 
22 #include "access/xlog_internal.h"
23 #include "common/controldata_utils.h"
24 #include "common/file_perm.h"
25 #include "common/file_utils.h"
26 #include "common/logging.h"
27 #include "getopt_long.h"
28 #include "pg_getopt.h"
29 #include "storage/bufpage.h"
30 #include "storage/checksum.h"
31 #include "storage/checksum_impl.h"
32 
33 
34 static int64 files = 0;
35 static int64 blocks = 0;
36 static int64 badblocks = 0;
37 static ControlFileData *ControlFile;
38 
39 static char *only_filenode = NULL;
40 static bool do_sync = true;
41 static bool verbose = false;
42 static bool showprogress = false;
43 
44 typedef enum
45 {
46 	PG_MODE_CHECK,
47 	PG_MODE_DISABLE,
48 	PG_MODE_ENABLE
49 } PgChecksumMode;
50 
51 /*
52  * Filename components.
53  *
54  * XXX: fd.h is not declared here as frontend side code is not able to
55  * interact with the backend-side definitions for the various fsync
56  * wrappers.
57  */
58 #define PG_TEMP_FILES_DIR "pgsql_tmp"
59 #define PG_TEMP_FILE_PREFIX "pgsql_tmp"
60 
61 static PgChecksumMode mode = PG_MODE_CHECK;
62 
63 static const char *progname;
64 
65 /*
66  * Progress status information.
67  */
68 int64		total_size = 0;
69 int64		current_size = 0;
70 static pg_time_t last_progress_report = 0;
71 
72 static void
usage(void)73 usage(void)
74 {
75 	printf(_("%s enables, disables, or verifies data checksums in a PostgreSQL database cluster.\n\n"), progname);
76 	printf(_("Usage:\n"));
77 	printf(_("  %s [OPTION]... [DATADIR]\n"), progname);
78 	printf(_("\nOptions:\n"));
79 	printf(_(" [-D, --pgdata=]DATADIR    data directory\n"));
80 	printf(_("  -c, --check              check data checksums (default)\n"));
81 	printf(_("  -d, --disable            disable data checksums\n"));
82 	printf(_("  -e, --enable             enable data checksums\n"));
83 	printf(_("  -f, --filenode=FILENODE  check only relation with specified filenode\n"));
84 	printf(_("  -N, --no-sync            do not wait for changes to be written safely to disk\n"));
85 	printf(_("  -P, --progress           show progress information\n"));
86 	printf(_("  -v, --verbose            output verbose messages\n"));
87 	printf(_("  -V, --version            output version information, then exit\n"));
88 	printf(_("  -?, --help               show this help, then exit\n"));
89 	printf(_("\nIf no data directory (DATADIR) is specified, "
90 			 "the environment variable PGDATA\nis used.\n\n"));
91 	printf(_("Report bugs to <%s>.\n"), PACKAGE_BUGREPORT);
92 	printf(_("%s home page: <%s>\n"), PACKAGE_NAME, PACKAGE_URL);
93 }
94 
95 /*
96  * Definition of one element part of an exclusion list, used for files
97  * to exclude from checksum validation.  "name" is the name of the file
98  * or path to check for exclusion.  If "match_prefix" is true, any items
99  * matching the name as prefix are excluded.
100  */
101 struct exclude_list_item
102 {
103 	const char *name;
104 	bool		match_prefix;
105 };
106 
107 /*
108  * List of files excluded from checksum validation.
109  *
110  * Note: this list should be kept in sync with what basebackup.c includes.
111  */
112 static const struct exclude_list_item skip[] = {
113 	{"pg_control", false},
114 	{"pg_filenode.map", false},
115 	{"pg_internal.init", true},
116 	{"PG_VERSION", false},
117 #ifdef EXEC_BACKEND
118 	{"config_exec_params", true},
119 #endif
120 	{NULL, false}
121 };
122 
123 /*
124  * Report current progress status.  Parts borrowed from
125  * src/bin/pg_basebackup/pg_basebackup.c.
126  */
127 static void
progress_report(bool finished)128 progress_report(bool finished)
129 {
130 	int			percent;
131 	char		total_size_str[32];
132 	char		current_size_str[32];
133 	pg_time_t	now;
134 
135 	Assert(showprogress);
136 
137 	now = time(NULL);
138 	if (now == last_progress_report && !finished)
139 		return;					/* Max once per second */
140 
141 	/* Save current time */
142 	last_progress_report = now;
143 
144 	/* Adjust total size if current_size is larger */
145 	if (current_size > total_size)
146 		total_size = current_size;
147 
148 	/* Calculate current percentage of size done */
149 	percent = total_size ? (int) ((current_size) * 100 / total_size) : 0;
150 
151 	/*
152 	 * Separate step to keep platform-dependent format code out of
153 	 * translatable strings.  And we only test for INT64_FORMAT availability
154 	 * in snprintf, not fprintf.
155 	 */
156 	snprintf(total_size_str, sizeof(total_size_str), INT64_FORMAT,
157 			 total_size / (1024 * 1024));
158 	snprintf(current_size_str, sizeof(current_size_str), INT64_FORMAT,
159 			 current_size / (1024 * 1024));
160 
161 	fprintf(stderr, _("%*s/%s MB (%d%%) computed"),
162 			(int) strlen(current_size_str), current_size_str, total_size_str,
163 			percent);
164 
165 	/*
166 	 * Stay on the same line if reporting to a terminal and we're not done
167 	 * yet.
168 	 */
169 	fputc((!finished && isatty(fileno(stderr))) ? '\r' : '\n', stderr);
170 }
171 
172 static bool
skipfile(const char * fn)173 skipfile(const char *fn)
174 {
175 	int			excludeIdx;
176 
177 	for (excludeIdx = 0; skip[excludeIdx].name != NULL; excludeIdx++)
178 	{
179 		int			cmplen = strlen(skip[excludeIdx].name);
180 
181 		if (!skip[excludeIdx].match_prefix)
182 			cmplen++;
183 		if (strncmp(skip[excludeIdx].name, fn, cmplen) == 0)
184 			return true;
185 	}
186 
187 	return false;
188 }
189 
190 static void
scan_file(const char * fn,BlockNumber segmentno)191 scan_file(const char *fn, BlockNumber segmentno)
192 {
193 	PGAlignedBlock buf;
194 	PageHeader	header = (PageHeader) buf.data;
195 	int			f;
196 	BlockNumber blockno;
197 	int			flags;
198 
199 	Assert(mode == PG_MODE_ENABLE ||
200 		   mode == PG_MODE_CHECK);
201 
202 	flags = (mode == PG_MODE_ENABLE) ? O_RDWR : O_RDONLY;
203 	f = open(fn, PG_BINARY | flags, 0);
204 
205 	if (f < 0)
206 	{
207 		pg_log_error("could not open file \"%s\": %m", fn);
208 		exit(1);
209 	}
210 
211 	files++;
212 
213 	for (blockno = 0;; blockno++)
214 	{
215 		uint16		csum;
216 		int			r = read(f, buf.data, BLCKSZ);
217 
218 		if (r == 0)
219 			break;
220 		if (r != BLCKSZ)
221 		{
222 			if (r < 0)
223 				pg_log_error("could not read block %u in file \"%s\": %m",
224 							 blockno, fn);
225 			else
226 				pg_log_error("could not read block %u in file \"%s\": read %d of %d",
227 							 blockno, fn, r, BLCKSZ);
228 			exit(1);
229 		}
230 		blocks++;
231 
232 		/*
233 		 * Since the file size is counted as total_size for progress status
234 		 * information, the sizes of all pages including new ones in the file
235 		 * should be counted as current_size. Otherwise the progress reporting
236 		 * calculated using those counters may not reach 100%.
237 		 */
238 		current_size += r;
239 
240 		/* New pages have no checksum yet */
241 		if (PageIsNew(header))
242 			continue;
243 
244 		csum = pg_checksum_page(buf.data, blockno + segmentno * RELSEG_SIZE);
245 		if (mode == PG_MODE_CHECK)
246 		{
247 			if (csum != header->pd_checksum)
248 			{
249 				if (ControlFile->data_checksum_version == PG_DATA_CHECKSUM_VERSION)
250 					pg_log_error("checksum verification failed in file \"%s\", block %u: calculated checksum %X but block contains %X",
251 								 fn, blockno, csum, header->pd_checksum);
252 				badblocks++;
253 			}
254 		}
255 		else if (mode == PG_MODE_ENABLE)
256 		{
257 			int			w;
258 
259 			/* Set checksum in page header */
260 			header->pd_checksum = csum;
261 
262 			/* Seek back to beginning of block */
263 			if (lseek(f, -BLCKSZ, SEEK_CUR) < 0)
264 			{
265 				pg_log_error("seek failed for block %u in file \"%s\": %m", blockno, fn);
266 				exit(1);
267 			}
268 
269 			/* Write block with checksum */
270 			w = write(f, buf.data, BLCKSZ);
271 			if (w != BLCKSZ)
272 			{
273 				if (w < 0)
274 					pg_log_error("could not write block %u in file \"%s\": %m",
275 								 blockno, fn);
276 				else
277 					pg_log_error("could not write block %u in file \"%s\": wrote %d of %d",
278 								 blockno, fn, w, BLCKSZ);
279 				exit(1);
280 			}
281 		}
282 
283 		if (showprogress)
284 			progress_report(false);
285 	}
286 
287 	if (verbose)
288 	{
289 		if (mode == PG_MODE_CHECK)
290 			pg_log_info("checksums verified in file \"%s\"", fn);
291 		if (mode == PG_MODE_ENABLE)
292 			pg_log_info("checksums enabled in file \"%s\"", fn);
293 	}
294 
295 	close(f);
296 }
297 
298 /*
299  * Scan the given directory for items which can be checksummed and
300  * operate on each one of them.  If "sizeonly" is true, the size of
301  * all the items which have checksums is computed and returned back
302  * to the caller without operating on the files.  This is used to compile
303  * the total size of the data directory for progress reports.
304  */
305 static int64
scan_directory(const char * basedir,const char * subdir,bool sizeonly)306 scan_directory(const char *basedir, const char *subdir, bool sizeonly)
307 {
308 	int64		dirsize = 0;
309 	char		path[MAXPGPATH];
310 	DIR		   *dir;
311 	struct dirent *de;
312 
313 	snprintf(path, sizeof(path), "%s/%s", basedir, subdir);
314 	dir = opendir(path);
315 	if (!dir)
316 	{
317 		pg_log_error("could not open directory \"%s\": %m", path);
318 		exit(1);
319 	}
320 	while ((de = readdir(dir)) != NULL)
321 	{
322 		char		fn[MAXPGPATH];
323 		struct stat st;
324 
325 		if (strcmp(de->d_name, ".") == 0 ||
326 			strcmp(de->d_name, "..") == 0)
327 			continue;
328 
329 		/* Skip temporary files */
330 		if (strncmp(de->d_name,
331 					PG_TEMP_FILE_PREFIX,
332 					strlen(PG_TEMP_FILE_PREFIX)) == 0)
333 			continue;
334 
335 		/* Skip temporary folders */
336 		if (strncmp(de->d_name,
337 					PG_TEMP_FILES_DIR,
338 					strlen(PG_TEMP_FILES_DIR)) == 0)
339 			continue;
340 
341 		snprintf(fn, sizeof(fn), "%s/%s", path, de->d_name);
342 		if (lstat(fn, &st) < 0)
343 		{
344 			pg_log_error("could not stat file \"%s\": %m", fn);
345 			exit(1);
346 		}
347 		if (S_ISREG(st.st_mode))
348 		{
349 			char		fnonly[MAXPGPATH];
350 			char	   *forkpath,
351 					   *segmentpath;
352 			BlockNumber segmentno = 0;
353 
354 			if (skipfile(de->d_name))
355 				continue;
356 
357 			/*
358 			 * Cut off at the segment boundary (".") to get the segment number
359 			 * in order to mix it into the checksum. Then also cut off at the
360 			 * fork boundary, to get the filenode the file belongs to for
361 			 * filtering.
362 			 */
363 			strlcpy(fnonly, de->d_name, sizeof(fnonly));
364 			segmentpath = strchr(fnonly, '.');
365 			if (segmentpath != NULL)
366 			{
367 				*segmentpath++ = '\0';
368 				segmentno = atoi(segmentpath);
369 				if (segmentno == 0)
370 				{
371 					pg_log_error("invalid segment number %d in file name \"%s\"",
372 								 segmentno, fn);
373 					exit(1);
374 				}
375 			}
376 
377 			forkpath = strchr(fnonly, '_');
378 			if (forkpath != NULL)
379 				*forkpath++ = '\0';
380 
381 			if (only_filenode && strcmp(only_filenode, fnonly) != 0)
382 				/* filenode not to be included */
383 				continue;
384 
385 			dirsize += st.st_size;
386 
387 			/*
388 			 * No need to work on the file when calculating only the size of
389 			 * the items in the data folder.
390 			 */
391 			if (!sizeonly)
392 				scan_file(fn, segmentno);
393 		}
394 #ifndef WIN32
395 		else if (S_ISDIR(st.st_mode) || S_ISLNK(st.st_mode))
396 #else
397 		else if (S_ISDIR(st.st_mode) || pgwin32_is_junction(fn))
398 #endif
399 		{
400 			/*
401 			 * If going through the entries of pg_tblspc, we assume to operate
402 			 * on tablespace locations where only TABLESPACE_VERSION_DIRECTORY
403 			 * is valid, resolving the linked locations and dive into them
404 			 * directly.
405 			 */
406 			if (strncmp("pg_tblspc", subdir, strlen("pg_tblspc")) == 0)
407 			{
408 				char		tblspc_path[MAXPGPATH];
409 				struct stat tblspc_st;
410 
411 				/*
412 				 * Resolve tablespace location path and check whether
413 				 * TABLESPACE_VERSION_DIRECTORY exists.  Not finding a valid
414 				 * location is unexpected, since there should be no orphaned
415 				 * links and no links pointing to something else than a
416 				 * directory.
417 				 */
418 				snprintf(tblspc_path, sizeof(tblspc_path), "%s/%s/%s",
419 						 path, de->d_name, TABLESPACE_VERSION_DIRECTORY);
420 
421 				if (lstat(tblspc_path, &tblspc_st) < 0)
422 				{
423 					pg_log_error("could not stat file \"%s\": %m",
424 								 tblspc_path);
425 					exit(1);
426 				}
427 
428 				/*
429 				 * Move backwards once as the scan needs to happen for the
430 				 * contents of TABLESPACE_VERSION_DIRECTORY.
431 				 */
432 				snprintf(tblspc_path, sizeof(tblspc_path), "%s/%s",
433 						 path, de->d_name);
434 
435 				/* Looks like a valid tablespace location */
436 				dirsize += scan_directory(tblspc_path,
437 										  TABLESPACE_VERSION_DIRECTORY,
438 										  sizeonly);
439 			}
440 			else
441 			{
442 				dirsize += scan_directory(path, de->d_name, sizeonly);
443 			}
444 		}
445 	}
446 	closedir(dir);
447 	return dirsize;
448 }
449 
450 int
main(int argc,char * argv[])451 main(int argc, char *argv[])
452 {
453 	static struct option long_options[] = {
454 		{"check", no_argument, NULL, 'c'},
455 		{"pgdata", required_argument, NULL, 'D'},
456 		{"disable", no_argument, NULL, 'd'},
457 		{"enable", no_argument, NULL, 'e'},
458 		{"filenode", required_argument, NULL, 'f'},
459 		{"no-sync", no_argument, NULL, 'N'},
460 		{"progress", no_argument, NULL, 'P'},
461 		{"verbose", no_argument, NULL, 'v'},
462 		{NULL, 0, NULL, 0}
463 	};
464 
465 	char	   *DataDir = NULL;
466 	int			c;
467 	int			option_index;
468 	bool		crc_ok;
469 
470 	pg_logging_init(argv[0]);
471 	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_checksums"));
472 	progname = get_progname(argv[0]);
473 
474 	if (argc > 1)
475 	{
476 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
477 		{
478 			usage();
479 			exit(0);
480 		}
481 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
482 		{
483 			puts("pg_checksums (PostgreSQL) " PG_VERSION);
484 			exit(0);
485 		}
486 	}
487 
488 	while ((c = getopt_long(argc, argv, "cD:deNPf:v", long_options, &option_index)) != -1)
489 	{
490 		switch (c)
491 		{
492 			case 'c':
493 				mode = PG_MODE_CHECK;
494 				break;
495 			case 'd':
496 				mode = PG_MODE_DISABLE;
497 				break;
498 			case 'e':
499 				mode = PG_MODE_ENABLE;
500 				break;
501 			case 'f':
502 				if (atoi(optarg) == 0)
503 				{
504 					pg_log_error("invalid filenode specification, must be numeric: %s", optarg);
505 					exit(1);
506 				}
507 				only_filenode = pstrdup(optarg);
508 				break;
509 			case 'N':
510 				do_sync = false;
511 				break;
512 			case 'v':
513 				verbose = true;
514 				break;
515 			case 'D':
516 				DataDir = optarg;
517 				break;
518 			case 'P':
519 				showprogress = true;
520 				break;
521 			default:
522 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
523 				exit(1);
524 		}
525 	}
526 
527 	if (DataDir == NULL)
528 	{
529 		if (optind < argc)
530 			DataDir = argv[optind++];
531 		else
532 			DataDir = getenv("PGDATA");
533 
534 		/* If no DataDir was specified, and none could be found, error out */
535 		if (DataDir == NULL)
536 		{
537 			pg_log_error("no data directory specified");
538 			fprintf(stderr, _("Try \"%s --help\" for more information.\n"), progname);
539 			exit(1);
540 		}
541 	}
542 
543 	/* Complain if any arguments remain */
544 	if (optind < argc)
545 	{
546 		pg_log_error("too many command-line arguments (first is \"%s\")",
547 					 argv[optind]);
548 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
549 				progname);
550 		exit(1);
551 	}
552 
553 	/* filenode checking only works in --check mode */
554 	if (mode != PG_MODE_CHECK && only_filenode)
555 	{
556 		pg_log_error("option -f/--filenode can only be used with --check");
557 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
558 				progname);
559 		exit(1);
560 	}
561 
562 	/* Read the control file and check compatibility */
563 	ControlFile = get_controlfile(DataDir, &crc_ok);
564 	if (!crc_ok)
565 	{
566 		pg_log_error("pg_control CRC value is incorrect");
567 		exit(1);
568 	}
569 
570 	if (ControlFile->pg_control_version != PG_CONTROL_VERSION)
571 	{
572 		pg_log_error("cluster is not compatible with this version of pg_checksums");
573 		exit(1);
574 	}
575 
576 	if (ControlFile->blcksz != BLCKSZ)
577 	{
578 		pg_log_error("database cluster is not compatible");
579 		fprintf(stderr, _("The database cluster was initialized with block size %u, but pg_checksums was compiled with block size %u.\n"),
580 				ControlFile->blcksz, BLCKSZ);
581 		exit(1);
582 	}
583 
584 	/*
585 	 * Check if cluster is running.  A clean shutdown is required to avoid
586 	 * random checksum failures caused by torn pages.  Note that this doesn't
587 	 * guard against someone starting the cluster concurrently.
588 	 */
589 	if (ControlFile->state != DB_SHUTDOWNED &&
590 		ControlFile->state != DB_SHUTDOWNED_IN_RECOVERY)
591 	{
592 		pg_log_error("cluster must be shut down");
593 		exit(1);
594 	}
595 
596 	if (ControlFile->data_checksum_version == 0 &&
597 		mode == PG_MODE_CHECK)
598 	{
599 		pg_log_error("data checksums are not enabled in cluster");
600 		exit(1);
601 	}
602 
603 	if (ControlFile->data_checksum_version == 0 &&
604 		mode == PG_MODE_DISABLE)
605 	{
606 		pg_log_error("data checksums are already disabled in cluster");
607 		exit(1);
608 	}
609 
610 	if (ControlFile->data_checksum_version > 0 &&
611 		mode == PG_MODE_ENABLE)
612 	{
613 		pg_log_error("data checksums are already enabled in cluster");
614 		exit(1);
615 	}
616 
617 	/* Operate on all files if checking or enabling checksums */
618 	if (mode == PG_MODE_CHECK || mode == PG_MODE_ENABLE)
619 	{
620 		/*
621 		 * If progress status information is requested, we need to scan the
622 		 * directory tree twice: once to know how much total data needs to be
623 		 * processed and once to do the real work.
624 		 */
625 		if (showprogress)
626 		{
627 			total_size = scan_directory(DataDir, "global", true);
628 			total_size += scan_directory(DataDir, "base", true);
629 			total_size += scan_directory(DataDir, "pg_tblspc", true);
630 		}
631 
632 		(void) scan_directory(DataDir, "global", false);
633 		(void) scan_directory(DataDir, "base", false);
634 		(void) scan_directory(DataDir, "pg_tblspc", false);
635 
636 		if (showprogress)
637 			progress_report(true);
638 
639 		printf(_("Checksum operation completed\n"));
640 		printf(_("Files scanned:  %s\n"), psprintf(INT64_FORMAT, files));
641 		printf(_("Blocks scanned: %s\n"), psprintf(INT64_FORMAT, blocks));
642 		if (mode == PG_MODE_CHECK)
643 		{
644 			printf(_("Bad checksums:  %s\n"), psprintf(INT64_FORMAT, badblocks));
645 			printf(_("Data checksum version: %u\n"), ControlFile->data_checksum_version);
646 
647 			if (badblocks > 0)
648 				exit(1);
649 		}
650 	}
651 
652 	/*
653 	 * Finally make the data durable on disk if enabling or disabling
654 	 * checksums.  Flush first the data directory for safety, and then update
655 	 * the control file to keep the switch consistent.
656 	 */
657 	if (mode == PG_MODE_ENABLE || mode == PG_MODE_DISABLE)
658 	{
659 		ControlFile->data_checksum_version =
660 			(mode == PG_MODE_ENABLE) ? PG_DATA_CHECKSUM_VERSION : 0;
661 
662 		if (do_sync)
663 		{
664 			pg_log_info("syncing data directory");
665 			fsync_pgdata(DataDir, PG_VERSION_NUM);
666 		}
667 
668 		pg_log_info("updating control file");
669 		update_controlfile(DataDir, ControlFile, do_sync);
670 
671 		if (verbose)
672 			printf(_("Data checksum version: %u\n"), ControlFile->data_checksum_version);
673 		if (mode == PG_MODE_ENABLE)
674 			printf(_("Checksums enabled in cluster\n"));
675 		else
676 			printf(_("Checksums disabled in cluster\n"));
677 	}
678 
679 	return 0;
680 }
681