1 /*
2  *	pg_test_fsync.c
3  *		tests all supported fsync() methods
4  */
5 
6 #include "postgres_fe.h"
7 
8 #include <sys/stat.h>
9 #include <sys/time.h>
10 #include <fcntl.h>
11 #include <time.h>
12 #include <unistd.h>
13 #include <signal.h>
14 
15 #include "access/xlogdefs.h"
16 #include "common/logging.h"
17 #include "getopt_long.h"
18 
19 /*
20  * put the temp files in the local directory
21  * unless the user specifies otherwise
22  */
23 #define FSYNC_FILENAME	"./pg_test_fsync.out"
24 
25 #define XLOG_BLCKSZ_K	(XLOG_BLCKSZ / 1024)
26 
27 #define LABEL_FORMAT		"        %-30s"
28 #define NA_FORMAT			"%21s\n"
29 /* translator: maintain alignment with NA_FORMAT */
30 #define OPS_FORMAT			gettext_noop("%13.3f ops/sec  %6.0f usecs/op\n")
31 #define USECS_SEC			1000000
32 
33 /* These are macros to avoid timing the function call overhead. */
34 #ifndef WIN32
35 #define START_TIMER \
36 do { \
37 	alarm_triggered = false; \
38 	alarm(secs_per_test); \
39 	gettimeofday(&start_t, NULL); \
40 } while (0)
41 #else
42 /* WIN32 doesn't support alarm, so we create a thread and sleep there */
43 #define START_TIMER \
44 do { \
45 	alarm_triggered = false; \
46 	if (CreateThread(NULL, 0, process_alarm, NULL, 0, NULL) == \
47 		INVALID_HANDLE_VALUE) \
48 	{ \
49 		pg_log_error("could not create thread for alarm"); \
50 		exit(1); \
51 	} \
52 	gettimeofday(&start_t, NULL); \
53 } while (0)
54 #endif
55 
56 #define STOP_TIMER	\
57 do { \
58 	gettimeofday(&stop_t, NULL); \
59 	print_elapse(start_t, stop_t, ops); \
60 } while (0)
61 
62 
63 static const char *progname;
64 
65 static int	secs_per_test = 5;
66 static int	needs_unlink = 0;
67 static char full_buf[DEFAULT_XLOG_SEG_SIZE],
68 		   *buf,
69 		   *filename = FSYNC_FILENAME;
70 static struct timeval start_t,
71 			stop_t;
72 static bool alarm_triggered = false;
73 
74 
75 static void handle_args(int argc, char *argv[]);
76 static void prepare_buf(void);
77 static void test_open(void);
78 static void test_non_sync(void);
79 static void test_sync(int writes_per_op);
80 static void test_open_syncs(void);
81 static void test_open_sync(const char *msg, int writes_size);
82 static void test_file_descriptor_sync(void);
83 
84 #ifndef WIN32
85 static void process_alarm(int sig);
86 #else
87 static DWORD WINAPI process_alarm(LPVOID param);
88 #endif
89 static void signal_cleanup(int sig);
90 
91 #ifdef HAVE_FSYNC_WRITETHROUGH
92 static int	pg_fsync_writethrough(int fd);
93 #endif
94 static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops);
95 
96 #define die(msg) do { pg_log_error("%s: %m", _(msg)); exit(1); } while(0)
97 
98 
99 int
main(int argc,char * argv[])100 main(int argc, char *argv[])
101 {
102 	pg_logging_init(argv[0]);
103 	set_pglocale_pgservice(argv[0], PG_TEXTDOMAIN("pg_test_fsync"));
104 	progname = get_progname(argv[0]);
105 
106 	handle_args(argc, argv);
107 
108 	/* Prevent leaving behind the test file */
109 	pqsignal(SIGINT, signal_cleanup);
110 	pqsignal(SIGTERM, signal_cleanup);
111 #ifndef WIN32
112 	pqsignal(SIGALRM, process_alarm);
113 #endif
114 #ifdef SIGHUP
115 	/* Not defined on win32 */
116 	pqsignal(SIGHUP, signal_cleanup);
117 #endif
118 
119 	prepare_buf();
120 
121 	test_open();
122 
123 	/* Test using 1 XLOG_BLCKSZ write */
124 	test_sync(1);
125 
126 	/* Test using 2 XLOG_BLCKSZ writes */
127 	test_sync(2);
128 
129 	test_open_syncs();
130 
131 	test_file_descriptor_sync();
132 
133 	test_non_sync();
134 
135 	unlink(filename);
136 
137 	return 0;
138 }
139 
140 static void
handle_args(int argc,char * argv[])141 handle_args(int argc, char *argv[])
142 {
143 	static struct option long_options[] = {
144 		{"filename", required_argument, NULL, 'f'},
145 		{"secs-per-test", required_argument, NULL, 's'},
146 		{NULL, 0, NULL, 0}
147 	};
148 
149 	int			option;			/* Command line option */
150 	int			optindex = 0;	/* used by getopt_long */
151 
152 	if (argc > 1)
153 	{
154 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
155 		{
156 			printf(_("Usage: %s [-f FILENAME] [-s SECS-PER-TEST]\n"), progname);
157 			exit(0);
158 		}
159 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
160 		{
161 			puts("pg_test_fsync (PostgreSQL) " PG_VERSION);
162 			exit(0);
163 		}
164 	}
165 
166 	while ((option = getopt_long(argc, argv, "f:s:",
167 								 long_options, &optindex)) != -1)
168 	{
169 		switch (option)
170 		{
171 			case 'f':
172 				filename = pg_strdup(optarg);
173 				break;
174 
175 			case 's':
176 				secs_per_test = atoi(optarg);
177 				break;
178 
179 			default:
180 				fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
181 						progname);
182 				exit(1);
183 				break;
184 		}
185 	}
186 
187 	if (argc > optind)
188 	{
189 		pg_log_error("too many command-line arguments (first is \"%s\")",
190 					 argv[optind]);
191 		fprintf(stderr, _("Try \"%s --help\" for more information.\n"),
192 				progname);
193 		exit(1);
194 	}
195 
196 	printf(ngettext("%d second per test\n",
197 					"%d seconds per test\n",
198 					secs_per_test),
199 		   secs_per_test);
200 #if PG_O_DIRECT != 0
201 	printf(_("O_DIRECT supported on this platform for open_datasync and open_sync.\n"));
202 #else
203 	printf(_("Direct I/O is not supported on this platform.\n"));
204 #endif
205 }
206 
207 static void
prepare_buf(void)208 prepare_buf(void)
209 {
210 	int			ops;
211 
212 	/* write random data into buffer */
213 	for (ops = 0; ops < DEFAULT_XLOG_SEG_SIZE; ops++)
214 		full_buf[ops] = random();
215 
216 	buf = (char *) TYPEALIGN(XLOG_BLCKSZ, full_buf);
217 }
218 
219 static void
test_open(void)220 test_open(void)
221 {
222 	int			tmpfile;
223 
224 	/*
225 	 * test if we can open the target file
226 	 */
227 	if ((tmpfile = open(filename, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
228 		die("could not open output file");
229 	needs_unlink = 1;
230 	if (write(tmpfile, full_buf, DEFAULT_XLOG_SEG_SIZE) !=
231 		DEFAULT_XLOG_SEG_SIZE)
232 		die("write failed");
233 
234 	/* fsync now so that dirty buffers don't skew later tests */
235 	if (fsync(tmpfile) != 0)
236 		die("fsync failed");
237 
238 	close(tmpfile);
239 }
240 
241 static void
test_sync(int writes_per_op)242 test_sync(int writes_per_op)
243 {
244 	int			tmpfile,
245 				ops,
246 				writes;
247 	bool		fs_warning = false;
248 
249 	if (writes_per_op == 1)
250 		printf(_("\nCompare file sync methods using one %dkB write:\n"), XLOG_BLCKSZ_K);
251 	else
252 		printf(_("\nCompare file sync methods using two %dkB writes:\n"), XLOG_BLCKSZ_K);
253 	printf(_("(in wal_sync_method preference order, except fdatasync is Linux's default)\n"));
254 
255 	/*
256 	 * Test open_datasync if available
257 	 */
258 	printf(LABEL_FORMAT, "open_datasync");
259 	fflush(stdout);
260 
261 #ifdef OPEN_DATASYNC_FLAG
262 	if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT | PG_BINARY, 0)) == -1)
263 	{
264 		printf(NA_FORMAT, _("n/a*"));
265 		fs_warning = true;
266 	}
267 	else
268 	{
269 		START_TIMER;
270 		for (ops = 0; alarm_triggered == false; ops++)
271 		{
272 			for (writes = 0; writes < writes_per_op; writes++)
273 				if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
274 					die("write failed");
275 			if (lseek(tmpfile, 0, SEEK_SET) == -1)
276 				die("seek failed");
277 		}
278 		STOP_TIMER;
279 		close(tmpfile);
280 	}
281 #else
282 	printf(NA_FORMAT, _("n/a"));
283 #endif
284 
285 /*
286  * Test fdatasync if available
287  */
288 	printf(LABEL_FORMAT, "fdatasync");
289 	fflush(stdout);
290 
291 #ifdef HAVE_FDATASYNC
292 	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
293 		die("could not open output file");
294 	START_TIMER;
295 	for (ops = 0; alarm_triggered == false; ops++)
296 	{
297 		for (writes = 0; writes < writes_per_op; writes++)
298 			if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
299 				die("write failed");
300 		fdatasync(tmpfile);
301 		if (lseek(tmpfile, 0, SEEK_SET) == -1)
302 			die("seek failed");
303 	}
304 	STOP_TIMER;
305 	close(tmpfile);
306 #else
307 	printf(NA_FORMAT, _("n/a"));
308 #endif
309 
310 /*
311  * Test fsync
312  */
313 	printf(LABEL_FORMAT, "fsync");
314 	fflush(stdout);
315 
316 	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
317 		die("could not open output file");
318 	START_TIMER;
319 	for (ops = 0; alarm_triggered == false; ops++)
320 	{
321 		for (writes = 0; writes < writes_per_op; writes++)
322 			if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
323 				die("write failed");
324 		if (fsync(tmpfile) != 0)
325 			die("fsync failed");
326 		if (lseek(tmpfile, 0, SEEK_SET) == -1)
327 			die("seek failed");
328 	}
329 	STOP_TIMER;
330 	close(tmpfile);
331 
332 /*
333  * If fsync_writethrough is available, test as well
334  */
335 	printf(LABEL_FORMAT, "fsync_writethrough");
336 	fflush(stdout);
337 
338 #ifdef HAVE_FSYNC_WRITETHROUGH
339 	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
340 		die("could not open output file");
341 	START_TIMER;
342 	for (ops = 0; alarm_triggered == false; ops++)
343 	{
344 		for (writes = 0; writes < writes_per_op; writes++)
345 			if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
346 				die("write failed");
347 		if (pg_fsync_writethrough(tmpfile) != 0)
348 			die("fsync failed");
349 		if (lseek(tmpfile, 0, SEEK_SET) == -1)
350 			die("seek failed");
351 	}
352 	STOP_TIMER;
353 	close(tmpfile);
354 #else
355 	printf(NA_FORMAT, _("n/a"));
356 #endif
357 
358 /*
359  * Test open_sync if available
360  */
361 	printf(LABEL_FORMAT, "open_sync");
362 	fflush(stdout);
363 
364 #ifdef OPEN_SYNC_FLAG
365 	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
366 	{
367 		printf(NA_FORMAT, _("n/a*"));
368 		fs_warning = true;
369 	}
370 	else
371 	{
372 		START_TIMER;
373 		for (ops = 0; alarm_triggered == false; ops++)
374 		{
375 			for (writes = 0; writes < writes_per_op; writes++)
376 				if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
377 
378 					/*
379 					 * This can generate write failures if the filesystem has
380 					 * a large block size, e.g. 4k, and there is no support
381 					 * for O_DIRECT writes smaller than the file system block
382 					 * size, e.g. XFS.
383 					 */
384 					die("write failed");
385 			if (lseek(tmpfile, 0, SEEK_SET) == -1)
386 				die("seek failed");
387 		}
388 		STOP_TIMER;
389 		close(tmpfile);
390 	}
391 #else
392 	printf(NA_FORMAT, _("n/a"));
393 #endif
394 
395 	if (fs_warning)
396 	{
397 		printf(_("* This file system and its mount options do not support direct\n"
398 				 "  I/O, e.g. ext4 in journaled mode.\n"));
399 	}
400 }
401 
402 static void
test_open_syncs(void)403 test_open_syncs(void)
404 {
405 	printf(_("\nCompare open_sync with different write sizes:\n"));
406 	printf(_("(This is designed to compare the cost of writing 16kB in different write\n"
407 			 "open_sync sizes.)\n"));
408 
409 	test_open_sync(_(" 1 * 16kB open_sync write"), 16);
410 	test_open_sync(_(" 2 *  8kB open_sync writes"), 8);
411 	test_open_sync(_(" 4 *  4kB open_sync writes"), 4);
412 	test_open_sync(_(" 8 *  2kB open_sync writes"), 2);
413 	test_open_sync(_("16 *  1kB open_sync writes"), 1);
414 }
415 
416 /*
417  * Test open_sync with different size files
418  */
419 static void
test_open_sync(const char * msg,int writes_size)420 test_open_sync(const char *msg, int writes_size)
421 {
422 #ifdef OPEN_SYNC_FLAG
423 	int			tmpfile,
424 				ops,
425 				writes;
426 #endif
427 
428 	printf(LABEL_FORMAT, msg);
429 	fflush(stdout);
430 
431 #ifdef OPEN_SYNC_FLAG
432 	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
433 		printf(NA_FORMAT, _("n/a*"));
434 	else
435 	{
436 		START_TIMER;
437 		for (ops = 0; alarm_triggered == false; ops++)
438 		{
439 			for (writes = 0; writes < 16 / writes_size; writes++)
440 				if (write(tmpfile, buf, writes_size * 1024) !=
441 					writes_size * 1024)
442 					die("write failed");
443 			if (lseek(tmpfile, 0, SEEK_SET) == -1)
444 				die("seek failed");
445 		}
446 		STOP_TIMER;
447 		close(tmpfile);
448 	}
449 #else
450 	printf(NA_FORMAT, _("n/a"));
451 #endif
452 }
453 
454 static void
test_file_descriptor_sync(void)455 test_file_descriptor_sync(void)
456 {
457 	int			tmpfile,
458 				ops;
459 
460 	/*
461 	 * Test whether fsync can sync data written on a different descriptor for
462 	 * the same file.  This checks the efficiency of multi-process fsyncs
463 	 * against the same file. Possibly this should be done with writethrough
464 	 * on platforms which support it.
465 	 */
466 	printf(_("\nTest if fsync on non-write file descriptor is honored:\n"));
467 	printf(_("(If the times are similar, fsync() can sync data written on a different\n"
468 			 "descriptor.)\n"));
469 
470 	/*
471 	 * first write, fsync and close, which is the normal behavior without
472 	 * multiple descriptors
473 	 */
474 	printf(LABEL_FORMAT, "write, fsync, close");
475 	fflush(stdout);
476 
477 	START_TIMER;
478 	for (ops = 0; alarm_triggered == false; ops++)
479 	{
480 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
481 			die("could not open output file");
482 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
483 			die("write failed");
484 		if (fsync(tmpfile) != 0)
485 			die("fsync failed");
486 		close(tmpfile);
487 
488 		/*
489 		 * open and close the file again to be consistent with the following
490 		 * test
491 		 */
492 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
493 			die("could not open output file");
494 		close(tmpfile);
495 	}
496 	STOP_TIMER;
497 
498 	/*
499 	 * Now open, write, close, open again and fsync This simulates processes
500 	 * fsyncing each other's writes.
501 	 */
502 	printf(LABEL_FORMAT, "write, close, fsync");
503 	fflush(stdout);
504 
505 	START_TIMER;
506 	for (ops = 0; alarm_triggered == false; ops++)
507 	{
508 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
509 			die("could not open output file");
510 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
511 			die("write failed");
512 		close(tmpfile);
513 		/* reopen file */
514 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
515 			die("could not open output file");
516 		if (fsync(tmpfile) != 0)
517 			die("fsync failed");
518 		close(tmpfile);
519 	}
520 	STOP_TIMER;
521 }
522 
523 static void
test_non_sync(void)524 test_non_sync(void)
525 {
526 	int			tmpfile,
527 				ops;
528 
529 	/*
530 	 * Test a simple write without fsync
531 	 */
532 	printf(_("\nNon-sync'ed %dkB writes:\n"), XLOG_BLCKSZ_K);
533 	printf(LABEL_FORMAT, "write");
534 	fflush(stdout);
535 
536 	START_TIMER;
537 	for (ops = 0; alarm_triggered == false; ops++)
538 	{
539 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
540 			die("could not open output file");
541 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
542 			die("write failed");
543 		close(tmpfile);
544 	}
545 	STOP_TIMER;
546 }
547 
548 static void
signal_cleanup(int signum)549 signal_cleanup(int signum)
550 {
551 	/* Delete the file if it exists. Ignore errors */
552 	if (needs_unlink)
553 		unlink(filename);
554 	/* Finish incomplete line on stdout */
555 	puts("");
556 	exit(signum);
557 }
558 
559 #ifdef HAVE_FSYNC_WRITETHROUGH
560 
561 static int
pg_fsync_writethrough(int fd)562 pg_fsync_writethrough(int fd)
563 {
564 #ifdef WIN32
565 	return _commit(fd);
566 #elif defined(F_FULLFSYNC)
567 	return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
568 #else
569 	errno = ENOSYS;
570 	return -1;
571 #endif
572 }
573 #endif
574 
575 /*
576  * print out the writes per second for tests
577  */
578 static void
print_elapse(struct timeval start_t,struct timeval stop_t,int ops)579 print_elapse(struct timeval start_t, struct timeval stop_t, int ops)
580 {
581 	double		total_time = (stop_t.tv_sec - start_t.tv_sec) +
582 	(stop_t.tv_usec - start_t.tv_usec) * 0.000001;
583 	double		per_second = ops / total_time;
584 	double		avg_op_time_us = (total_time / ops) * USECS_SEC;
585 
586 	printf(_(OPS_FORMAT), per_second, avg_op_time_us);
587 }
588 
589 #ifndef WIN32
590 static void
process_alarm(int sig)591 process_alarm(int sig)
592 {
593 	alarm_triggered = true;
594 }
595 #else
596 static DWORD WINAPI
process_alarm(LPVOID param)597 process_alarm(LPVOID param)
598 {
599 	/* WIN32 doesn't support alarm, so we create a thread and sleep here */
600 	Sleep(secs_per_test * 1000);
601 	alarm_triggered = true;
602 	ExitThread(0);
603 }
604 #endif
605