1 /*
2  *	pg_test_fsync.c
3  *		tests all supported fsync() methods
4  */
5 
6 #include "postgres_fe.h"
7 
8 #include <sys/stat.h>
9 #include <sys/time.h>
10 #include <fcntl.h>
11 #include <time.h>
12 #include <unistd.h>
13 #include <signal.h>
14 
15 #include "getopt_long.h"
16 #include "access/xlogdefs.h"
17 
18 
19 /*
20  * put the temp files in the local directory
21  * unless the user specifies otherwise
22  */
23 #define FSYNC_FILENAME	"./pg_test_fsync.out"
24 
25 #define XLOG_BLCKSZ_K	(XLOG_BLCKSZ / 1024)
26 
27 #define LABEL_FORMAT		"        %-30s"
28 #define NA_FORMAT			"%20s"
29 #define OPS_FORMAT			"%13.3f ops/sec  %6.0f usecs/op"
30 #define USECS_SEC			1000000
31 
32 /* These are macros to avoid timing the function call overhead. */
33 #ifndef WIN32
34 #define START_TIMER \
35 do { \
36 	alarm_triggered = false; \
37 	alarm(secs_per_test); \
38 	gettimeofday(&start_t, NULL); \
39 } while (0)
40 #else
41 /* WIN32 doesn't support alarm, so we create a thread and sleep there */
42 #define START_TIMER \
43 do { \
44 	alarm_triggered = false; \
45 	if (CreateThread(NULL, 0, process_alarm, NULL, 0, NULL) == \
46 		INVALID_HANDLE_VALUE) \
47 	{ \
48 		fprintf(stderr, "Cannot create thread for alarm\n"); \
49 		exit(1); \
50 	} \
51 	gettimeofday(&start_t, NULL); \
52 } while (0)
53 #endif
54 
55 #define STOP_TIMER	\
56 do { \
57 	gettimeofday(&stop_t, NULL); \
58 	print_elapse(start_t, stop_t, ops); \
59 } while (0)
60 
61 
62 static const char *progname;
63 
64 static int	secs_per_test = 5;
65 static int	needs_unlink = 0;
66 static char full_buf[XLOG_SEG_SIZE],
67 		   *buf,
68 		   *filename = FSYNC_FILENAME;
69 static struct timeval start_t,
70 			stop_t;
71 static bool alarm_triggered = false;
72 
73 
74 static void handle_args(int argc, char *argv[]);
75 static void prepare_buf(void);
76 static void test_open(void);
77 static void test_non_sync(void);
78 static void test_sync(int writes_per_op);
79 static void test_open_syncs(void);
80 static void test_open_sync(const char *msg, int writes_size);
81 static void test_file_descriptor_sync(void);
82 
83 #ifndef WIN32
84 static void process_alarm(int sig);
85 #else
86 static DWORD WINAPI process_alarm(LPVOID param);
87 #endif
88 static void signal_cleanup(int sig);
89 
90 #ifdef HAVE_FSYNC_WRITETHROUGH
91 static int	pg_fsync_writethrough(int fd);
92 #endif
93 static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops);
94 static void die(const char *str);
95 
96 
97 int
main(int argc,char * argv[])98 main(int argc, char *argv[])
99 {
100 	progname = get_progname(argv[0]);
101 
102 	handle_args(argc, argv);
103 
104 	/* Prevent leaving behind the test file */
105 	pqsignal(SIGINT, signal_cleanup);
106 	pqsignal(SIGTERM, signal_cleanup);
107 #ifndef WIN32
108 	pqsignal(SIGALRM, process_alarm);
109 #endif
110 #ifdef SIGHUP
111 	/* Not defined on win32 */
112 	pqsignal(SIGHUP, signal_cleanup);
113 #endif
114 
115 	prepare_buf();
116 
117 	test_open();
118 
119 	/* Test using 1 XLOG_BLCKSZ write */
120 	test_sync(1);
121 
122 	/* Test using 2 XLOG_BLCKSZ writes */
123 	test_sync(2);
124 
125 	test_open_syncs();
126 
127 	test_file_descriptor_sync();
128 
129 	test_non_sync();
130 
131 	unlink(filename);
132 
133 	return 0;
134 }
135 
136 static void
handle_args(int argc,char * argv[])137 handle_args(int argc, char *argv[])
138 {
139 	static struct option long_options[] = {
140 		{"filename", required_argument, NULL, 'f'},
141 		{"secs-per-test", required_argument, NULL, 's'},
142 		{NULL, 0, NULL, 0}
143 	};
144 
145 	int			option;			/* Command line option */
146 	int			optindex = 0;	/* used by getopt_long */
147 
148 	if (argc > 1)
149 	{
150 		if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
151 		{
152 			printf("Usage: %s [-f FILENAME] [-s SECS-PER-TEST]\n", progname);
153 			exit(0);
154 		}
155 		if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
156 		{
157 			puts("pg_test_fsync (PostgreSQL) " PG_VERSION);
158 			exit(0);
159 		}
160 	}
161 
162 	while ((option = getopt_long(argc, argv, "f:s:",
163 								 long_options, &optindex)) != -1)
164 	{
165 		switch (option)
166 		{
167 			case 'f':
168 				filename = strdup(optarg);
169 				break;
170 
171 			case 's':
172 				secs_per_test = atoi(optarg);
173 				break;
174 
175 			default:
176 				fprintf(stderr, "Try \"%s --help\" for more information.\n",
177 						progname);
178 				exit(1);
179 				break;
180 		}
181 	}
182 
183 	if (argc > optind)
184 	{
185 		fprintf(stderr,
186 				"%s: too many command-line arguments (first is \"%s\")\n",
187 				progname, argv[optind]);
188 		fprintf(stderr, "Try \"%s --help\" for more information.\n",
189 				progname);
190 		exit(1);
191 	}
192 
193 	printf("%d seconds per test\n", secs_per_test);
194 #if PG_O_DIRECT != 0
195 	printf("O_DIRECT supported on this platform for open_datasync and open_sync.\n");
196 #else
197 	printf("Direct I/O is not supported on this platform.\n");
198 #endif
199 }
200 
201 static void
prepare_buf(void)202 prepare_buf(void)
203 {
204 	int			ops;
205 
206 	/* write random data into buffer */
207 	for (ops = 0; ops < XLOG_SEG_SIZE; ops++)
208 		full_buf[ops] = random();
209 
210 	buf = (char *) TYPEALIGN(XLOG_BLCKSZ, full_buf);
211 }
212 
213 static void
test_open(void)214 test_open(void)
215 {
216 	int			tmpfile;
217 
218 	/*
219 	 * test if we can open the target file
220 	 */
221 	if ((tmpfile = open(filename, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
222 		die("could not open output file");
223 	needs_unlink = 1;
224 	if (write(tmpfile, full_buf, XLOG_SEG_SIZE) != XLOG_SEG_SIZE)
225 		die("write failed");
226 
227 	/* fsync now so that dirty buffers don't skew later tests */
228 	if (fsync(tmpfile) != 0)
229 		die("fsync failed");
230 
231 	close(tmpfile);
232 }
233 
234 static void
test_sync(int writes_per_op)235 test_sync(int writes_per_op)
236 {
237 	int			tmpfile,
238 				ops,
239 				writes;
240 	bool		fs_warning = false;
241 
242 	if (writes_per_op == 1)
243 		printf("\nCompare file sync methods using one %dkB write:\n", XLOG_BLCKSZ_K);
244 	else
245 		printf("\nCompare file sync methods using two %dkB writes:\n", XLOG_BLCKSZ_K);
246 	printf("(in wal_sync_method preference order, except fdatasync is Linux's default)\n");
247 
248 	/*
249 	 * Test open_datasync if available
250 	 */
251 	printf(LABEL_FORMAT, "open_datasync");
252 	fflush(stdout);
253 
254 #ifdef OPEN_DATASYNC_FLAG
255 	if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT | PG_BINARY, 0)) == -1)
256 	{
257 		printf(NA_FORMAT, "n/a*\n");
258 		fs_warning = true;
259 	}
260 	else
261 	{
262 		START_TIMER;
263 		for (ops = 0; alarm_triggered == false; ops++)
264 		{
265 			for (writes = 0; writes < writes_per_op; writes++)
266 				if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
267 					die("write failed");
268 			if (lseek(tmpfile, 0, SEEK_SET) == -1)
269 				die("seek failed");
270 		}
271 		STOP_TIMER;
272 		close(tmpfile);
273 	}
274 #else
275 	printf(NA_FORMAT, "n/a\n");
276 #endif
277 
278 /*
279  * Test fdatasync if available
280  */
281 	printf(LABEL_FORMAT, "fdatasync");
282 	fflush(stdout);
283 
284 #ifdef HAVE_FDATASYNC
285 	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
286 		die("could not open output file");
287 	START_TIMER;
288 	for (ops = 0; alarm_triggered == false; ops++)
289 	{
290 		for (writes = 0; writes < writes_per_op; writes++)
291 			if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
292 				die("write failed");
293 		fdatasync(tmpfile);
294 		if (lseek(tmpfile, 0, SEEK_SET) == -1)
295 			die("seek failed");
296 	}
297 	STOP_TIMER;
298 	close(tmpfile);
299 #else
300 	printf(NA_FORMAT, "n/a\n");
301 #endif
302 
303 /*
304  * Test fsync
305  */
306 	printf(LABEL_FORMAT, "fsync");
307 	fflush(stdout);
308 
309 	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
310 		die("could not open output file");
311 	START_TIMER;
312 	for (ops = 0; alarm_triggered == false; ops++)
313 	{
314 		for (writes = 0; writes < writes_per_op; writes++)
315 			if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
316 				die("write failed");
317 		if (fsync(tmpfile) != 0)
318 			die("fsync failed");
319 		if (lseek(tmpfile, 0, SEEK_SET) == -1)
320 			die("seek failed");
321 	}
322 	STOP_TIMER;
323 	close(tmpfile);
324 
325 /*
326  * If fsync_writethrough is available, test as well
327  */
328 	printf(LABEL_FORMAT, "fsync_writethrough");
329 	fflush(stdout);
330 
331 #ifdef HAVE_FSYNC_WRITETHROUGH
332 	if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
333 		die("could not open output file");
334 	START_TIMER;
335 	for (ops = 0; alarm_triggered == false; ops++)
336 	{
337 		for (writes = 0; writes < writes_per_op; writes++)
338 			if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
339 				die("write failed");
340 		if (pg_fsync_writethrough(tmpfile) != 0)
341 			die("fsync failed");
342 		if (lseek(tmpfile, 0, SEEK_SET) == -1)
343 			die("seek failed");
344 	}
345 	STOP_TIMER;
346 	close(tmpfile);
347 #else
348 	printf(NA_FORMAT, "n/a\n");
349 #endif
350 
351 /*
352  * Test open_sync if available
353  */
354 	printf(LABEL_FORMAT, "open_sync");
355 	fflush(stdout);
356 
357 #ifdef OPEN_SYNC_FLAG
358 	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
359 	{
360 		printf(NA_FORMAT, "n/a*\n");
361 		fs_warning = true;
362 	}
363 	else
364 	{
365 		START_TIMER;
366 		for (ops = 0; alarm_triggered == false; ops++)
367 		{
368 			for (writes = 0; writes < writes_per_op; writes++)
369 				if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
370 
371 					/*
372 					 * This can generate write failures if the filesystem has
373 					 * a large block size, e.g. 4k, and there is no support
374 					 * for O_DIRECT writes smaller than the file system block
375 					 * size, e.g. XFS.
376 					 */
377 					die("write failed");
378 			if (lseek(tmpfile, 0, SEEK_SET) == -1)
379 				die("seek failed");
380 		}
381 		STOP_TIMER;
382 		close(tmpfile);
383 	}
384 #else
385 	printf(NA_FORMAT, "n/a\n");
386 #endif
387 
388 	if (fs_warning)
389 	{
390 		printf("* This file system and its mount options do not support direct\n");
391 		printf("I/O, e.g. ext4 in journaled mode.\n");
392 	}
393 }
394 
395 static void
test_open_syncs(void)396 test_open_syncs(void)
397 {
398 	printf("\nCompare open_sync with different write sizes:\n");
399 	printf("(This is designed to compare the cost of writing 16kB in different write\n"
400 		   "open_sync sizes.)\n");
401 
402 	test_open_sync(" 1 * 16kB open_sync write", 16);
403 	test_open_sync(" 2 *  8kB open_sync writes", 8);
404 	test_open_sync(" 4 *  4kB open_sync writes", 4);
405 	test_open_sync(" 8 *  2kB open_sync writes", 2);
406 	test_open_sync("16 *  1kB open_sync writes", 1);
407 }
408 
409 /*
410  * Test open_sync with different size files
411  */
412 static void
test_open_sync(const char * msg,int writes_size)413 test_open_sync(const char *msg, int writes_size)
414 {
415 #ifdef OPEN_SYNC_FLAG
416 	int			tmpfile,
417 				ops,
418 				writes;
419 #endif
420 
421 	printf(LABEL_FORMAT, msg);
422 	fflush(stdout);
423 
424 #ifdef OPEN_SYNC_FLAG
425 	if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
426 		printf(NA_FORMAT, "n/a*\n");
427 	else
428 	{
429 		START_TIMER;
430 		for (ops = 0; alarm_triggered == false; ops++)
431 		{
432 			for (writes = 0; writes < 16 / writes_size; writes++)
433 				if (write(tmpfile, buf, writes_size * 1024) !=
434 					writes_size * 1024)
435 					die("write failed");
436 			if (lseek(tmpfile, 0, SEEK_SET) == -1)
437 				die("seek failed");
438 		}
439 		STOP_TIMER;
440 		close(tmpfile);
441 	}
442 #else
443 	printf(NA_FORMAT, "n/a\n");
444 #endif
445 }
446 
447 static void
test_file_descriptor_sync(void)448 test_file_descriptor_sync(void)
449 {
450 	int			tmpfile,
451 				ops;
452 
453 	/*
454 	 * Test whether fsync can sync data written on a different descriptor for
455 	 * the same file.  This checks the efficiency of multi-process fsyncs
456 	 * against the same file. Possibly this should be done with writethrough
457 	 * on platforms which support it.
458 	 */
459 	printf("\nTest if fsync on non-write file descriptor is honored:\n");
460 	printf("(If the times are similar, fsync() can sync data written on a different\n"
461 		   "descriptor.)\n");
462 
463 	/*
464 	 * first write, fsync and close, which is the normal behavior without
465 	 * multiple descriptors
466 	 */
467 	printf(LABEL_FORMAT, "write, fsync, close");
468 	fflush(stdout);
469 
470 	START_TIMER;
471 	for (ops = 0; alarm_triggered == false; ops++)
472 	{
473 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
474 			die("could not open output file");
475 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
476 			die("write failed");
477 		if (fsync(tmpfile) != 0)
478 			die("fsync failed");
479 		close(tmpfile);
480 
481 		/*
482 		 * open and close the file again to be consistent with the following
483 		 * test
484 		 */
485 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
486 			die("could not open output file");
487 		close(tmpfile);
488 	}
489 	STOP_TIMER;
490 
491 	/*
492 	 * Now open, write, close, open again and fsync This simulates processes
493 	 * fsyncing each other's writes.
494 	 */
495 	printf(LABEL_FORMAT, "write, close, fsync");
496 	fflush(stdout);
497 
498 	START_TIMER;
499 	for (ops = 0; alarm_triggered == false; ops++)
500 	{
501 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
502 			die("could not open output file");
503 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
504 			die("write failed");
505 		close(tmpfile);
506 		/* reopen file */
507 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
508 			die("could not open output file");
509 		if (fsync(tmpfile) != 0)
510 			die("fsync failed");
511 		close(tmpfile);
512 	}
513 	STOP_TIMER;
514 }
515 
516 static void
test_non_sync(void)517 test_non_sync(void)
518 {
519 	int			tmpfile,
520 				ops;
521 
522 	/*
523 	 * Test a simple write without fsync
524 	 */
525 	printf("\nNon-sync'ed %dkB writes:\n", XLOG_BLCKSZ_K);
526 	printf(LABEL_FORMAT, "write");
527 	fflush(stdout);
528 
529 	START_TIMER;
530 	for (ops = 0; alarm_triggered == false; ops++)
531 	{
532 		if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
533 			die("could not open output file");
534 		if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
535 			die("write failed");
536 		close(tmpfile);
537 	}
538 	STOP_TIMER;
539 }
540 
541 static void
signal_cleanup(int signum)542 signal_cleanup(int signum)
543 {
544 	/* Delete the file if it exists. Ignore errors */
545 	if (needs_unlink)
546 		unlink(filename);
547 	/* Finish incomplete line on stdout */
548 	puts("");
549 	exit(signum);
550 }
551 
552 #ifdef HAVE_FSYNC_WRITETHROUGH
553 
554 static int
pg_fsync_writethrough(int fd)555 pg_fsync_writethrough(int fd)
556 {
557 #ifdef WIN32
558 	return _commit(fd);
559 #elif defined(F_FULLFSYNC)
560 	return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
561 #else
562 	errno = ENOSYS;
563 	return -1;
564 #endif
565 }
566 #endif
567 
568 /*
569  * print out the writes per second for tests
570  */
571 static void
print_elapse(struct timeval start_t,struct timeval stop_t,int ops)572 print_elapse(struct timeval start_t, struct timeval stop_t, int ops)
573 {
574 	double		total_time = (stop_t.tv_sec - start_t.tv_sec) +
575 	(stop_t.tv_usec - start_t.tv_usec) * 0.000001;
576 	double		per_second = ops / total_time;
577 	double		avg_op_time_us = (total_time / ops) * USECS_SEC;
578 
579 	printf(OPS_FORMAT "\n", per_second, avg_op_time_us);
580 }
581 
582 #ifndef WIN32
583 static void
process_alarm(int sig)584 process_alarm(int sig)
585 {
586 	alarm_triggered = true;
587 }
588 #else
589 static DWORD WINAPI
process_alarm(LPVOID param)590 process_alarm(LPVOID param)
591 {
592 	/* WIN32 doesn't support alarm, so we create a thread and sleep here */
593 	Sleep(secs_per_test * 1000);
594 	alarm_triggered = true;
595 	ExitThread(0);
596 }
597 #endif
598 
599 static void
die(const char * str)600 die(const char *str)
601 {
602 	fprintf(stderr, "%s: %s\n", str, strerror(errno));
603 	exit(1);
604 }
605