1 /*
2 * pg_test_fsync.c
3 * tests all supported fsync() methods
4 */
5
6 #include "postgres_fe.h"
7
8 #include <sys/stat.h>
9 #include <sys/time.h>
10 #include <fcntl.h>
11 #include <time.h>
12 #include <unistd.h>
13 #include <signal.h>
14
15 #include "getopt_long.h"
16 #include "access/xlogdefs.h"
17
18
19 /*
20 * put the temp files in the local directory
21 * unless the user specifies otherwise
22 */
23 #define FSYNC_FILENAME "./pg_test_fsync.out"
24
25 #define XLOG_BLCKSZ_K (XLOG_BLCKSZ / 1024)
26
27 #define LABEL_FORMAT " %-30s"
28 #define NA_FORMAT "%20s"
29 #define OPS_FORMAT "%13.3f ops/sec %6.0f usecs/op"
30 #define USECS_SEC 1000000
31
32 /* These are macros to avoid timing the function call overhead. */
33 #ifndef WIN32
34 #define START_TIMER \
35 do { \
36 alarm_triggered = false; \
37 alarm(secs_per_test); \
38 gettimeofday(&start_t, NULL); \
39 } while (0)
40 #else
41 /* WIN32 doesn't support alarm, so we create a thread and sleep there */
42 #define START_TIMER \
43 do { \
44 alarm_triggered = false; \
45 if (CreateThread(NULL, 0, process_alarm, NULL, 0, NULL) == \
46 INVALID_HANDLE_VALUE) \
47 { \
48 fprintf(stderr, "Cannot create thread for alarm\n"); \
49 exit(1); \
50 } \
51 gettimeofday(&start_t, NULL); \
52 } while (0)
53 #endif
54
55 #define STOP_TIMER \
56 do { \
57 gettimeofday(&stop_t, NULL); \
58 print_elapse(start_t, stop_t, ops); \
59 } while (0)
60
61
62 static const char *progname;
63
64 static int secs_per_test = 5;
65 static int needs_unlink = 0;
66 static char full_buf[XLOG_SEG_SIZE],
67 *buf,
68 *filename = FSYNC_FILENAME;
69 static struct timeval start_t,
70 stop_t;
71 static bool alarm_triggered = false;
72
73
74 static void handle_args(int argc, char *argv[]);
75 static void prepare_buf(void);
76 static void test_open(void);
77 static void test_non_sync(void);
78 static void test_sync(int writes_per_op);
79 static void test_open_syncs(void);
80 static void test_open_sync(const char *msg, int writes_size);
81 static void test_file_descriptor_sync(void);
82
83 #ifndef WIN32
84 static void process_alarm(int sig);
85 #else
86 static DWORD WINAPI process_alarm(LPVOID param);
87 #endif
88 static void signal_cleanup(int sig);
89
90 #ifdef HAVE_FSYNC_WRITETHROUGH
91 static int pg_fsync_writethrough(int fd);
92 #endif
93 static void print_elapse(struct timeval start_t, struct timeval stop_t, int ops);
94 static void die(const char *str);
95
96
97 int
main(int argc,char * argv[])98 main(int argc, char *argv[])
99 {
100 progname = get_progname(argv[0]);
101
102 handle_args(argc, argv);
103
104 /* Prevent leaving behind the test file */
105 pqsignal(SIGINT, signal_cleanup);
106 pqsignal(SIGTERM, signal_cleanup);
107 #ifndef WIN32
108 pqsignal(SIGALRM, process_alarm);
109 #endif
110 #ifdef SIGHUP
111 /* Not defined on win32 */
112 pqsignal(SIGHUP, signal_cleanup);
113 #endif
114
115 prepare_buf();
116
117 test_open();
118
119 /* Test using 1 XLOG_BLCKSZ write */
120 test_sync(1);
121
122 /* Test using 2 XLOG_BLCKSZ writes */
123 test_sync(2);
124
125 test_open_syncs();
126
127 test_file_descriptor_sync();
128
129 test_non_sync();
130
131 unlink(filename);
132
133 return 0;
134 }
135
136 static void
handle_args(int argc,char * argv[])137 handle_args(int argc, char *argv[])
138 {
139 static struct option long_options[] = {
140 {"filename", required_argument, NULL, 'f'},
141 {"secs-per-test", required_argument, NULL, 's'},
142 {NULL, 0, NULL, 0}
143 };
144
145 int option; /* Command line option */
146 int optindex = 0; /* used by getopt_long */
147
148 if (argc > 1)
149 {
150 if (strcmp(argv[1], "--help") == 0 || strcmp(argv[1], "-?") == 0)
151 {
152 printf("Usage: %s [-f FILENAME] [-s SECS-PER-TEST]\n", progname);
153 exit(0);
154 }
155 if (strcmp(argv[1], "--version") == 0 || strcmp(argv[1], "-V") == 0)
156 {
157 puts("pg_test_fsync (PostgreSQL) " PG_VERSION);
158 exit(0);
159 }
160 }
161
162 while ((option = getopt_long(argc, argv, "f:s:",
163 long_options, &optindex)) != -1)
164 {
165 switch (option)
166 {
167 case 'f':
168 filename = strdup(optarg);
169 break;
170
171 case 's':
172 secs_per_test = atoi(optarg);
173 break;
174
175 default:
176 fprintf(stderr, "Try \"%s --help\" for more information.\n",
177 progname);
178 exit(1);
179 break;
180 }
181 }
182
183 if (argc > optind)
184 {
185 fprintf(stderr,
186 "%s: too many command-line arguments (first is \"%s\")\n",
187 progname, argv[optind]);
188 fprintf(stderr, "Try \"%s --help\" for more information.\n",
189 progname);
190 exit(1);
191 }
192
193 printf("%d seconds per test\n", secs_per_test);
194 #if PG_O_DIRECT != 0
195 printf("O_DIRECT supported on this platform for open_datasync and open_sync.\n");
196 #else
197 printf("Direct I/O is not supported on this platform.\n");
198 #endif
199 }
200
201 static void
prepare_buf(void)202 prepare_buf(void)
203 {
204 int ops;
205
206 /* write random data into buffer */
207 for (ops = 0; ops < XLOG_SEG_SIZE; ops++)
208 full_buf[ops] = random();
209
210 buf = (char *) TYPEALIGN(XLOG_BLCKSZ, full_buf);
211 }
212
213 static void
test_open(void)214 test_open(void)
215 {
216 int tmpfile;
217
218 /*
219 * test if we can open the target file
220 */
221 if ((tmpfile = open(filename, O_RDWR | O_CREAT | PG_BINARY, S_IRUSR | S_IWUSR)) == -1)
222 die("could not open output file");
223 needs_unlink = 1;
224 if (write(tmpfile, full_buf, XLOG_SEG_SIZE) != XLOG_SEG_SIZE)
225 die("write failed");
226
227 /* fsync now so that dirty buffers don't skew later tests */
228 if (fsync(tmpfile) != 0)
229 die("fsync failed");
230
231 close(tmpfile);
232 }
233
234 static void
test_sync(int writes_per_op)235 test_sync(int writes_per_op)
236 {
237 int tmpfile,
238 ops,
239 writes;
240 bool fs_warning = false;
241
242 if (writes_per_op == 1)
243 printf("\nCompare file sync methods using one %dkB write:\n", XLOG_BLCKSZ_K);
244 else
245 printf("\nCompare file sync methods using two %dkB writes:\n", XLOG_BLCKSZ_K);
246 printf("(in wal_sync_method preference order, except fdatasync is Linux's default)\n");
247
248 /*
249 * Test open_datasync if available
250 */
251 printf(LABEL_FORMAT, "open_datasync");
252 fflush(stdout);
253
254 #ifdef OPEN_DATASYNC_FLAG
255 if ((tmpfile = open(filename, O_RDWR | O_DSYNC | PG_O_DIRECT | PG_BINARY, 0)) == -1)
256 {
257 printf(NA_FORMAT, "n/a*\n");
258 fs_warning = true;
259 }
260 else
261 {
262 START_TIMER;
263 for (ops = 0; alarm_triggered == false; ops++)
264 {
265 for (writes = 0; writes < writes_per_op; writes++)
266 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
267 die("write failed");
268 if (lseek(tmpfile, 0, SEEK_SET) == -1)
269 die("seek failed");
270 }
271 STOP_TIMER;
272 close(tmpfile);
273 }
274 #else
275 printf(NA_FORMAT, "n/a\n");
276 #endif
277
278 /*
279 * Test fdatasync if available
280 */
281 printf(LABEL_FORMAT, "fdatasync");
282 fflush(stdout);
283
284 #ifdef HAVE_FDATASYNC
285 if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
286 die("could not open output file");
287 START_TIMER;
288 for (ops = 0; alarm_triggered == false; ops++)
289 {
290 for (writes = 0; writes < writes_per_op; writes++)
291 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
292 die("write failed");
293 fdatasync(tmpfile);
294 if (lseek(tmpfile, 0, SEEK_SET) == -1)
295 die("seek failed");
296 }
297 STOP_TIMER;
298 close(tmpfile);
299 #else
300 printf(NA_FORMAT, "n/a\n");
301 #endif
302
303 /*
304 * Test fsync
305 */
306 printf(LABEL_FORMAT, "fsync");
307 fflush(stdout);
308
309 if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
310 die("could not open output file");
311 START_TIMER;
312 for (ops = 0; alarm_triggered == false; ops++)
313 {
314 for (writes = 0; writes < writes_per_op; writes++)
315 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
316 die("write failed");
317 if (fsync(tmpfile) != 0)
318 die("fsync failed");
319 if (lseek(tmpfile, 0, SEEK_SET) == -1)
320 die("seek failed");
321 }
322 STOP_TIMER;
323 close(tmpfile);
324
325 /*
326 * If fsync_writethrough is available, test as well
327 */
328 printf(LABEL_FORMAT, "fsync_writethrough");
329 fflush(stdout);
330
331 #ifdef HAVE_FSYNC_WRITETHROUGH
332 if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
333 die("could not open output file");
334 START_TIMER;
335 for (ops = 0; alarm_triggered == false; ops++)
336 {
337 for (writes = 0; writes < writes_per_op; writes++)
338 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
339 die("write failed");
340 if (pg_fsync_writethrough(tmpfile) != 0)
341 die("fsync failed");
342 if (lseek(tmpfile, 0, SEEK_SET) == -1)
343 die("seek failed");
344 }
345 STOP_TIMER;
346 close(tmpfile);
347 #else
348 printf(NA_FORMAT, "n/a\n");
349 #endif
350
351 /*
352 * Test open_sync if available
353 */
354 printf(LABEL_FORMAT, "open_sync");
355 fflush(stdout);
356
357 #ifdef OPEN_SYNC_FLAG
358 if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
359 {
360 printf(NA_FORMAT, "n/a*\n");
361 fs_warning = true;
362 }
363 else
364 {
365 START_TIMER;
366 for (ops = 0; alarm_triggered == false; ops++)
367 {
368 for (writes = 0; writes < writes_per_op; writes++)
369 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
370
371 /*
372 * This can generate write failures if the filesystem has
373 * a large block size, e.g. 4k, and there is no support
374 * for O_DIRECT writes smaller than the file system block
375 * size, e.g. XFS.
376 */
377 die("write failed");
378 if (lseek(tmpfile, 0, SEEK_SET) == -1)
379 die("seek failed");
380 }
381 STOP_TIMER;
382 close(tmpfile);
383 }
384 #else
385 printf(NA_FORMAT, "n/a\n");
386 #endif
387
388 if (fs_warning)
389 {
390 printf("* This file system and its mount options do not support direct\n");
391 printf("I/O, e.g. ext4 in journaled mode.\n");
392 }
393 }
394
395 static void
test_open_syncs(void)396 test_open_syncs(void)
397 {
398 printf("\nCompare open_sync with different write sizes:\n");
399 printf("(This is designed to compare the cost of writing 16kB in different write\n"
400 "open_sync sizes.)\n");
401
402 test_open_sync(" 1 * 16kB open_sync write", 16);
403 test_open_sync(" 2 * 8kB open_sync writes", 8);
404 test_open_sync(" 4 * 4kB open_sync writes", 4);
405 test_open_sync(" 8 * 2kB open_sync writes", 2);
406 test_open_sync("16 * 1kB open_sync writes", 1);
407 }
408
409 /*
410 * Test open_sync with different size files
411 */
412 static void
test_open_sync(const char * msg,int writes_size)413 test_open_sync(const char *msg, int writes_size)
414 {
415 #ifdef OPEN_SYNC_FLAG
416 int tmpfile,
417 ops,
418 writes;
419 #endif
420
421 printf(LABEL_FORMAT, msg);
422 fflush(stdout);
423
424 #ifdef OPEN_SYNC_FLAG
425 if ((tmpfile = open(filename, O_RDWR | OPEN_SYNC_FLAG | PG_O_DIRECT | PG_BINARY, 0)) == -1)
426 printf(NA_FORMAT, "n/a*\n");
427 else
428 {
429 START_TIMER;
430 for (ops = 0; alarm_triggered == false; ops++)
431 {
432 for (writes = 0; writes < 16 / writes_size; writes++)
433 if (write(tmpfile, buf, writes_size * 1024) !=
434 writes_size * 1024)
435 die("write failed");
436 if (lseek(tmpfile, 0, SEEK_SET) == -1)
437 die("seek failed");
438 }
439 STOP_TIMER;
440 close(tmpfile);
441 }
442 #else
443 printf(NA_FORMAT, "n/a\n");
444 #endif
445 }
446
447 static void
test_file_descriptor_sync(void)448 test_file_descriptor_sync(void)
449 {
450 int tmpfile,
451 ops;
452
453 /*
454 * Test whether fsync can sync data written on a different descriptor for
455 * the same file. This checks the efficiency of multi-process fsyncs
456 * against the same file. Possibly this should be done with writethrough
457 * on platforms which support it.
458 */
459 printf("\nTest if fsync on non-write file descriptor is honored:\n");
460 printf("(If the times are similar, fsync() can sync data written on a different\n"
461 "descriptor.)\n");
462
463 /*
464 * first write, fsync and close, which is the normal behavior without
465 * multiple descriptors
466 */
467 printf(LABEL_FORMAT, "write, fsync, close");
468 fflush(stdout);
469
470 START_TIMER;
471 for (ops = 0; alarm_triggered == false; ops++)
472 {
473 if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
474 die("could not open output file");
475 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
476 die("write failed");
477 if (fsync(tmpfile) != 0)
478 die("fsync failed");
479 close(tmpfile);
480
481 /*
482 * open and close the file again to be consistent with the following
483 * test
484 */
485 if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
486 die("could not open output file");
487 close(tmpfile);
488 }
489 STOP_TIMER;
490
491 /*
492 * Now open, write, close, open again and fsync This simulates processes
493 * fsyncing each other's writes.
494 */
495 printf(LABEL_FORMAT, "write, close, fsync");
496 fflush(stdout);
497
498 START_TIMER;
499 for (ops = 0; alarm_triggered == false; ops++)
500 {
501 if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
502 die("could not open output file");
503 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
504 die("write failed");
505 close(tmpfile);
506 /* reopen file */
507 if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
508 die("could not open output file");
509 if (fsync(tmpfile) != 0)
510 die("fsync failed");
511 close(tmpfile);
512 }
513 STOP_TIMER;
514 }
515
516 static void
test_non_sync(void)517 test_non_sync(void)
518 {
519 int tmpfile,
520 ops;
521
522 /*
523 * Test a simple write without fsync
524 */
525 printf("\nNon-sync'ed %dkB writes:\n", XLOG_BLCKSZ_K);
526 printf(LABEL_FORMAT, "write");
527 fflush(stdout);
528
529 START_TIMER;
530 for (ops = 0; alarm_triggered == false; ops++)
531 {
532 if ((tmpfile = open(filename, O_RDWR | PG_BINARY, 0)) == -1)
533 die("could not open output file");
534 if (write(tmpfile, buf, XLOG_BLCKSZ) != XLOG_BLCKSZ)
535 die("write failed");
536 close(tmpfile);
537 }
538 STOP_TIMER;
539 }
540
541 static void
signal_cleanup(int signum)542 signal_cleanup(int signum)
543 {
544 /* Delete the file if it exists. Ignore errors */
545 if (needs_unlink)
546 unlink(filename);
547 /* Finish incomplete line on stdout */
548 puts("");
549 exit(signum);
550 }
551
552 #ifdef HAVE_FSYNC_WRITETHROUGH
553
554 static int
pg_fsync_writethrough(int fd)555 pg_fsync_writethrough(int fd)
556 {
557 #ifdef WIN32
558 return _commit(fd);
559 #elif defined(F_FULLFSYNC)
560 return (fcntl(fd, F_FULLFSYNC, 0) == -1) ? -1 : 0;
561 #else
562 errno = ENOSYS;
563 return -1;
564 #endif
565 }
566 #endif
567
568 /*
569 * print out the writes per second for tests
570 */
571 static void
print_elapse(struct timeval start_t,struct timeval stop_t,int ops)572 print_elapse(struct timeval start_t, struct timeval stop_t, int ops)
573 {
574 double total_time = (stop_t.tv_sec - start_t.tv_sec) +
575 (stop_t.tv_usec - start_t.tv_usec) * 0.000001;
576 double per_second = ops / total_time;
577 double avg_op_time_us = (total_time / ops) * USECS_SEC;
578
579 printf(OPS_FORMAT "\n", per_second, avg_op_time_us);
580 }
581
582 #ifndef WIN32
583 static void
process_alarm(int sig)584 process_alarm(int sig)
585 {
586 alarm_triggered = true;
587 }
588 #else
589 static DWORD WINAPI
process_alarm(LPVOID param)590 process_alarm(LPVOID param)
591 {
592 /* WIN32 doesn't support alarm, so we create a thread and sleep here */
593 Sleep(secs_per_test * 1000);
594 alarm_triggered = true;
595 ExitThread(0);
596 }
597 #endif
598
599 static void
die(const char * str)600 die(const char *str)
601 {
602 fprintf(stderr, "%s: %s\n", str, strerror(errno));
603 exit(1);
604 }
605