1 /*-
2 * Public Domain 2014-2018 MongoDB, Inc.
3 * Public Domain 2008-2014 WiredTiger, Inc.
4 *
5 * This is free and unencumbered software released into the public domain.
6 *
7 * Anyone is free to copy, modify, publish, use, compile, sell, or
8 * distribute this software, either in source code form or as a compiled
9 * binary, for any purpose, commercial or non-commercial, and by any
10 * means.
11 *
12 * In jurisdictions that recognize copyright laws, the author or authors
13 * of this software dedicate any and all copyright interest in the
14 * software to the public domain. We make this dedication for the benefit
15 * of the public at large and to the detriment of our heirs and
16 * successors. We intend this dedication to be an overt act of
17 * relinquishment in perpetuity of all present and future rights to this
18 * software under copyright law.
19 *
20 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21 * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22 * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23 * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
24 * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25 * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26 * OTHER DEALINGS IN THE SOFTWARE.
27 */
28
29 #include "wt_internal.h"
30
31 /*
32 * __posix_sync --
33 * Underlying support function to flush a file descriptor.
34 *
35 * Fsync calls (or fsync-style calls, for example, fdatasync) are not retried
36 * on failure, and failure halts the system.
37 *
38 * Excerpted from the LWN.net article https://lwn.net/Articles/752063/:
39 * In short, PostgreSQL assumes that a successful call to fsync() indicates
40 * that all data written since the last successful call made it safely to
41 * persistent storage. But that is not what the kernel actually does. When
42 * a buffered I/O write fails due to a hardware-level error, filesystems
43 * will respond differently, but that behavior usually includes discarding
44 * the data in the affected pages and marking them as being clean. So a read
45 * of the blocks that were just written will likely return something other
46 * than the data that was written.
47 *
48 * Given the shared history of UNIX filesystems, and the difficulty of knowing
49 * what specific error will be returned under specific circumstances, we don't
50 * retry fsync-style calls and panic if a flush operation fails.
51 */
52 static int
__posix_sync(WT_SESSION_IMPL * session,int fd,const char * name,const char * func)53 __posix_sync(
54 WT_SESSION_IMPL *session, int fd, const char *name, const char *func)
55 {
56 WT_DECL_RET;
57
58 #if defined(F_FULLFSYNC)
59 /*
60 * OS X fsync documentation:
61 * "Note that while fsync() will flush all data from the host to the
62 * drive (i.e. the "permanent storage device"), the drive itself may
63 * not physically write the data to the platters for quite some time
64 * and it may be written in an out-of-order sequence. For applications
65 * that require tighter guarantees about the integrity of their data,
66 * Mac OS X provides the F_FULLFSYNC fcntl. The F_FULLFSYNC fcntl asks
67 * the drive to flush all buffered data to permanent storage."
68 *
69 * OS X F_FULLFSYNC fcntl documentation:
70 * "This is currently implemented on HFS, MS-DOS (FAT), and Universal
71 * Disk Format (UDF) file systems."
72 *
73 * See comment in __posix_sync(): sync cannot be retried or fail.
74 */
75 static enum { FF_NOTSET, FF_IGNORE, FF_OK } ff_status = FF_NOTSET;
76 switch (ff_status) {
77 case FF_NOTSET:
78 WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
79 if (ret == 0) {
80 ff_status = FF_OK;
81 return (0);
82 }
83
84 /*
85 * If the first F_FULLFSYNC fails, assume the file system
86 * doesn't support it and fallback to fdatasync or fsync.
87 */
88 ff_status = FF_IGNORE;
89 __wt_err(session, ret,
90 "fcntl(F_FULLFSYNC) failed, falling back to fdatasync "
91 "or fsync");
92 break;
93 case FF_IGNORE:
94 break;
95 case FF_OK:
96 WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
97 if (ret == 0)
98 return (0);
99 WT_PANIC_RET(session,
100 ret, "%s: %s: fcntl(F_FULLFSYNC)", name, func);
101 }
102 #endif
103 #if defined(HAVE_FDATASYNC)
104 /* See comment in __posix_sync(): sync cannot be retried or fail. */
105 WT_SYSCALL(fdatasync(fd), ret);
106 if (ret == 0)
107 return (0);
108 WT_PANIC_RET(session, ret, "%s: %s: fdatasync", name, func);
109 #else
110 /* See comment in __posix_sync(): sync cannot be retried or fail. */
111 WT_SYSCALL(fsync(fd), ret);
112 if (ret == 0)
113 return (0);
114 WT_PANIC_RET(session, ret, "%s: %s: fsync", name, func);
115 #endif
116 }
117
118 #ifdef __linux__
119 /*
120 * __posix_directory_sync --
121 * Flush a directory to ensure file creation, remove or rename is durable.
122 */
123 static int
__posix_directory_sync(WT_SESSION_IMPL * session,const char * path)124 __posix_directory_sync(WT_SESSION_IMPL *session, const char *path)
125 {
126 WT_DECL_ITEM(tmp);
127 WT_DECL_RET;
128 int fd, tret;
129 char *dir;
130
131 WT_RET(__wt_scr_alloc(session, 0, &tmp));
132 WT_ERR(__wt_buf_setstr(session, tmp, path));
133
134 /*
135 * This layer should never see a path that doesn't include a trailing
136 * path separator, this code asserts that fact.
137 */
138 dir = tmp->mem;
139 strrchr(dir, '/')[1] = '\0';
140
141 fd = 0; /* -Wconditional-uninitialized */
142 WT_SYSCALL_RETRY((
143 (fd = open(dir, O_RDONLY, 0444)) == -1 ? -1 : 0), ret);
144 if (ret != 0)
145 WT_ERR_MSG(session, ret, "%s: directory-sync: open", dir);
146
147 ret = __posix_sync(session, fd, dir, "directory-sync");
148
149 WT_SYSCALL(close(fd), tret);
150 if (tret != 0) {
151 __wt_err(session, tret, "%s: directory-sync: close", dir);
152 WT_TRET(tret);
153 }
154
155 err: __wt_scr_free(session, &tmp);
156 if (ret == 0)
157 return (ret);
158
159 /* See comment in __posix_sync(): sync cannot be retried or fail. */
160 WT_PANIC_RET(session, ret, "%s: directory-sync", path);
161 }
162 #endif
163
164 /*
165 * __posix_fs_exist --
166 * Return if the file exists.
167 */
168 static int
__posix_fs_exist(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * name,bool * existp)169 __posix_fs_exist(WT_FILE_SYSTEM *file_system,
170 WT_SESSION *wt_session, const char *name, bool *existp)
171 {
172 struct stat sb;
173 WT_DECL_RET;
174 WT_SESSION_IMPL *session;
175
176 WT_UNUSED(file_system);
177
178 session = (WT_SESSION_IMPL *)wt_session;
179
180 WT_SYSCALL(stat(name, &sb), ret);
181 if (ret == 0) {
182 *existp = true;
183 return (0);
184 }
185 if (ret == ENOENT) {
186 *existp = false;
187 return (0);
188 }
189 WT_RET_MSG(session, ret, "%s: file-exist: stat", name);
190 }
191
192 /*
193 * __posix_fs_remove --
194 * Remove a file.
195 */
196 static int
__posix_fs_remove(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * name,uint32_t flags)197 __posix_fs_remove(WT_FILE_SYSTEM *file_system,
198 WT_SESSION *wt_session, const char *name, uint32_t flags)
199 {
200 WT_DECL_RET;
201 WT_SESSION_IMPL *session;
202
203 WT_UNUSED(file_system);
204
205 session = (WT_SESSION_IMPL *)wt_session;
206
207 /*
208 * ISO C doesn't require remove return -1 on failure or set errno (note
209 * POSIX 1003.1 extends C with those requirements). Regardless, use the
210 * unlink system call, instead of remove, to simplify error handling;
211 * where we're not doing any special checking for standards compliance,
212 * using unlink may be marginally safer.
213 */
214 WT_SYSCALL(unlink(name), ret);
215 if (ret != 0)
216 WT_RET_MSG(session, ret, "%s: file-remove: unlink", name);
217
218 if (!LF_ISSET(WT_FS_DURABLE))
219 return (0);
220
221 #ifdef __linux__
222 /* Flush the backing directory to guarantee the remove. */
223 WT_RET (__posix_directory_sync(session, name));
224 #endif
225 return (0);
226 }
227
228 /*
229 * __posix_fs_rename --
230 * Rename a file.
231 */
232 static int
__posix_fs_rename(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * from,const char * to,uint32_t flags)233 __posix_fs_rename(WT_FILE_SYSTEM *file_system,
234 WT_SESSION *wt_session, const char *from, const char *to, uint32_t flags)
235 {
236 WT_DECL_RET;
237 WT_SESSION_IMPL *session;
238
239 WT_UNUSED(file_system);
240
241 session = (WT_SESSION_IMPL *)wt_session;
242
243 /*
244 * ISO C doesn't require rename return -1 on failure or set errno (note
245 * POSIX 1003.1 extends C with those requirements). Be cautious, force
246 * any non-zero return to -1 so we'll check errno. We can still end up
247 * with the wrong errno (if errno is garbage), or the generic WT_ERROR
248 * return (if errno is 0), but we've done the best we can.
249 */
250 WT_SYSCALL(rename(from, to) != 0 ? -1 : 0, ret);
251 if (ret != 0)
252 WT_RET_MSG(
253 session, ret, "%s to %s: file-rename: rename", from, to);
254
255 if (!LF_ISSET(WT_FS_DURABLE))
256 return (0);
257 #ifdef __linux__
258 /*
259 * Flush the backing directory to guarantee the rename. My reading of
260 * POSIX 1003.1 is there's no guarantee flushing only one of the from
261 * or to directories, or flushing a common parent, is sufficient, and
262 * even if POSIX were to make that guarantee, existing filesystems are
263 * known to not provide the guarantee or only provide the guarantee
264 * with specific mount options. Flush both of the from/to directories
265 * until it's a performance problem.
266 */
267 WT_RET(__posix_directory_sync(session, from));
268
269 /*
270 * In almost all cases, we're going to be renaming files in the same
271 * directory, we can at least fast-path that.
272 */
273 {
274 bool same_directory;
275 const char *fp, *tp;
276
277 fp = strrchr(from, '/');
278 tp = strrchr(to, '/');
279 same_directory = (fp == NULL && tp == NULL) ||
280 (fp != NULL && tp != NULL &&
281 fp - from == tp - to && memcmp(from, to, (size_t)(fp - from)) == 0);
282
283 if (!same_directory)
284 WT_RET(__posix_directory_sync(session, to));
285 }
286 #endif
287 return (0);
288 }
289
290 /*
291 * __posix_fs_size --
292 * Get the size of a file in bytes, by file name.
293 */
294 static int
__posix_fs_size(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * name,wt_off_t * sizep)295 __posix_fs_size(WT_FILE_SYSTEM *file_system,
296 WT_SESSION *wt_session, const char *name, wt_off_t *sizep)
297 {
298 struct stat sb;
299 WT_DECL_RET;
300 WT_SESSION_IMPL *session;
301
302 WT_UNUSED(file_system);
303
304 session = (WT_SESSION_IMPL *)wt_session;
305
306 WT_SYSCALL(stat(name, &sb), ret);
307 if (ret == 0) {
308 *sizep = sb.st_size;
309 return (0);
310 }
311 WT_RET_MSG(session, ret, "%s: file-size: stat", name);
312 }
313
314 #if defined(HAVE_POSIX_FADVISE)
315 /*
316 * __posix_file_advise --
317 * POSIX fadvise.
318 */
319 static int
__posix_file_advise(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t offset,wt_off_t len,int advice)320 __posix_file_advise(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
321 wt_off_t offset, wt_off_t len, int advice)
322 {
323 WT_DECL_RET;
324 WT_FILE_HANDLE_POSIX *pfh;
325 WT_SESSION_IMPL *session;
326
327 session = (WT_SESSION_IMPL *)wt_session;
328 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
329
330 WT_SYSCALL(posix_fadvise(pfh->fd, offset, len, advice), ret);
331 if (ret == 0)
332 return (0);
333
334 /*
335 * Treat EINVAL as not-supported, some systems don't support some flags.
336 * Quietly fail, callers expect not-supported failures, and reset the
337 * handle method to prevent future calls.
338 */
339 if (ret == EINVAL) {
340 file_handle->fh_advise = NULL;
341 return (__wt_set_return(session, ENOTSUP));
342 }
343
344 WT_RET_MSG(session, ret,
345 "%s: handle-advise: posix_fadvise", file_handle->name);
346
347 }
348 #endif
349
350 /*
351 * __posix_file_close --
352 * ANSI C close.
353 */
354 static int
__posix_file_close(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session)355 __posix_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
356 {
357 WT_DECL_RET;
358 WT_FILE_HANDLE_POSIX *pfh;
359 WT_SESSION_IMPL *session;
360
361 session = (WT_SESSION_IMPL *)wt_session;
362 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
363
364 /* Close the file handle. */
365 if (pfh->fd != -1) {
366 WT_SYSCALL(close(pfh->fd), ret);
367 if (ret != 0)
368 __wt_err(session, ret,
369 "%s: handle-close: close", file_handle->name);
370 }
371
372 __wt_free(session, file_handle->name);
373 __wt_free(session, pfh);
374 return (ret);
375 }
376
377 /*
378 * __posix_file_lock --
379 * Lock/unlock a file.
380 */
381 static int
__posix_file_lock(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,bool lock)382 __posix_file_lock(
383 WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, bool lock)
384 {
385 struct flock fl;
386 WT_DECL_RET;
387 WT_FILE_HANDLE_POSIX *pfh;
388 WT_SESSION_IMPL *session;
389
390 session = (WT_SESSION_IMPL *)wt_session;
391 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
392
393 /*
394 * WiredTiger requires this function be able to acquire locks past
395 * the end of file.
396 *
397 * Note we're using fcntl(2) locking: all fcntl locks associated with a
398 * file for a given process are removed when any file descriptor for the
399 * file is closed by the process, even if a lock was never requested for
400 * that file descriptor.
401 */
402 fl.l_start = 0;
403 fl.l_len = 1;
404 fl.l_type = lock ? F_WRLCK : F_UNLCK;
405 fl.l_whence = SEEK_SET;
406
407 WT_SYSCALL(fcntl(pfh->fd, F_SETLK, &fl) == -1 ? -1 : 0, ret);
408 if (ret == 0)
409 return (0);
410 WT_RET_MSG(session, ret, "%s: handle-lock: fcntl", file_handle->name);
411 }
412
413 /*
414 * __posix_file_read --
415 * POSIX pread.
416 */
417 static int
__posix_file_read(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t offset,size_t len,void * buf)418 __posix_file_read(WT_FILE_HANDLE *file_handle,
419 WT_SESSION *wt_session, wt_off_t offset, size_t len, void *buf)
420 {
421 WT_FILE_HANDLE_POSIX *pfh;
422 WT_SESSION_IMPL *session;
423 size_t chunk;
424 ssize_t nr;
425 uint8_t *addr;
426
427 session = (WT_SESSION_IMPL *)wt_session;
428 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
429
430 /* Assert direct I/O is aligned and a multiple of the alignment. */
431 WT_ASSERT(session,
432 !pfh->direct_io ||
433 S2C(session)->buffer_alignment == 0 ||
434 (!((uintptr_t)buf &
435 (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
436 len >= S2C(session)->buffer_alignment &&
437 len % S2C(session)->buffer_alignment == 0));
438
439 /* Break reads larger than 1GB into 1GB chunks. */
440 for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
441 chunk = WT_MIN(len, WT_GIGABYTE);
442 if ((nr = pread(pfh->fd, addr, chunk, offset)) <= 0) {
443 if (nr == 0)
444 F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
445 WT_RET_MSG(session,
446 nr == 0 ? WT_ERROR : __wt_errno(),
447 "%s: handle-read: pread: failed to read %"
448 WT_SIZET_FMT " bytes at offset %" PRIuMAX,
449 file_handle->name, chunk, (uintmax_t)offset);
450 }
451 }
452 return (0);
453 }
454
455 /*
456 * __posix_file_size --
457 * Get the size of a file in bytes, by file handle.
458 */
459 static int
__posix_file_size(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t * sizep)460 __posix_file_size(
461 WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t *sizep)
462 {
463 struct stat sb;
464 WT_DECL_RET;
465 WT_FILE_HANDLE_POSIX *pfh;
466 WT_SESSION_IMPL *session;
467
468 session = (WT_SESSION_IMPL *)wt_session;
469 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
470
471 WT_SYSCALL(fstat(pfh->fd, &sb), ret);
472 if (ret == 0) {
473 *sizep = sb.st_size;
474 return (0);
475 }
476 WT_RET_MSG(session, ret, "%s: handle-size: fstat", file_handle->name);
477 }
478
479 /*
480 * __posix_file_sync --
481 * POSIX fsync.
482 */
483 static int
__posix_file_sync(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session)484 __posix_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
485 {
486 WT_FILE_HANDLE_POSIX *pfh;
487 WT_SESSION_IMPL *session;
488
489 session = (WT_SESSION_IMPL *)wt_session;
490 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
491
492 return (
493 __posix_sync(session, pfh->fd, file_handle->name, "handle-sync"));
494 }
495
496 #ifdef HAVE_SYNC_FILE_RANGE
497 /*
498 * __posix_file_sync_nowait --
499 * POSIX fsync.
500 */
501 static int
__posix_file_sync_nowait(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session)502 __posix_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
503 {
504 WT_DECL_RET;
505 WT_FILE_HANDLE_POSIX *pfh;
506 WT_SESSION_IMPL *session;
507
508 session = (WT_SESSION_IMPL *)wt_session;
509 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
510
511 /* See comment in __posix_sync(): sync cannot be retried or fail. */
512 WT_SYSCALL(sync_file_range(pfh->fd,
513 (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE), ret);
514 if (ret == 0)
515 return (0);
516
517 WT_PANIC_RET(session, ret,
518 "%s: handle-sync-nowait: sync_file_range", file_handle->name);
519 }
520 #endif
521
522 #ifdef HAVE_FTRUNCATE
523 /*
524 * __posix_file_truncate --
525 * POSIX ftruncate.
526 */
527 static int
__posix_file_truncate(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t len)528 __posix_file_truncate(
529 WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t len)
530 {
531 WT_DECL_RET;
532 WT_FILE_HANDLE_POSIX *pfh;
533 WT_SESSION_IMPL *session;
534
535 session = (WT_SESSION_IMPL *)wt_session;
536 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
537
538 WT_SYSCALL_RETRY(ftruncate(pfh->fd, len), ret);
539 if (ret == 0)
540 return (0);
541 WT_RET_MSG(session, ret,
542 "%s: handle-truncate: ftruncate", file_handle->name);
543 }
544 #endif
545
546 /*
547 * __posix_file_write --
548 * POSIX pwrite.
549 */
550 static int
__posix_file_write(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t offset,size_t len,const void * buf)551 __posix_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
552 wt_off_t offset, size_t len, const void *buf)
553 {
554 WT_FILE_HANDLE_POSIX *pfh;
555 WT_SESSION_IMPL *session;
556 size_t chunk;
557 ssize_t nw;
558 const uint8_t *addr;
559
560 session = (WT_SESSION_IMPL *)wt_session;
561 pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
562
563 /* Assert direct I/O is aligned and a multiple of the alignment. */
564 WT_ASSERT(session,
565 !pfh->direct_io ||
566 S2C(session)->buffer_alignment == 0 ||
567 (!((uintptr_t)buf &
568 (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
569 len >= S2C(session)->buffer_alignment &&
570 len % S2C(session)->buffer_alignment == 0));
571
572 /* Break writes larger than 1GB into 1GB chunks. */
573 for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
574 chunk = WT_MIN(len, WT_GIGABYTE);
575 if ((nw = pwrite(pfh->fd, addr, chunk, offset)) < 0)
576 WT_RET_MSG(session, __wt_errno(),
577 "%s: handle-write: pwrite: failed to write %"
578 WT_SIZET_FMT " bytes at offset %" PRIuMAX,
579 file_handle->name, chunk, (uintmax_t)offset);
580 }
581 return (0);
582 }
583
584 /*
585 * __posix_open_file_cloexec --
586 * Prevent child access to file handles.
587 */
588 static inline int
__posix_open_file_cloexec(WT_SESSION_IMPL * session,int fd,const char * name)589 __posix_open_file_cloexec(WT_SESSION_IMPL *session, int fd, const char *name)
590 {
591 #if defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
592 int f;
593
594 /*
595 * Security:
596 * The application may spawn a new process, and we don't want another
597 * process to have access to our file handles. There's an obvious race
598 * between the open and this call, prefer the flag to open if available.
599 */
600 if ((f = fcntl(fd, F_GETFD)) == -1 ||
601 fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1)
602 WT_RET_MSG(session, __wt_errno(),
603 "%s: handle-open: fcntl(FD_CLOEXEC)", name);
604 return (0);
605 #else
606 WT_UNUSED(session);
607 WT_UNUSED(fd);
608 WT_UNUSED(name);
609 return (0);
610 #endif
611 }
612
613 /*
614 * __posix_open_file --
615 * Open a file handle.
616 */
617 static int
__posix_open_file(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * name,WT_FS_OPEN_FILE_TYPE file_type,uint32_t flags,WT_FILE_HANDLE ** file_handlep)618 __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
619 const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags,
620 WT_FILE_HANDLE **file_handlep)
621 {
622 WT_CONNECTION_IMPL *conn;
623 WT_DECL_RET;
624 WT_FILE_HANDLE *file_handle;
625 WT_FILE_HANDLE_POSIX *pfh;
626 WT_SESSION_IMPL *session;
627 mode_t mode;
628 int advise_flag, f;
629
630 WT_UNUSED(file_system);
631
632 *file_handlep = NULL;
633
634 session = (WT_SESSION_IMPL *)wt_session;
635 conn = S2C(session);
636
637 WT_RET(__wt_calloc_one(session, &pfh));
638
639 /* Set up error handling. */
640 pfh->fd = -1;
641
642 if (file_type == WT_FS_OPEN_FILE_TYPE_DIRECTORY) {
643 f = O_RDONLY;
644 #ifdef O_CLOEXEC
645 /*
646 * Security:
647 * The application may spawn a new process, and we don't want
648 * another process to have access to our file handles.
649 */
650 f |= O_CLOEXEC;
651 #endif
652 WT_SYSCALL_RETRY((
653 (pfh->fd = open(name, f, 0444)) == -1 ? -1 : 0), ret);
654 if (ret != 0)
655 WT_ERR_MSG(session, ret,
656 "%s: handle-open: open-directory", name);
657 WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
658 goto directory_open;
659 }
660
661 f = LF_ISSET(WT_FS_OPEN_READONLY) ? O_RDONLY : O_RDWR;
662 if (LF_ISSET(WT_FS_OPEN_CREATE)) {
663 f |= O_CREAT;
664 if (LF_ISSET(WT_FS_OPEN_EXCLUSIVE))
665 f |= O_EXCL;
666 mode = 0666;
667 } else
668 mode = 0;
669
670 #ifdef O_BINARY
671 /* Windows clones: we always want to treat the file as a binary. */
672 f |= O_BINARY;
673 #endif
674 #ifdef O_CLOEXEC
675 /*
676 * Security:
677 * The application may spawn a new process, and we don't want another
678 * process to have access to our file handles.
679 */
680 f |= O_CLOEXEC;
681 #endif
682 #ifdef O_DIRECT
683 /* Direct I/O. */
684 if (LF_ISSET(WT_FS_OPEN_DIRECTIO)) {
685 f |= O_DIRECT;
686 pfh->direct_io = true;
687 } else
688 pfh->direct_io = false;
689 #endif
690 #ifdef O_NOATIME
691 /* Avoid updating metadata for read-only workloads. */
692 if (file_type == WT_FS_OPEN_FILE_TYPE_DATA)
693 f |= O_NOATIME;
694 #endif
695
696 if (file_type == WT_FS_OPEN_FILE_TYPE_LOG &&
697 FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
698 #ifdef O_DSYNC
699 f |= O_DSYNC;
700 #elif defined(O_SYNC)
701 f |= O_SYNC;
702 #else
703 WT_ERR_MSG(session, ENOTSUP,
704 "unsupported log sync mode configured");
705 #endif
706 }
707
708 /* Create/Open the file. */
709 WT_SYSCALL_RETRY(((pfh->fd = open(name, f, mode)) == -1 ? -1 : 0), ret);
710 if (ret != 0)
711 WT_ERR_MSG(session, ret,
712 pfh->direct_io ?
713 "%s: handle-open: open: failed with direct I/O configured, "
714 "some filesystem types do not support direct I/O" :
715 "%s: handle-open: open", name);
716
717 #ifdef __linux__
718 /*
719 * Durability: some filesystems require a directory sync to be confident
720 * the file will appear.
721 */
722 if (LF_ISSET(WT_FS_OPEN_DURABLE))
723 WT_ERR(__posix_directory_sync(session, name));
724 #endif
725
726 WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
727
728 #if defined(HAVE_POSIX_FADVISE)
729 /*
730 * If the user set an access pattern hint, call fadvise now.
731 * Ignore fadvise when doing direct I/O, the kernel cache isn't
732 * interesting.
733 */
734 if (!pfh->direct_io && file_type == WT_FS_OPEN_FILE_TYPE_DATA &&
735 LF_ISSET(WT_FS_OPEN_ACCESS_RAND | WT_FS_OPEN_ACCESS_SEQ)) {
736 advise_flag = 0;
737 if (LF_ISSET(WT_FS_OPEN_ACCESS_RAND))
738 advise_flag = POSIX_FADV_RANDOM;
739 if (LF_ISSET(WT_FS_OPEN_ACCESS_SEQ))
740 advise_flag = POSIX_FADV_SEQUENTIAL;
741 WT_SYSCALL(posix_fadvise(pfh->fd, 0, 0, advise_flag), ret);
742 if (ret != 0)
743 WT_ERR_MSG(session, ret,
744 "%s: handle-open: posix_fadvise", name);
745 }
746 #else
747 WT_UNUSED(advise_flag);
748 #endif
749
750 directory_open:
751 /* Initialize public information. */
752 file_handle = (WT_FILE_HANDLE *)pfh;
753 WT_ERR(__wt_strdup(session, name, &file_handle->name));
754
755 file_handle->close = __posix_file_close;
756 #if defined(HAVE_POSIX_FADVISE)
757 /*
758 * Ignore fadvise when doing direct I/O, the kernel cache isn't
759 * interesting.
760 */
761 if (!pfh->direct_io)
762 file_handle->fh_advise = __posix_file_advise;
763 #endif
764 file_handle->fh_extend = __wt_posix_file_extend;
765 file_handle->fh_lock = __posix_file_lock;
766 #ifdef WORDS_BIGENDIAN
767 /*
768 * The underlying objects are little-endian, mapping objects isn't
769 * currently supported on big-endian systems.
770 */
771 #else
772 file_handle->fh_map = __wt_posix_map;
773 #ifdef HAVE_POSIX_MADVISE
774 file_handle->fh_map_discard = __wt_posix_map_discard;
775 file_handle->fh_map_preload = __wt_posix_map_preload;
776 #endif
777 file_handle->fh_unmap = __wt_posix_unmap;
778 #endif
779 file_handle->fh_read = __posix_file_read;
780 file_handle->fh_size = __posix_file_size;
781 file_handle->fh_sync = __posix_file_sync;
782 #ifdef HAVE_SYNC_FILE_RANGE
783 file_handle->fh_sync_nowait = __posix_file_sync_nowait;
784 #endif
785 #ifdef HAVE_FTRUNCATE
786 file_handle->fh_truncate = __posix_file_truncate;
787 #endif
788 file_handle->fh_write = __posix_file_write;
789
790 *file_handlep = file_handle;
791
792 return (0);
793
794 err: WT_TRET(__posix_file_close((WT_FILE_HANDLE *)pfh, wt_session));
795 return (ret);
796 }
797
798 /*
799 * __posix_terminate --
800 * Terminate a POSIX configuration.
801 */
802 static int
__posix_terminate(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session)803 __posix_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session)
804 {
805 WT_SESSION_IMPL *session;
806
807 session = (WT_SESSION_IMPL *)wt_session;
808
809 __wt_free(session, file_system);
810 return (0);
811 }
812
813 /*
814 * __wt_os_posix --
815 * Initialize a POSIX configuration.
816 */
817 int
__wt_os_posix(WT_SESSION_IMPL * session)818 __wt_os_posix(WT_SESSION_IMPL *session)
819 {
820 WT_CONNECTION_IMPL *conn;
821 WT_FILE_SYSTEM *file_system;
822
823 conn = S2C(session);
824
825 WT_RET(__wt_calloc_one(session, &file_system));
826
827 /* Initialize the POSIX jump table. */
828 file_system->fs_directory_list = __wt_posix_directory_list;
829 file_system->fs_directory_list_single =
830 __wt_posix_directory_list_single;
831 file_system->fs_directory_list_free = __wt_posix_directory_list_free;
832 file_system->fs_exist = __posix_fs_exist;
833 file_system->fs_open_file = __posix_open_file;
834 file_system->fs_remove = __posix_fs_remove;
835 file_system->fs_rename = __posix_fs_rename;
836 file_system->fs_size = __posix_fs_size;
837 file_system->terminate = __posix_terminate;
838
839 /* Switch it into place. */
840 conn->file_system = file_system;
841
842 return (0);
843 }
844