1 /*-
2  * Public Domain 2014-2018 MongoDB, Inc.
3  * Public Domain 2008-2014 WiredTiger, Inc.
4  *
5  * This is free and unencumbered software released into the public domain.
6  *
7  * Anyone is free to copy, modify, publish, use, compile, sell, or
8  * distribute this software, either in source code form or as a compiled
9  * binary, for any purpose, commercial or non-commercial, and by any
10  * means.
11  *
12  * In jurisdictions that recognize copyright laws, the author or authors
13  * of this software dedicate any and all copyright interest in the
14  * software to the public domain. We make this dedication for the benefit
15  * of the public at large and to the detriment of our heirs and
16  * successors. We intend this dedication to be an overt act of
17  * relinquishment in perpetuity of all present and future rights to this
18  * software under copyright law.
19  *
20  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
21  * EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
22  * MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
23  * IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
24  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
25  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
26  * OTHER DEALINGS IN THE SOFTWARE.
27  */
28 
29 #include "wt_internal.h"
30 
31 /*
32  * __posix_sync --
33  *	Underlying support function to flush a file descriptor.
34  *
35  * Fsync calls (or fsync-style calls, for example, fdatasync) are not retried
36  * on failure, and failure halts the system.
37  *
38  * Excerpted from the LWN.net article https://lwn.net/Articles/752063/:
39  * In short, PostgreSQL assumes that a successful call to fsync() indicates
40  * that all data written since the last successful call made it safely to
41  * persistent storage. But that is not what the kernel actually does. When
42  * a buffered I/O write fails due to a hardware-level error, filesystems
43  * will respond differently, but that behavior usually includes discarding
44  * the data in the affected pages and marking them as being clean. So a read
45  * of the blocks that were just written will likely return something other
46  * than the data that was written.
47  *
48  * Given the shared history of UNIX filesystems, and the difficulty of knowing
49  * what specific error will be returned under specific circumstances, we don't
50  * retry fsync-style calls and panic if a flush operation fails.
51  */
52 static int
__posix_sync(WT_SESSION_IMPL * session,int fd,const char * name,const char * func)53 __posix_sync(
54     WT_SESSION_IMPL *session, int fd, const char *name, const char *func)
55 {
56 	WT_DECL_RET;
57 
58 #if defined(F_FULLFSYNC)
59 	/*
60 	 * OS X fsync documentation:
61 	 * "Note that while fsync() will flush all data from the host to the
62 	 * drive (i.e. the "permanent storage device"), the drive itself may
63 	 * not physically write the data to the platters for quite some time
64 	 * and it may be written in an out-of-order sequence. For applications
65 	 * that require tighter guarantees about the integrity of their data,
66 	 * Mac OS X provides the F_FULLFSYNC fcntl. The F_FULLFSYNC fcntl asks
67 	 * the drive to flush all buffered data to permanent storage."
68 	 *
69 	 * OS X F_FULLFSYNC fcntl documentation:
70 	 * "This is currently implemented on HFS, MS-DOS (FAT), and Universal
71 	 * Disk Format (UDF) file systems."
72 	 *
73 	 * See comment in __posix_sync(): sync cannot be retried or fail.
74 	 */
75 	static enum { FF_NOTSET, FF_IGNORE, FF_OK } ff_status = FF_NOTSET;
76 	switch (ff_status) {
77 	case FF_NOTSET:
78 		WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
79 		if (ret == 0) {
80 			ff_status = FF_OK;
81 			return (0);
82 		}
83 
84 		/*
85 		 * If the first F_FULLFSYNC fails, assume the file system
86 		 * doesn't support it and fallback to fdatasync or fsync.
87 		 */
88 		ff_status = FF_IGNORE;
89 		__wt_err(session, ret,
90 		    "fcntl(F_FULLFSYNC) failed, falling back to fdatasync "
91 		    "or fsync");
92 		break;
93 	case FF_IGNORE:
94 		break;
95 	case FF_OK:
96 		WT_SYSCALL(fcntl(fd, F_FULLFSYNC, 0) == -1 ? -1 : 0, ret);
97 		if (ret == 0)
98 			return (0);
99 		WT_PANIC_RET(session,
100 		    ret, "%s: %s: fcntl(F_FULLFSYNC)", name, func);
101 	}
102 #endif
103 #if defined(HAVE_FDATASYNC)
104 	/* See comment in __posix_sync(): sync cannot be retried or fail. */
105 	WT_SYSCALL(fdatasync(fd), ret);
106 	if (ret == 0)
107 		return (0);
108 	WT_PANIC_RET(session, ret, "%s: %s: fdatasync", name, func);
109 #else
110 	/* See comment in __posix_sync(): sync cannot be retried or fail. */
111 	WT_SYSCALL(fsync(fd), ret);
112 	if (ret == 0)
113 		return (0);
114 	WT_PANIC_RET(session, ret, "%s: %s: fsync", name, func);
115 #endif
116 }
117 
118 #ifdef __linux__
119 /*
120  * __posix_directory_sync --
121  *	Flush a directory to ensure file creation, remove or rename is durable.
122  */
123 static int
__posix_directory_sync(WT_SESSION_IMPL * session,const char * path)124 __posix_directory_sync(WT_SESSION_IMPL *session, const char *path)
125 {
126 	WT_DECL_ITEM(tmp);
127 	WT_DECL_RET;
128 	int fd, tret;
129 	char *dir;
130 
131 	WT_RET(__wt_scr_alloc(session, 0, &tmp));
132 	WT_ERR(__wt_buf_setstr(session, tmp, path));
133 
134 	/*
135 	 * This layer should never see a path that doesn't include a trailing
136 	 * path separator, this code asserts that fact.
137 	 */
138 	dir = tmp->mem;
139 	strrchr(dir, '/')[1] = '\0';
140 
141 	fd = 0;				/* -Wconditional-uninitialized */
142 	WT_SYSCALL_RETRY((
143 	    (fd = open(dir, O_RDONLY, 0444)) == -1 ? -1 : 0), ret);
144 	if (ret != 0)
145 		WT_ERR_MSG(session, ret, "%s: directory-sync: open", dir);
146 
147 	ret = __posix_sync(session, fd, dir, "directory-sync");
148 
149 	WT_SYSCALL(close(fd), tret);
150 	if (tret != 0) {
151 		__wt_err(session, tret, "%s: directory-sync: close", dir);
152 		WT_TRET(tret);
153 	}
154 
155 err:	__wt_scr_free(session, &tmp);
156 	if (ret == 0)
157 		return (ret);
158 
159 	/* See comment in __posix_sync(): sync cannot be retried or fail. */
160 	WT_PANIC_RET(session, ret, "%s: directory-sync", path);
161 }
162 #endif
163 
164 /*
165  * __posix_fs_exist --
166  *	Return if the file exists.
167  */
168 static int
__posix_fs_exist(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * name,bool * existp)169 __posix_fs_exist(WT_FILE_SYSTEM *file_system,
170     WT_SESSION *wt_session, const char *name, bool *existp)
171 {
172 	struct stat sb;
173 	WT_DECL_RET;
174 	WT_SESSION_IMPL *session;
175 
176 	WT_UNUSED(file_system);
177 
178 	session = (WT_SESSION_IMPL *)wt_session;
179 
180 	WT_SYSCALL(stat(name, &sb), ret);
181 	if (ret == 0) {
182 		*existp = true;
183 		return (0);
184 	}
185 	if (ret == ENOENT) {
186 		*existp = false;
187 		return (0);
188 	}
189 	WT_RET_MSG(session, ret, "%s: file-exist: stat", name);
190 }
191 
192 /*
193  * __posix_fs_remove --
194  *	Remove a file.
195  */
196 static int
__posix_fs_remove(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * name,uint32_t flags)197 __posix_fs_remove(WT_FILE_SYSTEM *file_system,
198     WT_SESSION *wt_session, const char *name, uint32_t flags)
199 {
200 	WT_DECL_RET;
201 	WT_SESSION_IMPL *session;
202 
203 	WT_UNUSED(file_system);
204 
205 	session = (WT_SESSION_IMPL *)wt_session;
206 
207 	/*
208 	 * ISO C doesn't require remove return -1 on failure or set errno (note
209 	 * POSIX 1003.1 extends C with those requirements). Regardless, use the
210 	 * unlink system call, instead of remove, to simplify error handling;
211 	 * where we're not doing any special checking for standards compliance,
212 	 * using unlink may be marginally safer.
213 	 */
214 	WT_SYSCALL(unlink(name), ret);
215 	if (ret != 0)
216 		WT_RET_MSG(session, ret, "%s: file-remove: unlink", name);
217 
218 	if (!LF_ISSET(WT_FS_DURABLE))
219 		return (0);
220 
221 #ifdef __linux__
222 	/* Flush the backing directory to guarantee the remove. */
223 	WT_RET (__posix_directory_sync(session, name));
224 #endif
225 	return (0);
226 }
227 
228 /*
229  * __posix_fs_rename --
230  *	Rename a file.
231  */
232 static int
__posix_fs_rename(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * from,const char * to,uint32_t flags)233 __posix_fs_rename(WT_FILE_SYSTEM *file_system,
234     WT_SESSION *wt_session, const char *from, const char *to, uint32_t flags)
235 {
236 	WT_DECL_RET;
237 	WT_SESSION_IMPL *session;
238 
239 	WT_UNUSED(file_system);
240 
241 	session = (WT_SESSION_IMPL *)wt_session;
242 
243 	/*
244 	 * ISO C doesn't require rename return -1 on failure or set errno (note
245 	 * POSIX 1003.1 extends C with those requirements). Be cautious, force
246 	 * any non-zero return to -1 so we'll check errno. We can still end up
247 	 * with the wrong errno (if errno is garbage), or the generic WT_ERROR
248 	 * return (if errno is 0), but we've done the best we can.
249 	 */
250 	WT_SYSCALL(rename(from, to) != 0 ? -1 : 0, ret);
251 	if (ret != 0)
252 		WT_RET_MSG(
253 		    session, ret, "%s to %s: file-rename: rename", from, to);
254 
255 	if (!LF_ISSET(WT_FS_DURABLE))
256 		return (0);
257 #ifdef __linux__
258 	/*
259 	 * Flush the backing directory to guarantee the rename. My reading of
260 	 * POSIX 1003.1 is there's no guarantee flushing only one of the from
261 	 * or to directories, or flushing a common parent, is sufficient, and
262 	 * even if POSIX were to make that guarantee, existing filesystems are
263 	 * known to not provide the guarantee or only provide the guarantee
264 	 * with specific mount options. Flush both of the from/to directories
265 	 * until it's a performance problem.
266 	 */
267 	WT_RET(__posix_directory_sync(session, from));
268 
269 	/*
270 	 * In almost all cases, we're going to be renaming files in the same
271 	 * directory, we can at least fast-path that.
272 	 */
273 	{
274 	bool same_directory;
275 	const char *fp, *tp;
276 
277 	fp = strrchr(from, '/');
278 	tp = strrchr(to, '/');
279 	same_directory = (fp == NULL && tp == NULL) ||
280 	    (fp != NULL && tp != NULL &&
281 	    fp - from == tp - to && memcmp(from, to, (size_t)(fp - from)) == 0);
282 
283 	if (!same_directory)
284 		WT_RET(__posix_directory_sync(session, to));
285 	}
286 #endif
287 	return (0);
288 }
289 
290 /*
291  * __posix_fs_size --
292  *	Get the size of a file in bytes, by file name.
293  */
294 static int
__posix_fs_size(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * name,wt_off_t * sizep)295 __posix_fs_size(WT_FILE_SYSTEM *file_system,
296     WT_SESSION *wt_session, const char *name, wt_off_t *sizep)
297 {
298 	struct stat sb;
299 	WT_DECL_RET;
300 	WT_SESSION_IMPL *session;
301 
302 	WT_UNUSED(file_system);
303 
304 	session = (WT_SESSION_IMPL *)wt_session;
305 
306 	WT_SYSCALL(stat(name, &sb), ret);
307 	if (ret == 0) {
308 		*sizep = sb.st_size;
309 		return (0);
310 	}
311 	WT_RET_MSG(session, ret, "%s: file-size: stat", name);
312 }
313 
314 #if defined(HAVE_POSIX_FADVISE)
315 /*
316  * __posix_file_advise --
317  *	POSIX fadvise.
318  */
319 static int
__posix_file_advise(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t offset,wt_off_t len,int advice)320 __posix_file_advise(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
321     wt_off_t offset, wt_off_t len, int advice)
322 {
323 	WT_DECL_RET;
324 	WT_FILE_HANDLE_POSIX *pfh;
325 	WT_SESSION_IMPL *session;
326 
327 	session = (WT_SESSION_IMPL *)wt_session;
328 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
329 
330 	WT_SYSCALL(posix_fadvise(pfh->fd, offset, len, advice), ret);
331 	if (ret == 0)
332 		return (0);
333 
334 	/*
335 	 * Treat EINVAL as not-supported, some systems don't support some flags.
336 	 * Quietly fail, callers expect not-supported failures, and reset the
337 	 * handle method to prevent future calls.
338 	 */
339 	if (ret == EINVAL) {
340 		file_handle->fh_advise = NULL;
341 		return (__wt_set_return(session, ENOTSUP));
342 	}
343 
344 	WT_RET_MSG(session, ret,
345 	    "%s: handle-advise: posix_fadvise", file_handle->name);
346 
347 }
348 #endif
349 
350 /*
351  * __posix_file_close --
352  *	ANSI C close.
353  */
354 static int
__posix_file_close(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session)355 __posix_file_close(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
356 {
357 	WT_DECL_RET;
358 	WT_FILE_HANDLE_POSIX *pfh;
359 	WT_SESSION_IMPL *session;
360 
361 	session = (WT_SESSION_IMPL *)wt_session;
362 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
363 
364 	/* Close the file handle. */
365 	if (pfh->fd != -1) {
366 		WT_SYSCALL(close(pfh->fd), ret);
367 		if (ret != 0)
368 			__wt_err(session, ret,
369 			    "%s: handle-close: close", file_handle->name);
370 	}
371 
372 	__wt_free(session, file_handle->name);
373 	__wt_free(session, pfh);
374 	return (ret);
375 }
376 
377 /*
378  * __posix_file_lock --
379  *	Lock/unlock a file.
380  */
381 static int
__posix_file_lock(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,bool lock)382 __posix_file_lock(
383     WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, bool lock)
384 {
385 	struct flock fl;
386 	WT_DECL_RET;
387 	WT_FILE_HANDLE_POSIX *pfh;
388 	WT_SESSION_IMPL *session;
389 
390 	session = (WT_SESSION_IMPL *)wt_session;
391 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
392 
393 	/*
394 	 * WiredTiger requires this function be able to acquire locks past
395 	 * the end of file.
396 	 *
397 	 * Note we're using fcntl(2) locking: all fcntl locks associated with a
398 	 * file for a given process are removed when any file descriptor for the
399 	 * file is closed by the process, even if a lock was never requested for
400 	 * that file descriptor.
401 	 */
402 	fl.l_start = 0;
403 	fl.l_len = 1;
404 	fl.l_type = lock ? F_WRLCK : F_UNLCK;
405 	fl.l_whence = SEEK_SET;
406 
407 	WT_SYSCALL(fcntl(pfh->fd, F_SETLK, &fl) == -1 ? -1 : 0, ret);
408 	if (ret == 0)
409 		return (0);
410 	WT_RET_MSG(session, ret, "%s: handle-lock: fcntl", file_handle->name);
411 }
412 
413 /*
414  * __posix_file_read --
415  *	POSIX pread.
416  */
417 static int
__posix_file_read(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t offset,size_t len,void * buf)418 __posix_file_read(WT_FILE_HANDLE *file_handle,
419     WT_SESSION *wt_session, wt_off_t offset, size_t len, void *buf)
420 {
421 	WT_FILE_HANDLE_POSIX *pfh;
422 	WT_SESSION_IMPL *session;
423 	size_t chunk;
424 	ssize_t nr;
425 	uint8_t *addr;
426 
427 	session = (WT_SESSION_IMPL *)wt_session;
428 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
429 
430 	/* Assert direct I/O is aligned and a multiple of the alignment. */
431 	WT_ASSERT(session,
432 	    !pfh->direct_io ||
433 	    S2C(session)->buffer_alignment == 0 ||
434 	    (!((uintptr_t)buf &
435 	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
436 	    len >= S2C(session)->buffer_alignment &&
437 	    len % S2C(session)->buffer_alignment == 0));
438 
439 	/* Break reads larger than 1GB into 1GB chunks. */
440 	for (addr = buf; len > 0; addr += nr, len -= (size_t)nr, offset += nr) {
441 		chunk = WT_MIN(len, WT_GIGABYTE);
442 		if ((nr = pread(pfh->fd, addr, chunk, offset)) <= 0) {
443 			if (nr == 0)
444 				F_SET(S2C(session), WT_CONN_DATA_CORRUPTION);
445 			WT_RET_MSG(session,
446 			    nr == 0 ? WT_ERROR : __wt_errno(),
447 			    "%s: handle-read: pread: failed to read %"
448 			    WT_SIZET_FMT " bytes at offset %" PRIuMAX,
449 			    file_handle->name, chunk, (uintmax_t)offset);
450 		}
451 	}
452 	return (0);
453 }
454 
455 /*
456  * __posix_file_size --
457  *	Get the size of a file in bytes, by file handle.
458  */
459 static int
__posix_file_size(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t * sizep)460 __posix_file_size(
461     WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t *sizep)
462 {
463 	struct stat sb;
464 	WT_DECL_RET;
465 	WT_FILE_HANDLE_POSIX *pfh;
466 	WT_SESSION_IMPL *session;
467 
468 	session = (WT_SESSION_IMPL *)wt_session;
469 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
470 
471 	WT_SYSCALL(fstat(pfh->fd, &sb), ret);
472 	if (ret == 0) {
473 		*sizep = sb.st_size;
474 		return (0);
475 	}
476 	WT_RET_MSG(session, ret, "%s: handle-size: fstat", file_handle->name);
477 }
478 
479 /*
480  * __posix_file_sync --
481  *	POSIX fsync.
482  */
483 static int
__posix_file_sync(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session)484 __posix_file_sync(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
485 {
486 	WT_FILE_HANDLE_POSIX *pfh;
487 	WT_SESSION_IMPL *session;
488 
489 	session = (WT_SESSION_IMPL *)wt_session;
490 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
491 
492 	return (
493 	    __posix_sync(session, pfh->fd, file_handle->name, "handle-sync"));
494 }
495 
496 #ifdef HAVE_SYNC_FILE_RANGE
497 /*
498  * __posix_file_sync_nowait --
499  *	POSIX fsync.
500  */
501 static int
__posix_file_sync_nowait(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session)502 __posix_file_sync_nowait(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session)
503 {
504 	WT_DECL_RET;
505 	WT_FILE_HANDLE_POSIX *pfh;
506 	WT_SESSION_IMPL *session;
507 
508 	session = (WT_SESSION_IMPL *)wt_session;
509 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
510 
511 	/* See comment in __posix_sync(): sync cannot be retried or fail. */
512 	WT_SYSCALL(sync_file_range(pfh->fd,
513 	    (off64_t)0, (off64_t)0, SYNC_FILE_RANGE_WRITE), ret);
514 	if (ret == 0)
515 		return (0);
516 
517 	WT_PANIC_RET(session, ret,
518 	    "%s: handle-sync-nowait: sync_file_range", file_handle->name);
519 }
520 #endif
521 
522 #ifdef HAVE_FTRUNCATE
523 /*
524  * __posix_file_truncate --
525  *	POSIX ftruncate.
526  */
527 static int
__posix_file_truncate(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t len)528 __posix_file_truncate(
529     WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session, wt_off_t len)
530 {
531 	WT_DECL_RET;
532 	WT_FILE_HANDLE_POSIX *pfh;
533 	WT_SESSION_IMPL *session;
534 
535 	session = (WT_SESSION_IMPL *)wt_session;
536 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
537 
538 	WT_SYSCALL_RETRY(ftruncate(pfh->fd, len), ret);
539 	if (ret == 0)
540 		return (0);
541 	WT_RET_MSG(session, ret,
542 	    "%s: handle-truncate: ftruncate", file_handle->name);
543 }
544 #endif
545 
546 /*
547  * __posix_file_write --
548  *	POSIX pwrite.
549  */
550 static int
__posix_file_write(WT_FILE_HANDLE * file_handle,WT_SESSION * wt_session,wt_off_t offset,size_t len,const void * buf)551 __posix_file_write(WT_FILE_HANDLE *file_handle, WT_SESSION *wt_session,
552     wt_off_t offset, size_t len, const void *buf)
553 {
554 	WT_FILE_HANDLE_POSIX *pfh;
555 	WT_SESSION_IMPL *session;
556 	size_t chunk;
557 	ssize_t nw;
558 	const uint8_t *addr;
559 
560 	session = (WT_SESSION_IMPL *)wt_session;
561 	pfh = (WT_FILE_HANDLE_POSIX *)file_handle;
562 
563 	/* Assert direct I/O is aligned and a multiple of the alignment. */
564 	WT_ASSERT(session,
565 	    !pfh->direct_io ||
566 	    S2C(session)->buffer_alignment == 0 ||
567 	    (!((uintptr_t)buf &
568 	    (uintptr_t)(S2C(session)->buffer_alignment - 1)) &&
569 	    len >= S2C(session)->buffer_alignment &&
570 	    len % S2C(session)->buffer_alignment == 0));
571 
572 	/* Break writes larger than 1GB into 1GB chunks. */
573 	for (addr = buf; len > 0; addr += nw, len -= (size_t)nw, offset += nw) {
574 		chunk = WT_MIN(len, WT_GIGABYTE);
575 		if ((nw = pwrite(pfh->fd, addr, chunk, offset)) < 0)
576 			WT_RET_MSG(session, __wt_errno(),
577 			    "%s: handle-write: pwrite: failed to write %"
578 			    WT_SIZET_FMT " bytes at offset %" PRIuMAX,
579 			    file_handle->name, chunk, (uintmax_t)offset);
580 	}
581 	return (0);
582 }
583 
584 /*
585  * __posix_open_file_cloexec --
586  *	Prevent child access to file handles.
587  */
588 static inline int
__posix_open_file_cloexec(WT_SESSION_IMPL * session,int fd,const char * name)589 __posix_open_file_cloexec(WT_SESSION_IMPL *session, int fd, const char *name)
590 {
591 #if defined(FD_CLOEXEC) && !defined(O_CLOEXEC)
592 	int f;
593 
594 	/*
595 	 * Security:
596 	 * The application may spawn a new process, and we don't want another
597 	 * process to have access to our file handles. There's an obvious race
598 	 * between the open and this call, prefer the flag to open if available.
599 	 */
600 	if ((f = fcntl(fd, F_GETFD)) == -1 ||
601 	    fcntl(fd, F_SETFD, f | FD_CLOEXEC) == -1)
602 		WT_RET_MSG(session, __wt_errno(),
603 		    "%s: handle-open: fcntl(FD_CLOEXEC)", name);
604 	return (0);
605 #else
606 	WT_UNUSED(session);
607 	WT_UNUSED(fd);
608 	WT_UNUSED(name);
609 	return (0);
610 #endif
611 }
612 
613 /*
614  * __posix_open_file --
615  *	Open a file handle.
616  */
617 static int
__posix_open_file(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session,const char * name,WT_FS_OPEN_FILE_TYPE file_type,uint32_t flags,WT_FILE_HANDLE ** file_handlep)618 __posix_open_file(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session,
619     const char *name, WT_FS_OPEN_FILE_TYPE file_type, uint32_t flags,
620     WT_FILE_HANDLE **file_handlep)
621 {
622 	WT_CONNECTION_IMPL *conn;
623 	WT_DECL_RET;
624 	WT_FILE_HANDLE *file_handle;
625 	WT_FILE_HANDLE_POSIX *pfh;
626 	WT_SESSION_IMPL *session;
627 	mode_t mode;
628 	int advise_flag, f;
629 
630 	WT_UNUSED(file_system);
631 
632 	*file_handlep = NULL;
633 
634 	session = (WT_SESSION_IMPL *)wt_session;
635 	conn = S2C(session);
636 
637 	WT_RET(__wt_calloc_one(session, &pfh));
638 
639 	/* Set up error handling. */
640 	pfh->fd = -1;
641 
642 	if (file_type == WT_FS_OPEN_FILE_TYPE_DIRECTORY) {
643 		f = O_RDONLY;
644 #ifdef O_CLOEXEC
645 		/*
646 		 * Security:
647 		 * The application may spawn a new process, and we don't want
648 		 * another process to have access to our file handles.
649 		 */
650 		f |= O_CLOEXEC;
651 #endif
652 		WT_SYSCALL_RETRY((
653 		    (pfh->fd = open(name, f, 0444)) == -1 ? -1 : 0), ret);
654 		if (ret != 0)
655 			WT_ERR_MSG(session, ret,
656 			    "%s: handle-open: open-directory", name);
657 		WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
658 		goto directory_open;
659 	}
660 
661 	f = LF_ISSET(WT_FS_OPEN_READONLY) ? O_RDONLY : O_RDWR;
662 	if (LF_ISSET(WT_FS_OPEN_CREATE)) {
663 		f |= O_CREAT;
664 		if (LF_ISSET(WT_FS_OPEN_EXCLUSIVE))
665 			f |= O_EXCL;
666 		mode = 0666;
667 	} else
668 		mode = 0;
669 
670 #ifdef O_BINARY
671 	/* Windows clones: we always want to treat the file as a binary. */
672 	f |= O_BINARY;
673 #endif
674 #ifdef O_CLOEXEC
675 	/*
676 	 * Security:
677 	 * The application may spawn a new process, and we don't want another
678 	 * process to have access to our file handles.
679 	 */
680 	f |= O_CLOEXEC;
681 #endif
682 #ifdef O_DIRECT
683 	/* Direct I/O. */
684 	if (LF_ISSET(WT_FS_OPEN_DIRECTIO)) {
685 		f |= O_DIRECT;
686 		pfh->direct_io = true;
687 	} else
688 		pfh->direct_io = false;
689 #endif
690 #ifdef O_NOATIME
691 	/* Avoid updating metadata for read-only workloads. */
692 	if (file_type == WT_FS_OPEN_FILE_TYPE_DATA)
693 		f |= O_NOATIME;
694 #endif
695 
696 	if (file_type == WT_FS_OPEN_FILE_TYPE_LOG &&
697 	    FLD_ISSET(conn->txn_logsync, WT_LOG_DSYNC)) {
698 #ifdef O_DSYNC
699 		f |= O_DSYNC;
700 #elif defined(O_SYNC)
701 		f |= O_SYNC;
702 #else
703 		WT_ERR_MSG(session, ENOTSUP,
704 		    "unsupported log sync mode configured");
705 #endif
706 	}
707 
708 	/* Create/Open the file. */
709 	WT_SYSCALL_RETRY(((pfh->fd = open(name, f, mode)) == -1 ? -1 : 0), ret);
710 	if (ret != 0)
711 		WT_ERR_MSG(session, ret,
712 		    pfh->direct_io ?
713 		    "%s: handle-open: open: failed with direct I/O configured, "
714 		    "some filesystem types do not support direct I/O" :
715 		    "%s: handle-open: open", name);
716 
717 #ifdef __linux__
718 	/*
719 	 * Durability: some filesystems require a directory sync to be confident
720 	 * the file will appear.
721 	 */
722 	if (LF_ISSET(WT_FS_OPEN_DURABLE))
723 		WT_ERR(__posix_directory_sync(session, name));
724 #endif
725 
726 	WT_ERR(__posix_open_file_cloexec(session, pfh->fd, name));
727 
728 #if defined(HAVE_POSIX_FADVISE)
729 	/*
730 	 * If the user set an access pattern hint, call fadvise now.
731 	 * Ignore fadvise when doing direct I/O, the kernel cache isn't
732 	 * interesting.
733 	 */
734 	if (!pfh->direct_io && file_type == WT_FS_OPEN_FILE_TYPE_DATA &&
735 	    LF_ISSET(WT_FS_OPEN_ACCESS_RAND | WT_FS_OPEN_ACCESS_SEQ)) {
736 		advise_flag = 0;
737 		if (LF_ISSET(WT_FS_OPEN_ACCESS_RAND))
738 			advise_flag = POSIX_FADV_RANDOM;
739 		if (LF_ISSET(WT_FS_OPEN_ACCESS_SEQ))
740 			advise_flag = POSIX_FADV_SEQUENTIAL;
741 		WT_SYSCALL(posix_fadvise(pfh->fd, 0, 0, advise_flag), ret);
742 		if (ret != 0)
743 			WT_ERR_MSG(session, ret,
744 			    "%s: handle-open: posix_fadvise", name);
745 	}
746 #else
747 	WT_UNUSED(advise_flag);
748 #endif
749 
750 directory_open:
751 	/* Initialize public information. */
752 	file_handle = (WT_FILE_HANDLE *)pfh;
753 	WT_ERR(__wt_strdup(session, name, &file_handle->name));
754 
755 	file_handle->close = __posix_file_close;
756 #if defined(HAVE_POSIX_FADVISE)
757 	/*
758 	 * Ignore fadvise when doing direct I/O, the kernel cache isn't
759 	 * interesting.
760 	 */
761 	if (!pfh->direct_io)
762 		file_handle->fh_advise = __posix_file_advise;
763 #endif
764 	file_handle->fh_extend = __wt_posix_file_extend;
765 	file_handle->fh_lock = __posix_file_lock;
766 #ifdef WORDS_BIGENDIAN
767 	/*
768 	 * The underlying objects are little-endian, mapping objects isn't
769 	 * currently supported on big-endian systems.
770 	 */
771 #else
772 	file_handle->fh_map = __wt_posix_map;
773 #ifdef HAVE_POSIX_MADVISE
774 	file_handle->fh_map_discard = __wt_posix_map_discard;
775 	file_handle->fh_map_preload = __wt_posix_map_preload;
776 #endif
777 	file_handle->fh_unmap = __wt_posix_unmap;
778 #endif
779 	file_handle->fh_read = __posix_file_read;
780 	file_handle->fh_size = __posix_file_size;
781 	file_handle->fh_sync = __posix_file_sync;
782 #ifdef HAVE_SYNC_FILE_RANGE
783 	file_handle->fh_sync_nowait = __posix_file_sync_nowait;
784 #endif
785 #ifdef HAVE_FTRUNCATE
786 	file_handle->fh_truncate = __posix_file_truncate;
787 #endif
788 	file_handle->fh_write = __posix_file_write;
789 
790 	*file_handlep = file_handle;
791 
792 	return (0);
793 
794 err:	WT_TRET(__posix_file_close((WT_FILE_HANDLE *)pfh, wt_session));
795 	return (ret);
796 }
797 
798 /*
799  * __posix_terminate --
800  *	Terminate a POSIX configuration.
801  */
802 static int
__posix_terminate(WT_FILE_SYSTEM * file_system,WT_SESSION * wt_session)803 __posix_terminate(WT_FILE_SYSTEM *file_system, WT_SESSION *wt_session)
804 {
805 	WT_SESSION_IMPL *session;
806 
807 	session = (WT_SESSION_IMPL *)wt_session;
808 
809 	__wt_free(session, file_system);
810 	return (0);
811 }
812 
813 /*
814  * __wt_os_posix --
815  *	Initialize a POSIX configuration.
816  */
817 int
__wt_os_posix(WT_SESSION_IMPL * session)818 __wt_os_posix(WT_SESSION_IMPL *session)
819 {
820 	WT_CONNECTION_IMPL *conn;
821 	WT_FILE_SYSTEM *file_system;
822 
823 	conn = S2C(session);
824 
825 	WT_RET(__wt_calloc_one(session, &file_system));
826 
827 	/* Initialize the POSIX jump table. */
828 	file_system->fs_directory_list = __wt_posix_directory_list;
829 	file_system->fs_directory_list_single =
830 	    __wt_posix_directory_list_single;
831 	file_system->fs_directory_list_free = __wt_posix_directory_list_free;
832 	file_system->fs_exist = __posix_fs_exist;
833 	file_system->fs_open_file = __posix_open_file;
834 	file_system->fs_remove = __posix_fs_remove;
835 	file_system->fs_rename = __posix_fs_rename;
836 	file_system->fs_size = __posix_fs_size;
837 	file_system->terminate = __posix_terminate;
838 
839 	/* Switch it into place. */
840 	conn->file_system = file_system;
841 
842 	return (0);
843 }
844