1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors
9 #include <dirent.h>
10 #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
11 #include <dlfcn.h>
12 #endif
13 #include <errno.h>
14 #include <fcntl.h>
15
16 #if defined(OS_LINUX)
17 #include <linux/fs.h>
18 #endif
19 #include <pthread.h>
20 #include <signal.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/ioctl.h>
25 #include <sys/mman.h>
26 #include <sys/stat.h>
27 #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
28 #include <sys/statfs.h>
29 #include <sys/syscall.h>
30 #include <sys/sysmacros.h>
31 #endif
32 #include <sys/statvfs.h>
33 #include <sys/time.h>
34 #include <sys/types.h>
35 #include <time.h>
36 #include <algorithm>
37 // Get nano time includes
38 #if defined(OS_LINUX) || defined(OS_FREEBSD)
39 #elif defined(__MACH__)
40 #include <Availability.h>
41 #include <mach/clock.h>
42 #include <mach/mach.h>
43 #else
44 #include <chrono>
45 #endif
46 #include <deque>
47 #include <set>
48 #include <vector>
49
50 #include "env/io_posix.h"
51 #include "logging/logging.h"
52 #include "logging/posix_logger.h"
53 #include "monitoring/iostats_context_imp.h"
54 #include "monitoring/thread_status_updater.h"
55 #include "port/port.h"
56 #include "rocksdb/options.h"
57 #include "rocksdb/slice.h"
58 #include "test_util/sync_point.h"
59 #include "util/coding.h"
60 #include "util/compression_context_cache.h"
61 #include "util/random.h"
62 #include "util/string_util.h"
63 #include "util/thread_local.h"
64 #include "util/threadpool_imp.h"
65
66 #if !defined(TMPFS_MAGIC)
67 #define TMPFS_MAGIC 0x01021994
68 #endif
69 #if !defined(XFS_SUPER_MAGIC)
70 #define XFS_SUPER_MAGIC 0x58465342
71 #endif
72 #if !defined(EXT4_SUPER_MAGIC)
73 #define EXT4_SUPER_MAGIC 0xEF53
74 #endif
75
76 namespace ROCKSDB_NAMESPACE {
77
78 namespace {
79
GetDBFileMode(bool allow_non_owner_access)80 inline mode_t GetDBFileMode(bool allow_non_owner_access) {
81 return allow_non_owner_access ? 0644 : 0600;
82 }
83
84 // list of pathnames that are locked
85 static std::set<std::string> lockedFiles;
86 static port::Mutex mutex_lockedFiles;
87
LockOrUnlock(int fd,bool lock)88 static int LockOrUnlock(int fd, bool lock) {
89 errno = 0;
90 struct flock f;
91 memset(&f, 0, sizeof(f));
92 f.l_type = (lock ? F_WRLCK : F_UNLCK);
93 f.l_whence = SEEK_SET;
94 f.l_start = 0;
95 f.l_len = 0; // Lock/unlock entire file
96 int value = fcntl(fd, F_SETLK, &f);
97
98 return value;
99 }
100
101 class PosixFileLock : public FileLock {
102 public:
103 int fd_;
104 std::string filename;
105 };
106
cloexec_flags(int flags,const EnvOptions * options)107 int cloexec_flags(int flags, const EnvOptions* options) {
108 // If the system supports opening the file with cloexec enabled,
109 // do so, as this avoids a race condition if a db is opened around
110 // the same time that a child process is forked
111 #ifdef O_CLOEXEC
112 if (options == nullptr || options->set_fd_cloexec) {
113 flags |= O_CLOEXEC;
114 }
115 #endif
116 return flags;
117 }
118
119 class PosixFileSystem : public FileSystem {
120 public:
121 PosixFileSystem();
122
Name() const123 const char* Name() const override { return "Posix File System"; }
124
~PosixFileSystem()125 ~PosixFileSystem() override {}
126
SetFD_CLOEXEC(int fd,const EnvOptions * options)127 void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
128 if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
129 fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
130 }
131 }
132
NewSequentialFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSSequentialFile> * result,IODebugContext *)133 IOStatus NewSequentialFile(const std::string& fname,
134 const FileOptions& options,
135 std::unique_ptr<FSSequentialFile>* result,
136 IODebugContext* /*dbg*/) override {
137 result->reset();
138 int fd = -1;
139 int flags = cloexec_flags(O_RDONLY, &options);
140 FILE* file = nullptr;
141
142 if (options.use_direct_reads && !options.use_mmap_reads) {
143 #ifdef ROCKSDB_LITE
144 return IOStatus::IOError(fname,
145 "Direct I/O not supported in RocksDB lite");
146 #endif // !ROCKSDB_LITE
147 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
148 flags |= O_DIRECT;
149 #endif
150 }
151
152 do {
153 IOSTATS_TIMER_GUARD(open_nanos);
154 fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
155 } while (fd < 0 && errno == EINTR);
156 if (fd < 0) {
157 return IOError("While opening a file for sequentially reading", fname,
158 errno);
159 }
160
161 SetFD_CLOEXEC(fd, &options);
162
163 if (options.use_direct_reads && !options.use_mmap_reads) {
164 #ifdef OS_MACOSX
165 if (fcntl(fd, F_NOCACHE, 1) == -1) {
166 close(fd);
167 return IOError("While fcntl NoCache", fname, errno);
168 }
169 #endif
170 } else {
171 do {
172 IOSTATS_TIMER_GUARD(open_nanos);
173 file = fdopen(fd, "r");
174 } while (file == nullptr && errno == EINTR);
175 if (file == nullptr) {
176 close(fd);
177 return IOError("While opening file for sequentially read", fname,
178 errno);
179 }
180 }
181 result->reset(new PosixSequentialFile(fname, file, fd, options));
182 return IOStatus::OK();
183 }
184
NewRandomAccessFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSRandomAccessFile> * result,IODebugContext *)185 IOStatus NewRandomAccessFile(const std::string& fname,
186 const FileOptions& options,
187 std::unique_ptr<FSRandomAccessFile>* result,
188 IODebugContext* /*dbg*/) override {
189 result->reset();
190 IOStatus s;
191 int fd;
192 int flags = cloexec_flags(O_RDONLY, &options);
193
194 if (options.use_direct_reads && !options.use_mmap_reads) {
195 #ifdef ROCKSDB_LITE
196 return IOStatus::IOError(fname,
197 "Direct I/O not supported in RocksDB lite");
198 #endif // !ROCKSDB_LITE
199 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
200 flags |= O_DIRECT;
201 TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags);
202 #endif
203 }
204
205 do {
206 IOSTATS_TIMER_GUARD(open_nanos);
207 fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
208 } while (fd < 0 && errno == EINTR);
209 if (fd < 0) {
210 return IOError("While open a file for random read", fname, errno);
211 }
212 SetFD_CLOEXEC(fd, &options);
213
214 if (options.use_mmap_reads && sizeof(void*) >= 8) {
215 // Use of mmap for random reads has been removed because it
216 // kills performance when storage is fast.
217 // Use mmap when virtual address-space is plentiful.
218 uint64_t size;
219 IOOptions opts;
220 s = GetFileSize(fname, opts, &size, nullptr);
221 if (s.ok()) {
222 void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
223 if (base != MAP_FAILED) {
224 result->reset(
225 new PosixMmapReadableFile(fd, fname, base, size, options));
226 } else {
227 s = IOError("while mmap file for read", fname, errno);
228 close(fd);
229 }
230 }
231 } else {
232 if (options.use_direct_reads && !options.use_mmap_reads) {
233 #ifdef OS_MACOSX
234 if (fcntl(fd, F_NOCACHE, 1) == -1) {
235 close(fd);
236 return IOError("while fcntl NoCache", fname, errno);
237 }
238 #endif
239 }
240 result->reset(new PosixRandomAccessFile(fname, fd, options
241 #if defined(ROCKSDB_IOURING_PRESENT)
242 ,
243 thread_local_io_urings_.get()
244 #endif
245 ));
246 }
247 return s;
248 }
249
OpenWritableFile(const std::string & fname,const FileOptions & options,bool reopen,std::unique_ptr<FSWritableFile> * result,IODebugContext *)250 virtual IOStatus OpenWritableFile(const std::string& fname,
251 const FileOptions& options,
252 bool reopen,
253 std::unique_ptr<FSWritableFile>* result,
254 IODebugContext* /*dbg*/) {
255 result->reset();
256 IOStatus s;
257 int fd = -1;
258 int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC);
259 // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
260 if (options.use_direct_writes && !options.use_mmap_writes) {
261 // Note: we should avoid O_APPEND here due to ta the following bug:
262 // POSIX requires that opening a file with the O_APPEND flag should
263 // have no affect on the location at which pwrite() writes data.
264 // However, on Linux, if a file is opened with O_APPEND, pwrite()
265 // appends data to the end of the file, regardless of the value of
266 // offset.
267 // More info here: https://linux.die.net/man/2/pwrite
268 #ifdef ROCKSDB_LITE
269 return IOStatus::IOError(fname,
270 "Direct I/O not supported in RocksDB lite");
271 #endif // ROCKSDB_LITE
272 flags |= O_WRONLY;
273 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
274 flags |= O_DIRECT;
275 #endif
276 TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
277 } else if (options.use_mmap_writes) {
278 // non-direct I/O
279 flags |= O_RDWR;
280 } else {
281 flags |= O_WRONLY;
282 }
283
284 flags = cloexec_flags(flags, &options);
285
286 do {
287 IOSTATS_TIMER_GUARD(open_nanos);
288 fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
289 } while (fd < 0 && errno == EINTR);
290
291 if (fd < 0) {
292 s = IOError("While open a file for appending", fname, errno);
293 return s;
294 }
295 SetFD_CLOEXEC(fd, &options);
296
297 if (options.use_mmap_writes) {
298 if (!checkedDiskForMmap_) {
299 // this will be executed once in the program's lifetime.
300 // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
301 if (!SupportsFastAllocate(fname)) {
302 forceMmapOff_ = true;
303 }
304 checkedDiskForMmap_ = true;
305 }
306 }
307 if (options.use_mmap_writes && !forceMmapOff_) {
308 result->reset(new PosixMmapFile(fname, fd, page_size_, options));
309 } else if (options.use_direct_writes && !options.use_mmap_writes) {
310 #ifdef OS_MACOSX
311 if (fcntl(fd, F_NOCACHE, 1) == -1) {
312 close(fd);
313 s = IOError("While fcntl NoCache an opened file for appending", fname,
314 errno);
315 return s;
316 }
317 #elif defined(OS_SOLARIS)
318 if (directio(fd, DIRECTIO_ON) == -1) {
319 if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON
320 close(fd);
321 s = IOError("While calling directio()", fname, errno);
322 return s;
323 }
324 }
325 #endif
326 result->reset(new PosixWritableFile(fname, fd, options));
327 } else {
328 // disable mmap writes
329 EnvOptions no_mmap_writes_options = options;
330 no_mmap_writes_options.use_mmap_writes = false;
331 result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
332 }
333 return s;
334 }
335
NewWritableFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSWritableFile> * result,IODebugContext * dbg)336 IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
337 std::unique_ptr<FSWritableFile>* result,
338 IODebugContext* dbg) override {
339 return OpenWritableFile(fname, options, false, result, dbg);
340 }
341
ReopenWritableFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSWritableFile> * result,IODebugContext * dbg)342 IOStatus ReopenWritableFile(const std::string& fname,
343 const FileOptions& options,
344 std::unique_ptr<FSWritableFile>* result,
345 IODebugContext* dbg) override {
346 return OpenWritableFile(fname, options, true, result, dbg);
347 }
348
ReuseWritableFile(const std::string & fname,const std::string & old_fname,const FileOptions & options,std::unique_ptr<FSWritableFile> * result,IODebugContext *)349 IOStatus ReuseWritableFile(const std::string& fname,
350 const std::string& old_fname,
351 const FileOptions& options,
352 std::unique_ptr<FSWritableFile>* result,
353 IODebugContext* /*dbg*/) override {
354 result->reset();
355 IOStatus s;
356 int fd = -1;
357
358 int flags = 0;
359 // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
360 if (options.use_direct_writes && !options.use_mmap_writes) {
361 #ifdef ROCKSDB_LITE
362 return IOStatus::IOError(fname,
363 "Direct I/O not supported in RocksDB lite");
364 #endif // !ROCKSDB_LITE
365 flags |= O_WRONLY;
366 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
367 flags |= O_DIRECT;
368 #endif
369 TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
370 } else if (options.use_mmap_writes) {
371 // mmap needs O_RDWR mode
372 flags |= O_RDWR;
373 } else {
374 flags |= O_WRONLY;
375 }
376
377 flags = cloexec_flags(flags, &options);
378
379 do {
380 IOSTATS_TIMER_GUARD(open_nanos);
381 fd = open(old_fname.c_str(), flags,
382 GetDBFileMode(allow_non_owner_access_));
383 } while (fd < 0 && errno == EINTR);
384 if (fd < 0) {
385 s = IOError("while reopen file for write", fname, errno);
386 return s;
387 }
388
389 SetFD_CLOEXEC(fd, &options);
390 // rename into place
391 if (rename(old_fname.c_str(), fname.c_str()) != 0) {
392 s = IOError("while rename file to " + fname, old_fname, errno);
393 close(fd);
394 return s;
395 }
396
397 if (options.use_mmap_writes) {
398 if (!checkedDiskForMmap_) {
399 // this will be executed once in the program's lifetime.
400 // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
401 if (!SupportsFastAllocate(fname)) {
402 forceMmapOff_ = true;
403 }
404 checkedDiskForMmap_ = true;
405 }
406 }
407 if (options.use_mmap_writes && !forceMmapOff_) {
408 result->reset(new PosixMmapFile(fname, fd, page_size_, options));
409 } else if (options.use_direct_writes && !options.use_mmap_writes) {
410 #ifdef OS_MACOSX
411 if (fcntl(fd, F_NOCACHE, 1) == -1) {
412 close(fd);
413 s = IOError("while fcntl NoCache for reopened file for append", fname,
414 errno);
415 return s;
416 }
417 #elif defined(OS_SOLARIS)
418 if (directio(fd, DIRECTIO_ON) == -1) {
419 if (errno != ENOTTY) { // ZFS filesystems don't support DIRECTIO_ON
420 close(fd);
421 s = IOError("while calling directio()", fname, errno);
422 return s;
423 }
424 }
425 #endif
426 result->reset(new PosixWritableFile(fname, fd, options));
427 } else {
428 // disable mmap writes
429 FileOptions no_mmap_writes_options = options;
430 no_mmap_writes_options.use_mmap_writes = false;
431 result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
432 }
433 return s;
434 }
435
NewRandomRWFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSRandomRWFile> * result,IODebugContext *)436 IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
437 std::unique_ptr<FSRandomRWFile>* result,
438 IODebugContext* /*dbg*/) override {
439 int fd = -1;
440 int flags = cloexec_flags(O_RDWR, &options);
441
442 while (fd < 0) {
443 IOSTATS_TIMER_GUARD(open_nanos);
444
445 fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
446 if (fd < 0) {
447 // Error while opening the file
448 if (errno == EINTR) {
449 continue;
450 }
451 return IOError("While open file for random read/write", fname, errno);
452 }
453 }
454
455 SetFD_CLOEXEC(fd, &options);
456 result->reset(new PosixRandomRWFile(fname, fd, options));
457 return IOStatus::OK();
458 }
459
NewMemoryMappedFileBuffer(const std::string & fname,std::unique_ptr<MemoryMappedFileBuffer> * result)460 IOStatus NewMemoryMappedFileBuffer(
461 const std::string& fname,
462 std::unique_ptr<MemoryMappedFileBuffer>* result) override {
463 int fd = -1;
464 IOStatus status;
465 int flags = cloexec_flags(O_RDWR, nullptr);
466
467 while (fd < 0) {
468 IOSTATS_TIMER_GUARD(open_nanos);
469 fd = open(fname.c_str(), flags, 0644);
470 if (fd < 0) {
471 // Error while opening the file
472 if (errno == EINTR) {
473 continue;
474 }
475 status =
476 IOError("While open file for raw mmap buffer access", fname, errno);
477 break;
478 }
479 }
480 uint64_t size;
481 if (status.ok()) {
482 IOOptions opts;
483 status = GetFileSize(fname, opts, &size, nullptr);
484 }
485 void* base = nullptr;
486 if (status.ok()) {
487 base = mmap(nullptr, static_cast<size_t>(size), PROT_READ | PROT_WRITE,
488 MAP_SHARED, fd, 0);
489 if (base == MAP_FAILED) {
490 status = IOError("while mmap file for read", fname, errno);
491 }
492 }
493 if (status.ok()) {
494 result->reset(
495 new PosixMemoryMappedFileBuffer(base, static_cast<size_t>(size)));
496 }
497 if (fd >= 0) {
498 // don't need to keep it open after mmap has been called
499 close(fd);
500 }
501 return status;
502 }
503
NewDirectory(const std::string & name,const IOOptions &,std::unique_ptr<FSDirectory> * result,IODebugContext *)504 IOStatus NewDirectory(const std::string& name, const IOOptions& /*opts*/,
505 std::unique_ptr<FSDirectory>* result,
506 IODebugContext* /*dbg*/) override {
507 result->reset();
508 int fd;
509 int flags = cloexec_flags(0, nullptr);
510 {
511 IOSTATS_TIMER_GUARD(open_nanos);
512 fd = open(name.c_str(), flags);
513 }
514 if (fd < 0) {
515 return IOError("While open directory", name, errno);
516 } else {
517 result->reset(new PosixDirectory(fd));
518 }
519 return IOStatus::OK();
520 }
521
NewLogger(const std::string &,const IOOptions &,std::shared_ptr<ROCKSDB_NAMESPACE::Logger> *,IODebugContext *)522 IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*opts*/,
523 std::shared_ptr<ROCKSDB_NAMESPACE::Logger>* /*ptr*/,
524 IODebugContext* /*dbg*/) override {
525 return IOStatus::NotSupported();
526 }
527
FileExists(const std::string & fname,const IOOptions &,IODebugContext *)528 IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/,
529 IODebugContext* /*dbg*/) override {
530 int result = access(fname.c_str(), F_OK);
531
532 if (result == 0) {
533 return IOStatus::OK();
534 }
535
536 int err = errno;
537 switch (err) {
538 case EACCES:
539 case ELOOP:
540 case ENAMETOOLONG:
541 case ENOENT:
542 case ENOTDIR:
543 return IOStatus::NotFound();
544 default:
545 assert(err == EIO || err == ENOMEM);
546 return IOStatus::IOError("Unexpected error(" + ToString(err) +
547 ") accessing file `" + fname + "' ");
548 }
549 }
550
GetChildren(const std::string & dir,const IOOptions &,std::vector<std::string> * result,IODebugContext *)551 IOStatus GetChildren(const std::string& dir, const IOOptions& /*opts*/,
552 std::vector<std::string>* result,
553 IODebugContext* /*dbg*/) override {
554 result->clear();
555 DIR* d = opendir(dir.c_str());
556 if (d == nullptr) {
557 switch (errno) {
558 case EACCES:
559 case ENOENT:
560 case ENOTDIR:
561 return IOStatus::NotFound();
562 default:
563 return IOError("While opendir", dir, errno);
564 }
565 }
566 struct dirent* entry;
567 while ((entry = readdir(d)) != nullptr) {
568 result->push_back(entry->d_name);
569 }
570 closedir(d);
571 return IOStatus::OK();
572 }
573
DeleteFile(const std::string & fname,const IOOptions &,IODebugContext *)574 IOStatus DeleteFile(const std::string& fname, const IOOptions& /*opts*/,
575 IODebugContext* /*dbg*/) override {
576 IOStatus result;
577 if (unlink(fname.c_str()) != 0) {
578 result = IOError("while unlink() file", fname, errno);
579 }
580 return result;
581 }
582
CreateDir(const std::string & name,const IOOptions &,IODebugContext *)583 IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/,
584 IODebugContext* /*dbg*/) override {
585 IOStatus result;
586 if (mkdir(name.c_str(), 0755) != 0) {
587 result = IOError("While mkdir", name, errno);
588 }
589 return result;
590 }
591
CreateDirIfMissing(const std::string & name,const IOOptions &,IODebugContext *)592 IOStatus CreateDirIfMissing(const std::string& name,
593 const IOOptions& /*opts*/,
594 IODebugContext* /*dbg*/) override {
595 IOStatus result;
596 if (mkdir(name.c_str(), 0755) != 0) {
597 if (errno != EEXIST) {
598 result = IOError("While mkdir if missing", name, errno);
599 } else if (!DirExists(name)) { // Check that name is actually a
600 // directory.
601 // Message is taken from mkdir
602 result =
603 IOStatus::IOError("`" + name + "' exists but is not a directory");
604 }
605 }
606 return result;
607 }
608
DeleteDir(const std::string & name,const IOOptions &,IODebugContext *)609 IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/,
610 IODebugContext* /*dbg*/) override {
611 IOStatus result;
612 if (rmdir(name.c_str()) != 0) {
613 result = IOError("file rmdir", name, errno);
614 }
615 return result;
616 }
617
GetFileSize(const std::string & fname,const IOOptions &,uint64_t * size,IODebugContext *)618 IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
619 uint64_t* size, IODebugContext* /*dbg*/) override {
620 IOStatus s;
621 struct stat sbuf;
622 if (stat(fname.c_str(), &sbuf) != 0) {
623 *size = 0;
624 s = IOError("while stat a file for size", fname, errno);
625 } else {
626 *size = sbuf.st_size;
627 }
628 return s;
629 }
630
GetFileModificationTime(const std::string & fname,const IOOptions &,uint64_t * file_mtime,IODebugContext *)631 IOStatus GetFileModificationTime(const std::string& fname,
632 const IOOptions& /*opts*/,
633 uint64_t* file_mtime,
634 IODebugContext* /*dbg*/) override {
635 struct stat s;
636 if (stat(fname.c_str(), &s) != 0) {
637 return IOError("while stat a file for modification time", fname, errno);
638 }
639 *file_mtime = static_cast<uint64_t>(s.st_mtime);
640 return IOStatus::OK();
641 }
642
RenameFile(const std::string & src,const std::string & target,const IOOptions &,IODebugContext *)643 IOStatus RenameFile(const std::string& src, const std::string& target,
644 const IOOptions& /*opts*/,
645 IODebugContext* /*dbg*/) override {
646 IOStatus result;
647 if (rename(src.c_str(), target.c_str()) != 0) {
648 result = IOError("While renaming a file to " + target, src, errno);
649 }
650 return result;
651 }
652
LinkFile(const std::string & src,const std::string & target,const IOOptions &,IODebugContext *)653 IOStatus LinkFile(const std::string& src, const std::string& target,
654 const IOOptions& /*opts*/,
655 IODebugContext* /*dbg*/) override {
656 IOStatus result;
657 if (link(src.c_str(), target.c_str()) != 0) {
658 if (errno == EXDEV) {
659 return IOStatus::NotSupported("No cross FS links allowed");
660 }
661 result = IOError("while link file to " + target, src, errno);
662 }
663 return result;
664 }
665
NumFileLinks(const std::string & fname,const IOOptions &,uint64_t * count,IODebugContext *)666 IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/,
667 uint64_t* count, IODebugContext* /*dbg*/) override {
668 struct stat s;
669 if (stat(fname.c_str(), &s) != 0) {
670 return IOError("while stat a file for num file links", fname, errno);
671 }
672 *count = static_cast<uint64_t>(s.st_nlink);
673 return IOStatus::OK();
674 }
675
AreFilesSame(const std::string & first,const std::string & second,const IOOptions &,bool * res,IODebugContext *)676 IOStatus AreFilesSame(const std::string& first, const std::string& second,
677 const IOOptions& /*opts*/, bool* res,
678 IODebugContext* /*dbg*/) override {
679 struct stat statbuf[2];
680 if (stat(first.c_str(), &statbuf[0]) != 0) {
681 return IOError("stat file", first, errno);
682 }
683 if (stat(second.c_str(), &statbuf[1]) != 0) {
684 return IOError("stat file", second, errno);
685 }
686
687 if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) ||
688 minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) ||
689 statbuf[0].st_ino != statbuf[1].st_ino) {
690 *res = false;
691 } else {
692 *res = true;
693 }
694 return IOStatus::OK();
695 }
696
LockFile(const std::string & fname,const IOOptions &,FileLock ** lock,IODebugContext *)697 IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/,
698 FileLock** lock, IODebugContext* /*dbg*/) override {
699 *lock = nullptr;
700 IOStatus result;
701
702 mutex_lockedFiles.Lock();
703 // If it already exists in the lockedFiles set, then it is already locked,
704 // and fail this lock attempt. Otherwise, insert it into lockedFiles.
705 // This check is needed because fcntl() does not detect lock conflict
706 // if the fcntl is issued by the same thread that earlier acquired
707 // this lock.
708 // We must do this check *before* opening the file:
709 // Otherwise, we will open a new file descriptor. Locks are associated with
710 // a process, not a file descriptor and when *any* file descriptor is
711 // closed, all locks the process holds for that *file* are released
712 if (lockedFiles.insert(fname).second == false) {
713 mutex_lockedFiles.Unlock();
714 errno = ENOLCK;
715 return IOError("lock ", fname, errno);
716 }
717
718 int fd;
719 int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
720
721 {
722 IOSTATS_TIMER_GUARD(open_nanos);
723 fd = open(fname.c_str(), flags, 0644);
724 }
725 if (fd < 0) {
726 result = IOError("while open a file for lock", fname, errno);
727 } else if (LockOrUnlock(fd, true) == -1) {
728 // if there is an error in locking, then remove the pathname from
729 // lockedfiles
730 lockedFiles.erase(fname);
731 result = IOError("While lock file", fname, errno);
732 close(fd);
733 } else {
734 SetFD_CLOEXEC(fd, nullptr);
735 PosixFileLock* my_lock = new PosixFileLock;
736 my_lock->fd_ = fd;
737 my_lock->filename = fname;
738 *lock = my_lock;
739 }
740
741 mutex_lockedFiles.Unlock();
742 return result;
743 }
744
UnlockFile(FileLock * lock,const IOOptions &,IODebugContext *)745 IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
746 IODebugContext* /*dbg*/) override {
747 PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
748 IOStatus result;
749 mutex_lockedFiles.Lock();
750 // If we are unlocking, then verify that we had locked it earlier,
751 // it should already exist in lockedFiles. Remove it from lockedFiles.
752 if (lockedFiles.erase(my_lock->filename) != 1) {
753 errno = ENOLCK;
754 result = IOError("unlock", my_lock->filename, errno);
755 } else if (LockOrUnlock(my_lock->fd_, false) == -1) {
756 result = IOError("unlock", my_lock->filename, errno);
757 }
758 close(my_lock->fd_);
759 delete my_lock;
760 mutex_lockedFiles.Unlock();
761 return result;
762 }
763
GetAbsolutePath(const std::string & db_path,const IOOptions &,std::string * output_path,IODebugContext *)764 IOStatus GetAbsolutePath(const std::string& db_path,
765 const IOOptions& /*opts*/, std::string* output_path,
766 IODebugContext* /*dbg*/) override {
767 if (!db_path.empty() && db_path[0] == '/') {
768 *output_path = db_path;
769 return IOStatus::OK();
770 }
771
772 char the_path[256];
773 char* ret = getcwd(the_path, 256);
774 if (ret == nullptr) {
775 return IOStatus::IOError(strerror(errno));
776 }
777
778 *output_path = ret;
779 return IOStatus::OK();
780 }
781
GetTestDirectory(const IOOptions &,std::string * result,IODebugContext *)782 IOStatus GetTestDirectory(const IOOptions& /*opts*/, std::string* result,
783 IODebugContext* /*dbg*/) override {
784 const char* env = getenv("TEST_TMPDIR");
785 if (env && env[0] != '\0') {
786 *result = env;
787 } else {
788 char buf[100];
789 snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
790 *result = buf;
791 }
792 // Directory may already exist
793 {
794 IOOptions opts;
795 CreateDir(*result, opts, nullptr);
796 }
797 return IOStatus::OK();
798 }
799
GetFreeSpace(const std::string & fname,const IOOptions &,uint64_t * free_space,IODebugContext *)800 IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/,
801 uint64_t* free_space,
802 IODebugContext* /*dbg*/) override {
803 struct statvfs sbuf;
804
805 if (statvfs(fname.c_str(), &sbuf) < 0) {
806 return IOError("While doing statvfs", fname, errno);
807 }
808
809 *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
810 return IOStatus::OK();
811 }
812
OptimizeForLogWrite(const FileOptions & file_options,const DBOptions & db_options) const813 FileOptions OptimizeForLogWrite(const FileOptions& file_options,
814 const DBOptions& db_options) const override {
815 FileOptions optimized = file_options;
816 optimized.use_mmap_writes = false;
817 optimized.use_direct_writes = false;
818 optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
819 // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
820 // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
821 // test and make this false
822 optimized.fallocate_with_keep_size = true;
823 optimized.writable_file_max_buffer_size =
824 db_options.writable_file_max_buffer_size;
825 return optimized;
826 }
827
OptimizeForManifestWrite(const FileOptions & file_options) const828 FileOptions OptimizeForManifestWrite(
829 const FileOptions& file_options) const override {
830 FileOptions optimized = file_options;
831 optimized.use_mmap_writes = false;
832 optimized.use_direct_writes = false;
833 optimized.fallocate_with_keep_size = true;
834 return optimized;
835 }
836
837 private:
838 bool checkedDiskForMmap_;
839 bool forceMmapOff_; // do we override Env options?
840
841 // Returns true iff the named directory exists and is a directory.
DirExists(const std::string & dname)842 virtual bool DirExists(const std::string& dname) {
843 struct stat statbuf;
844 if (stat(dname.c_str(), &statbuf) == 0) {
845 return S_ISDIR(statbuf.st_mode);
846 }
847 return false; // stat() failed return false
848 }
849
SupportsFastAllocate(const std::string & path)850 bool SupportsFastAllocate(const std::string& path) {
851 #ifdef ROCKSDB_FALLOCATE_PRESENT
852 struct statfs s;
853 if (statfs(path.c_str(), &s)) {
854 return false;
855 }
856 switch (s.f_type) {
857 case EXT4_SUPER_MAGIC:
858 return true;
859 case XFS_SUPER_MAGIC:
860 return true;
861 case TMPFS_MAGIC:
862 return true;
863 default:
864 return false;
865 }
866 #else
867 (void)path;
868 return false;
869 #endif
870 }
871
872 #if defined(ROCKSDB_IOURING_PRESENT)
873 // io_uring instance
874 std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
875 #endif
876
877 size_t page_size_;
878
879 // If true, allow non owner read access for db files. Otherwise, non-owner
880 // has no access to db files.
881 bool allow_non_owner_access_;
882 };
883
PosixFileSystem()884 PosixFileSystem::PosixFileSystem()
885 : checkedDiskForMmap_(false),
886 forceMmapOff_(false),
887 page_size_(getpagesize()),
888 allow_non_owner_access_(true) {
889 #if defined(ROCKSDB_IOURING_PRESENT)
890 // Test whether IOUring is supported, and if it does, create a managing
891 // object for thread local point so that in the future thread-local
892 // io_uring can be created.
893 struct io_uring* new_io_uring = CreateIOUring();
894 if (new_io_uring != nullptr) {
895 thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
896 delete new_io_uring;
897 }
898 #endif
899 }
900
901 } // namespace
902
903 //
904 // Default Posix FileSystem
905 //
Default()906 std::shared_ptr<FileSystem> FileSystem::Default() {
907 static PosixFileSystem default_fs;
908 static std::shared_ptr<PosixFileSystem> default_fs_ptr(
909 &default_fs, [](PosixFileSystem*) {});
910 return default_fs_ptr;
911 }
912
913 } // namespace ROCKSDB_NAMESPACE
914