1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors
9 #include <dirent.h>
10 #ifndef ROCKSDB_NO_DYNAMIC_EXTENSION
11 #include <dlfcn.h>
12 #endif
13 #include <errno.h>
14 #include <fcntl.h>
15 
16 #if defined(OS_LINUX)
17 #include <linux/fs.h>
18 #endif
19 #include <pthread.h>
20 #include <signal.h>
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/ioctl.h>
25 #include <sys/mman.h>
26 #include <sys/stat.h>
27 #if defined(OS_LINUX) || defined(OS_SOLARIS) || defined(OS_ANDROID)
28 #include <sys/statfs.h>
29 #include <sys/syscall.h>
30 #include <sys/sysmacros.h>
31 #endif
32 #include <sys/statvfs.h>
33 #include <sys/time.h>
34 #include <sys/types.h>
35 #include <time.h>
36 #include <algorithm>
37 // Get nano time includes
38 #if defined(OS_LINUX) || defined(OS_FREEBSD)
39 #elif defined(__MACH__)
40 #include <Availability.h>
41 #include <mach/clock.h>
42 #include <mach/mach.h>
43 #else
44 #include <chrono>
45 #endif
46 #include <deque>
47 #include <set>
48 #include <vector>
49 
50 #include "env/io_posix.h"
51 #include "logging/logging.h"
52 #include "logging/posix_logger.h"
53 #include "monitoring/iostats_context_imp.h"
54 #include "monitoring/thread_status_updater.h"
55 #include "port/port.h"
56 #include "rocksdb/options.h"
57 #include "rocksdb/slice.h"
58 #include "test_util/sync_point.h"
59 #include "util/coding.h"
60 #include "util/compression_context_cache.h"
61 #include "util/random.h"
62 #include "util/string_util.h"
63 #include "util/thread_local.h"
64 #include "util/threadpool_imp.h"
65 
66 #if !defined(TMPFS_MAGIC)
67 #define TMPFS_MAGIC 0x01021994
68 #endif
69 #if !defined(XFS_SUPER_MAGIC)
70 #define XFS_SUPER_MAGIC 0x58465342
71 #endif
72 #if !defined(EXT4_SUPER_MAGIC)
73 #define EXT4_SUPER_MAGIC 0xEF53
74 #endif
75 
76 namespace ROCKSDB_NAMESPACE {
77 
78 namespace {
79 
GetDBFileMode(bool allow_non_owner_access)80 inline mode_t GetDBFileMode(bool allow_non_owner_access) {
81   return allow_non_owner_access ? 0644 : 0600;
82 }
83 
84 // list of pathnames that are locked
85 static std::set<std::string> lockedFiles;
86 static port::Mutex mutex_lockedFiles;
87 
LockOrUnlock(int fd,bool lock)88 static int LockOrUnlock(int fd, bool lock) {
89   errno = 0;
90   struct flock f;
91   memset(&f, 0, sizeof(f));
92   f.l_type = (lock ? F_WRLCK : F_UNLCK);
93   f.l_whence = SEEK_SET;
94   f.l_start = 0;
95   f.l_len = 0;  // Lock/unlock entire file
96   int value = fcntl(fd, F_SETLK, &f);
97 
98   return value;
99 }
100 
101 class PosixFileLock : public FileLock {
102  public:
103   int fd_;
104   std::string filename;
105 };
106 
cloexec_flags(int flags,const EnvOptions * options)107 int cloexec_flags(int flags, const EnvOptions* options) {
108   // If the system supports opening the file with cloexec enabled,
109   // do so, as this avoids a race condition if a db is opened around
110   // the same time that a child process is forked
111 #ifdef O_CLOEXEC
112   if (options == nullptr || options->set_fd_cloexec) {
113     flags |= O_CLOEXEC;
114   }
115 #endif
116   return flags;
117 }
118 
119 class PosixFileSystem : public FileSystem {
120  public:
121   PosixFileSystem();
122 
Name() const123   const char* Name() const override { return "Posix File System"; }
124 
~PosixFileSystem()125   ~PosixFileSystem() override {}
126 
SetFD_CLOEXEC(int fd,const EnvOptions * options)127   void SetFD_CLOEXEC(int fd, const EnvOptions* options) {
128     if ((options == nullptr || options->set_fd_cloexec) && fd > 0) {
129       fcntl(fd, F_SETFD, fcntl(fd, F_GETFD) | FD_CLOEXEC);
130     }
131   }
132 
NewSequentialFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSSequentialFile> * result,IODebugContext *)133   IOStatus NewSequentialFile(const std::string& fname,
134                              const FileOptions& options,
135                              std::unique_ptr<FSSequentialFile>* result,
136                              IODebugContext* /*dbg*/) override {
137     result->reset();
138     int fd = -1;
139     int flags = cloexec_flags(O_RDONLY, &options);
140     FILE* file = nullptr;
141 
142     if (options.use_direct_reads && !options.use_mmap_reads) {
143 #ifdef ROCKSDB_LITE
144       return IOStatus::IOError(fname,
145                                "Direct I/O not supported in RocksDB lite");
146 #endif  // !ROCKSDB_LITE
147 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
148       flags |= O_DIRECT;
149 #endif
150     }
151 
152     do {
153       IOSTATS_TIMER_GUARD(open_nanos);
154       fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
155     } while (fd < 0 && errno == EINTR);
156     if (fd < 0) {
157       return IOError("While opening a file for sequentially reading", fname,
158                      errno);
159     }
160 
161     SetFD_CLOEXEC(fd, &options);
162 
163     if (options.use_direct_reads && !options.use_mmap_reads) {
164 #ifdef OS_MACOSX
165       if (fcntl(fd, F_NOCACHE, 1) == -1) {
166         close(fd);
167         return IOError("While fcntl NoCache", fname, errno);
168       }
169 #endif
170     } else {
171       do {
172         IOSTATS_TIMER_GUARD(open_nanos);
173         file = fdopen(fd, "r");
174       } while (file == nullptr && errno == EINTR);
175       if (file == nullptr) {
176         close(fd);
177         return IOError("While opening file for sequentially read", fname,
178                        errno);
179       }
180     }
181     result->reset(new PosixSequentialFile(fname, file, fd, options));
182     return IOStatus::OK();
183   }
184 
NewRandomAccessFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSRandomAccessFile> * result,IODebugContext *)185   IOStatus NewRandomAccessFile(const std::string& fname,
186                                const FileOptions& options,
187                                std::unique_ptr<FSRandomAccessFile>* result,
188                                IODebugContext* /*dbg*/) override {
189     result->reset();
190     IOStatus s;
191     int fd;
192     int flags = cloexec_flags(O_RDONLY, &options);
193 
194     if (options.use_direct_reads && !options.use_mmap_reads) {
195 #ifdef ROCKSDB_LITE
196       return IOStatus::IOError(fname,
197                                "Direct I/O not supported in RocksDB lite");
198 #endif  // !ROCKSDB_LITE
199 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
200       flags |= O_DIRECT;
201       TEST_SYNC_POINT_CALLBACK("NewRandomAccessFile:O_DIRECT", &flags);
202 #endif
203     }
204 
205     do {
206       IOSTATS_TIMER_GUARD(open_nanos);
207       fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
208     } while (fd < 0 && errno == EINTR);
209     if (fd < 0) {
210       return IOError("While open a file for random read", fname, errno);
211     }
212     SetFD_CLOEXEC(fd, &options);
213 
214     if (options.use_mmap_reads && sizeof(void*) >= 8) {
215       // Use of mmap for random reads has been removed because it
216       // kills performance when storage is fast.
217       // Use mmap when virtual address-space is plentiful.
218       uint64_t size;
219       IOOptions opts;
220       s = GetFileSize(fname, opts, &size, nullptr);
221       if (s.ok()) {
222         void* base = mmap(nullptr, size, PROT_READ, MAP_SHARED, fd, 0);
223         if (base != MAP_FAILED) {
224           result->reset(
225               new PosixMmapReadableFile(fd, fname, base, size, options));
226         } else {
227           s = IOError("while mmap file for read", fname, errno);
228           close(fd);
229         }
230       }
231     } else {
232       if (options.use_direct_reads && !options.use_mmap_reads) {
233 #ifdef OS_MACOSX
234         if (fcntl(fd, F_NOCACHE, 1) == -1) {
235           close(fd);
236           return IOError("while fcntl NoCache", fname, errno);
237         }
238 #endif
239       }
240       result->reset(new PosixRandomAccessFile(fname, fd, options
241 #if defined(ROCKSDB_IOURING_PRESENT)
242                                               ,
243                                               thread_local_io_urings_.get()
244 #endif
245                                                   ));
246     }
247     return s;
248   }
249 
OpenWritableFile(const std::string & fname,const FileOptions & options,bool reopen,std::unique_ptr<FSWritableFile> * result,IODebugContext *)250   virtual IOStatus OpenWritableFile(const std::string& fname,
251                                     const FileOptions& options,
252                                     bool reopen,
253                                     std::unique_ptr<FSWritableFile>* result,
254                                     IODebugContext* /*dbg*/) {
255     result->reset();
256     IOStatus s;
257     int fd = -1;
258     int flags = (reopen) ? (O_CREAT | O_APPEND) : (O_CREAT | O_TRUNC);
259     // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
260     if (options.use_direct_writes && !options.use_mmap_writes) {
261       // Note: we should avoid O_APPEND here due to ta the following bug:
262       // POSIX requires that opening a file with the O_APPEND flag should
263       // have no affect on the location at which pwrite() writes data.
264       // However, on Linux, if a file is opened with O_APPEND, pwrite()
265       // appends data to the end of the file, regardless of the value of
266       // offset.
267       // More info here: https://linux.die.net/man/2/pwrite
268 #ifdef ROCKSDB_LITE
269       return IOStatus::IOError(fname,
270                                "Direct I/O not supported in RocksDB lite");
271 #endif  // ROCKSDB_LITE
272       flags |= O_WRONLY;
273 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
274       flags |= O_DIRECT;
275 #endif
276       TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
277     } else if (options.use_mmap_writes) {
278       // non-direct I/O
279       flags |= O_RDWR;
280     } else {
281       flags |= O_WRONLY;
282     }
283 
284     flags = cloexec_flags(flags, &options);
285 
286     do {
287       IOSTATS_TIMER_GUARD(open_nanos);
288       fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
289     } while (fd < 0 && errno == EINTR);
290 
291     if (fd < 0) {
292       s = IOError("While open a file for appending", fname, errno);
293       return s;
294     }
295     SetFD_CLOEXEC(fd, &options);
296 
297     if (options.use_mmap_writes) {
298       if (!checkedDiskForMmap_) {
299         // this will be executed once in the program's lifetime.
300         // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
301         if (!SupportsFastAllocate(fname)) {
302           forceMmapOff_ = true;
303         }
304         checkedDiskForMmap_ = true;
305       }
306     }
307     if (options.use_mmap_writes && !forceMmapOff_) {
308       result->reset(new PosixMmapFile(fname, fd, page_size_, options));
309     } else if (options.use_direct_writes && !options.use_mmap_writes) {
310 #ifdef OS_MACOSX
311       if (fcntl(fd, F_NOCACHE, 1) == -1) {
312         close(fd);
313         s = IOError("While fcntl NoCache an opened file for appending", fname,
314                     errno);
315         return s;
316       }
317 #elif defined(OS_SOLARIS)
318       if (directio(fd, DIRECTIO_ON) == -1) {
319         if (errno != ENOTTY) {  // ZFS filesystems don't support DIRECTIO_ON
320           close(fd);
321           s = IOError("While calling directio()", fname, errno);
322           return s;
323         }
324       }
325 #endif
326       result->reset(new PosixWritableFile(fname, fd, options));
327     } else {
328       // disable mmap writes
329       EnvOptions no_mmap_writes_options = options;
330       no_mmap_writes_options.use_mmap_writes = false;
331       result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
332     }
333     return s;
334   }
335 
NewWritableFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSWritableFile> * result,IODebugContext * dbg)336   IOStatus NewWritableFile(const std::string& fname, const FileOptions& options,
337                            std::unique_ptr<FSWritableFile>* result,
338                            IODebugContext* dbg) override {
339     return OpenWritableFile(fname, options, false, result, dbg);
340   }
341 
ReopenWritableFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSWritableFile> * result,IODebugContext * dbg)342   IOStatus ReopenWritableFile(const std::string& fname,
343                               const FileOptions& options,
344                               std::unique_ptr<FSWritableFile>* result,
345                               IODebugContext* dbg) override {
346     return OpenWritableFile(fname, options, true, result, dbg);
347   }
348 
ReuseWritableFile(const std::string & fname,const std::string & old_fname,const FileOptions & options,std::unique_ptr<FSWritableFile> * result,IODebugContext *)349   IOStatus ReuseWritableFile(const std::string& fname,
350                              const std::string& old_fname,
351                              const FileOptions& options,
352                              std::unique_ptr<FSWritableFile>* result,
353                              IODebugContext* /*dbg*/) override {
354     result->reset();
355     IOStatus s;
356     int fd = -1;
357 
358     int flags = 0;
359     // Direct IO mode with O_DIRECT flag or F_NOCAHCE (MAC OSX)
360     if (options.use_direct_writes && !options.use_mmap_writes) {
361 #ifdef ROCKSDB_LITE
362       return IOStatus::IOError(fname,
363                                "Direct I/O not supported in RocksDB lite");
364 #endif  // !ROCKSDB_LITE
365       flags |= O_WRONLY;
366 #if !defined(OS_MACOSX) && !defined(OS_OPENBSD) && !defined(OS_SOLARIS)
367       flags |= O_DIRECT;
368 #endif
369       TEST_SYNC_POINT_CALLBACK("NewWritableFile:O_DIRECT", &flags);
370     } else if (options.use_mmap_writes) {
371       // mmap needs O_RDWR mode
372       flags |= O_RDWR;
373     } else {
374       flags |= O_WRONLY;
375     }
376 
377     flags = cloexec_flags(flags, &options);
378 
379     do {
380       IOSTATS_TIMER_GUARD(open_nanos);
381       fd = open(old_fname.c_str(), flags,
382                 GetDBFileMode(allow_non_owner_access_));
383     } while (fd < 0 && errno == EINTR);
384     if (fd < 0) {
385       s = IOError("while reopen file for write", fname, errno);
386       return s;
387     }
388 
389     SetFD_CLOEXEC(fd, &options);
390     // rename into place
391     if (rename(old_fname.c_str(), fname.c_str()) != 0) {
392       s = IOError("while rename file to " + fname, old_fname, errno);
393       close(fd);
394       return s;
395     }
396 
397     if (options.use_mmap_writes) {
398       if (!checkedDiskForMmap_) {
399         // this will be executed once in the program's lifetime.
400         // do not use mmapWrite on non ext-3/xfs/tmpfs systems.
401         if (!SupportsFastAllocate(fname)) {
402           forceMmapOff_ = true;
403         }
404         checkedDiskForMmap_ = true;
405       }
406     }
407     if (options.use_mmap_writes && !forceMmapOff_) {
408       result->reset(new PosixMmapFile(fname, fd, page_size_, options));
409     } else if (options.use_direct_writes && !options.use_mmap_writes) {
410 #ifdef OS_MACOSX
411       if (fcntl(fd, F_NOCACHE, 1) == -1) {
412         close(fd);
413         s = IOError("while fcntl NoCache for reopened file for append", fname,
414                     errno);
415         return s;
416       }
417 #elif defined(OS_SOLARIS)
418       if (directio(fd, DIRECTIO_ON) == -1) {
419         if (errno != ENOTTY) {  // ZFS filesystems don't support DIRECTIO_ON
420           close(fd);
421           s = IOError("while calling directio()", fname, errno);
422           return s;
423         }
424       }
425 #endif
426       result->reset(new PosixWritableFile(fname, fd, options));
427     } else {
428       // disable mmap writes
429       FileOptions no_mmap_writes_options = options;
430       no_mmap_writes_options.use_mmap_writes = false;
431       result->reset(new PosixWritableFile(fname, fd, no_mmap_writes_options));
432     }
433     return s;
434   }
435 
NewRandomRWFile(const std::string & fname,const FileOptions & options,std::unique_ptr<FSRandomRWFile> * result,IODebugContext *)436   IOStatus NewRandomRWFile(const std::string& fname, const FileOptions& options,
437                            std::unique_ptr<FSRandomRWFile>* result,
438                            IODebugContext* /*dbg*/) override {
439     int fd = -1;
440     int flags = cloexec_flags(O_RDWR, &options);
441 
442     while (fd < 0) {
443       IOSTATS_TIMER_GUARD(open_nanos);
444 
445       fd = open(fname.c_str(), flags, GetDBFileMode(allow_non_owner_access_));
446       if (fd < 0) {
447         // Error while opening the file
448         if (errno == EINTR) {
449           continue;
450         }
451         return IOError("While open file for random read/write", fname, errno);
452       }
453     }
454 
455     SetFD_CLOEXEC(fd, &options);
456     result->reset(new PosixRandomRWFile(fname, fd, options));
457     return IOStatus::OK();
458   }
459 
NewMemoryMappedFileBuffer(const std::string & fname,std::unique_ptr<MemoryMappedFileBuffer> * result)460   IOStatus NewMemoryMappedFileBuffer(
461       const std::string& fname,
462       std::unique_ptr<MemoryMappedFileBuffer>* result) override {
463     int fd = -1;
464     IOStatus status;
465     int flags = cloexec_flags(O_RDWR, nullptr);
466 
467     while (fd < 0) {
468       IOSTATS_TIMER_GUARD(open_nanos);
469       fd = open(fname.c_str(), flags, 0644);
470       if (fd < 0) {
471         // Error while opening the file
472         if (errno == EINTR) {
473           continue;
474         }
475         status =
476             IOError("While open file for raw mmap buffer access", fname, errno);
477         break;
478       }
479     }
480     uint64_t size;
481     if (status.ok()) {
482       IOOptions opts;
483       status = GetFileSize(fname, opts, &size, nullptr);
484     }
485     void* base = nullptr;
486     if (status.ok()) {
487       base = mmap(nullptr, static_cast<size_t>(size), PROT_READ | PROT_WRITE,
488                   MAP_SHARED, fd, 0);
489       if (base == MAP_FAILED) {
490         status = IOError("while mmap file for read", fname, errno);
491       }
492     }
493     if (status.ok()) {
494       result->reset(
495           new PosixMemoryMappedFileBuffer(base, static_cast<size_t>(size)));
496     }
497     if (fd >= 0) {
498       // don't need to keep it open after mmap has been called
499       close(fd);
500     }
501     return status;
502   }
503 
NewDirectory(const std::string & name,const IOOptions &,std::unique_ptr<FSDirectory> * result,IODebugContext *)504   IOStatus NewDirectory(const std::string& name, const IOOptions& /*opts*/,
505                         std::unique_ptr<FSDirectory>* result,
506                         IODebugContext* /*dbg*/) override {
507     result->reset();
508     int fd;
509     int flags = cloexec_flags(0, nullptr);
510     {
511       IOSTATS_TIMER_GUARD(open_nanos);
512       fd = open(name.c_str(), flags);
513     }
514     if (fd < 0) {
515       return IOError("While open directory", name, errno);
516     } else {
517       result->reset(new PosixDirectory(fd));
518     }
519     return IOStatus::OK();
520   }
521 
NewLogger(const std::string &,const IOOptions &,std::shared_ptr<ROCKSDB_NAMESPACE::Logger> *,IODebugContext *)522   IOStatus NewLogger(const std::string& /*fname*/, const IOOptions& /*opts*/,
523                      std::shared_ptr<ROCKSDB_NAMESPACE::Logger>* /*ptr*/,
524                      IODebugContext* /*dbg*/) override {
525     return IOStatus::NotSupported();
526   }
527 
FileExists(const std::string & fname,const IOOptions &,IODebugContext *)528   IOStatus FileExists(const std::string& fname, const IOOptions& /*opts*/,
529                       IODebugContext* /*dbg*/) override {
530     int result = access(fname.c_str(), F_OK);
531 
532     if (result == 0) {
533       return IOStatus::OK();
534     }
535 
536     int err = errno;
537     switch (err) {
538       case EACCES:
539       case ELOOP:
540       case ENAMETOOLONG:
541       case ENOENT:
542       case ENOTDIR:
543         return IOStatus::NotFound();
544       default:
545         assert(err == EIO || err == ENOMEM);
546         return IOStatus::IOError("Unexpected error(" + ToString(err) +
547                                  ") accessing file `" + fname + "' ");
548     }
549   }
550 
GetChildren(const std::string & dir,const IOOptions &,std::vector<std::string> * result,IODebugContext *)551   IOStatus GetChildren(const std::string& dir, const IOOptions& /*opts*/,
552                        std::vector<std::string>* result,
553                        IODebugContext* /*dbg*/) override {
554     result->clear();
555     DIR* d = opendir(dir.c_str());
556     if (d == nullptr) {
557       switch (errno) {
558         case EACCES:
559         case ENOENT:
560         case ENOTDIR:
561           return IOStatus::NotFound();
562         default:
563           return IOError("While opendir", dir, errno);
564       }
565     }
566     struct dirent* entry;
567     while ((entry = readdir(d)) != nullptr) {
568       result->push_back(entry->d_name);
569     }
570     closedir(d);
571     return IOStatus::OK();
572   }
573 
DeleteFile(const std::string & fname,const IOOptions &,IODebugContext *)574   IOStatus DeleteFile(const std::string& fname, const IOOptions& /*opts*/,
575                       IODebugContext* /*dbg*/) override {
576     IOStatus result;
577     if (unlink(fname.c_str()) != 0) {
578       result = IOError("while unlink() file", fname, errno);
579     }
580     return result;
581   }
582 
CreateDir(const std::string & name,const IOOptions &,IODebugContext *)583   IOStatus CreateDir(const std::string& name, const IOOptions& /*opts*/,
584                      IODebugContext* /*dbg*/) override {
585     IOStatus result;
586     if (mkdir(name.c_str(), 0755) != 0) {
587       result = IOError("While mkdir", name, errno);
588     }
589     return result;
590   }
591 
CreateDirIfMissing(const std::string & name,const IOOptions &,IODebugContext *)592   IOStatus CreateDirIfMissing(const std::string& name,
593                               const IOOptions& /*opts*/,
594                               IODebugContext* /*dbg*/) override {
595     IOStatus result;
596     if (mkdir(name.c_str(), 0755) != 0) {
597       if (errno != EEXIST) {
598         result = IOError("While mkdir if missing", name, errno);
599       } else if (!DirExists(name)) {  // Check that name is actually a
600                                       // directory.
601         // Message is taken from mkdir
602         result =
603             IOStatus::IOError("`" + name + "' exists but is not a directory");
604       }
605     }
606     return result;
607   }
608 
DeleteDir(const std::string & name,const IOOptions &,IODebugContext *)609   IOStatus DeleteDir(const std::string& name, const IOOptions& /*opts*/,
610                      IODebugContext* /*dbg*/) override {
611     IOStatus result;
612     if (rmdir(name.c_str()) != 0) {
613       result = IOError("file rmdir", name, errno);
614     }
615     return result;
616   }
617 
GetFileSize(const std::string & fname,const IOOptions &,uint64_t * size,IODebugContext *)618   IOStatus GetFileSize(const std::string& fname, const IOOptions& /*opts*/,
619                        uint64_t* size, IODebugContext* /*dbg*/) override {
620     IOStatus s;
621     struct stat sbuf;
622     if (stat(fname.c_str(), &sbuf) != 0) {
623       *size = 0;
624       s = IOError("while stat a file for size", fname, errno);
625     } else {
626       *size = sbuf.st_size;
627     }
628     return s;
629   }
630 
GetFileModificationTime(const std::string & fname,const IOOptions &,uint64_t * file_mtime,IODebugContext *)631   IOStatus GetFileModificationTime(const std::string& fname,
632                                    const IOOptions& /*opts*/,
633                                    uint64_t* file_mtime,
634                                    IODebugContext* /*dbg*/) override {
635     struct stat s;
636     if (stat(fname.c_str(), &s) != 0) {
637       return IOError("while stat a file for modification time", fname, errno);
638     }
639     *file_mtime = static_cast<uint64_t>(s.st_mtime);
640     return IOStatus::OK();
641   }
642 
RenameFile(const std::string & src,const std::string & target,const IOOptions &,IODebugContext *)643   IOStatus RenameFile(const std::string& src, const std::string& target,
644                       const IOOptions& /*opts*/,
645                       IODebugContext* /*dbg*/) override {
646     IOStatus result;
647     if (rename(src.c_str(), target.c_str()) != 0) {
648       result = IOError("While renaming a file to " + target, src, errno);
649     }
650     return result;
651   }
652 
LinkFile(const std::string & src,const std::string & target,const IOOptions &,IODebugContext *)653   IOStatus LinkFile(const std::string& src, const std::string& target,
654                     const IOOptions& /*opts*/,
655                     IODebugContext* /*dbg*/) override {
656     IOStatus result;
657     if (link(src.c_str(), target.c_str()) != 0) {
658       if (errno == EXDEV) {
659         return IOStatus::NotSupported("No cross FS links allowed");
660       }
661       result = IOError("while link file to " + target, src, errno);
662     }
663     return result;
664   }
665 
NumFileLinks(const std::string & fname,const IOOptions &,uint64_t * count,IODebugContext *)666   IOStatus NumFileLinks(const std::string& fname, const IOOptions& /*opts*/,
667                         uint64_t* count, IODebugContext* /*dbg*/) override {
668     struct stat s;
669     if (stat(fname.c_str(), &s) != 0) {
670       return IOError("while stat a file for num file links", fname, errno);
671     }
672     *count = static_cast<uint64_t>(s.st_nlink);
673     return IOStatus::OK();
674   }
675 
AreFilesSame(const std::string & first,const std::string & second,const IOOptions &,bool * res,IODebugContext *)676   IOStatus AreFilesSame(const std::string& first, const std::string& second,
677                         const IOOptions& /*opts*/, bool* res,
678                         IODebugContext* /*dbg*/) override {
679     struct stat statbuf[2];
680     if (stat(first.c_str(), &statbuf[0]) != 0) {
681       return IOError("stat file", first, errno);
682     }
683     if (stat(second.c_str(), &statbuf[1]) != 0) {
684       return IOError("stat file", second, errno);
685     }
686 
687     if (major(statbuf[0].st_dev) != major(statbuf[1].st_dev) ||
688         minor(statbuf[0].st_dev) != minor(statbuf[1].st_dev) ||
689         statbuf[0].st_ino != statbuf[1].st_ino) {
690       *res = false;
691     } else {
692       *res = true;
693     }
694     return IOStatus::OK();
695   }
696 
LockFile(const std::string & fname,const IOOptions &,FileLock ** lock,IODebugContext *)697   IOStatus LockFile(const std::string& fname, const IOOptions& /*opts*/,
698                     FileLock** lock, IODebugContext* /*dbg*/) override {
699     *lock = nullptr;
700     IOStatus result;
701 
702     mutex_lockedFiles.Lock();
703     // If it already exists in the lockedFiles set, then it is already locked,
704     // and fail this lock attempt. Otherwise, insert it into lockedFiles.
705     // This check is needed because fcntl() does not detect lock conflict
706     // if the fcntl is issued by the same thread that earlier acquired
707     // this lock.
708     // We must do this check *before* opening the file:
709     // Otherwise, we will open a new file descriptor. Locks are associated with
710     // a process, not a file descriptor and when *any* file descriptor is
711     // closed, all locks the process holds for that *file* are released
712     if (lockedFiles.insert(fname).second == false) {
713       mutex_lockedFiles.Unlock();
714       errno = ENOLCK;
715       return IOError("lock ", fname, errno);
716     }
717 
718     int fd;
719     int flags = cloexec_flags(O_RDWR | O_CREAT, nullptr);
720 
721     {
722       IOSTATS_TIMER_GUARD(open_nanos);
723       fd = open(fname.c_str(), flags, 0644);
724     }
725     if (fd < 0) {
726       result = IOError("while open a file for lock", fname, errno);
727     } else if (LockOrUnlock(fd, true) == -1) {
728       // if there is an error in locking, then remove the pathname from
729       // lockedfiles
730       lockedFiles.erase(fname);
731       result = IOError("While lock file", fname, errno);
732       close(fd);
733     } else {
734       SetFD_CLOEXEC(fd, nullptr);
735       PosixFileLock* my_lock = new PosixFileLock;
736       my_lock->fd_ = fd;
737       my_lock->filename = fname;
738       *lock = my_lock;
739     }
740 
741     mutex_lockedFiles.Unlock();
742     return result;
743   }
744 
UnlockFile(FileLock * lock,const IOOptions &,IODebugContext *)745   IOStatus UnlockFile(FileLock* lock, const IOOptions& /*opts*/,
746                       IODebugContext* /*dbg*/) override {
747     PosixFileLock* my_lock = reinterpret_cast<PosixFileLock*>(lock);
748     IOStatus result;
749     mutex_lockedFiles.Lock();
750     // If we are unlocking, then verify that we had locked it earlier,
751     // it should already exist in lockedFiles. Remove it from lockedFiles.
752     if (lockedFiles.erase(my_lock->filename) != 1) {
753       errno = ENOLCK;
754       result = IOError("unlock", my_lock->filename, errno);
755     } else if (LockOrUnlock(my_lock->fd_, false) == -1) {
756       result = IOError("unlock", my_lock->filename, errno);
757     }
758     close(my_lock->fd_);
759     delete my_lock;
760     mutex_lockedFiles.Unlock();
761     return result;
762   }
763 
GetAbsolutePath(const std::string & db_path,const IOOptions &,std::string * output_path,IODebugContext *)764   IOStatus GetAbsolutePath(const std::string& db_path,
765                            const IOOptions& /*opts*/, std::string* output_path,
766                            IODebugContext* /*dbg*/) override {
767     if (!db_path.empty() && db_path[0] == '/') {
768       *output_path = db_path;
769       return IOStatus::OK();
770     }
771 
772     char the_path[256];
773     char* ret = getcwd(the_path, 256);
774     if (ret == nullptr) {
775       return IOStatus::IOError(strerror(errno));
776     }
777 
778     *output_path = ret;
779     return IOStatus::OK();
780   }
781 
GetTestDirectory(const IOOptions &,std::string * result,IODebugContext *)782   IOStatus GetTestDirectory(const IOOptions& /*opts*/, std::string* result,
783                             IODebugContext* /*dbg*/) override {
784     const char* env = getenv("TEST_TMPDIR");
785     if (env && env[0] != '\0') {
786       *result = env;
787     } else {
788       char buf[100];
789       snprintf(buf, sizeof(buf), "/tmp/rocksdbtest-%d", int(geteuid()));
790       *result = buf;
791     }
792     // Directory may already exist
793     {
794       IOOptions opts;
795       CreateDir(*result, opts, nullptr);
796     }
797     return IOStatus::OK();
798   }
799 
GetFreeSpace(const std::string & fname,const IOOptions &,uint64_t * free_space,IODebugContext *)800   IOStatus GetFreeSpace(const std::string& fname, const IOOptions& /*opts*/,
801                         uint64_t* free_space,
802                         IODebugContext* /*dbg*/) override {
803     struct statvfs sbuf;
804 
805     if (statvfs(fname.c_str(), &sbuf) < 0) {
806       return IOError("While doing statvfs", fname, errno);
807     }
808 
809     *free_space = ((uint64_t)sbuf.f_bsize * sbuf.f_bfree);
810     return IOStatus::OK();
811   }
812 
OptimizeForLogWrite(const FileOptions & file_options,const DBOptions & db_options) const813   FileOptions OptimizeForLogWrite(const FileOptions& file_options,
814                                  const DBOptions& db_options) const override {
815     FileOptions optimized = file_options;
816     optimized.use_mmap_writes = false;
817     optimized.use_direct_writes = false;
818     optimized.bytes_per_sync = db_options.wal_bytes_per_sync;
819     // TODO(icanadi) it's faster if fallocate_with_keep_size is false, but it
820     // breaks TransactionLogIteratorStallAtLastRecord unit test. Fix the unit
821     // test and make this false
822     optimized.fallocate_with_keep_size = true;
823     optimized.writable_file_max_buffer_size =
824         db_options.writable_file_max_buffer_size;
825     return optimized;
826   }
827 
OptimizeForManifestWrite(const FileOptions & file_options) const828   FileOptions OptimizeForManifestWrite(
829       const FileOptions& file_options) const override {
830     FileOptions optimized = file_options;
831     optimized.use_mmap_writes = false;
832     optimized.use_direct_writes = false;
833     optimized.fallocate_with_keep_size = true;
834     return optimized;
835   }
836 
837  private:
838   bool checkedDiskForMmap_;
839   bool forceMmapOff_;  // do we override Env options?
840 
841   // Returns true iff the named directory exists and is a directory.
DirExists(const std::string & dname)842   virtual bool DirExists(const std::string& dname) {
843     struct stat statbuf;
844     if (stat(dname.c_str(), &statbuf) == 0) {
845       return S_ISDIR(statbuf.st_mode);
846     }
847     return false;  // stat() failed return false
848   }
849 
SupportsFastAllocate(const std::string & path)850   bool SupportsFastAllocate(const std::string& path) {
851 #ifdef ROCKSDB_FALLOCATE_PRESENT
852     struct statfs s;
853     if (statfs(path.c_str(), &s)) {
854       return false;
855     }
856     switch (s.f_type) {
857       case EXT4_SUPER_MAGIC:
858         return true;
859       case XFS_SUPER_MAGIC:
860         return true;
861       case TMPFS_MAGIC:
862         return true;
863       default:
864         return false;
865     }
866 #else
867     (void)path;
868     return false;
869 #endif
870   }
871 
872 #if defined(ROCKSDB_IOURING_PRESENT)
873   // io_uring instance
874   std::unique_ptr<ThreadLocalPtr> thread_local_io_urings_;
875 #endif
876 
877   size_t page_size_;
878 
879   // If true, allow non owner read access for db files. Otherwise, non-owner
880   //  has no access to db files.
881   bool allow_non_owner_access_;
882 };
883 
PosixFileSystem()884 PosixFileSystem::PosixFileSystem()
885     : checkedDiskForMmap_(false),
886       forceMmapOff_(false),
887       page_size_(getpagesize()),
888       allow_non_owner_access_(true) {
889 #if defined(ROCKSDB_IOURING_PRESENT)
890   // Test whether IOUring is supported, and if it does, create a managing
891   // object for thread local point so that in the future thread-local
892   // io_uring can be created.
893   struct io_uring* new_io_uring = CreateIOUring();
894   if (new_io_uring != nullptr) {
895     thread_local_io_urings_.reset(new ThreadLocalPtr(DeleteIOUring));
896     delete new_io_uring;
897   }
898 #endif
899 }
900 
901 }  // namespace
902 
903 //
904 // Default Posix FileSystem
905 //
Default()906 std::shared_ptr<FileSystem> FileSystem::Default() {
907   static PosixFileSystem default_fs;
908   static std::shared_ptr<PosixFileSystem> default_fs_ptr(
909       &default_fs, [](PosixFileSystem*) {});
910   return default_fs_ptr;
911 }
912 
913 }  // namespace ROCKSDB_NAMESPACE
914