1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #ifdef ROCKSDB_LIB_IO_POSIX
11 #include "env/io_posix.h"
12 #include <errno.h>
13 #include <fcntl.h>
14 #include <algorithm>
15 #if defined(OS_LINUX)
16 #include <linux/fs.h>
17 #ifndef FALLOC_FL_KEEP_SIZE
18 #include <linux/falloc.h>
19 #endif
20 #endif
21 #include <stdio.h>
22 #include <stdlib.h>
23 #include <string.h>
24 #include <sys/ioctl.h>
25 #include <sys/mman.h>
26 #include <sys/stat.h>
27 #include <sys/types.h>
28 #ifdef OS_LINUX
29 #include <sys/statfs.h>
30 #include <sys/syscall.h>
31 #include <sys/sysmacros.h>
32 #endif
33 #include "monitoring/iostats_context_imp.h"
34 #include "port/port.h"
35 #include "rocksdb/slice.h"
36 #include "test_util/sync_point.h"
37 #include "util/autovector.h"
38 #include "util/coding.h"
39 #include "util/string_util.h"
40 
41 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT)
42 #define F_LINUX_SPECIFIC_BASE 1024
43 #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12)
44 #endif
45 
46 namespace ROCKSDB_NAMESPACE {
47 
48 // A wrapper for fadvise, if the platform doesn't support fadvise,
49 // it will simply return 0.
50 int Fadvise(int fd, off_t offset, size_t len, int advice) {
51 #ifdef OS_LINUX
52   return posix_fadvise(fd, offset, len, advice);
53 #else
54   (void)fd;
55   (void)offset;
56   (void)len;
57   (void)advice;
58   return 0;  // simply do nothing.
59 #endif
60 }
61 
62 namespace {
63 
64 // On MacOS (and probably *BSD), the posix write and pwrite calls do not support
65 // buffers larger than 2^31-1 bytes. These two wrappers fix this issue by
66 // cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep
67 // the writes aligned.
68 
69 bool PosixWrite(int fd, const char* buf, size_t nbyte) {
70   const size_t kLimit1Gb = 1UL << 30;
71 
72   const char* src = buf;
73   size_t left = nbyte;
74 
75   while (left != 0) {
76     size_t bytes_to_write = std::min(left, kLimit1Gb);
77 
78     ssize_t done = write(fd, src, bytes_to_write);
79     if (done < 0) {
80       if (errno == EINTR) {
81         continue;
82       }
83       return false;
84     }
85     left -= done;
86     src += done;
87   }
88   return true;
89 }
90 
91 bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) {
92   const size_t kLimit1Gb = 1UL << 30;
93 
94   const char* src = buf;
95   size_t left = nbyte;
96 
97   while (left != 0) {
98     size_t bytes_to_write = std::min(left, kLimit1Gb);
99 
100     ssize_t done = pwrite(fd, src, bytes_to_write, offset);
101     if (done < 0) {
102       if (errno == EINTR) {
103         continue;
104       }
105       return false;
106     }
107     left -= done;
108     offset += done;
109     src += done;
110   }
111 
112   return true;
113 }
114 
115 size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) {
116 #ifdef OS_LINUX
117   struct stat buf;
118   int result = fstat(fd, &buf);
119   if (result == -1) {
120     return kDefaultPageSize;
121   }
122   if (major(buf.st_dev) == 0) {
123     // Unnamed devices (e.g. non-device mounts), reserved as null device number.
124     // These don't have an entry in /sys/dev/block/. Return a sensible default.
125     return kDefaultPageSize;
126   }
127 
128   // Reading queue/logical_block_size does not require special permissions.
129   const int kBufferSize = 100;
130   char path[kBufferSize];
131   char real_path[PATH_MAX + 1];
132   snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev),
133            minor(buf.st_dev));
134   if (realpath(path, real_path) == nullptr) {
135     return kDefaultPageSize;
136   }
137   std::string device_dir(real_path);
138   if (!device_dir.empty() && device_dir.back() == '/') {
139     device_dir.pop_back();
140   }
141   // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda
142   // and nvme0n1 have it.
143   // $ ls -al '/sys/dev/block/8:3'
144   // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 ->
145   // ../../block/sda/sda3
146   // $ ls -al '/sys/dev/block/259:4'
147   // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 ->
148   // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1
149   size_t parent_end = device_dir.rfind('/', device_dir.length() - 1);
150   if (parent_end == std::string::npos) {
151     return kDefaultPageSize;
152   }
153   size_t parent_begin = device_dir.rfind('/', parent_end - 1);
154   if (parent_begin == std::string::npos) {
155     return kDefaultPageSize;
156   }
157   std::string parent =
158       device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1);
159   std::string child = device_dir.substr(parent_end + 1, std::string::npos);
160   if (parent != "block" &&
161       (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) {
162     device_dir = device_dir.substr(0, parent_end);
163   }
164   std::string fname = device_dir + "/queue/logical_block_size";
165   FILE* fp;
166   size_t size = 0;
167   fp = fopen(fname.c_str(), "r");
168   if (fp != nullptr) {
169     char* line = nullptr;
170     size_t len = 0;
171     if (getline(&line, &len, fp) != -1) {
172       sscanf(line, "%zu", &size);
173     }
174     free(line);
175     fclose(fp);
176   }
177   if (size != 0 && (size & (size - 1)) == 0) {
178     return size;
179   }
180 #endif
181   return kDefaultPageSize;
182 }
183 
184 #ifdef ROCKSDB_RANGESYNC_PRESENT
185 
186 #if !defined(ZFS_SUPER_MAGIC)
187 // The magic number for ZFS was not exposed until recently. It should be fixed
188 // forever so we can just copy the magic number here.
189 #define ZFS_SUPER_MAGIC 0x2fc12fc1
190 #endif
191 
192 bool IsSyncFileRangeSupported(int fd) {
193   // The approach taken in this function is to build a blacklist of cases where
194   // we know `sync_file_range` definitely will not work properly despite passing
195   // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or
196   // if any of the checks fail in unexpected ways, we allow `sync_file_range` to
197   // be used. This way should minimize risk of impacting existing use cases.
198   struct statfs buf;
199   int ret = fstatfs(fd, &buf);
200   assert(ret == 0);
201   if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) {
202     // Testing on ZFS showed the writeback did not happen asynchronously when
203     // `sync_file_range` was called, even though it returned success. Avoid it
204     // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`,
205     // even though this'll incur extra I/O for metadata.
206     return false;
207   }
208 
209   ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */);
210   assert(!(ret == -1 && errno != ENOSYS));
211   if (ret == -1 && errno == ENOSYS) {
212     // `sync_file_range` is not implemented on all platforms even if
213     // compile-time checks pass and a supported filesystem is in-use. For
214     // example, using ext4 on WSL (Windows Subsystem for Linux),
215     // `sync_file_range()` returns `ENOSYS`
216     // ("Function not implemented").
217     return false;
218   }
219   // None of the cases on the blacklist matched, so allow `sync_file_range` use.
220   return true;
221 }
222 
223 #undef ZFS_SUPER_MAGIC
224 
225 #endif  // ROCKSDB_RANGESYNC_PRESENT
226 
227 }  // anonymous namespace
228 
229 /*
230  * DirectIOHelper
231  */
232 #ifndef NDEBUG
233 namespace {
234 
235 bool IsSectorAligned(const size_t off, size_t sector_size) {
236   return off % sector_size == 0;
237 }
238 
239 bool IsSectorAligned(const void* ptr, size_t sector_size) {
240   return uintptr_t(ptr) % sector_size == 0;
241 }
242 
243 }  // namespace
244 #endif
245 
246 /*
247  * PosixSequentialFile
248  */
249 PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file,
250                                          int fd, const EnvOptions& options)
251     : filename_(fname),
252       file_(file),
253       fd_(fd),
254       use_direct_io_(options.use_direct_reads),
255       logical_sector_size_(GetLogicalBufferSize(fd_)) {
256   assert(!options.use_direct_reads || !options.use_mmap_reads);
257 }
258 
259 PosixSequentialFile::~PosixSequentialFile() {
260   if (!use_direct_io()) {
261     assert(file_);
262     fclose(file_);
263   } else {
264     assert(fd_);
265     close(fd_);
266   }
267 }
268 
269 IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/,
270                                    Slice* result, char* scratch,
271                                    IODebugContext* /*dbg*/) {
272   assert(result != nullptr && !use_direct_io());
273   IOStatus s;
274   size_t r = 0;
275   do {
276     r = fread_unlocked(scratch, 1, n, file_);
277   } while (r == 0 && ferror(file_) && errno == EINTR);
278   *result = Slice(scratch, r);
279   if (r < n) {
280     if (feof(file_)) {
281       // We leave status as ok if we hit the end of the file
282       // We also clear the error so that the reads can continue
283       // if a new data is written to the file
284       clearerr(file_);
285     } else {
286       // A partial read with an error: return a non-ok status
287       s = IOError("While reading file sequentially", filename_, errno);
288     }
289   }
290   return s;
291 }
292 
293 IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n,
294                                              const IOOptions& /*opts*/,
295                                              Slice* result, char* scratch,
296                                              IODebugContext* /*dbg*/) {
297   assert(use_direct_io());
298   assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
299   assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
300   assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
301 
302   IOStatus s;
303   ssize_t r = -1;
304   size_t left = n;
305   char* ptr = scratch;
306   while (left > 0) {
307     r = pread(fd_, ptr, left, static_cast<off_t>(offset));
308     if (r <= 0) {
309       if (r == -1 && errno == EINTR) {
310         continue;
311       }
312       break;
313     }
314     ptr += r;
315     offset += r;
316     left -= r;
317     if (r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
318       // Bytes reads don't fill sectors. Should only happen at the end
319       // of the file.
320       break;
321     }
322   }
323   if (r < 0) {
324     // An error: return a non-ok status
325     s = IOError(
326         "While pread " + ToString(n) + " bytes from offset " + ToString(offset),
327         filename_, errno);
328   }
329   *result = Slice(scratch, (r < 0) ? 0 : n - left);
330   return s;
331 }
332 
333 IOStatus PosixSequentialFile::Skip(uint64_t n) {
334   if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) {
335     return IOError("While fseek to skip " + ToString(n) + " bytes", filename_,
336                    errno);
337   }
338   return IOStatus::OK();
339 }
340 
341 IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) {
342 #ifndef OS_LINUX
343   (void)offset;
344   (void)length;
345   return IOStatus::OK();
346 #else
347   if (!use_direct_io()) {
348     // free OS pages
349     int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
350     if (ret != 0) {
351       return IOError("While fadvise NotNeeded offset " + ToString(offset) +
352                          " len " + ToString(length),
353                      filename_, errno);
354     }
355   }
356   return IOStatus::OK();
357 #endif
358 }
359 
360 /*
361  * PosixRandomAccessFile
362  */
363 #if defined(OS_LINUX)
364 size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
365   if (max_size < kMaxVarint64Length * 3) {
366     return 0;
367   }
368 
369   struct stat buf;
370   int result = fstat(fd, &buf);
371   if (result == -1) {
372     return 0;
373   }
374 
375   long version = 0;
376   result = ioctl(fd, FS_IOC_GETVERSION, &version);
377   TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result);
378   if (result == -1) {
379     return 0;
380   }
381   uint64_t uversion = (uint64_t)version;
382 
383   char* rid = id;
384   rid = EncodeVarint64(rid, buf.st_dev);
385   rid = EncodeVarint64(rid, buf.st_ino);
386   rid = EncodeVarint64(rid, uversion);
387   assert(rid >= id);
388   return static_cast<size_t>(rid - id);
389 }
390 #endif
391 
392 #if defined(OS_MACOSX) || defined(OS_AIX)
393 size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) {
394   if (max_size < kMaxVarint64Length * 3) {
395     return 0;
396   }
397 
398   struct stat buf;
399   int result = fstat(fd, &buf);
400   if (result == -1) {
401     return 0;
402   }
403 
404   char* rid = id;
405   rid = EncodeVarint64(rid, buf.st_dev);
406   rid = EncodeVarint64(rid, buf.st_ino);
407   rid = EncodeVarint64(rid, buf.st_gen);
408   assert(rid >= id);
409   return static_cast<size_t>(rid - id);
410 }
411 #endif
412 /*
413  * PosixRandomAccessFile
414  *
415  * pread() based random-access
416  */
417 PosixRandomAccessFile::PosixRandomAccessFile(
418     const std::string& fname, int fd, const EnvOptions& options
419 #if defined(ROCKSDB_IOURING_PRESENT)
420     ,
421     ThreadLocalPtr* thread_local_io_urings
422 #endif
423     )
424     : filename_(fname),
425       fd_(fd),
426       use_direct_io_(options.use_direct_reads),
427       logical_sector_size_(GetLogicalBufferSize(fd_))
428 #if defined(ROCKSDB_IOURING_PRESENT)
429       ,
430       thread_local_io_urings_(thread_local_io_urings)
431 #endif
432 {
433   assert(!options.use_direct_reads || !options.use_mmap_reads);
434   assert(!options.use_mmap_reads || sizeof(void*) < 8);
435 }
436 
437 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); }
438 
439 IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n,
440                                      const IOOptions& /*opts*/, Slice* result,
441                                      char* scratch,
442                                      IODebugContext* /*dbg*/) const {
443   if (use_direct_io()) {
444     assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
445     assert(IsSectorAligned(n, GetRequiredBufferAlignment()));
446     assert(IsSectorAligned(scratch, GetRequiredBufferAlignment()));
447   }
448   IOStatus s;
449   ssize_t r = -1;
450   size_t left = n;
451   char* ptr = scratch;
452   while (left > 0) {
453     r = pread(fd_, ptr, left, static_cast<off_t>(offset));
454     if (r <= 0) {
455       if (r == -1 && errno == EINTR) {
456         continue;
457       }
458       break;
459     }
460     ptr += r;
461     offset += r;
462     left -= r;
463     if (use_direct_io() &&
464         r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) {
465       // Bytes reads don't fill sectors. Should only happen at the end
466       // of the file.
467       break;
468     }
469   }
470   if (r < 0) {
471     // An error: return a non-ok status
472     s = IOError(
473         "While pread offset " + ToString(offset) + " len " + ToString(n),
474         filename_, errno);
475   }
476   *result = Slice(scratch, (r < 0) ? 0 : n - left);
477   return s;
478 }
479 
480 IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs,
481                                           size_t num_reqs,
482                                           const IOOptions& options,
483                                           IODebugContext* dbg) {
484 #if defined(ROCKSDB_IOURING_PRESENT)
485   struct io_uring* iu = nullptr;
486   if (thread_local_io_urings_) {
487     iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get());
488     if (iu == nullptr) {
489       iu = CreateIOUring();
490       if (iu != nullptr) {
491         thread_local_io_urings_->Reset(iu);
492       }
493     }
494   }
495 
496   // Init failed, platform doesn't support io_uring. Fall back to
497   // serialized reads
498   if (iu == nullptr) {
499     return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
500   }
501 
502   struct WrappedReadRequest {
503     FSReadRequest* req;
504     struct iovec iov;
505     size_t finished_len;
506     explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {}
507   };
508 
509   autovector<WrappedReadRequest, 32> req_wraps;
510   autovector<WrappedReadRequest*, 4> incomplete_rq_list;
511 
512   for (size_t i = 0; i < num_reqs; i++) {
513     req_wraps.emplace_back(&reqs[i]);
514   }
515 
516   size_t reqs_off = 0;
517   while (num_reqs > reqs_off || !incomplete_rq_list.empty()) {
518     size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size();
519 
520     // If requests exceed depth, split it into batches
521     if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth;
522 
523     assert(incomplete_rq_list.size() <= this_reqs);
524     for (size_t i = 0; i < this_reqs; i++) {
525       WrappedReadRequest* rep_to_submit;
526       if (i < incomplete_rq_list.size()) {
527         rep_to_submit = incomplete_rq_list[i];
528       } else {
529         rep_to_submit = &req_wraps[reqs_off++];
530       }
531       assert(rep_to_submit->req->len > rep_to_submit->finished_len);
532       rep_to_submit->iov.iov_base =
533           rep_to_submit->req->scratch + rep_to_submit->finished_len;
534       rep_to_submit->iov.iov_len =
535           rep_to_submit->req->len - rep_to_submit->finished_len;
536 
537       struct io_uring_sqe* sqe;
538       sqe = io_uring_get_sqe(iu);
539       io_uring_prep_readv(
540           sqe, fd_, &rep_to_submit->iov, 1,
541           rep_to_submit->req->offset + rep_to_submit->finished_len);
542       io_uring_sqe_set_data(sqe, rep_to_submit);
543     }
544     incomplete_rq_list.clear();
545 
546     ssize_t ret =
547         io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs));
548     if (static_cast<size_t>(ret) != this_reqs) {
549       fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs);
550     }
551     assert(static_cast<size_t>(ret) == this_reqs);
552 
553     for (size_t i = 0; i < this_reqs; i++) {
554       struct io_uring_cqe* cqe;
555       WrappedReadRequest* req_wrap;
556 
557       // We could use the peek variant here, but this seems safer in terms
558       // of our initial wait not reaping all completions
559       ret = io_uring_wait_cqe(iu, &cqe);
560       assert(!ret);
561 
562       req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe));
563       FSReadRequest* req = req_wrap->req;
564       if (cqe->res < 0) {
565         req->result = Slice(req->scratch, 0);
566         req->status = IOError("Req failed", filename_, cqe->res);
567       } else {
568         size_t bytes_read = static_cast<size_t>(cqe->res);
569         TEST_SYNC_POINT_CALLBACK(
570             "PosixRandomAccessFile::MultiRead:io_uring_result", &bytes_read);
571         if (bytes_read == req_wrap->iov.iov_len) {
572           req->result = Slice(req->scratch, req->len);
573           req->status = IOStatus::OK();
574         } else if (bytes_read == 0) {
575           // cqe->res == 0 can means EOF, or can mean partial results. See
576           // comment
577           // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435
578           // Fall back to pread in this case.
579           Slice tmp_slice;
580           req->status =
581               Read(req->offset + req_wrap->finished_len,
582                    req->len - req_wrap->finished_len, options, &tmp_slice,
583                    req->scratch + req_wrap->finished_len, dbg);
584           req->result =
585               Slice(req->scratch, req_wrap->finished_len + tmp_slice.size());
586         } else if (bytes_read < req_wrap->iov.iov_len) {
587           assert(bytes_read > 0);
588           assert(bytes_read + req_wrap->finished_len < req->len);
589           req_wrap->finished_len += bytes_read;
590           incomplete_rq_list.push_back(req_wrap);
591         } else {
592           req->result = Slice(req->scratch, 0);
593           req->status = IOError("Req returned more bytes than requested",
594                                 filename_, cqe->res);
595         }
596       }
597       io_uring_cqe_seen(iu, cqe);
598     }
599   }
600   return IOStatus::OK();
601 #else
602   return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg);
603 #endif
604 }
605 
606 IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n,
607                                          const IOOptions& /*opts*/,
608                                          IODebugContext* /*dbg*/) {
609   IOStatus s;
610   if (!use_direct_io()) {
611     ssize_t r = 0;
612 #ifdef OS_LINUX
613     r = readahead(fd_, offset, n);
614 #endif
615 #ifdef OS_MACOSX
616     radvisory advice;
617     advice.ra_offset = static_cast<off_t>(offset);
618     advice.ra_count = static_cast<int>(n);
619     r = fcntl(fd_, F_RDADVISE, &advice);
620 #endif
621     if (r == -1) {
622       s = IOError("While prefetching offset " + ToString(offset) + " len " +
623                       ToString(n),
624                   filename_, errno);
625     }
626   }
627   return s;
628 }
629 
630 #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX)
631 size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
632   return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
633 }
634 #endif
635 
636 void PosixRandomAccessFile::Hint(AccessPattern pattern) {
637   if (use_direct_io()) {
638     return;
639   }
640   switch (pattern) {
641     case kNormal:
642       Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL);
643       break;
644     case kRandom:
645       Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM);
646       break;
647     case kSequential:
648       Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL);
649       break;
650     case kWillNeed:
651       Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED);
652       break;
653     case kWontNeed:
654       Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED);
655       break;
656     default:
657       assert(false);
658       break;
659   }
660 }
661 
662 IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
663   if (use_direct_io()) {
664     return IOStatus::OK();
665   }
666 #ifndef OS_LINUX
667   (void)offset;
668   (void)length;
669   return IOStatus::OK();
670 #else
671   // free OS pages
672   int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
673   if (ret == 0) {
674     return IOStatus::OK();
675   }
676   return IOError("While fadvise NotNeeded offset " + ToString(offset) +
677                      " len " + ToString(length),
678                  filename_, errno);
679 #endif
680 }
681 
682 /*
683  * PosixMmapReadableFile
684  *
685  * mmap() based random-access
686  */
687 // base[0,length-1] contains the mmapped contents of the file.
688 PosixMmapReadableFile::PosixMmapReadableFile(const int fd,
689                                              const std::string& fname,
690                                              void* base, size_t length,
691                                              const EnvOptions& options)
692     : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) {
693 #ifdef NDEBUG
694   (void)options;
695 #endif
696   fd_ = fd_ + 0;  // suppress the warning for used variables
697   assert(options.use_mmap_reads);
698   assert(!options.use_direct_reads);
699 }
700 
701 PosixMmapReadableFile::~PosixMmapReadableFile() {
702   int ret = munmap(mmapped_region_, length_);
703   if (ret != 0) {
704     fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n",
705             mmapped_region_, length_);
706   }
707   close(fd_);
708 }
709 
710 IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n,
711                                      const IOOptions& /*opts*/, Slice* result,
712                                      char* /*scratch*/,
713                                      IODebugContext* /*dbg*/) const {
714   IOStatus s;
715   if (offset > length_) {
716     *result = Slice();
717     return IOError("While mmap read offset " + ToString(offset) +
718                        " larger than file length " + ToString(length_),
719                    filename_, EINVAL);
720   } else if (offset + n > length_) {
721     n = static_cast<size_t>(length_ - offset);
722   }
723   *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n);
724   return s;
725 }
726 
727 IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
728 #ifndef OS_LINUX
729   (void)offset;
730   (void)length;
731   return IOStatus::OK();
732 #else
733   // free OS pages
734   int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
735   if (ret == 0) {
736     return IOStatus::OK();
737   }
738   return IOError("While fadvise not needed. Offset " + ToString(offset) +
739                      " len" + ToString(length),
740                  filename_, errno);
741 #endif
742 }
743 
744 /*
745  * PosixMmapFile
746  *
747  * We preallocate up to an extra megabyte and use memcpy to append new
748  * data to the file.  This is safe since we either properly close the
749  * file before reading from it, or for log files, the reading code
750  * knows enough to skip zero suffixes.
751  */
752 IOStatus PosixMmapFile::UnmapCurrentRegion() {
753   TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
754   if (base_ != nullptr) {
755     int munmap_status = munmap(base_, limit_ - base_);
756     if (munmap_status != 0) {
757       return IOError("While munmap", filename_, munmap_status);
758     }
759     file_offset_ += limit_ - base_;
760     base_ = nullptr;
761     limit_ = nullptr;
762     last_sync_ = nullptr;
763     dst_ = nullptr;
764 
765     // Increase the amount we map the next time, but capped at 1MB
766     if (map_size_ < (1 << 20)) {
767       map_size_ *= 2;
768     }
769   }
770   return IOStatus::OK();
771 }
772 
773 IOStatus PosixMmapFile::MapNewRegion() {
774 #ifdef ROCKSDB_FALLOCATE_PRESENT
775   assert(base_ == nullptr);
776   TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds);
777   // we can't fallocate with FALLOC_FL_KEEP_SIZE here
778   if (allow_fallocate_) {
779     IOSTATS_TIMER_GUARD(allocate_nanos);
780     int alloc_status = fallocate(fd_, 0, file_offset_, map_size_);
781     if (alloc_status != 0) {
782       // fallback to posix_fallocate
783       alloc_status = posix_fallocate(fd_, file_offset_, map_size_);
784     }
785     if (alloc_status != 0) {
786       return IOStatus::IOError("Error allocating space to file : " + filename_ +
787                                "Error : " + strerror(alloc_status));
788     }
789   }
790 
791   TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds);
792   void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_,
793                    file_offset_);
794   if (ptr == MAP_FAILED) {
795     return IOStatus::IOError("MMap failed on " + filename_);
796   }
797   TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds);
798 
799   base_ = reinterpret_cast<char*>(ptr);
800   limit_ = base_ + map_size_;
801   dst_ = base_;
802   last_sync_ = base_;
803   return IOStatus::OK();
804 #else
805   return IOStatus::NotSupported("This platform doesn't support fallocate()");
806 #endif
807 }
808 
809 IOStatus PosixMmapFile::Msync() {
810   if (dst_ == last_sync_) {
811     return IOStatus::OK();
812   }
813   // Find the beginnings of the pages that contain the first and last
814   // bytes to be synced.
815   size_t p1 = TruncateToPageBoundary(last_sync_ - base_);
816   size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1);
817   last_sync_ = dst_;
818   TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds);
819   if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) {
820     return IOError("While msync", filename_, errno);
821   }
822   return IOStatus::OK();
823 }
824 
825 PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size,
826                              const EnvOptions& options)
827     : filename_(fname),
828       fd_(fd),
829       page_size_(page_size),
830       map_size_(Roundup(65536, page_size)),
831       base_(nullptr),
832       limit_(nullptr),
833       dst_(nullptr),
834       last_sync_(nullptr),
835       file_offset_(0) {
836 #ifdef ROCKSDB_FALLOCATE_PRESENT
837   allow_fallocate_ = options.allow_fallocate;
838   fallocate_with_keep_size_ = options.fallocate_with_keep_size;
839 #else
840   (void)options;
841 #endif
842   assert((page_size & (page_size - 1)) == 0);
843   assert(options.use_mmap_writes);
844   assert(!options.use_direct_writes);
845 }
846 
847 PosixMmapFile::~PosixMmapFile() {
848   if (fd_ >= 0) {
849     PosixMmapFile::Close(IOOptions(), nullptr);
850   }
851 }
852 
853 IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/,
854                                IODebugContext* /*dbg*/) {
855   const char* src = data.data();
856   size_t left = data.size();
857   while (left > 0) {
858     assert(base_ <= dst_);
859     assert(dst_ <= limit_);
860     size_t avail = limit_ - dst_;
861     if (avail == 0) {
862       IOStatus s = UnmapCurrentRegion();
863       if (!s.ok()) {
864         return s;
865       }
866       s = MapNewRegion();
867       if (!s.ok()) {
868         return s;
869       }
870       TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds);
871     }
872 
873     size_t n = (left <= avail) ? left : avail;
874     assert(dst_);
875     memcpy(dst_, src, n);
876     dst_ += n;
877     src += n;
878     left -= n;
879   }
880   return IOStatus::OK();
881 }
882 
883 IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/,
884                               IODebugContext* /*dbg*/) {
885   IOStatus s;
886   size_t unused = limit_ - dst_;
887 
888   s = UnmapCurrentRegion();
889   if (!s.ok()) {
890     s = IOError("While closing mmapped file", filename_, errno);
891   } else if (unused > 0) {
892     // Trim the extra space at the end of the file
893     if (ftruncate(fd_, file_offset_ - unused) < 0) {
894       s = IOError("While ftruncating mmaped file", filename_, errno);
895     }
896   }
897 
898   if (close(fd_) < 0) {
899     if (s.ok()) {
900       s = IOError("While closing mmapped file", filename_, errno);
901     }
902   }
903 
904   fd_ = -1;
905   base_ = nullptr;
906   limit_ = nullptr;
907   return s;
908 }
909 
910 IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/,
911                               IODebugContext* /*dbg*/) {
912   return IOStatus::OK();
913 }
914 
915 IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/,
916                              IODebugContext* /*dbg*/) {
917   if (fdatasync(fd_) < 0) {
918     return IOError("While fdatasync mmapped file", filename_, errno);
919   }
920 
921   return Msync();
922 }
923 
924 /**
925  * Flush data as well as metadata to stable storage.
926  */
927 IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/,
928                               IODebugContext* /*dbg*/) {
929   if (fsync(fd_) < 0) {
930     return IOError("While fsync mmaped file", filename_, errno);
931   }
932 
933   return Msync();
934 }
935 
936 /**
937  * Get the size of valid data in the file. This will not match the
938  * size that is returned from the filesystem because we use mmap
939  * to extend file by map_size every time.
940  */
941 uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/,
942                                     IODebugContext* /*dbg*/) {
943   size_t used = dst_ - base_;
944   return file_offset_ + used;
945 }
946 
947 IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) {
948 #ifndef OS_LINUX
949   (void)offset;
950   (void)length;
951   return IOStatus::OK();
952 #else
953   // free OS pages
954   int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
955   if (ret == 0) {
956     return IOStatus::OK();
957   }
958   return IOError("While fadvise NotNeeded mmapped file", filename_, errno);
959 #endif
960 }
961 
962 #ifdef ROCKSDB_FALLOCATE_PRESENT
963 IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len,
964                                  const IOOptions& /*opts*/,
965                                  IODebugContext* /*dbg*/) {
966   assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
967   assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
968   TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds);
969   int alloc_status = 0;
970   if (allow_fallocate_) {
971     alloc_status =
972         fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
973                   static_cast<off_t>(offset), static_cast<off_t>(len));
974   }
975   if (alloc_status == 0) {
976     return IOStatus::OK();
977   } else {
978     return IOError(
979         "While fallocate offset " + ToString(offset) + " len " + ToString(len),
980         filename_, errno);
981   }
982 }
983 #endif
984 
985 /*
986  * PosixWritableFile
987  *
988  * Use posix write to write data to a file.
989  */
990 PosixWritableFile::PosixWritableFile(const std::string& fname, int fd,
991                                      const EnvOptions& options)
992     : FSWritableFile(options),
993       filename_(fname),
994       use_direct_io_(options.use_direct_writes),
995       fd_(fd),
996       filesize_(0),
997       logical_sector_size_(GetLogicalBufferSize(fd_)) {
998 #ifdef ROCKSDB_FALLOCATE_PRESENT
999   allow_fallocate_ = options.allow_fallocate;
1000   fallocate_with_keep_size_ = options.fallocate_with_keep_size;
1001 #endif
1002 #ifdef ROCKSDB_RANGESYNC_PRESENT
1003   sync_file_range_supported_ = IsSyncFileRangeSupported(fd_);
1004 #endif  // ROCKSDB_RANGESYNC_PRESENT
1005   assert(!options.use_mmap_writes);
1006 }
1007 
1008 PosixWritableFile::~PosixWritableFile() {
1009   if (fd_ >= 0) {
1010     PosixWritableFile::Close(IOOptions(), nullptr);
1011   }
1012 }
1013 
1014 IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/,
1015                                    IODebugContext* /*dbg*/) {
1016   if (use_direct_io()) {
1017     assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
1018     assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
1019   }
1020   const char* src = data.data();
1021   size_t nbytes = data.size();
1022 
1023   if (!PosixWrite(fd_, src, nbytes)) {
1024     return IOError("While appending to file", filename_, errno);
1025   }
1026 
1027   filesize_ += nbytes;
1028   return IOStatus::OK();
1029 }
1030 
1031 IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset,
1032                                              const IOOptions& /*opts*/,
1033                                              IODebugContext* /*dbg*/) {
1034   if (use_direct_io()) {
1035     assert(IsSectorAligned(offset, GetRequiredBufferAlignment()));
1036     assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment()));
1037     assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment()));
1038   }
1039   assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1040   const char* src = data.data();
1041   size_t nbytes = data.size();
1042   if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
1043     return IOError("While pwrite to file at offset " + ToString(offset),
1044                    filename_, errno);
1045   }
1046   filesize_ = offset + nbytes;
1047   return IOStatus::OK();
1048 }
1049 
1050 IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/,
1051                                      IODebugContext* /*dbg*/) {
1052   IOStatus s;
1053   int r = ftruncate(fd_, size);
1054   if (r < 0) {
1055     s = IOError("While ftruncate file to size " + ToString(size), filename_,
1056                 errno);
1057   } else {
1058     filesize_ = size;
1059   }
1060   return s;
1061 }
1062 
1063 IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/,
1064                                   IODebugContext* /*dbg*/) {
1065   IOStatus s;
1066 
1067   size_t block_size;
1068   size_t last_allocated_block;
1069   GetPreallocationStatus(&block_size, &last_allocated_block);
1070   if (last_allocated_block > 0) {
1071     // trim the extra space preallocated at the end of the file
1072     // NOTE(ljin): we probably don't want to surface failure as an IOError,
1073     // but it will be nice to log these errors.
1074     int dummy __attribute__((__unused__));
1075     dummy = ftruncate(fd_, filesize_);
1076 #if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) && \
1077     !defined(TRAVIS)
1078     // in some file systems, ftruncate only trims trailing space if the
1079     // new file size is smaller than the current size. Calling fallocate
1080     // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused
1081     // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following
1082     // filesystems:
1083     //   XFS (since Linux 2.6.38)
1084     //   ext4 (since Linux 3.0)
1085     //   Btrfs (since Linux 3.7)
1086     //   tmpfs (since Linux 3.5)
1087     // We ignore error since failure of this operation does not affect
1088     // correctness.
1089     // TRAVIS - this code does not work on TRAVIS filesystems.
1090     // the FALLOC_FL_KEEP_SIZE option is expected to not change the size
1091     // of the file, but it does. Simple strace report will show that.
1092     // While we work with Travis-CI team to figure out if this is a
1093     // quirk of Docker/AUFS, we will comment this out.
1094     struct stat file_stats;
1095     int result = fstat(fd_, &file_stats);
1096     // After ftruncate, we check whether ftruncate has the correct behavior.
1097     // If not, we should hack it with FALLOC_FL_PUNCH_HOLE
1098     if (result == 0 &&
1099         (file_stats.st_size + file_stats.st_blksize - 1) /
1100                 file_stats.st_blksize !=
1101             file_stats.st_blocks / (file_stats.st_blksize / 512)) {
1102       IOSTATS_TIMER_GUARD(allocate_nanos);
1103       if (allow_fallocate_) {
1104         fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_,
1105                   block_size * last_allocated_block - filesize_);
1106       }
1107     }
1108 #endif
1109   }
1110 
1111   if (close(fd_) < 0) {
1112     s = IOError("While closing file after writing", filename_, errno);
1113   }
1114   fd_ = -1;
1115   return s;
1116 }
1117 
1118 // write out the cached data to the OS cache
1119 IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/,
1120                                   IODebugContext* /*dbg*/) {
1121   return IOStatus::OK();
1122 }
1123 
1124 IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/,
1125                                  IODebugContext* /*dbg*/) {
1126   if (fdatasync(fd_) < 0) {
1127     return IOError("While fdatasync", filename_, errno);
1128   }
1129   return IOStatus::OK();
1130 }
1131 
1132 IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/,
1133                                   IODebugContext* /*dbg*/) {
1134   if (fsync(fd_) < 0) {
1135     return IOError("While fsync", filename_, errno);
1136   }
1137   return IOStatus::OK();
1138 }
1139 
1140 bool PosixWritableFile::IsSyncThreadSafe() const { return true; }
1141 
1142 uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/,
1143                                         IODebugContext* /*dbg*/) {
1144   return filesize_;
1145 }
1146 
1147 void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) {
1148 #ifdef OS_LINUX
1149 // Suppress Valgrind "Unimplemented functionality" error.
1150 #ifndef ROCKSDB_VALGRIND_RUN
1151   if (hint == write_hint_) {
1152     return;
1153   }
1154   if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) {
1155     write_hint_ = hint;
1156   }
1157 #else
1158   (void)hint;
1159 #endif  // ROCKSDB_VALGRIND_RUN
1160 #else
1161   (void)hint;
1162 #endif  // OS_LINUX
1163 }
1164 
1165 IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) {
1166   if (use_direct_io()) {
1167     return IOStatus::OK();
1168   }
1169 #ifndef OS_LINUX
1170   (void)offset;
1171   (void)length;
1172   return IOStatus::OK();
1173 #else
1174   // free OS pages
1175   int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED);
1176   if (ret == 0) {
1177     return IOStatus::OK();
1178   }
1179   return IOError("While fadvise NotNeeded", filename_, errno);
1180 #endif
1181 }
1182 
1183 #ifdef ROCKSDB_FALLOCATE_PRESENT
1184 IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len,
1185                                      const IOOptions& /*opts*/,
1186                                      IODebugContext* /*dbg*/) {
1187   assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1188   assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1189   TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds);
1190   IOSTATS_TIMER_GUARD(allocate_nanos);
1191   int alloc_status = 0;
1192   if (allow_fallocate_) {
1193     alloc_status =
1194         fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0,
1195                   static_cast<off_t>(offset), static_cast<off_t>(len));
1196   }
1197   if (alloc_status == 0) {
1198     return IOStatus::OK();
1199   } else {
1200     return IOError(
1201         "While fallocate offset " + ToString(offset) + " len " + ToString(len),
1202         filename_, errno);
1203   }
1204 }
1205 #endif
1206 
1207 IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes,
1208                                       const IOOptions& opts,
1209                                       IODebugContext* dbg) {
1210 #ifdef ROCKSDB_RANGESYNC_PRESENT
1211   assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1212   assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max()));
1213   if (sync_file_range_supported_) {
1214     int ret;
1215     if (strict_bytes_per_sync_) {
1216       // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length
1217       // that spans all bytes written so far tells `sync_file_range` to wait for
1218       // any outstanding writeback requests to finish before issuing a new one.
1219       ret =
1220           sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes),
1221                           SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE);
1222     } else {
1223       ret = sync_file_range(fd_, static_cast<off_t>(offset),
1224                             static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE);
1225     }
1226     if (ret != 0) {
1227       return IOError("While sync_file_range returned " + ToString(ret),
1228                      filename_, errno);
1229     }
1230     return IOStatus::OK();
1231   }
1232 #endif  // ROCKSDB_RANGESYNC_PRESENT
1233   return FSWritableFile::RangeSync(offset, nbytes, opts, dbg);
1234 }
1235 
1236 #ifdef OS_LINUX
1237 size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const {
1238   return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size);
1239 }
1240 #endif
1241 
1242 /*
1243  * PosixRandomRWFile
1244  */
1245 
1246 PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd,
1247                                      const EnvOptions& /*options*/)
1248     : filename_(fname), fd_(fd) {}
1249 
1250 PosixRandomRWFile::~PosixRandomRWFile() {
1251   if (fd_ >= 0) {
1252     Close(IOOptions(), nullptr);
1253   }
1254 }
1255 
1256 IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data,
1257                                   const IOOptions& /*opts*/,
1258                                   IODebugContext* /*dbg*/) {
1259   const char* src = data.data();
1260   size_t nbytes = data.size();
1261   if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) {
1262     return IOError(
1263         "While write random read/write file at offset " + ToString(offset),
1264         filename_, errno);
1265   }
1266 
1267   return IOStatus::OK();
1268 }
1269 
1270 IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n,
1271                                  const IOOptions& /*opts*/, Slice* result,
1272                                  char* scratch, IODebugContext* /*dbg*/) const {
1273   size_t left = n;
1274   char* ptr = scratch;
1275   while (left > 0) {
1276     ssize_t done = pread(fd_, ptr, left, offset);
1277     if (done < 0) {
1278       // error while reading from file
1279       if (errno == EINTR) {
1280         // read was interrupted, try again.
1281         continue;
1282       }
1283       return IOError("While reading random read/write file offset " +
1284                          ToString(offset) + " len " + ToString(n),
1285                      filename_, errno);
1286     } else if (done == 0) {
1287       // Nothing more to read
1288       break;
1289     }
1290 
1291     // Read `done` bytes
1292     ptr += done;
1293     offset += done;
1294     left -= done;
1295   }
1296 
1297   *result = Slice(scratch, n - left);
1298   return IOStatus::OK();
1299 }
1300 
1301 IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/,
1302                                   IODebugContext* /*dbg*/) {
1303   return IOStatus::OK();
1304 }
1305 
1306 IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/,
1307                                  IODebugContext* /*dbg*/) {
1308   if (fdatasync(fd_) < 0) {
1309     return IOError("While fdatasync random read/write file", filename_, errno);
1310   }
1311   return IOStatus::OK();
1312 }
1313 
1314 IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/,
1315                                   IODebugContext* /*dbg*/) {
1316   if (fsync(fd_) < 0) {
1317     return IOError("While fsync random read/write file", filename_, errno);
1318   }
1319   return IOStatus::OK();
1320 }
1321 
1322 IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/,
1323                                   IODebugContext* /*dbg*/) {
1324   if (close(fd_) < 0) {
1325     return IOError("While close random read/write file", filename_, errno);
1326   }
1327   fd_ = -1;
1328   return IOStatus::OK();
1329 }
1330 
1331 PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() {
1332   // TODO should have error handling though not much we can do...
1333   munmap(this->base_, length_);
1334 }
1335 
1336 /*
1337  * PosixDirectory
1338  */
1339 
1340 PosixDirectory::~PosixDirectory() { close(fd_); }
1341 
1342 IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/,
1343                                IODebugContext* /*dbg*/) {
1344 #ifndef OS_AIX
1345   if (fsync(fd_) == -1) {
1346     return IOError("While fsync", "a directory", errno);
1347   }
1348 #endif
1349   return IOStatus::OK();
1350 }
1351 }  // namespace ROCKSDB_NAMESPACE
1352 #endif
1353