1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved. 2 // This source code is licensed under both the GPLv2 (found in the 3 // COPYING file in the root directory) and Apache 2.0 License 4 // (found in the LICENSE.Apache file in the root directory). 5 // 6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved. 7 // Use of this source code is governed by a BSD-style license that can be 8 // found in the LICENSE file. See the AUTHORS file for names of contributors. 9 10 #ifdef ROCKSDB_LIB_IO_POSIX 11 #include "env/io_posix.h" 12 #include <errno.h> 13 #include <fcntl.h> 14 #include <algorithm> 15 #if defined(OS_LINUX) 16 #include <linux/fs.h> 17 #ifndef FALLOC_FL_KEEP_SIZE 18 #include <linux/falloc.h> 19 #endif 20 #endif 21 #include <stdio.h> 22 #include <stdlib.h> 23 #include <string.h> 24 #include <sys/ioctl.h> 25 #include <sys/mman.h> 26 #include <sys/stat.h> 27 #include <sys/types.h> 28 #ifdef OS_LINUX 29 #include <sys/statfs.h> 30 #include <sys/syscall.h> 31 #include <sys/sysmacros.h> 32 #endif 33 #include "monitoring/iostats_context_imp.h" 34 #include "port/port.h" 35 #include "rocksdb/slice.h" 36 #include "test_util/sync_point.h" 37 #include "util/autovector.h" 38 #include "util/coding.h" 39 #include "util/string_util.h" 40 41 #if defined(OS_LINUX) && !defined(F_SET_RW_HINT) 42 #define F_LINUX_SPECIFIC_BASE 1024 43 #define F_SET_RW_HINT (F_LINUX_SPECIFIC_BASE + 12) 44 #endif 45 46 namespace ROCKSDB_NAMESPACE { 47 48 // A wrapper for fadvise, if the platform doesn't support fadvise, 49 // it will simply return 0. 50 int Fadvise(int fd, off_t offset, size_t len, int advice) { 51 #ifdef OS_LINUX 52 return posix_fadvise(fd, offset, len, advice); 53 #else 54 (void)fd; 55 (void)offset; 56 (void)len; 57 (void)advice; 58 return 0; // simply do nothing. 59 #endif 60 } 61 62 namespace { 63 64 // On MacOS (and probably *BSD), the posix write and pwrite calls do not support 65 // buffers larger than 2^31-1 bytes. These two wrappers fix this issue by 66 // cutting the buffer in 1GB chunks. We use this chunk size to be sure to keep 67 // the writes aligned. 68 69 bool PosixWrite(int fd, const char* buf, size_t nbyte) { 70 const size_t kLimit1Gb = 1UL << 30; 71 72 const char* src = buf; 73 size_t left = nbyte; 74 75 while (left != 0) { 76 size_t bytes_to_write = std::min(left, kLimit1Gb); 77 78 ssize_t done = write(fd, src, bytes_to_write); 79 if (done < 0) { 80 if (errno == EINTR) { 81 continue; 82 } 83 return false; 84 } 85 left -= done; 86 src += done; 87 } 88 return true; 89 } 90 91 bool PosixPositionedWrite(int fd, const char* buf, size_t nbyte, off_t offset) { 92 const size_t kLimit1Gb = 1UL << 30; 93 94 const char* src = buf; 95 size_t left = nbyte; 96 97 while (left != 0) { 98 size_t bytes_to_write = std::min(left, kLimit1Gb); 99 100 ssize_t done = pwrite(fd, src, bytes_to_write, offset); 101 if (done < 0) { 102 if (errno == EINTR) { 103 continue; 104 } 105 return false; 106 } 107 left -= done; 108 offset += done; 109 src += done; 110 } 111 112 return true; 113 } 114 115 size_t GetLogicalBufferSize(int __attribute__((__unused__)) fd) { 116 #ifdef OS_LINUX 117 struct stat buf; 118 int result = fstat(fd, &buf); 119 if (result == -1) { 120 return kDefaultPageSize; 121 } 122 if (major(buf.st_dev) == 0) { 123 // Unnamed devices (e.g. non-device mounts), reserved as null device number. 124 // These don't have an entry in /sys/dev/block/. Return a sensible default. 125 return kDefaultPageSize; 126 } 127 128 // Reading queue/logical_block_size does not require special permissions. 129 const int kBufferSize = 100; 130 char path[kBufferSize]; 131 char real_path[PATH_MAX + 1]; 132 snprintf(path, kBufferSize, "/sys/dev/block/%u:%u", major(buf.st_dev), 133 minor(buf.st_dev)); 134 if (realpath(path, real_path) == nullptr) { 135 return kDefaultPageSize; 136 } 137 std::string device_dir(real_path); 138 if (!device_dir.empty() && device_dir.back() == '/') { 139 device_dir.pop_back(); 140 } 141 // NOTE: sda3 and nvme0n1p1 do not have a `queue/` subdir, only the parent sda 142 // and nvme0n1 have it. 143 // $ ls -al '/sys/dev/block/8:3' 144 // lrwxrwxrwx. 1 root root 0 Jun 26 01:38 /sys/dev/block/8:3 -> 145 // ../../block/sda/sda3 146 // $ ls -al '/sys/dev/block/259:4' 147 // lrwxrwxrwx 1 root root 0 Jan 31 16:04 /sys/dev/block/259:4 -> 148 // ../../devices/pci0000:17/0000:17:00.0/0000:18:00.0/nvme/nvme0/nvme0n1/nvme0n1p1 149 size_t parent_end = device_dir.rfind('/', device_dir.length() - 1); 150 if (parent_end == std::string::npos) { 151 return kDefaultPageSize; 152 } 153 size_t parent_begin = device_dir.rfind('/', parent_end - 1); 154 if (parent_begin == std::string::npos) { 155 return kDefaultPageSize; 156 } 157 std::string parent = 158 device_dir.substr(parent_begin + 1, parent_end - parent_begin - 1); 159 std::string child = device_dir.substr(parent_end + 1, std::string::npos); 160 if (parent != "block" && 161 (child.compare(0, 4, "nvme") || child.find('p') != std::string::npos)) { 162 device_dir = device_dir.substr(0, parent_end); 163 } 164 std::string fname = device_dir + "/queue/logical_block_size"; 165 FILE* fp; 166 size_t size = 0; 167 fp = fopen(fname.c_str(), "r"); 168 if (fp != nullptr) { 169 char* line = nullptr; 170 size_t len = 0; 171 if (getline(&line, &len, fp) != -1) { 172 sscanf(line, "%zu", &size); 173 } 174 free(line); 175 fclose(fp); 176 } 177 if (size != 0 && (size & (size - 1)) == 0) { 178 return size; 179 } 180 #endif 181 return kDefaultPageSize; 182 } 183 184 #ifdef ROCKSDB_RANGESYNC_PRESENT 185 186 #if !defined(ZFS_SUPER_MAGIC) 187 // The magic number for ZFS was not exposed until recently. It should be fixed 188 // forever so we can just copy the magic number here. 189 #define ZFS_SUPER_MAGIC 0x2fc12fc1 190 #endif 191 192 bool IsSyncFileRangeSupported(int fd) { 193 // The approach taken in this function is to build a blacklist of cases where 194 // we know `sync_file_range` definitely will not work properly despite passing 195 // the compile-time check (`ROCKSDB_RANGESYNC_PRESENT`). If we are unsure, or 196 // if any of the checks fail in unexpected ways, we allow `sync_file_range` to 197 // be used. This way should minimize risk of impacting existing use cases. 198 struct statfs buf; 199 int ret = fstatfs(fd, &buf); 200 assert(ret == 0); 201 if (ret == 0 && buf.f_type == ZFS_SUPER_MAGIC) { 202 // Testing on ZFS showed the writeback did not happen asynchronously when 203 // `sync_file_range` was called, even though it returned success. Avoid it 204 // and use `fdatasync` instead to preserve the contract of `bytes_per_sync`, 205 // even though this'll incur extra I/O for metadata. 206 return false; 207 } 208 209 ret = sync_file_range(fd, 0 /* offset */, 0 /* nbytes */, 0 /* flags */); 210 assert(!(ret == -1 && errno != ENOSYS)); 211 if (ret == -1 && errno == ENOSYS) { 212 // `sync_file_range` is not implemented on all platforms even if 213 // compile-time checks pass and a supported filesystem is in-use. For 214 // example, using ext4 on WSL (Windows Subsystem for Linux), 215 // `sync_file_range()` returns `ENOSYS` 216 // ("Function not implemented"). 217 return false; 218 } 219 // None of the cases on the blacklist matched, so allow `sync_file_range` use. 220 return true; 221 } 222 223 #undef ZFS_SUPER_MAGIC 224 225 #endif // ROCKSDB_RANGESYNC_PRESENT 226 227 } // anonymous namespace 228 229 /* 230 * DirectIOHelper 231 */ 232 #ifndef NDEBUG 233 namespace { 234 235 bool IsSectorAligned(const size_t off, size_t sector_size) { 236 return off % sector_size == 0; 237 } 238 239 bool IsSectorAligned(const void* ptr, size_t sector_size) { 240 return uintptr_t(ptr) % sector_size == 0; 241 } 242 243 } // namespace 244 #endif 245 246 /* 247 * PosixSequentialFile 248 */ 249 PosixSequentialFile::PosixSequentialFile(const std::string& fname, FILE* file, 250 int fd, const EnvOptions& options) 251 : filename_(fname), 252 file_(file), 253 fd_(fd), 254 use_direct_io_(options.use_direct_reads), 255 logical_sector_size_(GetLogicalBufferSize(fd_)) { 256 assert(!options.use_direct_reads || !options.use_mmap_reads); 257 } 258 259 PosixSequentialFile::~PosixSequentialFile() { 260 if (!use_direct_io()) { 261 assert(file_); 262 fclose(file_); 263 } else { 264 assert(fd_); 265 close(fd_); 266 } 267 } 268 269 IOStatus PosixSequentialFile::Read(size_t n, const IOOptions& /*opts*/, 270 Slice* result, char* scratch, 271 IODebugContext* /*dbg*/) { 272 assert(result != nullptr && !use_direct_io()); 273 IOStatus s; 274 size_t r = 0; 275 do { 276 r = fread_unlocked(scratch, 1, n, file_); 277 } while (r == 0 && ferror(file_) && errno == EINTR); 278 *result = Slice(scratch, r); 279 if (r < n) { 280 if (feof(file_)) { 281 // We leave status as ok if we hit the end of the file 282 // We also clear the error so that the reads can continue 283 // if a new data is written to the file 284 clearerr(file_); 285 } else { 286 // A partial read with an error: return a non-ok status 287 s = IOError("While reading file sequentially", filename_, errno); 288 } 289 } 290 return s; 291 } 292 293 IOStatus PosixSequentialFile::PositionedRead(uint64_t offset, size_t n, 294 const IOOptions& /*opts*/, 295 Slice* result, char* scratch, 296 IODebugContext* /*dbg*/) { 297 assert(use_direct_io()); 298 assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); 299 assert(IsSectorAligned(n, GetRequiredBufferAlignment())); 300 assert(IsSectorAligned(scratch, GetRequiredBufferAlignment())); 301 302 IOStatus s; 303 ssize_t r = -1; 304 size_t left = n; 305 char* ptr = scratch; 306 while (left > 0) { 307 r = pread(fd_, ptr, left, static_cast<off_t>(offset)); 308 if (r <= 0) { 309 if (r == -1 && errno == EINTR) { 310 continue; 311 } 312 break; 313 } 314 ptr += r; 315 offset += r; 316 left -= r; 317 if (r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) { 318 // Bytes reads don't fill sectors. Should only happen at the end 319 // of the file. 320 break; 321 } 322 } 323 if (r < 0) { 324 // An error: return a non-ok status 325 s = IOError( 326 "While pread " + ToString(n) + " bytes from offset " + ToString(offset), 327 filename_, errno); 328 } 329 *result = Slice(scratch, (r < 0) ? 0 : n - left); 330 return s; 331 } 332 333 IOStatus PosixSequentialFile::Skip(uint64_t n) { 334 if (fseek(file_, static_cast<long int>(n), SEEK_CUR)) { 335 return IOError("While fseek to skip " + ToString(n) + " bytes", filename_, 336 errno); 337 } 338 return IOStatus::OK(); 339 } 340 341 IOStatus PosixSequentialFile::InvalidateCache(size_t offset, size_t length) { 342 #ifndef OS_LINUX 343 (void)offset; 344 (void)length; 345 return IOStatus::OK(); 346 #else 347 if (!use_direct_io()) { 348 // free OS pages 349 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); 350 if (ret != 0) { 351 return IOError("While fadvise NotNeeded offset " + ToString(offset) + 352 " len " + ToString(length), 353 filename_, errno); 354 } 355 } 356 return IOStatus::OK(); 357 #endif 358 } 359 360 /* 361 * PosixRandomAccessFile 362 */ 363 #if defined(OS_LINUX) 364 size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { 365 if (max_size < kMaxVarint64Length * 3) { 366 return 0; 367 } 368 369 struct stat buf; 370 int result = fstat(fd, &buf); 371 if (result == -1) { 372 return 0; 373 } 374 375 long version = 0; 376 result = ioctl(fd, FS_IOC_GETVERSION, &version); 377 TEST_SYNC_POINT_CALLBACK("GetUniqueIdFromFile:FS_IOC_GETVERSION", &result); 378 if (result == -1) { 379 return 0; 380 } 381 uint64_t uversion = (uint64_t)version; 382 383 char* rid = id; 384 rid = EncodeVarint64(rid, buf.st_dev); 385 rid = EncodeVarint64(rid, buf.st_ino); 386 rid = EncodeVarint64(rid, uversion); 387 assert(rid >= id); 388 return static_cast<size_t>(rid - id); 389 } 390 #endif 391 392 #if defined(OS_MACOSX) || defined(OS_AIX) 393 size_t PosixHelper::GetUniqueIdFromFile(int fd, char* id, size_t max_size) { 394 if (max_size < kMaxVarint64Length * 3) { 395 return 0; 396 } 397 398 struct stat buf; 399 int result = fstat(fd, &buf); 400 if (result == -1) { 401 return 0; 402 } 403 404 char* rid = id; 405 rid = EncodeVarint64(rid, buf.st_dev); 406 rid = EncodeVarint64(rid, buf.st_ino); 407 rid = EncodeVarint64(rid, buf.st_gen); 408 assert(rid >= id); 409 return static_cast<size_t>(rid - id); 410 } 411 #endif 412 /* 413 * PosixRandomAccessFile 414 * 415 * pread() based random-access 416 */ 417 PosixRandomAccessFile::PosixRandomAccessFile( 418 const std::string& fname, int fd, const EnvOptions& options 419 #if defined(ROCKSDB_IOURING_PRESENT) 420 , 421 ThreadLocalPtr* thread_local_io_urings 422 #endif 423 ) 424 : filename_(fname), 425 fd_(fd), 426 use_direct_io_(options.use_direct_reads), 427 logical_sector_size_(GetLogicalBufferSize(fd_)) 428 #if defined(ROCKSDB_IOURING_PRESENT) 429 , 430 thread_local_io_urings_(thread_local_io_urings) 431 #endif 432 { 433 assert(!options.use_direct_reads || !options.use_mmap_reads); 434 assert(!options.use_mmap_reads || sizeof(void*) < 8); 435 } 436 437 PosixRandomAccessFile::~PosixRandomAccessFile() { close(fd_); } 438 439 IOStatus PosixRandomAccessFile::Read(uint64_t offset, size_t n, 440 const IOOptions& /*opts*/, Slice* result, 441 char* scratch, 442 IODebugContext* /*dbg*/) const { 443 if (use_direct_io()) { 444 assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); 445 assert(IsSectorAligned(n, GetRequiredBufferAlignment())); 446 assert(IsSectorAligned(scratch, GetRequiredBufferAlignment())); 447 } 448 IOStatus s; 449 ssize_t r = -1; 450 size_t left = n; 451 char* ptr = scratch; 452 while (left > 0) { 453 r = pread(fd_, ptr, left, static_cast<off_t>(offset)); 454 if (r <= 0) { 455 if (r == -1 && errno == EINTR) { 456 continue; 457 } 458 break; 459 } 460 ptr += r; 461 offset += r; 462 left -= r; 463 if (use_direct_io() && 464 r % static_cast<ssize_t>(GetRequiredBufferAlignment()) != 0) { 465 // Bytes reads don't fill sectors. Should only happen at the end 466 // of the file. 467 break; 468 } 469 } 470 if (r < 0) { 471 // An error: return a non-ok status 472 s = IOError( 473 "While pread offset " + ToString(offset) + " len " + ToString(n), 474 filename_, errno); 475 } 476 *result = Slice(scratch, (r < 0) ? 0 : n - left); 477 return s; 478 } 479 480 IOStatus PosixRandomAccessFile::MultiRead(FSReadRequest* reqs, 481 size_t num_reqs, 482 const IOOptions& options, 483 IODebugContext* dbg) { 484 #if defined(ROCKSDB_IOURING_PRESENT) 485 struct io_uring* iu = nullptr; 486 if (thread_local_io_urings_) { 487 iu = static_cast<struct io_uring*>(thread_local_io_urings_->Get()); 488 if (iu == nullptr) { 489 iu = CreateIOUring(); 490 if (iu != nullptr) { 491 thread_local_io_urings_->Reset(iu); 492 } 493 } 494 } 495 496 // Init failed, platform doesn't support io_uring. Fall back to 497 // serialized reads 498 if (iu == nullptr) { 499 return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); 500 } 501 502 struct WrappedReadRequest { 503 FSReadRequest* req; 504 struct iovec iov; 505 size_t finished_len; 506 explicit WrappedReadRequest(FSReadRequest* r) : req(r), finished_len(0) {} 507 }; 508 509 autovector<WrappedReadRequest, 32> req_wraps; 510 autovector<WrappedReadRequest*, 4> incomplete_rq_list; 511 512 for (size_t i = 0; i < num_reqs; i++) { 513 req_wraps.emplace_back(&reqs[i]); 514 } 515 516 size_t reqs_off = 0; 517 while (num_reqs > reqs_off || !incomplete_rq_list.empty()) { 518 size_t this_reqs = (num_reqs - reqs_off) + incomplete_rq_list.size(); 519 520 // If requests exceed depth, split it into batches 521 if (this_reqs > kIoUringDepth) this_reqs = kIoUringDepth; 522 523 assert(incomplete_rq_list.size() <= this_reqs); 524 for (size_t i = 0; i < this_reqs; i++) { 525 WrappedReadRequest* rep_to_submit; 526 if (i < incomplete_rq_list.size()) { 527 rep_to_submit = incomplete_rq_list[i]; 528 } else { 529 rep_to_submit = &req_wraps[reqs_off++]; 530 } 531 assert(rep_to_submit->req->len > rep_to_submit->finished_len); 532 rep_to_submit->iov.iov_base = 533 rep_to_submit->req->scratch + rep_to_submit->finished_len; 534 rep_to_submit->iov.iov_len = 535 rep_to_submit->req->len - rep_to_submit->finished_len; 536 537 struct io_uring_sqe* sqe; 538 sqe = io_uring_get_sqe(iu); 539 io_uring_prep_readv( 540 sqe, fd_, &rep_to_submit->iov, 1, 541 rep_to_submit->req->offset + rep_to_submit->finished_len); 542 io_uring_sqe_set_data(sqe, rep_to_submit); 543 } 544 incomplete_rq_list.clear(); 545 546 ssize_t ret = 547 io_uring_submit_and_wait(iu, static_cast<unsigned int>(this_reqs)); 548 if (static_cast<size_t>(ret) != this_reqs) { 549 fprintf(stderr, "ret = %ld this_reqs: %ld\n", (long)ret, (long)this_reqs); 550 } 551 assert(static_cast<size_t>(ret) == this_reqs); 552 553 for (size_t i = 0; i < this_reqs; i++) { 554 struct io_uring_cqe* cqe; 555 WrappedReadRequest* req_wrap; 556 557 // We could use the peek variant here, but this seems safer in terms 558 // of our initial wait not reaping all completions 559 ret = io_uring_wait_cqe(iu, &cqe); 560 assert(!ret); 561 562 req_wrap = static_cast<WrappedReadRequest*>(io_uring_cqe_get_data(cqe)); 563 FSReadRequest* req = req_wrap->req; 564 if (cqe->res < 0) { 565 req->result = Slice(req->scratch, 0); 566 req->status = IOError("Req failed", filename_, cqe->res); 567 } else { 568 size_t bytes_read = static_cast<size_t>(cqe->res); 569 TEST_SYNC_POINT_CALLBACK( 570 "PosixRandomAccessFile::MultiRead:io_uring_result", &bytes_read); 571 if (bytes_read == req_wrap->iov.iov_len) { 572 req->result = Slice(req->scratch, req->len); 573 req->status = IOStatus::OK(); 574 } else if (bytes_read == 0) { 575 // cqe->res == 0 can means EOF, or can mean partial results. See 576 // comment 577 // https://github.com/facebook/rocksdb/pull/6441#issuecomment-589843435 578 // Fall back to pread in this case. 579 Slice tmp_slice; 580 req->status = 581 Read(req->offset + req_wrap->finished_len, 582 req->len - req_wrap->finished_len, options, &tmp_slice, 583 req->scratch + req_wrap->finished_len, dbg); 584 req->result = 585 Slice(req->scratch, req_wrap->finished_len + tmp_slice.size()); 586 } else if (bytes_read < req_wrap->iov.iov_len) { 587 assert(bytes_read > 0); 588 assert(bytes_read + req_wrap->finished_len < req->len); 589 req_wrap->finished_len += bytes_read; 590 incomplete_rq_list.push_back(req_wrap); 591 } else { 592 req->result = Slice(req->scratch, 0); 593 req->status = IOError("Req returned more bytes than requested", 594 filename_, cqe->res); 595 } 596 } 597 io_uring_cqe_seen(iu, cqe); 598 } 599 } 600 return IOStatus::OK(); 601 #else 602 return FSRandomAccessFile::MultiRead(reqs, num_reqs, options, dbg); 603 #endif 604 } 605 606 IOStatus PosixRandomAccessFile::Prefetch(uint64_t offset, size_t n, 607 const IOOptions& /*opts*/, 608 IODebugContext* /*dbg*/) { 609 IOStatus s; 610 if (!use_direct_io()) { 611 ssize_t r = 0; 612 #ifdef OS_LINUX 613 r = readahead(fd_, offset, n); 614 #endif 615 #ifdef OS_MACOSX 616 radvisory advice; 617 advice.ra_offset = static_cast<off_t>(offset); 618 advice.ra_count = static_cast<int>(n); 619 r = fcntl(fd_, F_RDADVISE, &advice); 620 #endif 621 if (r == -1) { 622 s = IOError("While prefetching offset " + ToString(offset) + " len " + 623 ToString(n), 624 filename_, errno); 625 } 626 } 627 return s; 628 } 629 630 #if defined(OS_LINUX) || defined(OS_MACOSX) || defined(OS_AIX) 631 size_t PosixRandomAccessFile::GetUniqueId(char* id, size_t max_size) const { 632 return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); 633 } 634 #endif 635 636 void PosixRandomAccessFile::Hint(AccessPattern pattern) { 637 if (use_direct_io()) { 638 return; 639 } 640 switch (pattern) { 641 case kNormal: 642 Fadvise(fd_, 0, 0, POSIX_FADV_NORMAL); 643 break; 644 case kRandom: 645 Fadvise(fd_, 0, 0, POSIX_FADV_RANDOM); 646 break; 647 case kSequential: 648 Fadvise(fd_, 0, 0, POSIX_FADV_SEQUENTIAL); 649 break; 650 case kWillNeed: 651 Fadvise(fd_, 0, 0, POSIX_FADV_WILLNEED); 652 break; 653 case kWontNeed: 654 Fadvise(fd_, 0, 0, POSIX_FADV_DONTNEED); 655 break; 656 default: 657 assert(false); 658 break; 659 } 660 } 661 662 IOStatus PosixRandomAccessFile::InvalidateCache(size_t offset, size_t length) { 663 if (use_direct_io()) { 664 return IOStatus::OK(); 665 } 666 #ifndef OS_LINUX 667 (void)offset; 668 (void)length; 669 return IOStatus::OK(); 670 #else 671 // free OS pages 672 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); 673 if (ret == 0) { 674 return IOStatus::OK(); 675 } 676 return IOError("While fadvise NotNeeded offset " + ToString(offset) + 677 " len " + ToString(length), 678 filename_, errno); 679 #endif 680 } 681 682 /* 683 * PosixMmapReadableFile 684 * 685 * mmap() based random-access 686 */ 687 // base[0,length-1] contains the mmapped contents of the file. 688 PosixMmapReadableFile::PosixMmapReadableFile(const int fd, 689 const std::string& fname, 690 void* base, size_t length, 691 const EnvOptions& options) 692 : fd_(fd), filename_(fname), mmapped_region_(base), length_(length) { 693 #ifdef NDEBUG 694 (void)options; 695 #endif 696 fd_ = fd_ + 0; // suppress the warning for used variables 697 assert(options.use_mmap_reads); 698 assert(!options.use_direct_reads); 699 } 700 701 PosixMmapReadableFile::~PosixMmapReadableFile() { 702 int ret = munmap(mmapped_region_, length_); 703 if (ret != 0) { 704 fprintf(stdout, "failed to munmap %p length %" ROCKSDB_PRIszt " \n", 705 mmapped_region_, length_); 706 } 707 close(fd_); 708 } 709 710 IOStatus PosixMmapReadableFile::Read(uint64_t offset, size_t n, 711 const IOOptions& /*opts*/, Slice* result, 712 char* /*scratch*/, 713 IODebugContext* /*dbg*/) const { 714 IOStatus s; 715 if (offset > length_) { 716 *result = Slice(); 717 return IOError("While mmap read offset " + ToString(offset) + 718 " larger than file length " + ToString(length_), 719 filename_, EINVAL); 720 } else if (offset + n > length_) { 721 n = static_cast<size_t>(length_ - offset); 722 } 723 *result = Slice(reinterpret_cast<char*>(mmapped_region_) + offset, n); 724 return s; 725 } 726 727 IOStatus PosixMmapReadableFile::InvalidateCache(size_t offset, size_t length) { 728 #ifndef OS_LINUX 729 (void)offset; 730 (void)length; 731 return IOStatus::OK(); 732 #else 733 // free OS pages 734 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); 735 if (ret == 0) { 736 return IOStatus::OK(); 737 } 738 return IOError("While fadvise not needed. Offset " + ToString(offset) + 739 " len" + ToString(length), 740 filename_, errno); 741 #endif 742 } 743 744 /* 745 * PosixMmapFile 746 * 747 * We preallocate up to an extra megabyte and use memcpy to append new 748 * data to the file. This is safe since we either properly close the 749 * file before reading from it, or for log files, the reading code 750 * knows enough to skip zero suffixes. 751 */ 752 IOStatus PosixMmapFile::UnmapCurrentRegion() { 753 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); 754 if (base_ != nullptr) { 755 int munmap_status = munmap(base_, limit_ - base_); 756 if (munmap_status != 0) { 757 return IOError("While munmap", filename_, munmap_status); 758 } 759 file_offset_ += limit_ - base_; 760 base_ = nullptr; 761 limit_ = nullptr; 762 last_sync_ = nullptr; 763 dst_ = nullptr; 764 765 // Increase the amount we map the next time, but capped at 1MB 766 if (map_size_ < (1 << 20)) { 767 map_size_ *= 2; 768 } 769 } 770 return IOStatus::OK(); 771 } 772 773 IOStatus PosixMmapFile::MapNewRegion() { 774 #ifdef ROCKSDB_FALLOCATE_PRESENT 775 assert(base_ == nullptr); 776 TEST_KILL_RANDOM("PosixMmapFile::UnmapCurrentRegion:0", rocksdb_kill_odds); 777 // we can't fallocate with FALLOC_FL_KEEP_SIZE here 778 if (allow_fallocate_) { 779 IOSTATS_TIMER_GUARD(allocate_nanos); 780 int alloc_status = fallocate(fd_, 0, file_offset_, map_size_); 781 if (alloc_status != 0) { 782 // fallback to posix_fallocate 783 alloc_status = posix_fallocate(fd_, file_offset_, map_size_); 784 } 785 if (alloc_status != 0) { 786 return IOStatus::IOError("Error allocating space to file : " + filename_ + 787 "Error : " + strerror(alloc_status)); 788 } 789 } 790 791 TEST_KILL_RANDOM("PosixMmapFile::Append:1", rocksdb_kill_odds); 792 void* ptr = mmap(nullptr, map_size_, PROT_READ | PROT_WRITE, MAP_SHARED, fd_, 793 file_offset_); 794 if (ptr == MAP_FAILED) { 795 return IOStatus::IOError("MMap failed on " + filename_); 796 } 797 TEST_KILL_RANDOM("PosixMmapFile::Append:2", rocksdb_kill_odds); 798 799 base_ = reinterpret_cast<char*>(ptr); 800 limit_ = base_ + map_size_; 801 dst_ = base_; 802 last_sync_ = base_; 803 return IOStatus::OK(); 804 #else 805 return IOStatus::NotSupported("This platform doesn't support fallocate()"); 806 #endif 807 } 808 809 IOStatus PosixMmapFile::Msync() { 810 if (dst_ == last_sync_) { 811 return IOStatus::OK(); 812 } 813 // Find the beginnings of the pages that contain the first and last 814 // bytes to be synced. 815 size_t p1 = TruncateToPageBoundary(last_sync_ - base_); 816 size_t p2 = TruncateToPageBoundary(dst_ - base_ - 1); 817 last_sync_ = dst_; 818 TEST_KILL_RANDOM("PosixMmapFile::Msync:0", rocksdb_kill_odds); 819 if (msync(base_ + p1, p2 - p1 + page_size_, MS_SYNC) < 0) { 820 return IOError("While msync", filename_, errno); 821 } 822 return IOStatus::OK(); 823 } 824 825 PosixMmapFile::PosixMmapFile(const std::string& fname, int fd, size_t page_size, 826 const EnvOptions& options) 827 : filename_(fname), 828 fd_(fd), 829 page_size_(page_size), 830 map_size_(Roundup(65536, page_size)), 831 base_(nullptr), 832 limit_(nullptr), 833 dst_(nullptr), 834 last_sync_(nullptr), 835 file_offset_(0) { 836 #ifdef ROCKSDB_FALLOCATE_PRESENT 837 allow_fallocate_ = options.allow_fallocate; 838 fallocate_with_keep_size_ = options.fallocate_with_keep_size; 839 #else 840 (void)options; 841 #endif 842 assert((page_size & (page_size - 1)) == 0); 843 assert(options.use_mmap_writes); 844 assert(!options.use_direct_writes); 845 } 846 847 PosixMmapFile::~PosixMmapFile() { 848 if (fd_ >= 0) { 849 PosixMmapFile::Close(IOOptions(), nullptr); 850 } 851 } 852 853 IOStatus PosixMmapFile::Append(const Slice& data, const IOOptions& /*opts*/, 854 IODebugContext* /*dbg*/) { 855 const char* src = data.data(); 856 size_t left = data.size(); 857 while (left > 0) { 858 assert(base_ <= dst_); 859 assert(dst_ <= limit_); 860 size_t avail = limit_ - dst_; 861 if (avail == 0) { 862 IOStatus s = UnmapCurrentRegion(); 863 if (!s.ok()) { 864 return s; 865 } 866 s = MapNewRegion(); 867 if (!s.ok()) { 868 return s; 869 } 870 TEST_KILL_RANDOM("PosixMmapFile::Append:0", rocksdb_kill_odds); 871 } 872 873 size_t n = (left <= avail) ? left : avail; 874 assert(dst_); 875 memcpy(dst_, src, n); 876 dst_ += n; 877 src += n; 878 left -= n; 879 } 880 return IOStatus::OK(); 881 } 882 883 IOStatus PosixMmapFile::Close(const IOOptions& /*opts*/, 884 IODebugContext* /*dbg*/) { 885 IOStatus s; 886 size_t unused = limit_ - dst_; 887 888 s = UnmapCurrentRegion(); 889 if (!s.ok()) { 890 s = IOError("While closing mmapped file", filename_, errno); 891 } else if (unused > 0) { 892 // Trim the extra space at the end of the file 893 if (ftruncate(fd_, file_offset_ - unused) < 0) { 894 s = IOError("While ftruncating mmaped file", filename_, errno); 895 } 896 } 897 898 if (close(fd_) < 0) { 899 if (s.ok()) { 900 s = IOError("While closing mmapped file", filename_, errno); 901 } 902 } 903 904 fd_ = -1; 905 base_ = nullptr; 906 limit_ = nullptr; 907 return s; 908 } 909 910 IOStatus PosixMmapFile::Flush(const IOOptions& /*opts*/, 911 IODebugContext* /*dbg*/) { 912 return IOStatus::OK(); 913 } 914 915 IOStatus PosixMmapFile::Sync(const IOOptions& /*opts*/, 916 IODebugContext* /*dbg*/) { 917 if (fdatasync(fd_) < 0) { 918 return IOError("While fdatasync mmapped file", filename_, errno); 919 } 920 921 return Msync(); 922 } 923 924 /** 925 * Flush data as well as metadata to stable storage. 926 */ 927 IOStatus PosixMmapFile::Fsync(const IOOptions& /*opts*/, 928 IODebugContext* /*dbg*/) { 929 if (fsync(fd_) < 0) { 930 return IOError("While fsync mmaped file", filename_, errno); 931 } 932 933 return Msync(); 934 } 935 936 /** 937 * Get the size of valid data in the file. This will not match the 938 * size that is returned from the filesystem because we use mmap 939 * to extend file by map_size every time. 940 */ 941 uint64_t PosixMmapFile::GetFileSize(const IOOptions& /*opts*/, 942 IODebugContext* /*dbg*/) { 943 size_t used = dst_ - base_; 944 return file_offset_ + used; 945 } 946 947 IOStatus PosixMmapFile::InvalidateCache(size_t offset, size_t length) { 948 #ifndef OS_LINUX 949 (void)offset; 950 (void)length; 951 return IOStatus::OK(); 952 #else 953 // free OS pages 954 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); 955 if (ret == 0) { 956 return IOStatus::OK(); 957 } 958 return IOError("While fadvise NotNeeded mmapped file", filename_, errno); 959 #endif 960 } 961 962 #ifdef ROCKSDB_FALLOCATE_PRESENT 963 IOStatus PosixMmapFile::Allocate(uint64_t offset, uint64_t len, 964 const IOOptions& /*opts*/, 965 IODebugContext* /*dbg*/) { 966 assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); 967 assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); 968 TEST_KILL_RANDOM("PosixMmapFile::Allocate:0", rocksdb_kill_odds); 969 int alloc_status = 0; 970 if (allow_fallocate_) { 971 alloc_status = 972 fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, 973 static_cast<off_t>(offset), static_cast<off_t>(len)); 974 } 975 if (alloc_status == 0) { 976 return IOStatus::OK(); 977 } else { 978 return IOError( 979 "While fallocate offset " + ToString(offset) + " len " + ToString(len), 980 filename_, errno); 981 } 982 } 983 #endif 984 985 /* 986 * PosixWritableFile 987 * 988 * Use posix write to write data to a file. 989 */ 990 PosixWritableFile::PosixWritableFile(const std::string& fname, int fd, 991 const EnvOptions& options) 992 : FSWritableFile(options), 993 filename_(fname), 994 use_direct_io_(options.use_direct_writes), 995 fd_(fd), 996 filesize_(0), 997 logical_sector_size_(GetLogicalBufferSize(fd_)) { 998 #ifdef ROCKSDB_FALLOCATE_PRESENT 999 allow_fallocate_ = options.allow_fallocate; 1000 fallocate_with_keep_size_ = options.fallocate_with_keep_size; 1001 #endif 1002 #ifdef ROCKSDB_RANGESYNC_PRESENT 1003 sync_file_range_supported_ = IsSyncFileRangeSupported(fd_); 1004 #endif // ROCKSDB_RANGESYNC_PRESENT 1005 assert(!options.use_mmap_writes); 1006 } 1007 1008 PosixWritableFile::~PosixWritableFile() { 1009 if (fd_ >= 0) { 1010 PosixWritableFile::Close(IOOptions(), nullptr); 1011 } 1012 } 1013 1014 IOStatus PosixWritableFile::Append(const Slice& data, const IOOptions& /*opts*/, 1015 IODebugContext* /*dbg*/) { 1016 if (use_direct_io()) { 1017 assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); 1018 assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); 1019 } 1020 const char* src = data.data(); 1021 size_t nbytes = data.size(); 1022 1023 if (!PosixWrite(fd_, src, nbytes)) { 1024 return IOError("While appending to file", filename_, errno); 1025 } 1026 1027 filesize_ += nbytes; 1028 return IOStatus::OK(); 1029 } 1030 1031 IOStatus PosixWritableFile::PositionedAppend(const Slice& data, uint64_t offset, 1032 const IOOptions& /*opts*/, 1033 IODebugContext* /*dbg*/) { 1034 if (use_direct_io()) { 1035 assert(IsSectorAligned(offset, GetRequiredBufferAlignment())); 1036 assert(IsSectorAligned(data.size(), GetRequiredBufferAlignment())); 1037 assert(IsSectorAligned(data.data(), GetRequiredBufferAlignment())); 1038 } 1039 assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); 1040 const char* src = data.data(); 1041 size_t nbytes = data.size(); 1042 if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) { 1043 return IOError("While pwrite to file at offset " + ToString(offset), 1044 filename_, errno); 1045 } 1046 filesize_ = offset + nbytes; 1047 return IOStatus::OK(); 1048 } 1049 1050 IOStatus PosixWritableFile::Truncate(uint64_t size, const IOOptions& /*opts*/, 1051 IODebugContext* /*dbg*/) { 1052 IOStatus s; 1053 int r = ftruncate(fd_, size); 1054 if (r < 0) { 1055 s = IOError("While ftruncate file to size " + ToString(size), filename_, 1056 errno); 1057 } else { 1058 filesize_ = size; 1059 } 1060 return s; 1061 } 1062 1063 IOStatus PosixWritableFile::Close(const IOOptions& /*opts*/, 1064 IODebugContext* /*dbg*/) { 1065 IOStatus s; 1066 1067 size_t block_size; 1068 size_t last_allocated_block; 1069 GetPreallocationStatus(&block_size, &last_allocated_block); 1070 if (last_allocated_block > 0) { 1071 // trim the extra space preallocated at the end of the file 1072 // NOTE(ljin): we probably don't want to surface failure as an IOError, 1073 // but it will be nice to log these errors. 1074 int dummy __attribute__((__unused__)); 1075 dummy = ftruncate(fd_, filesize_); 1076 #if defined(ROCKSDB_FALLOCATE_PRESENT) && defined(FALLOC_FL_PUNCH_HOLE) && \ 1077 !defined(TRAVIS) 1078 // in some file systems, ftruncate only trims trailing space if the 1079 // new file size is smaller than the current size. Calling fallocate 1080 // with FALLOC_FL_PUNCH_HOLE flag to explicitly release these unused 1081 // blocks. FALLOC_FL_PUNCH_HOLE is supported on at least the following 1082 // filesystems: 1083 // XFS (since Linux 2.6.38) 1084 // ext4 (since Linux 3.0) 1085 // Btrfs (since Linux 3.7) 1086 // tmpfs (since Linux 3.5) 1087 // We ignore error since failure of this operation does not affect 1088 // correctness. 1089 // TRAVIS - this code does not work on TRAVIS filesystems. 1090 // the FALLOC_FL_KEEP_SIZE option is expected to not change the size 1091 // of the file, but it does. Simple strace report will show that. 1092 // While we work with Travis-CI team to figure out if this is a 1093 // quirk of Docker/AUFS, we will comment this out. 1094 struct stat file_stats; 1095 int result = fstat(fd_, &file_stats); 1096 // After ftruncate, we check whether ftruncate has the correct behavior. 1097 // If not, we should hack it with FALLOC_FL_PUNCH_HOLE 1098 if (result == 0 && 1099 (file_stats.st_size + file_stats.st_blksize - 1) / 1100 file_stats.st_blksize != 1101 file_stats.st_blocks / (file_stats.st_blksize / 512)) { 1102 IOSTATS_TIMER_GUARD(allocate_nanos); 1103 if (allow_fallocate_) { 1104 fallocate(fd_, FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE, filesize_, 1105 block_size * last_allocated_block - filesize_); 1106 } 1107 } 1108 #endif 1109 } 1110 1111 if (close(fd_) < 0) { 1112 s = IOError("While closing file after writing", filename_, errno); 1113 } 1114 fd_ = -1; 1115 return s; 1116 } 1117 1118 // write out the cached data to the OS cache 1119 IOStatus PosixWritableFile::Flush(const IOOptions& /*opts*/, 1120 IODebugContext* /*dbg*/) { 1121 return IOStatus::OK(); 1122 } 1123 1124 IOStatus PosixWritableFile::Sync(const IOOptions& /*opts*/, 1125 IODebugContext* /*dbg*/) { 1126 if (fdatasync(fd_) < 0) { 1127 return IOError("While fdatasync", filename_, errno); 1128 } 1129 return IOStatus::OK(); 1130 } 1131 1132 IOStatus PosixWritableFile::Fsync(const IOOptions& /*opts*/, 1133 IODebugContext* /*dbg*/) { 1134 if (fsync(fd_) < 0) { 1135 return IOError("While fsync", filename_, errno); 1136 } 1137 return IOStatus::OK(); 1138 } 1139 1140 bool PosixWritableFile::IsSyncThreadSafe() const { return true; } 1141 1142 uint64_t PosixWritableFile::GetFileSize(const IOOptions& /*opts*/, 1143 IODebugContext* /*dbg*/) { 1144 return filesize_; 1145 } 1146 1147 void PosixWritableFile::SetWriteLifeTimeHint(Env::WriteLifeTimeHint hint) { 1148 #ifdef OS_LINUX 1149 // Suppress Valgrind "Unimplemented functionality" error. 1150 #ifndef ROCKSDB_VALGRIND_RUN 1151 if (hint == write_hint_) { 1152 return; 1153 } 1154 if (fcntl(fd_, F_SET_RW_HINT, &hint) == 0) { 1155 write_hint_ = hint; 1156 } 1157 #else 1158 (void)hint; 1159 #endif // ROCKSDB_VALGRIND_RUN 1160 #else 1161 (void)hint; 1162 #endif // OS_LINUX 1163 } 1164 1165 IOStatus PosixWritableFile::InvalidateCache(size_t offset, size_t length) { 1166 if (use_direct_io()) { 1167 return IOStatus::OK(); 1168 } 1169 #ifndef OS_LINUX 1170 (void)offset; 1171 (void)length; 1172 return IOStatus::OK(); 1173 #else 1174 // free OS pages 1175 int ret = Fadvise(fd_, offset, length, POSIX_FADV_DONTNEED); 1176 if (ret == 0) { 1177 return IOStatus::OK(); 1178 } 1179 return IOError("While fadvise NotNeeded", filename_, errno); 1180 #endif 1181 } 1182 1183 #ifdef ROCKSDB_FALLOCATE_PRESENT 1184 IOStatus PosixWritableFile::Allocate(uint64_t offset, uint64_t len, 1185 const IOOptions& /*opts*/, 1186 IODebugContext* /*dbg*/) { 1187 assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); 1188 assert(len <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); 1189 TEST_KILL_RANDOM("PosixWritableFile::Allocate:0", rocksdb_kill_odds); 1190 IOSTATS_TIMER_GUARD(allocate_nanos); 1191 int alloc_status = 0; 1192 if (allow_fallocate_) { 1193 alloc_status = 1194 fallocate(fd_, fallocate_with_keep_size_ ? FALLOC_FL_KEEP_SIZE : 0, 1195 static_cast<off_t>(offset), static_cast<off_t>(len)); 1196 } 1197 if (alloc_status == 0) { 1198 return IOStatus::OK(); 1199 } else { 1200 return IOError( 1201 "While fallocate offset " + ToString(offset) + " len " + ToString(len), 1202 filename_, errno); 1203 } 1204 } 1205 #endif 1206 1207 IOStatus PosixWritableFile::RangeSync(uint64_t offset, uint64_t nbytes, 1208 const IOOptions& opts, 1209 IODebugContext* dbg) { 1210 #ifdef ROCKSDB_RANGESYNC_PRESENT 1211 assert(offset <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); 1212 assert(nbytes <= static_cast<uint64_t>(std::numeric_limits<off_t>::max())); 1213 if (sync_file_range_supported_) { 1214 int ret; 1215 if (strict_bytes_per_sync_) { 1216 // Specifying `SYNC_FILE_RANGE_WAIT_BEFORE` together with an offset/length 1217 // that spans all bytes written so far tells `sync_file_range` to wait for 1218 // any outstanding writeback requests to finish before issuing a new one. 1219 ret = 1220 sync_file_range(fd_, 0, static_cast<off_t>(offset + nbytes), 1221 SYNC_FILE_RANGE_WAIT_BEFORE | SYNC_FILE_RANGE_WRITE); 1222 } else { 1223 ret = sync_file_range(fd_, static_cast<off_t>(offset), 1224 static_cast<off_t>(nbytes), SYNC_FILE_RANGE_WRITE); 1225 } 1226 if (ret != 0) { 1227 return IOError("While sync_file_range returned " + ToString(ret), 1228 filename_, errno); 1229 } 1230 return IOStatus::OK(); 1231 } 1232 #endif // ROCKSDB_RANGESYNC_PRESENT 1233 return FSWritableFile::RangeSync(offset, nbytes, opts, dbg); 1234 } 1235 1236 #ifdef OS_LINUX 1237 size_t PosixWritableFile::GetUniqueId(char* id, size_t max_size) const { 1238 return PosixHelper::GetUniqueIdFromFile(fd_, id, max_size); 1239 } 1240 #endif 1241 1242 /* 1243 * PosixRandomRWFile 1244 */ 1245 1246 PosixRandomRWFile::PosixRandomRWFile(const std::string& fname, int fd, 1247 const EnvOptions& /*options*/) 1248 : filename_(fname), fd_(fd) {} 1249 1250 PosixRandomRWFile::~PosixRandomRWFile() { 1251 if (fd_ >= 0) { 1252 Close(IOOptions(), nullptr); 1253 } 1254 } 1255 1256 IOStatus PosixRandomRWFile::Write(uint64_t offset, const Slice& data, 1257 const IOOptions& /*opts*/, 1258 IODebugContext* /*dbg*/) { 1259 const char* src = data.data(); 1260 size_t nbytes = data.size(); 1261 if (!PosixPositionedWrite(fd_, src, nbytes, static_cast<off_t>(offset))) { 1262 return IOError( 1263 "While write random read/write file at offset " + ToString(offset), 1264 filename_, errno); 1265 } 1266 1267 return IOStatus::OK(); 1268 } 1269 1270 IOStatus PosixRandomRWFile::Read(uint64_t offset, size_t n, 1271 const IOOptions& /*opts*/, Slice* result, 1272 char* scratch, IODebugContext* /*dbg*/) const { 1273 size_t left = n; 1274 char* ptr = scratch; 1275 while (left > 0) { 1276 ssize_t done = pread(fd_, ptr, left, offset); 1277 if (done < 0) { 1278 // error while reading from file 1279 if (errno == EINTR) { 1280 // read was interrupted, try again. 1281 continue; 1282 } 1283 return IOError("While reading random read/write file offset " + 1284 ToString(offset) + " len " + ToString(n), 1285 filename_, errno); 1286 } else if (done == 0) { 1287 // Nothing more to read 1288 break; 1289 } 1290 1291 // Read `done` bytes 1292 ptr += done; 1293 offset += done; 1294 left -= done; 1295 } 1296 1297 *result = Slice(scratch, n - left); 1298 return IOStatus::OK(); 1299 } 1300 1301 IOStatus PosixRandomRWFile::Flush(const IOOptions& /*opts*/, 1302 IODebugContext* /*dbg*/) { 1303 return IOStatus::OK(); 1304 } 1305 1306 IOStatus PosixRandomRWFile::Sync(const IOOptions& /*opts*/, 1307 IODebugContext* /*dbg*/) { 1308 if (fdatasync(fd_) < 0) { 1309 return IOError("While fdatasync random read/write file", filename_, errno); 1310 } 1311 return IOStatus::OK(); 1312 } 1313 1314 IOStatus PosixRandomRWFile::Fsync(const IOOptions& /*opts*/, 1315 IODebugContext* /*dbg*/) { 1316 if (fsync(fd_) < 0) { 1317 return IOError("While fsync random read/write file", filename_, errno); 1318 } 1319 return IOStatus::OK(); 1320 } 1321 1322 IOStatus PosixRandomRWFile::Close(const IOOptions& /*opts*/, 1323 IODebugContext* /*dbg*/) { 1324 if (close(fd_) < 0) { 1325 return IOError("While close random read/write file", filename_, errno); 1326 } 1327 fd_ = -1; 1328 return IOStatus::OK(); 1329 } 1330 1331 PosixMemoryMappedFileBuffer::~PosixMemoryMappedFileBuffer() { 1332 // TODO should have error handling though not much we can do... 1333 munmap(this->base_, length_); 1334 } 1335 1336 /* 1337 * PosixDirectory 1338 */ 1339 1340 PosixDirectory::~PosixDirectory() { close(fd_); } 1341 1342 IOStatus PosixDirectory::Fsync(const IOOptions& /*opts*/, 1343 IODebugContext* /*dbg*/) { 1344 #ifndef OS_AIX 1345 if (fsync(fd_) == -1) { 1346 return IOError("While fsync", "a directory", errno); 1347 } 1348 #endif 1349 return IOStatus::OK(); 1350 } 1351 } // namespace ROCKSDB_NAMESPACE 1352 #endif 1353