1 // Copyright (c) 2011-present, Facebook, Inc. All rights reserved.
2 // This source code is licensed under both the GPLv2 (found in the
3 // COPYING file in the root directory) and Apache 2.0 License
4 // (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9
10 #include "port/win/io_win.h"
11
12 #include "monitoring/iostats_context_imp.h"
13 #include "test_util/sync_point.h"
14 #include "util/aligned_buffer.h"
15 #include "util/coding.h"
16
17 namespace ROCKSDB_NAMESPACE {
18 namespace port {
19
20 /*
21 * DirectIOHelper
22 */
23 namespace {
24
25 const size_t kSectorSize = 512;
26
27 inline
IsPowerOfTwo(const size_t alignment)28 bool IsPowerOfTwo(const size_t alignment) {
29 return ((alignment) & (alignment - 1)) == 0;
30 }
31
32 inline
IsSectorAligned(const size_t off)33 bool IsSectorAligned(const size_t off) {
34 return (off & (kSectorSize - 1)) == 0;
35 }
36
37 inline
IsAligned(size_t alignment,const void * ptr)38 bool IsAligned(size_t alignment, const void* ptr) {
39 return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
40 }
41 }
42
43
GetWindowsErrSz(DWORD err)44 std::string GetWindowsErrSz(DWORD err) {
45 LPSTR lpMsgBuf;
46 FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
47 FORMAT_MESSAGE_IGNORE_INSERTS,
48 NULL, err,
49 0, // Default language
50 reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
51
52 std::string Err = lpMsgBuf;
53 LocalFree(lpMsgBuf);
54 return Err;
55 }
56
57 // We preserve the original name of this interface to denote the original idea
58 // behind it.
59 // All reads happen by a specified offset and pwrite interface does not change
60 // the position of the file pointer. Judging from the man page and errno it does
61 // execute
62 // lseek atomically to return the position of the file back where it was.
63 // WriteFile() does not
64 // have this capability. Therefore, for both pread and pwrite the pointer is
65 // advanced to the next position
66 // which is fine for writes because they are (should be) sequential.
67 // Because all the reads/writes happen by the specified offset, the caller in
68 // theory should not
69 // rely on the current file offset.
pwrite(const WinFileData * file_data,const Slice & data,uint64_t offset,size_t & bytes_written)70 Status pwrite(const WinFileData* file_data, const Slice& data,
71 uint64_t offset, size_t& bytes_written) {
72
73 Status s;
74 bytes_written = 0;
75
76 size_t num_bytes = data.size();
77 if (num_bytes > std::numeric_limits<DWORD>::max()) {
78 // May happen in 64-bit builds where size_t is 64-bits but
79 // long is still 32-bit, but that's the API here at the moment
80 return Status::InvalidArgument("num_bytes is too large for a single write: " +
81 file_data->GetName());
82 }
83
84 OVERLAPPED overlapped = { 0 };
85 ULARGE_INTEGER offsetUnion;
86 offsetUnion.QuadPart = offset;
87
88 overlapped.Offset = offsetUnion.LowPart;
89 overlapped.OffsetHigh = offsetUnion.HighPart;
90
91 DWORD bytesWritten = 0;
92
93 if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast<DWORD>(num_bytes),
94 &bytesWritten, &overlapped)) {
95 auto lastError = GetLastError();
96 s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
97 lastError);
98 } else {
99 bytes_written = bytesWritten;
100 }
101
102 return s;
103 }
104
105 // See comments for pwrite above
pread(const WinFileData * file_data,char * src,size_t num_bytes,uint64_t offset,size_t & bytes_read)106 Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
107 uint64_t offset, size_t& bytes_read) {
108
109 Status s;
110 bytes_read = 0;
111
112 if (num_bytes > std::numeric_limits<DWORD>::max()) {
113 return Status::InvalidArgument("num_bytes is too large for a single read: " +
114 file_data->GetName());
115 }
116
117 OVERLAPPED overlapped = { 0 };
118 ULARGE_INTEGER offsetUnion;
119 offsetUnion.QuadPart = offset;
120
121 overlapped.Offset = offsetUnion.LowPart;
122 overlapped.OffsetHigh = offsetUnion.HighPart;
123
124 DWORD bytesRead = 0;
125
126 if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast<DWORD>(num_bytes),
127 &bytesRead, &overlapped)) {
128 auto lastError = GetLastError();
129 // EOF is OK with zero bytes read
130 if (lastError != ERROR_HANDLE_EOF) {
131 s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
132 lastError);
133 }
134 } else {
135 bytes_read = bytesRead;
136 }
137
138 return s;
139 }
140
141 // SetFileInformationByHandle() is capable of fast pre-allocates.
142 // However, this does not change the file end position unless the file is
143 // truncated and the pre-allocated space is not considered filled with zeros.
fallocate(const std::string & filename,HANDLE hFile,uint64_t to_size)144 Status fallocate(const std::string& filename, HANDLE hFile,
145 uint64_t to_size) {
146 Status status;
147
148 FILE_ALLOCATION_INFO alloc_info;
149 alloc_info.AllocationSize.QuadPart = to_size;
150
151 if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
152 sizeof(FILE_ALLOCATION_INFO))) {
153 auto lastError = GetLastError();
154 status = IOErrorFromWindowsError(
155 "Failed to pre-allocate space: " + filename, lastError);
156 }
157
158 return status;
159 }
160
ftruncate(const std::string & filename,HANDLE hFile,uint64_t toSize)161 Status ftruncate(const std::string& filename, HANDLE hFile,
162 uint64_t toSize) {
163 Status status;
164
165 FILE_END_OF_FILE_INFO end_of_file;
166 end_of_file.EndOfFile.QuadPart = toSize;
167
168 if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
169 sizeof(FILE_END_OF_FILE_INFO))) {
170 auto lastError = GetLastError();
171 status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
172 lastError);
173 }
174
175 return status;
176 }
177
GetUniqueIdFromFile(HANDLE,char *,size_t)178 size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
179 size_t /*max_size*/) {
180 // Returning 0 is safe as it causes the table reader to generate a unique ID.
181 // This is suboptimal for performance as it prevents multiple table readers
182 // for the same file from sharing cached blocks. For example, if users have
183 // a low value for `max_open_files`, there can be many table readers opened
184 // for the same file.
185 //
186 // TODO: this is a temporarily solution as it is safe but not optimal for
187 // performance. For more details see discussion in
188 // https://github.com/facebook/rocksdb/pull/5844.
189 return 0;
190 }
191
192 ////////////////////////////////////////////////////////////////////////////////////////////////////
193 // WinMmapReadableFile
194
WinMmapReadableFile(const std::string & fileName,HANDLE hFile,HANDLE hMap,const void * mapped_region,size_t length)195 WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
196 HANDLE hFile, HANDLE hMap,
197 const void* mapped_region,
198 size_t length)
199 : WinFileData(fileName, hFile, false /* use_direct_io */),
200 hMap_(hMap),
201 mapped_region_(mapped_region),
202 length_(length) {}
203
~WinMmapReadableFile()204 WinMmapReadableFile::~WinMmapReadableFile() {
205 BOOL ret __attribute__((__unused__));
206 ret = ::UnmapViewOfFile(mapped_region_);
207 assert(ret);
208
209 ret = ::CloseHandle(hMap_);
210 assert(ret);
211 }
212
Read(uint64_t offset,size_t n,Slice * result,char * scratch) const213 Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
214 char* scratch) const {
215 Status s;
216
217 if (offset > length_) {
218 *result = Slice();
219 return IOError(filename_, EINVAL);
220 } else if (offset + n > length_) {
221 n = length_ - static_cast<size_t>(offset);
222 }
223 *result =
224 Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
225 return s;
226 }
227
InvalidateCache(size_t offset,size_t length)228 Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
229 return Status::OK();
230 }
231
GetUniqueId(char * id,size_t max_size) const232 size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
233 return GetUniqueIdFromFile(hFile_, id, max_size);
234 }
235
236 ///////////////////////////////////////////////////////////////////////////////
237 /// WinMmapFile
238
239
240 // Can only truncate or reserve to a sector size aligned if
241 // used on files that are opened with Unbuffered I/O
TruncateFile(uint64_t toSize)242 Status WinMmapFile::TruncateFile(uint64_t toSize) {
243 return ftruncate(filename_, hFile_, toSize);
244 }
245
UnmapCurrentRegion()246 Status WinMmapFile::UnmapCurrentRegion() {
247 Status status;
248
249 if (mapped_begin_ != nullptr) {
250 if (!::UnmapViewOfFile(mapped_begin_)) {
251 status = IOErrorFromWindowsError(
252 "Failed to unmap file view: " + filename_, GetLastError());
253 }
254
255 // Move on to the next portion of the file
256 file_offset_ += view_size_;
257
258 // UnmapView automatically sends data to disk but not the metadata
259 // which is good and provides some equivalent of fdatasync() on Linux
260 // therefore, we donot need separate flag for metadata
261 mapped_begin_ = nullptr;
262 mapped_end_ = nullptr;
263 dst_ = nullptr;
264
265 last_sync_ = nullptr;
266 pending_sync_ = false;
267 }
268
269 return status;
270 }
271
MapNewRegion()272 Status WinMmapFile::MapNewRegion() {
273
274 Status status;
275
276 assert(mapped_begin_ == nullptr);
277
278 size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
279
280 if (minDiskSize > reserved_size_) {
281 status = Allocate(file_offset_, view_size_);
282 if (!status.ok()) {
283 return status;
284 }
285 }
286
287 // Need to remap
288 if (hMap_ == NULL || reserved_size_ > mapping_size_) {
289
290 if (hMap_ != NULL) {
291 // Unmap the previous one
292 BOOL ret __attribute__((__unused__));
293 ret = ::CloseHandle(hMap_);
294 assert(ret);
295 hMap_ = NULL;
296 }
297
298 ULARGE_INTEGER mappingSize;
299 mappingSize.QuadPart = reserved_size_;
300
301 hMap_ = CreateFileMappingA(
302 hFile_,
303 NULL, // Security attributes
304 PAGE_READWRITE, // There is not a write only mode for mapping
305 mappingSize.HighPart, // Enable mapping the whole file but the actual
306 // amount mapped is determined by MapViewOfFile
307 mappingSize.LowPart,
308 NULL); // Mapping name
309
310 if (NULL == hMap_) {
311 return IOErrorFromWindowsError(
312 "WindowsMmapFile failed to create file mapping for: " + filename_,
313 GetLastError());
314 }
315
316 mapping_size_ = reserved_size_;
317 }
318
319 ULARGE_INTEGER offset;
320 offset.QuadPart = file_offset_;
321
322 // View must begin at the granularity aligned offset
323 mapped_begin_ = reinterpret_cast<char*>(
324 MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
325 view_size_, NULL));
326
327 if (!mapped_begin_) {
328 status = IOErrorFromWindowsError(
329 "WindowsMmapFile failed to map file view: " + filename_,
330 GetLastError());
331 } else {
332 mapped_end_ = mapped_begin_ + view_size_;
333 dst_ = mapped_begin_;
334 last_sync_ = mapped_begin_;
335 pending_sync_ = false;
336 }
337 return status;
338 }
339
PreallocateInternal(uint64_t spaceToReserve)340 Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
341 return fallocate(filename_, hFile_, spaceToReserve);
342 }
343
WinMmapFile(const std::string & fname,HANDLE hFile,size_t page_size,size_t allocation_granularity,const EnvOptions & options)344 WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
345 size_t page_size, size_t allocation_granularity,
346 const EnvOptions& options)
347 : WinFileData(fname, hFile, false),
348 WritableFile(options),
349 hMap_(NULL),
350 page_size_(page_size),
351 allocation_granularity_(allocation_granularity),
352 reserved_size_(0),
353 mapping_size_(0),
354 view_size_(0),
355 mapped_begin_(nullptr),
356 mapped_end_(nullptr),
357 dst_(nullptr),
358 last_sync_(nullptr),
359 file_offset_(0),
360 pending_sync_(false) {
361 // Allocation granularity must be obtained from GetSystemInfo() and must be
362 // a power of two.
363 assert(allocation_granularity > 0);
364 assert((allocation_granularity & (allocation_granularity - 1)) == 0);
365
366 assert(page_size > 0);
367 assert((page_size & (page_size - 1)) == 0);
368
369 // Only for memory mapped writes
370 assert(options.use_mmap_writes);
371
372 // View size must be both the multiple of allocation_granularity AND the
373 // page size and the granularity is usually a multiple of a page size.
374 const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
375 view_size_ = Roundup(viewSize, allocation_granularity_);
376 }
377
~WinMmapFile()378 WinMmapFile::~WinMmapFile() {
379 if (hFile_) {
380 this->Close();
381 }
382 }
383
Append(const Slice & data)384 Status WinMmapFile::Append(const Slice& data) {
385 const char* src = data.data();
386 size_t left = data.size();
387
388 while (left > 0) {
389 assert(mapped_begin_ <= dst_);
390 size_t avail = mapped_end_ - dst_;
391
392 if (avail == 0) {
393 Status s = UnmapCurrentRegion();
394 if (s.ok()) {
395 s = MapNewRegion();
396 }
397
398 if (!s.ok()) {
399 return s;
400 }
401 } else {
402 size_t n = std::min(left, avail);
403 memcpy(dst_, src, n);
404 dst_ += n;
405 src += n;
406 left -= n;
407 pending_sync_ = true;
408 }
409 }
410
411 // Now make sure that the last partial page is padded with zeros if needed
412 size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
413 if (bytesToPad > 0) {
414 memset(dst_, 0, bytesToPad);
415 }
416
417 return Status::OK();
418 }
419
420 // Means Close() will properly take care of truncate
421 // and it does not need any additional information
Truncate(uint64_t size)422 Status WinMmapFile::Truncate(uint64_t size) {
423 return Status::OK();
424 }
425
Close()426 Status WinMmapFile::Close() {
427 Status s;
428
429 assert(NULL != hFile_);
430
431 // We truncate to the precise size so no
432 // uninitialized data at the end. SetEndOfFile
433 // which we use does not write zeros and it is good.
434 uint64_t targetSize = GetFileSize();
435
436 if (mapped_begin_ != nullptr) {
437 // Sync before unmapping to make sure everything
438 // is on disk and there is not a lazy writing
439 // so we are deterministic with the tests
440 Sync();
441 s = UnmapCurrentRegion();
442 }
443
444 if (NULL != hMap_) {
445 BOOL ret = ::CloseHandle(hMap_);
446 if (!ret && s.ok()) {
447 auto lastError = GetLastError();
448 s = IOErrorFromWindowsError(
449 "Failed to Close mapping for file: " + filename_, lastError);
450 }
451
452 hMap_ = NULL;
453 }
454
455 if (hFile_ != NULL) {
456
457 TruncateFile(targetSize);
458
459 BOOL ret = ::CloseHandle(hFile_);
460 hFile_ = NULL;
461
462 if (!ret && s.ok()) {
463 auto lastError = GetLastError();
464 s = IOErrorFromWindowsError(
465 "Failed to close file map handle: " + filename_, lastError);
466 }
467 }
468
469 return s;
470 }
471
Flush()472 Status WinMmapFile::Flush() { return Status::OK(); }
473
474 // Flush only data
Sync()475 Status WinMmapFile::Sync() {
476 Status s;
477
478 // Some writes occurred since last sync
479 if (dst_ > last_sync_) {
480 assert(mapped_begin_);
481 assert(dst_);
482 assert(dst_ > mapped_begin_);
483 assert(dst_ < mapped_end_);
484
485 size_t page_begin =
486 TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
487 size_t page_end =
488 TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
489
490 // Flush only the amount of that is a multiple of pages
491 if (!::FlushViewOfFile(mapped_begin_ + page_begin,
492 (page_end - page_begin) + page_size_)) {
493 s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
494 GetLastError());
495 } else {
496 last_sync_ = dst_;
497 }
498 }
499
500 return s;
501 }
502
503 /**
504 * Flush data as well as metadata to stable storage.
505 */
Fsync()506 Status WinMmapFile::Fsync() {
507 Status s = Sync();
508
509 // Flush metadata
510 if (s.ok() && pending_sync_) {
511 if (!::FlushFileBuffers(hFile_)) {
512 s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
513 GetLastError());
514 }
515 pending_sync_ = false;
516 }
517
518 return s;
519 }
520
521 /**
522 * Get the size of valid data in the file. This will not match the
523 * size that is returned from the filesystem because we use mmap
524 * to extend file by map_size every time.
525 */
GetFileSize()526 uint64_t WinMmapFile::GetFileSize() {
527 size_t used = dst_ - mapped_begin_;
528 return file_offset_ + used;
529 }
530
InvalidateCache(size_t offset,size_t length)531 Status WinMmapFile::InvalidateCache(size_t offset, size_t length) {
532 return Status::OK();
533 }
534
Allocate(uint64_t offset,uint64_t len)535 Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
536 Status status;
537 TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
538
539 // Make sure that we reserve an aligned amount of space
540 // since the reservation block size is driven outside so we want
541 // to check if we are ok with reservation here
542 size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), view_size_);
543 // Nothing to do
544 if (spaceToReserve <= reserved_size_) {
545 return status;
546 }
547
548 IOSTATS_TIMER_GUARD(allocate_nanos);
549 status = PreallocateInternal(spaceToReserve);
550 if (status.ok()) {
551 reserved_size_ = spaceToReserve;
552 }
553 return status;
554 }
555
GetUniqueId(char * id,size_t max_size) const556 size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
557 return GetUniqueIdFromFile(hFile_, id, max_size);
558 }
559
560 //////////////////////////////////////////////////////////////////////////////////
561 // WinSequentialFile
562
WinSequentialFile(const std::string & fname,HANDLE f,const EnvOptions & options)563 WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
564 const EnvOptions& options)
565 : WinFileData(fname, f, options.use_direct_reads) {}
566
~WinSequentialFile()567 WinSequentialFile::~WinSequentialFile() {
568 assert(hFile_ != INVALID_HANDLE_VALUE);
569 }
570
Read(size_t n,Slice * result,char * scratch)571 Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
572 Status s;
573 size_t r = 0;
574
575 assert(result != nullptr);
576 if (WinFileData::use_direct_io()) {
577 return Status::NotSupported("Read() does not support direct_io");
578 }
579
580 // Windows ReadFile API accepts a DWORD.
581 // While it is possible to read in a loop if n is too big
582 // it is an unlikely case.
583 if (n > std::numeric_limits<DWORD>::max()) {
584 return Status::InvalidArgument("n is too big for a single ReadFile: "
585 + filename_);
586 }
587
588 DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
589 DWORD bytesRead = 0;
590 BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
591 if (ret != FALSE) {
592 r = bytesRead;
593 } else {
594 auto lastError = GetLastError();
595 if (lastError != ERROR_HANDLE_EOF) {
596 s = IOErrorFromWindowsError("ReadFile failed: " + filename_,
597 lastError);
598 }
599 }
600
601 *result = Slice(scratch, r);
602 return s;
603 }
604
PositionedReadInternal(char * src,size_t numBytes,uint64_t offset,size_t & bytes_read) const605 Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
606 uint64_t offset, size_t& bytes_read) const {
607 return pread(this, src, numBytes, offset, bytes_read);
608 }
609
PositionedRead(uint64_t offset,size_t n,Slice * result,char * scratch)610 Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
611 char* scratch) {
612
613 Status s;
614
615 if (!WinFileData::use_direct_io()) {
616 return Status::NotSupported("This function is only used for direct_io");
617 }
618
619 if (!IsSectorAligned(static_cast<size_t>(offset)) ||
620 !IsSectorAligned(n)) {
621 return Status::InvalidArgument(
622 "WinSequentialFile::PositionedRead: offset is not properly aligned");
623 }
624
625 size_t bytes_read = 0; // out param
626 s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, bytes_read);
627 *result = Slice(scratch, bytes_read);
628 return s;
629 }
630
631
Skip(uint64_t n)632 Status WinSequentialFile::Skip(uint64_t n) {
633 // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
634 // integer. As such it is a highly unlikley case to have n so large.
635 if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
636 return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" +
637 filename_);
638 }
639
640 LARGE_INTEGER li;
641 li.QuadPart = static_cast<LONGLONG>(n); //cast is safe due to the check above
642 BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
643 if (ret == FALSE) {
644 auto lastError = GetLastError();
645 return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
646 lastError);
647 }
648 return Status::OK();
649 }
650
InvalidateCache(size_t offset,size_t length)651 Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
652 return Status::OK();
653 }
654
655 //////////////////////////////////////////////////////////////////////////////////////////////////
656 /// WinRandomAccessBase
657
658 inline
PositionedReadInternal(char * src,size_t numBytes,uint64_t offset,size_t & bytes_read) const659 Status WinRandomAccessImpl::PositionedReadInternal(char* src,
660 size_t numBytes,
661 uint64_t offset,
662 size_t& bytes_read) const {
663 return pread(file_base_, src, numBytes, offset, bytes_read);
664 }
665
666 inline
WinRandomAccessImpl(WinFileData * file_base,size_t alignment,const EnvOptions & options)667 WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
668 size_t alignment,
669 const EnvOptions& options) :
670 file_base_(file_base),
671 alignment_(alignment) {
672
673 assert(!options.use_mmap_reads);
674 }
675
676 inline
ReadImpl(uint64_t offset,size_t n,Slice * result,char * scratch) const677 Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
678 char* scratch) const {
679
680 Status s;
681
682 // Check buffer alignment
683 if (file_base_->use_direct_io()) {
684 if (!IsSectorAligned(static_cast<size_t>(offset)) ||
685 !IsAligned(alignment_, scratch)) {
686 return Status::InvalidArgument(
687 "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
688 }
689 }
690
691 if (n == 0) {
692 *result = Slice(scratch, 0);
693 return s;
694 }
695
696 size_t bytes_read = 0;
697 s = PositionedReadInternal(scratch, n, offset, bytes_read);
698 *result = Slice(scratch, bytes_read);
699 return s;
700 }
701
702 ///////////////////////////////////////////////////////////////////////////////////////////////////
703 /// WinRandomAccessFile
704
WinRandomAccessFile(const std::string & fname,HANDLE hFile,size_t alignment,const EnvOptions & options)705 WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
706 size_t alignment,
707 const EnvOptions& options)
708 : WinFileData(fname, hFile, options.use_direct_reads),
709 WinRandomAccessImpl(this, alignment, options) {}
710
~WinRandomAccessFile()711 WinRandomAccessFile::~WinRandomAccessFile() {
712 }
713
Read(uint64_t offset,size_t n,Slice * result,char * scratch) const714 Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
715 char* scratch) const {
716 return ReadImpl(offset, n, result, scratch);
717 }
718
InvalidateCache(size_t offset,size_t length)719 Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
720 return Status::OK();
721 }
722
GetUniqueId(char * id,size_t max_size) const723 size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
724 return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
725 }
726
GetRequiredBufferAlignment() const727 size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
728 return GetAlignment();
729 }
730
731 /////////////////////////////////////////////////////////////////////////////
732 // WinWritableImpl
733 //
734
735 inline
PreallocateInternal(uint64_t spaceToReserve)736 Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
737 return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve);
738 }
739
740 inline
WinWritableImpl(WinFileData * file_data,size_t alignment)741 WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
742 : file_data_(file_data),
743 alignment_(alignment),
744 next_write_offset_(0),
745 reservedsize_(0) {
746
747 // Query current position in case ReopenWritableFile is called
748 // This position is only important for buffered writes
749 // for unbuffered writes we explicitely specify the position.
750 LARGE_INTEGER zero_move;
751 zero_move.QuadPart = 0; // Do not move
752 LARGE_INTEGER pos;
753 pos.QuadPart = 0;
754 BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
755 FILE_CURRENT);
756 // Querying no supped to fail
757 if (ret != 0) {
758 next_write_offset_ = pos.QuadPart;
759 } else {
760 assert(false);
761 }
762 }
763
764 inline
AppendImpl(const Slice & data)765 Status WinWritableImpl::AppendImpl(const Slice& data) {
766
767 Status s;
768
769 if (data.size() > std::numeric_limits<DWORD>::max()) {
770 return Status::InvalidArgument("data is too long for a single write" +
771 file_data_->GetName());
772 }
773
774 size_t bytes_written = 0; // out param
775
776 if (file_data_->use_direct_io()) {
777 // With no offset specified we are appending
778 // to the end of the file
779 assert(IsSectorAligned(next_write_offset_));
780 if (!IsSectorAligned(data.size()) ||
781 !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
782 s = Status::InvalidArgument(
783 "WriteData must be page aligned, size must be sector aligned");
784 } else {
785 s = pwrite(file_data_, data, next_write_offset_, bytes_written);
786 }
787 } else {
788
789 DWORD bytesWritten = 0;
790 if (!WriteFile(file_data_->GetFileHandle(), data.data(),
791 static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
792 auto lastError = GetLastError();
793 s = IOErrorFromWindowsError(
794 "Failed to WriteFile: " + file_data_->GetName(),
795 lastError);
796 } else {
797 bytes_written = bytesWritten;
798 }
799 }
800
801 if(s.ok()) {
802 if (bytes_written == data.size()) {
803 // This matters for direct_io cases where
804 // we rely on the fact that next_write_offset_
805 // is sector aligned
806 next_write_offset_ += bytes_written;
807 } else {
808 s = Status::IOError("Failed to write all bytes: " +
809 file_data_->GetName());
810 }
811 }
812
813 return s;
814 }
815
816 inline
PositionedAppendImpl(const Slice & data,uint64_t offset)817 Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
818
819 if(file_data_->use_direct_io()) {
820 if (!IsSectorAligned(static_cast<size_t>(offset)) ||
821 !IsSectorAligned(data.size()) ||
822 !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
823 return Status::InvalidArgument(
824 "Data and offset must be page aligned, size must be sector aligned");
825 }
826 }
827
828 size_t bytes_written = 0;
829 Status s = pwrite(file_data_, data, offset, bytes_written);
830
831 if(s.ok()) {
832 if (bytes_written == data.size()) {
833 // For sequential write this would be simple
834 // size extension by data.size()
835 uint64_t write_end = offset + bytes_written;
836 if (write_end >= next_write_offset_) {
837 next_write_offset_ = write_end;
838 }
839 } else {
840 s = Status::IOError("Failed to write all of the requested data: " +
841 file_data_->GetName());
842 }
843 }
844 return s;
845 }
846
847 inline
TruncateImpl(uint64_t size)848 Status WinWritableImpl::TruncateImpl(uint64_t size) {
849
850 // It is tempting to check for the size for sector alignment
851 // but truncation may come at the end and there is not a requirement
852 // for this to be sector aligned so long as we do not attempt to write
853 // after that. The interface docs state that the behavior is undefined
854 // in that case.
855 Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
856 size);
857
858 if (s.ok()) {
859 next_write_offset_ = size;
860 }
861 return s;
862 }
863
864 inline
CloseImpl()865 Status WinWritableImpl::CloseImpl() {
866
867 Status s;
868
869 auto hFile = file_data_->GetFileHandle();
870 assert(INVALID_HANDLE_VALUE != hFile);
871
872 if (!::FlushFileBuffers(hFile)) {
873 auto lastError = GetLastError();
874 s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " +
875 file_data_->GetName(),
876 lastError);
877 }
878
879 if(!file_data_->CloseFile() && s.ok()) {
880 auto lastError = GetLastError();
881 s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
882 lastError);
883 }
884 return s;
885 }
886
887 inline
SyncImpl()888 Status WinWritableImpl::SyncImpl() {
889 Status s;
890 if (!::FlushFileBuffers (file_data_->GetFileHandle())) {
891 auto lastError = GetLastError();
892 s = IOErrorFromWindowsError(
893 "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError);
894 }
895 return s;
896 }
897
898
899 inline
AllocateImpl(uint64_t offset,uint64_t len)900 Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
901 Status status;
902 TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
903
904 // Make sure that we reserve an aligned amount of space
905 // since the reservation block size is driven outside so we want
906 // to check if we are ok with reservation here
907 size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), static_cast<size_t>(alignment_));
908 // Nothing to do
909 if (spaceToReserve <= reservedsize_) {
910 return status;
911 }
912
913 IOSTATS_TIMER_GUARD(allocate_nanos);
914 status = PreallocateInternal(spaceToReserve);
915 if (status.ok()) {
916 reservedsize_ = spaceToReserve;
917 }
918 return status;
919 }
920
921
922 ////////////////////////////////////////////////////////////////////////////////
923 /// WinWritableFile
924
WinWritableFile(const std::string & fname,HANDLE hFile,size_t alignment,size_t,const EnvOptions & options)925 WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
926 size_t alignment, size_t /* capacity */,
927 const EnvOptions& options)
928 : WinFileData(fname, hFile, options.use_direct_writes),
929 WinWritableImpl(this, alignment),
930 WritableFile(options) {
931 assert(!options.use_mmap_writes);
932 }
933
~WinWritableFile()934 WinWritableFile::~WinWritableFile() {
935 }
936
937 // Indicates if the class makes use of direct I/O
use_direct_io() const938 bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
939
GetRequiredBufferAlignment() const940 size_t WinWritableFile::GetRequiredBufferAlignment() const {
941 return static_cast<size_t>(GetAlignement());
942 }
943
Append(const Slice & data)944 Status WinWritableFile::Append(const Slice& data) {
945 return AppendImpl(data);
946 }
947
PositionedAppend(const Slice & data,uint64_t offset)948 Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
949 return PositionedAppendImpl(data, offset);
950 }
951
952 // Need to implement this so the file is truncated correctly
953 // when buffered and unbuffered mode
Truncate(uint64_t size)954 Status WinWritableFile::Truncate(uint64_t size) {
955 return TruncateImpl(size);
956 }
957
Close()958 Status WinWritableFile::Close() {
959 return CloseImpl();
960 }
961
962 // write out the cached data to the OS cache
963 // This is now taken care of the WritableFileWriter
Flush()964 Status WinWritableFile::Flush() {
965 return Status::OK();
966 }
967
Sync()968 Status WinWritableFile::Sync() {
969 return SyncImpl();
970 }
971
Fsync()972 Status WinWritableFile::Fsync() { return SyncImpl(); }
973
IsSyncThreadSafe() const974 bool WinWritableFile::IsSyncThreadSafe() const { return true; }
975
GetFileSize()976 uint64_t WinWritableFile::GetFileSize() {
977 return GetFileNextWriteOffset();
978 }
979
Allocate(uint64_t offset,uint64_t len)980 Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) {
981 return AllocateImpl(offset, len);
982 }
983
GetUniqueId(char * id,size_t max_size) const984 size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
985 return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
986 }
987
988 /////////////////////////////////////////////////////////////////////////
989 /// WinRandomRWFile
990
WinRandomRWFile(const std::string & fname,HANDLE hFile,size_t alignment,const EnvOptions & options)991 WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
992 size_t alignment, const EnvOptions& options)
993 : WinFileData(fname, hFile,
994 options.use_direct_reads && options.use_direct_writes),
995 WinRandomAccessImpl(this, alignment, options),
996 WinWritableImpl(this, alignment) {}
997
use_direct_io() const998 bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
999
GetRequiredBufferAlignment() const1000 size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
1001 return static_cast<size_t>(GetAlignement());
1002 }
1003
Write(uint64_t offset,const Slice & data)1004 Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
1005 return PositionedAppendImpl(data, offset);
1006 }
1007
Read(uint64_t offset,size_t n,Slice * result,char * scratch) const1008 Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
1009 char* scratch) const {
1010 return ReadImpl(offset, n, result, scratch);
1011 }
1012
Flush()1013 Status WinRandomRWFile::Flush() {
1014 return Status::OK();
1015 }
1016
Sync()1017 Status WinRandomRWFile::Sync() {
1018 return SyncImpl();
1019 }
1020
Close()1021 Status WinRandomRWFile::Close() {
1022 return CloseImpl();
1023 }
1024
1025 //////////////////////////////////////////////////////////////////////////
1026 /// WinMemoryMappedBufer
~WinMemoryMappedBuffer()1027 WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
1028 BOOL ret
1029 #if defined(_MSC_VER)
1030 = FALSE;
1031 #else
1032 __attribute__((__unused__));
1033 #endif
1034 if (base_ != nullptr) {
1035 ret = ::UnmapViewOfFile(base_);
1036 assert(ret);
1037 base_ = nullptr;
1038 }
1039 if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
1040 ret = ::CloseHandle(map_handle_);
1041 assert(ret);
1042 map_handle_ = NULL;
1043 }
1044 if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
1045 ret = ::CloseHandle(file_handle_);
1046 assert(ret);
1047 file_handle_ = NULL;
1048 }
1049 }
1050
1051 //////////////////////////////////////////////////////////////////////////
1052 /// WinDirectory
1053
Fsync()1054 Status WinDirectory::Fsync() { return Status::OK(); }
1055
GetUniqueId(char * id,size_t max_size) const1056 size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
1057 return GetUniqueIdFromFile(handle_, id, max_size);
1058 }
1059 //////////////////////////////////////////////////////////////////////////
1060 /// WinFileLock
1061
~WinFileLock()1062 WinFileLock::~WinFileLock() {
1063 BOOL ret __attribute__((__unused__));
1064 ret = ::CloseHandle(hFile_);
1065 assert(ret);
1066 }
1067
1068 }
1069 } // namespace ROCKSDB_NAMESPACE
1070