1 //  Copyright (c) 2011-present, Facebook, Inc.  All rights reserved.
2 //  This source code is licensed under both the GPLv2 (found in the
3 //  COPYING file in the root directory) and Apache 2.0 License
4 //  (found in the LICENSE.Apache file in the root directory).
5 //
6 // Copyright (c) 2011 The LevelDB Authors. All rights reserved.
7 // Use of this source code is governed by a BSD-style license that can be
8 // found in the LICENSE file. See the AUTHORS file for names of contributors.
9 
10 #include "port/win/io_win.h"
11 
12 #include "monitoring/iostats_context_imp.h"
13 #include "test_util/sync_point.h"
14 #include "util/aligned_buffer.h"
15 #include "util/coding.h"
16 
17 namespace ROCKSDB_NAMESPACE {
18 namespace port {
19 
20 /*
21 * DirectIOHelper
22 */
23 namespace {
24 
25 const size_t kSectorSize = 512;
26 
27 inline
IsPowerOfTwo(const size_t alignment)28 bool IsPowerOfTwo(const size_t alignment) {
29   return ((alignment) & (alignment - 1)) == 0;
30 }
31 
32 inline
IsSectorAligned(const size_t off)33 bool IsSectorAligned(const size_t off) {
34   return (off & (kSectorSize - 1)) == 0;
35 }
36 
37 inline
IsAligned(size_t alignment,const void * ptr)38 bool IsAligned(size_t alignment, const void* ptr) {
39   return ((uintptr_t(ptr)) & (alignment - 1)) == 0;
40 }
41 }
42 
43 
GetWindowsErrSz(DWORD err)44 std::string GetWindowsErrSz(DWORD err) {
45   LPSTR lpMsgBuf;
46   FormatMessageA(FORMAT_MESSAGE_ALLOCATE_BUFFER | FORMAT_MESSAGE_FROM_SYSTEM |
47     FORMAT_MESSAGE_IGNORE_INSERTS,
48     NULL, err,
49     0,  // Default language
50     reinterpret_cast<LPSTR>(&lpMsgBuf), 0, NULL);
51 
52   std::string Err = lpMsgBuf;
53   LocalFree(lpMsgBuf);
54   return Err;
55 }
56 
57 // We preserve the original name of this interface to denote the original idea
58 // behind it.
59 // All reads happen by a specified offset and pwrite interface does not change
60 // the position of the file pointer. Judging from the man page and errno it does
61 // execute
62 // lseek atomically to return the position of the file back where it was.
63 // WriteFile() does not
64 // have this capability. Therefore, for both pread and pwrite the pointer is
65 // advanced to the next position
66 // which is fine for writes because they are (should be) sequential.
67 // Because all the reads/writes happen by the specified offset, the caller in
68 // theory should not
69 // rely on the current file offset.
pwrite(const WinFileData * file_data,const Slice & data,uint64_t offset,size_t & bytes_written)70 Status pwrite(const WinFileData* file_data, const Slice& data,
71   uint64_t offset, size_t& bytes_written) {
72 
73   Status s;
74   bytes_written = 0;
75 
76   size_t num_bytes = data.size();
77   if (num_bytes > std::numeric_limits<DWORD>::max()) {
78     // May happen in 64-bit builds where size_t is 64-bits but
79     // long is still 32-bit, but that's the API here at the moment
80     return Status::InvalidArgument("num_bytes is too large for a single write: " +
81           file_data->GetName());
82   }
83 
84   OVERLAPPED overlapped = { 0 };
85   ULARGE_INTEGER offsetUnion;
86   offsetUnion.QuadPart = offset;
87 
88   overlapped.Offset = offsetUnion.LowPart;
89   overlapped.OffsetHigh = offsetUnion.HighPart;
90 
91   DWORD bytesWritten = 0;
92 
93   if (FALSE == WriteFile(file_data->GetFileHandle(), data.data(), static_cast<DWORD>(num_bytes),
94     &bytesWritten, &overlapped)) {
95     auto lastError = GetLastError();
96     s = IOErrorFromWindowsError("WriteFile failed: " + file_data->GetName(),
97       lastError);
98   } else {
99     bytes_written = bytesWritten;
100   }
101 
102   return s;
103 }
104 
105 // See comments for pwrite above
pread(const WinFileData * file_data,char * src,size_t num_bytes,uint64_t offset,size_t & bytes_read)106 Status pread(const WinFileData* file_data, char* src, size_t num_bytes,
107   uint64_t offset, size_t& bytes_read) {
108 
109   Status s;
110   bytes_read = 0;
111 
112   if (num_bytes > std::numeric_limits<DWORD>::max()) {
113     return Status::InvalidArgument("num_bytes is too large for a single read: " +
114       file_data->GetName());
115   }
116 
117   OVERLAPPED overlapped = { 0 };
118   ULARGE_INTEGER offsetUnion;
119   offsetUnion.QuadPart = offset;
120 
121   overlapped.Offset = offsetUnion.LowPart;
122   overlapped.OffsetHigh = offsetUnion.HighPart;
123 
124   DWORD bytesRead = 0;
125 
126   if (FALSE == ReadFile(file_data->GetFileHandle(), src, static_cast<DWORD>(num_bytes),
127     &bytesRead, &overlapped)) {
128     auto lastError = GetLastError();
129     // EOF is OK with zero bytes read
130     if (lastError != ERROR_HANDLE_EOF) {
131       s = IOErrorFromWindowsError("ReadFile failed: " + file_data->GetName(),
132         lastError);
133     }
134   } else {
135     bytes_read = bytesRead;
136   }
137 
138   return s;
139 }
140 
141 // SetFileInformationByHandle() is capable of fast pre-allocates.
142 // However, this does not change the file end position unless the file is
143 // truncated and the pre-allocated space is not considered filled with zeros.
fallocate(const std::string & filename,HANDLE hFile,uint64_t to_size)144 Status fallocate(const std::string& filename, HANDLE hFile,
145   uint64_t to_size) {
146   Status status;
147 
148   FILE_ALLOCATION_INFO alloc_info;
149   alloc_info.AllocationSize.QuadPart = to_size;
150 
151   if (!SetFileInformationByHandle(hFile, FileAllocationInfo, &alloc_info,
152     sizeof(FILE_ALLOCATION_INFO))) {
153     auto lastError = GetLastError();
154     status = IOErrorFromWindowsError(
155       "Failed to pre-allocate space: " + filename, lastError);
156   }
157 
158   return status;
159 }
160 
ftruncate(const std::string & filename,HANDLE hFile,uint64_t toSize)161 Status ftruncate(const std::string& filename, HANDLE hFile,
162   uint64_t toSize) {
163   Status status;
164 
165   FILE_END_OF_FILE_INFO end_of_file;
166   end_of_file.EndOfFile.QuadPart = toSize;
167 
168   if (!SetFileInformationByHandle(hFile, FileEndOfFileInfo, &end_of_file,
169     sizeof(FILE_END_OF_FILE_INFO))) {
170     auto lastError = GetLastError();
171     status = IOErrorFromWindowsError("Failed to Set end of file: " + filename,
172       lastError);
173   }
174 
175   return status;
176 }
177 
GetUniqueIdFromFile(HANDLE,char *,size_t)178 size_t GetUniqueIdFromFile(HANDLE /*hFile*/, char* /*id*/,
179                            size_t /*max_size*/) {
180   // Returning 0 is safe as it causes the table reader to generate a unique ID.
181   // This is suboptimal for performance as it prevents multiple table readers
182   // for the same file from sharing cached blocks. For example, if users have
183   // a low value for `max_open_files`, there can be many table readers opened
184   // for the same file.
185   //
186   // TODO: this is a temporarily solution as it is safe but not optimal for
187   // performance. For more details see discussion in
188   // https://github.com/facebook/rocksdb/pull/5844.
189   return 0;
190 }
191 
192 ////////////////////////////////////////////////////////////////////////////////////////////////////
193 // WinMmapReadableFile
194 
WinMmapReadableFile(const std::string & fileName,HANDLE hFile,HANDLE hMap,const void * mapped_region,size_t length)195 WinMmapReadableFile::WinMmapReadableFile(const std::string& fileName,
196                                          HANDLE hFile, HANDLE hMap,
197                                          const void* mapped_region,
198                                          size_t length)
199     : WinFileData(fileName, hFile, false /* use_direct_io */),
200       hMap_(hMap),
201       mapped_region_(mapped_region),
202       length_(length) {}
203 
~WinMmapReadableFile()204 WinMmapReadableFile::~WinMmapReadableFile() {
205   BOOL ret __attribute__((__unused__));
206   ret = ::UnmapViewOfFile(mapped_region_);
207   assert(ret);
208 
209   ret = ::CloseHandle(hMap_);
210   assert(ret);
211 }
212 
Read(uint64_t offset,size_t n,Slice * result,char * scratch) const213 Status WinMmapReadableFile::Read(uint64_t offset, size_t n, Slice* result,
214   char* scratch) const {
215   Status s;
216 
217   if (offset > length_) {
218     *result = Slice();
219     return IOError(filename_, EINVAL);
220   } else if (offset + n > length_) {
221     n = length_ - static_cast<size_t>(offset);
222   }
223   *result =
224     Slice(reinterpret_cast<const char*>(mapped_region_)+offset, n);
225   return s;
226 }
227 
InvalidateCache(size_t offset,size_t length)228 Status WinMmapReadableFile::InvalidateCache(size_t offset, size_t length) {
229   return Status::OK();
230 }
231 
GetUniqueId(char * id,size_t max_size) const232 size_t WinMmapReadableFile::GetUniqueId(char* id, size_t max_size) const {
233   return GetUniqueIdFromFile(hFile_, id, max_size);
234 }
235 
236 ///////////////////////////////////////////////////////////////////////////////
237 /// WinMmapFile
238 
239 
240 // Can only truncate or reserve to a sector size aligned if
241 // used on files that are opened with Unbuffered I/O
TruncateFile(uint64_t toSize)242 Status WinMmapFile::TruncateFile(uint64_t toSize) {
243   return ftruncate(filename_, hFile_, toSize);
244 }
245 
UnmapCurrentRegion()246 Status WinMmapFile::UnmapCurrentRegion() {
247   Status status;
248 
249   if (mapped_begin_ != nullptr) {
250     if (!::UnmapViewOfFile(mapped_begin_)) {
251       status = IOErrorFromWindowsError(
252         "Failed to unmap file view: " + filename_, GetLastError());
253     }
254 
255     // Move on to the next portion of the file
256     file_offset_ += view_size_;
257 
258     // UnmapView automatically sends data to disk but not the metadata
259     // which is good and provides some equivalent of fdatasync() on Linux
260     // therefore, we donot need separate flag for metadata
261     mapped_begin_ = nullptr;
262     mapped_end_ = nullptr;
263     dst_ = nullptr;
264 
265     last_sync_ = nullptr;
266     pending_sync_ = false;
267   }
268 
269   return status;
270 }
271 
MapNewRegion()272 Status WinMmapFile::MapNewRegion() {
273 
274   Status status;
275 
276   assert(mapped_begin_ == nullptr);
277 
278   size_t minDiskSize = static_cast<size_t>(file_offset_) + view_size_;
279 
280   if (minDiskSize > reserved_size_) {
281     status = Allocate(file_offset_, view_size_);
282     if (!status.ok()) {
283       return status;
284     }
285   }
286 
287   // Need to remap
288   if (hMap_ == NULL || reserved_size_ > mapping_size_) {
289 
290     if (hMap_ != NULL) {
291       // Unmap the previous one
292       BOOL ret __attribute__((__unused__));
293       ret = ::CloseHandle(hMap_);
294       assert(ret);
295       hMap_ = NULL;
296     }
297 
298     ULARGE_INTEGER mappingSize;
299     mappingSize.QuadPart = reserved_size_;
300 
301     hMap_ = CreateFileMappingA(
302       hFile_,
303       NULL,                  // Security attributes
304       PAGE_READWRITE,        // There is not a write only mode for mapping
305       mappingSize.HighPart,  // Enable mapping the whole file but the actual
306       // amount mapped is determined by MapViewOfFile
307       mappingSize.LowPart,
308       NULL);  // Mapping name
309 
310     if (NULL == hMap_) {
311       return IOErrorFromWindowsError(
312         "WindowsMmapFile failed to create file mapping for: " + filename_,
313         GetLastError());
314     }
315 
316     mapping_size_ = reserved_size_;
317   }
318 
319   ULARGE_INTEGER offset;
320   offset.QuadPart = file_offset_;
321 
322   // View must begin at the granularity aligned offset
323   mapped_begin_ = reinterpret_cast<char*>(
324     MapViewOfFileEx(hMap_, FILE_MAP_WRITE, offset.HighPart, offset.LowPart,
325     view_size_, NULL));
326 
327   if (!mapped_begin_) {
328     status = IOErrorFromWindowsError(
329       "WindowsMmapFile failed to map file view: " + filename_,
330       GetLastError());
331   } else {
332     mapped_end_ = mapped_begin_ + view_size_;
333     dst_ = mapped_begin_;
334     last_sync_ = mapped_begin_;
335     pending_sync_ = false;
336   }
337   return status;
338 }
339 
PreallocateInternal(uint64_t spaceToReserve)340 Status WinMmapFile::PreallocateInternal(uint64_t spaceToReserve) {
341   return fallocate(filename_, hFile_, spaceToReserve);
342 }
343 
WinMmapFile(const std::string & fname,HANDLE hFile,size_t page_size,size_t allocation_granularity,const EnvOptions & options)344 WinMmapFile::WinMmapFile(const std::string& fname, HANDLE hFile,
345                          size_t page_size, size_t allocation_granularity,
346                          const EnvOptions& options)
347     : WinFileData(fname, hFile, false),
348       WritableFile(options),
349       hMap_(NULL),
350       page_size_(page_size),
351       allocation_granularity_(allocation_granularity),
352       reserved_size_(0),
353       mapping_size_(0),
354       view_size_(0),
355       mapped_begin_(nullptr),
356       mapped_end_(nullptr),
357       dst_(nullptr),
358       last_sync_(nullptr),
359       file_offset_(0),
360       pending_sync_(false) {
361   // Allocation granularity must be obtained from GetSystemInfo() and must be
362   // a power of two.
363   assert(allocation_granularity > 0);
364   assert((allocation_granularity & (allocation_granularity - 1)) == 0);
365 
366   assert(page_size > 0);
367   assert((page_size & (page_size - 1)) == 0);
368 
369   // Only for memory mapped writes
370   assert(options.use_mmap_writes);
371 
372   // View size must be both the multiple of allocation_granularity AND the
373   // page size and the granularity is usually a multiple of a page size.
374   const size_t viewSize = 32 * 1024; // 32Kb similar to the Windows File Cache in buffered mode
375   view_size_ = Roundup(viewSize, allocation_granularity_);
376 }
377 
~WinMmapFile()378 WinMmapFile::~WinMmapFile() {
379   if (hFile_) {
380     this->Close();
381   }
382 }
383 
Append(const Slice & data)384 Status WinMmapFile::Append(const Slice& data) {
385   const char* src = data.data();
386   size_t left = data.size();
387 
388   while (left > 0) {
389     assert(mapped_begin_ <= dst_);
390     size_t avail = mapped_end_ - dst_;
391 
392     if (avail == 0) {
393       Status s = UnmapCurrentRegion();
394       if (s.ok()) {
395         s = MapNewRegion();
396       }
397 
398       if (!s.ok()) {
399         return s;
400       }
401     } else {
402       size_t n = std::min(left, avail);
403       memcpy(dst_, src, n);
404       dst_ += n;
405       src += n;
406       left -= n;
407       pending_sync_ = true;
408     }
409   }
410 
411   // Now make sure that the last partial page is padded with zeros if needed
412   size_t bytesToPad = Roundup(size_t(dst_), page_size_) - size_t(dst_);
413   if (bytesToPad > 0) {
414     memset(dst_, 0, bytesToPad);
415   }
416 
417   return Status::OK();
418 }
419 
420 // Means Close() will properly take care of truncate
421 // and it does not need any additional information
Truncate(uint64_t size)422 Status WinMmapFile::Truncate(uint64_t size) {
423   return Status::OK();
424 }
425 
Close()426 Status WinMmapFile::Close() {
427   Status s;
428 
429   assert(NULL != hFile_);
430 
431   // We truncate to the precise size so no
432   // uninitialized data at the end. SetEndOfFile
433   // which we use does not write zeros and it is good.
434   uint64_t targetSize = GetFileSize();
435 
436   if (mapped_begin_ != nullptr) {
437     // Sync before unmapping to make sure everything
438     // is on disk and there is not a lazy writing
439     // so we are deterministic with the tests
440     Sync();
441     s = UnmapCurrentRegion();
442   }
443 
444   if (NULL != hMap_) {
445     BOOL ret = ::CloseHandle(hMap_);
446     if (!ret && s.ok()) {
447       auto lastError = GetLastError();
448       s = IOErrorFromWindowsError(
449         "Failed to Close mapping for file: " + filename_, lastError);
450     }
451 
452     hMap_ = NULL;
453   }
454 
455   if (hFile_ != NULL) {
456 
457     TruncateFile(targetSize);
458 
459     BOOL ret = ::CloseHandle(hFile_);
460     hFile_ = NULL;
461 
462     if (!ret && s.ok()) {
463       auto lastError = GetLastError();
464       s = IOErrorFromWindowsError(
465         "Failed to close file map handle: " + filename_, lastError);
466     }
467   }
468 
469   return s;
470 }
471 
Flush()472 Status WinMmapFile::Flush() { return Status::OK(); }
473 
474 // Flush only data
Sync()475 Status WinMmapFile::Sync() {
476   Status s;
477 
478   // Some writes occurred since last sync
479   if (dst_ > last_sync_) {
480     assert(mapped_begin_);
481     assert(dst_);
482     assert(dst_ > mapped_begin_);
483     assert(dst_ < mapped_end_);
484 
485     size_t page_begin =
486       TruncateToPageBoundary(page_size_, last_sync_ - mapped_begin_);
487     size_t page_end =
488       TruncateToPageBoundary(page_size_, dst_ - mapped_begin_ - 1);
489 
490     // Flush only the amount of that is a multiple of pages
491     if (!::FlushViewOfFile(mapped_begin_ + page_begin,
492       (page_end - page_begin) + page_size_)) {
493       s = IOErrorFromWindowsError("Failed to FlushViewOfFile: " + filename_,
494         GetLastError());
495     } else {
496       last_sync_ = dst_;
497     }
498   }
499 
500   return s;
501 }
502 
503 /**
504 * Flush data as well as metadata to stable storage.
505 */
Fsync()506 Status WinMmapFile::Fsync() {
507   Status s = Sync();
508 
509   // Flush metadata
510   if (s.ok() && pending_sync_) {
511     if (!::FlushFileBuffers(hFile_)) {
512       s = IOErrorFromWindowsError("Failed to FlushFileBuffers: " + filename_,
513         GetLastError());
514     }
515     pending_sync_ = false;
516   }
517 
518   return s;
519 }
520 
521 /**
522 * Get the size of valid data in the file. This will not match the
523 * size that is returned from the filesystem because we use mmap
524 * to extend file by map_size every time.
525 */
GetFileSize()526 uint64_t WinMmapFile::GetFileSize() {
527   size_t used = dst_ - mapped_begin_;
528   return file_offset_ + used;
529 }
530 
InvalidateCache(size_t offset,size_t length)531 Status WinMmapFile::InvalidateCache(size_t offset, size_t length) {
532   return Status::OK();
533 }
534 
Allocate(uint64_t offset,uint64_t len)535 Status WinMmapFile::Allocate(uint64_t offset, uint64_t len) {
536   Status status;
537   TEST_KILL_RANDOM("WinMmapFile::Allocate", rocksdb_kill_odds);
538 
539   // Make sure that we reserve an aligned amount of space
540   // since the reservation block size is driven outside so we want
541   // to check if we are ok with reservation here
542   size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), view_size_);
543   // Nothing to do
544   if (spaceToReserve <= reserved_size_) {
545     return status;
546   }
547 
548   IOSTATS_TIMER_GUARD(allocate_nanos);
549   status = PreallocateInternal(spaceToReserve);
550   if (status.ok()) {
551     reserved_size_ = spaceToReserve;
552   }
553   return status;
554 }
555 
GetUniqueId(char * id,size_t max_size) const556 size_t WinMmapFile::GetUniqueId(char* id, size_t max_size) const {
557   return GetUniqueIdFromFile(hFile_, id, max_size);
558 }
559 
560 //////////////////////////////////////////////////////////////////////////////////
561 // WinSequentialFile
562 
WinSequentialFile(const std::string & fname,HANDLE f,const EnvOptions & options)563 WinSequentialFile::WinSequentialFile(const std::string& fname, HANDLE f,
564                                      const EnvOptions& options)
565     : WinFileData(fname, f, options.use_direct_reads) {}
566 
~WinSequentialFile()567 WinSequentialFile::~WinSequentialFile() {
568   assert(hFile_ != INVALID_HANDLE_VALUE);
569 }
570 
Read(size_t n,Slice * result,char * scratch)571 Status WinSequentialFile::Read(size_t n, Slice* result, char* scratch) {
572   Status s;
573   size_t r = 0;
574 
575   assert(result != nullptr);
576   if (WinFileData::use_direct_io()) {
577     return Status::NotSupported("Read() does not support direct_io");
578   }
579 
580   // Windows ReadFile API accepts a DWORD.
581   // While it is possible to read in a loop if n is too big
582   // it is an unlikely case.
583   if (n > std::numeric_limits<DWORD>::max()) {
584     return Status::InvalidArgument("n is too big for a single ReadFile: "
585       + filename_);
586   }
587 
588   DWORD bytesToRead = static_cast<DWORD>(n); //cast is safe due to the check above
589   DWORD bytesRead = 0;
590   BOOL ret = ReadFile(hFile_, scratch, bytesToRead, &bytesRead, NULL);
591   if (ret != FALSE) {
592     r = bytesRead;
593   } else {
594     auto lastError = GetLastError();
595     if (lastError != ERROR_HANDLE_EOF) {
596       s = IOErrorFromWindowsError("ReadFile failed: " + filename_,
597         lastError);
598     }
599   }
600 
601   *result = Slice(scratch, r);
602   return s;
603 }
604 
PositionedReadInternal(char * src,size_t numBytes,uint64_t offset,size_t & bytes_read) const605 Status WinSequentialFile::PositionedReadInternal(char* src, size_t numBytes,
606   uint64_t offset, size_t& bytes_read) const {
607   return pread(this, src, numBytes, offset, bytes_read);
608 }
609 
PositionedRead(uint64_t offset,size_t n,Slice * result,char * scratch)610 Status WinSequentialFile::PositionedRead(uint64_t offset, size_t n, Slice* result,
611   char* scratch) {
612 
613   Status s;
614 
615   if (!WinFileData::use_direct_io()) {
616     return Status::NotSupported("This function is only used for direct_io");
617   }
618 
619   if (!IsSectorAligned(static_cast<size_t>(offset)) ||
620       !IsSectorAligned(n)) {
621       return Status::InvalidArgument(
622         "WinSequentialFile::PositionedRead: offset is not properly aligned");
623   }
624 
625   size_t bytes_read = 0; // out param
626   s = PositionedReadInternal(scratch, static_cast<size_t>(n), offset, bytes_read);
627   *result = Slice(scratch, bytes_read);
628   return s;
629 }
630 
631 
Skip(uint64_t n)632 Status WinSequentialFile::Skip(uint64_t n) {
633   // Can't handle more than signed max as SetFilePointerEx accepts a signed 64-bit
634   // integer. As such it is a highly unlikley case to have n so large.
635   if (n > static_cast<uint64_t>(std::numeric_limits<LONGLONG>::max())) {
636     return Status::InvalidArgument("n is too large for a single SetFilePointerEx() call" +
637       filename_);
638   }
639 
640   LARGE_INTEGER li;
641   li.QuadPart = static_cast<LONGLONG>(n); //cast is safe due to the check above
642   BOOL ret = SetFilePointerEx(hFile_, li, NULL, FILE_CURRENT);
643   if (ret == FALSE) {
644     auto lastError = GetLastError();
645     return IOErrorFromWindowsError("Skip SetFilePointerEx():" + filename_,
646       lastError);
647   }
648   return Status::OK();
649 }
650 
InvalidateCache(size_t offset,size_t length)651 Status WinSequentialFile::InvalidateCache(size_t offset, size_t length) {
652   return Status::OK();
653 }
654 
655 //////////////////////////////////////////////////////////////////////////////////////////////////
656 /// WinRandomAccessBase
657 
658 inline
PositionedReadInternal(char * src,size_t numBytes,uint64_t offset,size_t & bytes_read) const659 Status WinRandomAccessImpl::PositionedReadInternal(char* src,
660   size_t numBytes,
661   uint64_t offset,
662   size_t& bytes_read) const {
663   return pread(file_base_, src, numBytes, offset, bytes_read);
664 }
665 
666 inline
WinRandomAccessImpl(WinFileData * file_base,size_t alignment,const EnvOptions & options)667 WinRandomAccessImpl::WinRandomAccessImpl(WinFileData* file_base,
668   size_t alignment,
669   const EnvOptions& options) :
670     file_base_(file_base),
671     alignment_(alignment) {
672 
673   assert(!options.use_mmap_reads);
674 }
675 
676 inline
ReadImpl(uint64_t offset,size_t n,Slice * result,char * scratch) const677 Status WinRandomAccessImpl::ReadImpl(uint64_t offset, size_t n, Slice* result,
678   char* scratch) const {
679 
680   Status s;
681 
682   // Check buffer alignment
683   if (file_base_->use_direct_io()) {
684     if (!IsSectorAligned(static_cast<size_t>(offset)) ||
685         !IsAligned(alignment_, scratch)) {
686       return Status::InvalidArgument(
687         "WinRandomAccessImpl::ReadImpl: offset or scratch is not properly aligned");
688     }
689   }
690 
691   if (n == 0) {
692     *result = Slice(scratch, 0);
693     return s;
694   }
695 
696   size_t bytes_read = 0;
697   s = PositionedReadInternal(scratch, n, offset, bytes_read);
698   *result = Slice(scratch, bytes_read);
699   return s;
700 }
701 
702 ///////////////////////////////////////////////////////////////////////////////////////////////////
703 /// WinRandomAccessFile
704 
WinRandomAccessFile(const std::string & fname,HANDLE hFile,size_t alignment,const EnvOptions & options)705 WinRandomAccessFile::WinRandomAccessFile(const std::string& fname, HANDLE hFile,
706                                          size_t alignment,
707                                          const EnvOptions& options)
708     : WinFileData(fname, hFile, options.use_direct_reads),
709       WinRandomAccessImpl(this, alignment, options) {}
710 
~WinRandomAccessFile()711 WinRandomAccessFile::~WinRandomAccessFile() {
712 }
713 
Read(uint64_t offset,size_t n,Slice * result,char * scratch) const714 Status WinRandomAccessFile::Read(uint64_t offset, size_t n, Slice* result,
715   char* scratch) const {
716   return ReadImpl(offset, n, result, scratch);
717 }
718 
InvalidateCache(size_t offset,size_t length)719 Status WinRandomAccessFile::InvalidateCache(size_t offset, size_t length) {
720   return Status::OK();
721 }
722 
GetUniqueId(char * id,size_t max_size) const723 size_t WinRandomAccessFile::GetUniqueId(char* id, size_t max_size) const {
724   return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
725 }
726 
GetRequiredBufferAlignment() const727 size_t WinRandomAccessFile::GetRequiredBufferAlignment() const {
728   return GetAlignment();
729 }
730 
731 /////////////////////////////////////////////////////////////////////////////
732 // WinWritableImpl
733 //
734 
735 inline
PreallocateInternal(uint64_t spaceToReserve)736 Status WinWritableImpl::PreallocateInternal(uint64_t spaceToReserve) {
737   return fallocate(file_data_->GetName(), file_data_->GetFileHandle(), spaceToReserve);
738 }
739 
740 inline
WinWritableImpl(WinFileData * file_data,size_t alignment)741 WinWritableImpl::WinWritableImpl(WinFileData* file_data, size_t alignment)
742   : file_data_(file_data),
743   alignment_(alignment),
744   next_write_offset_(0),
745   reservedsize_(0) {
746 
747   // Query current position in case ReopenWritableFile is called
748   // This position is only important for buffered writes
749   // for unbuffered writes we explicitely specify the position.
750   LARGE_INTEGER zero_move;
751   zero_move.QuadPart = 0; // Do not move
752   LARGE_INTEGER pos;
753   pos.QuadPart = 0;
754   BOOL ret = SetFilePointerEx(file_data_->GetFileHandle(), zero_move, &pos,
755       FILE_CURRENT);
756   // Querying no supped to fail
757   if (ret != 0) {
758     next_write_offset_ = pos.QuadPart;
759   } else {
760     assert(false);
761   }
762 }
763 
764 inline
AppendImpl(const Slice & data)765 Status WinWritableImpl::AppendImpl(const Slice& data) {
766 
767   Status s;
768 
769   if (data.size() > std::numeric_limits<DWORD>::max()) {
770     return Status::InvalidArgument("data is too long for a single write" +
771       file_data_->GetName());
772   }
773 
774   size_t bytes_written = 0; // out param
775 
776   if (file_data_->use_direct_io()) {
777     // With no offset specified we are appending
778     // to the end of the file
779     assert(IsSectorAligned(next_write_offset_));
780     if (!IsSectorAligned(data.size()) ||
781         !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
782       s = Status::InvalidArgument(
783         "WriteData must be page aligned, size must be sector aligned");
784     } else {
785       s = pwrite(file_data_, data, next_write_offset_, bytes_written);
786     }
787   } else {
788 
789     DWORD bytesWritten = 0;
790     if (!WriteFile(file_data_->GetFileHandle(), data.data(),
791       static_cast<DWORD>(data.size()), &bytesWritten, NULL)) {
792       auto lastError = GetLastError();
793       s = IOErrorFromWindowsError(
794         "Failed to WriteFile: " + file_data_->GetName(),
795         lastError);
796     } else {
797       bytes_written = bytesWritten;
798     }
799   }
800 
801   if(s.ok()) {
802     if (bytes_written == data.size()) {
803       // This matters for direct_io cases where
804       // we rely on the fact that next_write_offset_
805       // is sector aligned
806       next_write_offset_ += bytes_written;
807     } else {
808       s = Status::IOError("Failed to write all bytes: " +
809         file_data_->GetName());
810     }
811   }
812 
813   return s;
814 }
815 
816 inline
PositionedAppendImpl(const Slice & data,uint64_t offset)817 Status WinWritableImpl::PositionedAppendImpl(const Slice& data, uint64_t offset) {
818 
819   if(file_data_->use_direct_io()) {
820     if (!IsSectorAligned(static_cast<size_t>(offset)) ||
821         !IsSectorAligned(data.size()) ||
822         !IsAligned(static_cast<size_t>(GetAlignement()), data.data())) {
823       return Status::InvalidArgument(
824         "Data and offset must be page aligned, size must be sector aligned");
825     }
826   }
827 
828   size_t bytes_written = 0;
829   Status s = pwrite(file_data_, data, offset, bytes_written);
830 
831   if(s.ok()) {
832     if (bytes_written == data.size()) {
833       // For sequential write this would be simple
834       // size extension by data.size()
835       uint64_t write_end = offset + bytes_written;
836       if (write_end >= next_write_offset_) {
837         next_write_offset_ = write_end;
838       }
839     } else {
840       s = Status::IOError("Failed to write all of the requested data: " +
841         file_data_->GetName());
842     }
843   }
844   return s;
845 }
846 
847 inline
TruncateImpl(uint64_t size)848 Status WinWritableImpl::TruncateImpl(uint64_t size) {
849 
850   // It is tempting to check for the size for sector alignment
851   // but truncation may come at the end and there is not a requirement
852   // for this to be sector aligned so long as we do not attempt to write
853   // after that. The interface docs state that the behavior is undefined
854   // in that case.
855   Status s = ftruncate(file_data_->GetName(), file_data_->GetFileHandle(),
856     size);
857 
858   if (s.ok()) {
859     next_write_offset_ = size;
860   }
861   return s;
862 }
863 
864 inline
CloseImpl()865 Status WinWritableImpl::CloseImpl() {
866 
867   Status s;
868 
869   auto hFile = file_data_->GetFileHandle();
870   assert(INVALID_HANDLE_VALUE != hFile);
871 
872   if (!::FlushFileBuffers(hFile)) {
873     auto lastError = GetLastError();
874     s = IOErrorFromWindowsError("FlushFileBuffers failed at Close() for: " +
875       file_data_->GetName(),
876       lastError);
877   }
878 
879   if(!file_data_->CloseFile() && s.ok()) {
880     auto lastError = GetLastError();
881     s = IOErrorFromWindowsError("CloseHandle failed for: " + file_data_->GetName(),
882       lastError);
883   }
884   return s;
885 }
886 
887 inline
SyncImpl()888 Status WinWritableImpl::SyncImpl() {
889   Status s;
890   if (!::FlushFileBuffers (file_data_->GetFileHandle())) {
891     auto lastError = GetLastError();
892     s = IOErrorFromWindowsError(
893         "FlushFileBuffers failed at Sync() for: " + file_data_->GetName(), lastError);
894   }
895   return s;
896 }
897 
898 
899 inline
AllocateImpl(uint64_t offset,uint64_t len)900 Status WinWritableImpl::AllocateImpl(uint64_t offset, uint64_t len) {
901   Status status;
902   TEST_KILL_RANDOM("WinWritableFile::Allocate", rocksdb_kill_odds);
903 
904   // Make sure that we reserve an aligned amount of space
905   // since the reservation block size is driven outside so we want
906   // to check if we are ok with reservation here
907   size_t spaceToReserve = Roundup(static_cast<size_t>(offset + len), static_cast<size_t>(alignment_));
908   // Nothing to do
909   if (spaceToReserve <= reservedsize_) {
910     return status;
911   }
912 
913   IOSTATS_TIMER_GUARD(allocate_nanos);
914   status = PreallocateInternal(spaceToReserve);
915   if (status.ok()) {
916     reservedsize_ = spaceToReserve;
917   }
918   return status;
919 }
920 
921 
922 ////////////////////////////////////////////////////////////////////////////////
923 /// WinWritableFile
924 
WinWritableFile(const std::string & fname,HANDLE hFile,size_t alignment,size_t,const EnvOptions & options)925 WinWritableFile::WinWritableFile(const std::string& fname, HANDLE hFile,
926                                  size_t alignment, size_t /* capacity */,
927                                  const EnvOptions& options)
928     : WinFileData(fname, hFile, options.use_direct_writes),
929       WinWritableImpl(this, alignment),
930       WritableFile(options) {
931   assert(!options.use_mmap_writes);
932 }
933 
~WinWritableFile()934 WinWritableFile::~WinWritableFile() {
935 }
936 
937 // Indicates if the class makes use of direct I/O
use_direct_io() const938 bool WinWritableFile::use_direct_io() const { return WinFileData::use_direct_io(); }
939 
GetRequiredBufferAlignment() const940 size_t WinWritableFile::GetRequiredBufferAlignment() const {
941   return static_cast<size_t>(GetAlignement());
942 }
943 
Append(const Slice & data)944 Status WinWritableFile::Append(const Slice& data) {
945   return AppendImpl(data);
946 }
947 
PositionedAppend(const Slice & data,uint64_t offset)948 Status WinWritableFile::PositionedAppend(const Slice& data, uint64_t offset) {
949   return PositionedAppendImpl(data, offset);
950 }
951 
952 // Need to implement this so the file is truncated correctly
953 // when buffered and unbuffered mode
Truncate(uint64_t size)954 Status WinWritableFile::Truncate(uint64_t size) {
955   return TruncateImpl(size);
956 }
957 
Close()958 Status WinWritableFile::Close() {
959   return CloseImpl();
960 }
961 
962   // write out the cached data to the OS cache
963   // This is now taken care of the WritableFileWriter
Flush()964 Status WinWritableFile::Flush() {
965   return Status::OK();
966 }
967 
Sync()968 Status WinWritableFile::Sync() {
969   return SyncImpl();
970 }
971 
Fsync()972 Status WinWritableFile::Fsync() { return SyncImpl(); }
973 
IsSyncThreadSafe() const974 bool WinWritableFile::IsSyncThreadSafe() const { return true; }
975 
GetFileSize()976 uint64_t WinWritableFile::GetFileSize() {
977   return GetFileNextWriteOffset();
978 }
979 
Allocate(uint64_t offset,uint64_t len)980 Status WinWritableFile::Allocate(uint64_t offset, uint64_t len) {
981   return AllocateImpl(offset, len);
982 }
983 
GetUniqueId(char * id,size_t max_size) const984 size_t WinWritableFile::GetUniqueId(char* id, size_t max_size) const {
985   return GetUniqueIdFromFile(GetFileHandle(), id, max_size);
986 }
987 
988 /////////////////////////////////////////////////////////////////////////
989 /// WinRandomRWFile
990 
WinRandomRWFile(const std::string & fname,HANDLE hFile,size_t alignment,const EnvOptions & options)991 WinRandomRWFile::WinRandomRWFile(const std::string& fname, HANDLE hFile,
992                                  size_t alignment, const EnvOptions& options)
993     : WinFileData(fname, hFile,
994                   options.use_direct_reads && options.use_direct_writes),
995       WinRandomAccessImpl(this, alignment, options),
996       WinWritableImpl(this, alignment) {}
997 
use_direct_io() const998 bool WinRandomRWFile::use_direct_io() const { return WinFileData::use_direct_io(); }
999 
GetRequiredBufferAlignment() const1000 size_t WinRandomRWFile::GetRequiredBufferAlignment() const {
1001   return static_cast<size_t>(GetAlignement());
1002 }
1003 
Write(uint64_t offset,const Slice & data)1004 Status WinRandomRWFile::Write(uint64_t offset, const Slice & data) {
1005   return PositionedAppendImpl(data, offset);
1006 }
1007 
Read(uint64_t offset,size_t n,Slice * result,char * scratch) const1008 Status WinRandomRWFile::Read(uint64_t offset, size_t n, Slice* result,
1009                              char* scratch) const {
1010   return ReadImpl(offset, n, result, scratch);
1011 }
1012 
Flush()1013 Status WinRandomRWFile::Flush() {
1014   return Status::OK();
1015 }
1016 
Sync()1017 Status WinRandomRWFile::Sync() {
1018   return SyncImpl();
1019 }
1020 
Close()1021 Status WinRandomRWFile::Close() {
1022   return CloseImpl();
1023 }
1024 
1025 //////////////////////////////////////////////////////////////////////////
1026 /// WinMemoryMappedBufer
~WinMemoryMappedBuffer()1027 WinMemoryMappedBuffer::~WinMemoryMappedBuffer() {
1028   BOOL ret
1029 #if defined(_MSC_VER)
1030     = FALSE;
1031 #else
1032     __attribute__((__unused__));
1033 #endif
1034   if (base_ != nullptr) {
1035     ret = ::UnmapViewOfFile(base_);
1036     assert(ret);
1037     base_ = nullptr;
1038   }
1039   if (map_handle_ != NULL && map_handle_ != INVALID_HANDLE_VALUE) {
1040     ret = ::CloseHandle(map_handle_);
1041     assert(ret);
1042     map_handle_ = NULL;
1043   }
1044   if (file_handle_ != NULL && file_handle_ != INVALID_HANDLE_VALUE) {
1045     ret = ::CloseHandle(file_handle_);
1046     assert(ret);
1047     file_handle_ = NULL;
1048   }
1049 }
1050 
1051 //////////////////////////////////////////////////////////////////////////
1052 /// WinDirectory
1053 
Fsync()1054 Status WinDirectory::Fsync() { return Status::OK(); }
1055 
GetUniqueId(char * id,size_t max_size) const1056 size_t WinDirectory::GetUniqueId(char* id, size_t max_size) const {
1057   return GetUniqueIdFromFile(handle_, id, max_size);
1058 }
1059 //////////////////////////////////////////////////////////////////////////
1060 /// WinFileLock
1061 
~WinFileLock()1062 WinFileLock::~WinFileLock() {
1063   BOOL ret __attribute__((__unused__));
1064   ret = ::CloseHandle(hFile_);
1065   assert(ret);
1066 }
1067 
1068 }
1069 }  // namespace ROCKSDB_NAMESPACE
1070