1 /*
2  * Copyright (c) Facebook, Inc. and its affiliates.
3  *
4  * Licensed under the Apache License, Version 2.0 (the "License");
5  * you may not use this file except in compliance with the License.
6  * You may obtain a copy of the License at
7  *
8  *     http://www.apache.org/licenses/LICENSE-2.0
9  *
10  * Unless required by applicable law or agreed to in writing, software
11  * distributed under the License is distributed on an "AS IS" BASIS,
12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13  * See the License for the specific language governing permissions and
14  * limitations under the License.
15  */
16 
17 #include <folly/system/MemoryMapping.h>
18 
19 #include <algorithm>
20 #include <cerrno>
21 #include <utility>
22 
23 #include <fmt/core.h>
24 #include <glog/logging.h>
25 
26 #include <folly/Portability.h>
27 #include <folly/portability/GFlags.h>
28 #include <folly/portability/SysMman.h>
29 #include <folly/portability/SysSyscall.h>
30 
31 #ifdef __linux__
32 #include <folly/experimental/io/HugePages.h> // @manual
33 #endif
34 
35 #include <fcntl.h>
36 #include <sys/types.h>
37 
38 #include <system_error>
39 
40 static constexpr ssize_t kDefaultMlockChunkSize = !folly::kMscVer
41     // Linux implementations of unmap/mlock/munlock take a kernel
42     // semaphore and block other threads from doing other memory
43     // operations. Split the operations in chunks.
44     ? (1 << 20) // 1MB
45     // MSVC doesn't have this problem, and calling munmap many times
46     // with the same address is a bad idea with the windows implementation.
47     : (-1);
48 
49 DEFINE_int64(
50     mlock_chunk_size,
51     kDefaultMlockChunkSize,
52     "Maximum bytes to mlock/munlock/munmap at once "
53     "(will be rounded up to PAGESIZE). Ignored if negative.");
54 
55 namespace folly {
56 
57 namespace {
58 
59 enum mmap_flags : int {
60 #ifdef MAP_POPULATE
61   populate = MAP_POPULATE,
62 #else
63   populate = 0,
64 #endif
65 };
66 
67 } // namespace
68 
MemoryMapping(MemoryMapping && other)69 MemoryMapping::MemoryMapping(MemoryMapping&& other) noexcept {
70   swap(other);
71 }
72 
MemoryMapping(File file,off_t offset,off_t length,Options options)73 MemoryMapping::MemoryMapping(
74     File file, off_t offset, off_t length, Options options)
75     : file_(std::move(file)), options_(options) {
76   CHECK(file_);
77   init(offset, length);
78 }
79 
MemoryMapping(const char * name,off_t offset,off_t length,Options options)80 MemoryMapping::MemoryMapping(
81     const char* name, off_t offset, off_t length, Options options)
82     : MemoryMapping(
83           File(name, options.writable ? O_RDWR : O_RDONLY),
84           offset,
85           length,
86           options) {}
87 
MemoryMapping(int fd,off_t offset,off_t length,Options options)88 MemoryMapping::MemoryMapping(
89     int fd, off_t offset, off_t length, Options options)
90     : MemoryMapping(File(fd), offset, length, options) {}
91 
MemoryMapping(AnonymousType,off_t length,Options options)92 MemoryMapping::MemoryMapping(AnonymousType, off_t length, Options options)
93     : options_(options) {
94   init(0, length);
95 }
96 
97 namespace {
98 
99 #ifdef __linux__
getDeviceOptions(dev_t device,off_t & pageSize,bool & autoExtend)100 void getDeviceOptions(dev_t device, off_t& pageSize, bool& autoExtend) {
101   auto ps = getHugePageSizeForDevice(device);
102   if (ps) {
103     pageSize = ps->size;
104     autoExtend = true;
105   }
106 }
107 #else
108 inline void getDeviceOptions(dev_t, off_t&, bool&) {}
109 #endif
110 
111 } // namespace
112 
init(off_t offset,off_t length)113 void MemoryMapping::init(off_t offset, off_t length) {
114   const bool grow = options_.grow;
115   const bool anon = !file_;
116   CHECK(!(grow && anon));
117 
118   off_t& pageSize = options_.pageSize;
119 
120   struct stat st;
121 
122   // On Linux, hugetlbfs file systems don't require ftruncate() to grow the
123   // file, and (on kernels before 2.6.24) don't even allow it. Also, the file
124   // size is always a multiple of the page size.
125   bool autoExtend = false;
126 
127   if (!anon) {
128     // Stat the file
129     CHECK_ERR(fstat(file_.fd(), &st));
130 
131     if (pageSize == 0) {
132       getDeviceOptions(st.st_dev, pageSize, autoExtend);
133     }
134   } else {
135     DCHECK(!file_);
136     DCHECK_EQ(offset, 0);
137     CHECK_EQ(pageSize, 0);
138     CHECK_GE(length, 0);
139   }
140 
141   if (pageSize == 0) {
142     pageSize = off_t(sysconf(_SC_PAGESIZE));
143   }
144 
145   CHECK_GT(pageSize, 0);
146   CHECK_EQ(pageSize & (pageSize - 1), 0); // power of two
147   CHECK_GE(offset, 0);
148 
149   // Round down the start of the mapped region
150   off_t skipStart = offset % pageSize;
151   offset -= skipStart;
152 
153   mapLength_ = length;
154   if (mapLength_ != -1) {
155     mapLength_ += skipStart;
156 
157     // Round up the end of the mapped region
158     mapLength_ = (mapLength_ + pageSize - 1) / pageSize * pageSize;
159   }
160 
161   off_t remaining = anon ? length : st.st_size - offset;
162 
163   if (mapLength_ == -1) {
164     length = mapLength_ = remaining;
165   } else {
166     if (length > remaining) {
167       if (grow) {
168         if (!autoExtend) {
169           PCHECK(0 == ftruncate(file_.fd(), offset + length))
170               << "ftruncate() failed, couldn't grow file to "
171               << offset + length;
172           remaining = length;
173         } else {
174           // Extend mapping to multiple of page size, don't use ftruncate
175           remaining = mapLength_;
176         }
177       } else {
178         length = remaining;
179       }
180     }
181     if (mapLength_ > remaining) {
182       mapLength_ = remaining;
183     }
184   }
185 
186   if (length == 0) {
187     mapLength_ = 0;
188     mapStart_ = nullptr;
189   } else {
190     int flags = options_.shared ? MAP_SHARED : MAP_PRIVATE;
191     if (anon) {
192       flags |= MAP_ANONYMOUS;
193     }
194     if (options_.prefault) {
195       flags |= mmap_flags::populate;
196     }
197 
198     // The standard doesn't actually require PROT_NONE to be zero...
199     int prot = PROT_NONE;
200     if (options_.readable || options_.writable) {
201       prot =
202           ((options_.readable ? PROT_READ : 0) |
203            (options_.writable ? PROT_WRITE : 0));
204     }
205 
206     auto start = static_cast<unsigned char*>(mmap(
207         options_.address, size_t(mapLength_), prot, flags, file_.fd(), offset));
208     PCHECK(start != MAP_FAILED)
209         << " offset=" << offset << " length=" << mapLength_;
210     mapStart_ = start;
211     data_.reset(start + skipStart, size_t(length));
212   }
213 }
214 
215 namespace {
216 
memOpChunkSize(off_t length,off_t pageSize)217 off_t memOpChunkSize(off_t length, off_t pageSize) {
218   off_t chunkSize = length;
219   if (FLAGS_mlock_chunk_size <= 0) {
220     return chunkSize;
221   }
222 
223   chunkSize = off_t(FLAGS_mlock_chunk_size);
224   off_t r = chunkSize % pageSize;
225   if (r) {
226     chunkSize += (pageSize - r);
227   }
228   return chunkSize;
229 }
230 
231 /**
232  * Run @op in chunks over the buffer @mem of @bufSize length.
233  *
234  * Return:
235  * - success: true + amountSucceeded == bufSize (op success on whole buffer)
236  * - failure: false + amountSucceeded == nr bytes on which op succeeded.
237  */
238 template <typename Op>
memOpInChunks(Op op,void * mem,size_t bufSize,off_t pageSize,size_t & amountSucceeded)239 bool memOpInChunks(
240     Op op, void* mem, size_t bufSize, off_t pageSize, size_t& amountSucceeded) {
241   // Linux' unmap/mlock/munlock take a kernel semaphore and block other threads
242   // from doing other memory operations. If the size of the buffer is big the
243   // semaphore can be down for seconds (for benchmarks see
244   // http://kostja-osipov.livejournal.com/42963.html).  Doing the operations in
245   // chunks breaks the locking into intervals and lets other threads do memory
246   // operations of their own.
247 
248   auto chunkSize = size_t(memOpChunkSize(off_t(bufSize), pageSize));
249 
250   auto addr = static_cast<char*>(mem);
251   amountSucceeded = 0;
252 
253   while (amountSucceeded < bufSize) {
254     size_t size = std::min(chunkSize, bufSize - amountSucceeded);
255     if (op(addr + amountSucceeded, size) != 0) {
256       return false;
257     }
258     amountSucceeded += size;
259   }
260 
261   return true;
262 }
263 
264 } // namespace
265 
mlock2wrapper(FOLLY_MAYBE_UNUSED const void * addr,FOLLY_MAYBE_UNUSED size_t len,MemoryMapping::LockFlags flags)266 int mlock2wrapper(
267     FOLLY_MAYBE_UNUSED const void* addr,
268     FOLLY_MAYBE_UNUSED size_t len,
269     MemoryMapping::LockFlags flags) {
270   int intFlags = 0;
271   if (flags.lockOnFault) {
272     // MLOCK_ONFAULT, only available in non-portable headers.
273     intFlags |= 0x01;
274   }
275 
276 #if defined(__GLIBC__) && !defined(__APPLE__)
277 #if __GLIBC_PREREQ(2, 27)
278   return mlock2(addr, len, intFlags);
279 #elif defined(SYS_mlock2)
280   // SYS_mlock2 is defined in Linux headers since 4.4
281   return syscall(SYS_mlock2, addr, len, intFlags);
282 #else // !__GLIBC_PREREQ(2, 27) && !defined(SYS_mlock2)
283   errno = ENOSYS;
284   return -1;
285 #endif
286 #else // !defined(__GLIBC__) || defined(__APPLE__)
287   errno = ENOSYS;
288   return -1;
289 #endif
290 }
291 
mlock(LockMode mode,LockFlags flags)292 bool MemoryMapping::mlock(LockMode mode, LockFlags flags) {
293   size_t amountSucceeded = 0;
294   locked_ = memOpInChunks(
295       [flags](void* addr, size_t len) -> int {
296         // If no flags are set, mlock2() behaves exactly the same as
297         // mlock(). Prefer the portable variant.
298         return flags == LockFlags{} ? ::mlock(addr, len)
299                                     : mlock2wrapper(addr, len, flags);
300       },
301       mapStart_,
302       size_t(mapLength_),
303       options_.pageSize,
304       amountSucceeded);
305   if (locked_) {
306     return true;
307   }
308 
309   auto msg = fmt::format("mlock({}) failed at {}", mapLength_, amountSucceeded);
310   if (mode == LockMode::TRY_LOCK && errno == EPERM) {
311     PLOG(WARNING) << msg;
312   } else if (mode == LockMode::TRY_LOCK && errno == ENOMEM) {
313     VLOG(1) << msg;
314   } else {
315     PLOG(FATAL) << msg;
316   }
317 
318   // only part of the buffer was mlocked, unlock it back
319   if (!memOpInChunks(
320           ::munlock,
321           mapStart_,
322           amountSucceeded,
323           options_.pageSize,
324           amountSucceeded)) {
325     PLOG(WARNING) << "munlock()";
326   }
327 
328   return false;
329 }
330 
munlock(bool dontneed)331 void MemoryMapping::munlock(bool dontneed) {
332   if (!locked_) {
333     return;
334   }
335 
336   size_t amountSucceeded = 0;
337   if (!memOpInChunks(
338           ::munlock,
339           mapStart_,
340           size_t(mapLength_),
341           options_.pageSize,
342           amountSucceeded)) {
343     PLOG(WARNING) << "munlock()";
344   }
345   if (mapLength_ && dontneed &&
346       ::madvise(mapStart_, size_t(mapLength_), MADV_DONTNEED)) {
347     PLOG(WARNING) << "madvise()";
348   }
349   locked_ = false;
350 }
351 
hintLinearScan()352 void MemoryMapping::hintLinearScan() {
353   advise(MADV_SEQUENTIAL);
354 }
355 
~MemoryMapping()356 MemoryMapping::~MemoryMapping() {
357   if (mapLength_) {
358     size_t amountSucceeded = 0;
359     if (!memOpInChunks(
360             ::munmap,
361             mapStart_,
362             size_t(mapLength_),
363             options_.pageSize,
364             amountSucceeded)) {
365       PLOG(FATAL) << fmt::format(
366           "munmap({}) failed at {}", mapLength_, amountSucceeded);
367     }
368   }
369 }
370 
advise(int advice) const371 void MemoryMapping::advise(int advice) const {
372   advise(advice, 0, size_t(mapLength_));
373 }
374 
advise(int advice,size_t offset,size_t length) const375 void MemoryMapping::advise(int advice, size_t offset, size_t length) const {
376   CHECK_LE(offset + length, size_t(mapLength_))
377       << " offset: " << offset << " length: " << length
378       << " mapLength_: " << mapLength_;
379 
380   // Include the entire start page: round down to page boundary.
381   const auto offMisalign = offset % options_.pageSize;
382   offset -= offMisalign;
383   length += offMisalign;
384 
385   // Round the last page down to page boundary.
386   if (offset + length != size_t(mapLength_)) {
387     length -= length % options_.pageSize;
388   }
389 
390   if (length == 0) {
391     return;
392   }
393 
394   char* mapStart = static_cast<char*>(mapStart_) + offset;
395   PLOG_IF(WARNING, ::madvise(mapStart, length, advice)) << "madvise";
396 }
397 
operator =(MemoryMapping && other)398 MemoryMapping& MemoryMapping::operator=(MemoryMapping&& other) {
399   swap(other);
400   return *this;
401 }
402 
swap(MemoryMapping & other)403 void MemoryMapping::swap(MemoryMapping& other) noexcept {
404   using std::swap;
405   swap(this->file_, other.file_);
406   swap(this->mapStart_, other.mapStart_);
407   swap(this->mapLength_, other.mapLength_);
408   swap(this->options_, other.options_);
409   swap(this->locked_, other.locked_);
410   swap(this->data_, other.data_);
411 }
412 
swap(MemoryMapping & a,MemoryMapping & b)413 void swap(MemoryMapping& a, MemoryMapping& b) noexcept {
414   a.swap(b);
415 }
416 
alignedForwardMemcpy(void * dst,const void * src,size_t size)417 void alignedForwardMemcpy(void* dst, const void* src, size_t size) {
418   assert(reinterpret_cast<uintptr_t>(src) % alignof(unsigned long) == 0);
419   assert(reinterpret_cast<uintptr_t>(dst) % alignof(unsigned long) == 0);
420 
421   auto srcl = static_cast<const unsigned long*>(src);
422   auto dstl = static_cast<unsigned long*>(dst);
423 
424   while (size >= sizeof(unsigned long)) {
425     *dstl++ = *srcl++;
426     size -= sizeof(unsigned long);
427   }
428 
429   auto srcc = reinterpret_cast<const unsigned char*>(srcl);
430   auto dstc = reinterpret_cast<unsigned char*>(dstl);
431 
432   while (size != 0) {
433     *dstc++ = *srcc++;
434     --size;
435   }
436 }
437 
mmapFileCopy(const char * src,const char * dest,mode_t mode)438 void mmapFileCopy(const char* src, const char* dest, mode_t mode) {
439   MemoryMapping srcMap(src);
440   srcMap.hintLinearScan();
441 
442   MemoryMapping destMap(
443       File(dest, O_RDWR | O_CREAT | O_TRUNC, mode),
444       0,
445       off_t(srcMap.range().size()),
446       MemoryMapping::writable());
447 
448   alignedForwardMemcpy(
449       destMap.writableRange().data(),
450       srcMap.range().data(),
451       srcMap.range().size());
452 }
453 
operator ==(const LockFlags & other) const454 bool MemoryMapping::LockFlags::operator==(const LockFlags& other) const {
455   return lockOnFault == other.lockOnFault;
456 }
457 
458 } // namespace folly
459