1 /******************************************************************************
2  *
3  * Name:     cpl_userfaultfd.cpp
4  * Project:  CPL - Common Portability Library
5  * Purpose:  Use userfaultfd and VSIL to service page faults
6  * Author:   James McClain, <james.mcclain@gmail.com>
7  *
8  ******************************************************************************
9  * Copyright (c) 2018, Dr. James McClain <james.mcclain@gmail.com>
10  *
11  * Permission is hereby granted, free of charge, to any person obtaining a
12  * copy of this software and associated documentation files (the "Software"),
13  * to deal in the Software without restriction, including without limitation
14  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
15  * and/or sell copies of the Software, and to permit persons to whom the
16  * Software is furnished to do so, subject to the following conditions:
17  *
18  * The above copyright notice and this permission notice shall be included
19  * in all copies or substantial portions of the Software.
20  *
21  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS
22  * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
23  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL
24  * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
25  * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING
26  * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER
27  * DEALINGS IN THE SOFTWARE.
28  ****************************************************************************/
29 
30 #ifdef ENABLE_UFFD
31 
32 #include <cstdlib>
33 #include <cinttypes>
34 #include <cstring>
35 #include <string>
36 
37 #include <fcntl.h>
38 #include <poll.h>
39 #include <pthread.h>
40 #include <sched.h>
41 #include <signal.h>
42 #include <unistd.h>
43 
44 #include <sys/ioctl.h>
45 #include <sys/mman.h>
46 #include <sys/stat.h>
47 #include <sys/syscall.h>
48 #include <sys/types.h>
49 #include <sys/utsname.h>
50 #include <linux/userfaultfd.h>
51 
52 #include "cpl_conv.h"
53 #include "cpl_error.h"
54 #include "cpl_userfaultfd.h"
55 #include "cpl_string.h"
56 #include "cpl_vsi.h"
57 #include "cpl_multiproc.h"
58 
59 
60 #define BAD_MMAP (reinterpret_cast<void *>(-1))
61 #define MAX_MESSAGES (0x100)
62 
63 static int64_t get_page_limit();
64 static void cpl_uffd_fault_handler(void * ptr);
65 static void signal_handler(int signal);
66 static void uffd_cleanup(void * ptr);
67 
68 struct cpl_uffd_context {
69   bool keep_going = false;
70 
71   int uffd = -1;
72   struct uffdio_register uffdio_register = {};
73   struct uffd_msg uffd_msgs[MAX_MESSAGES];
74 
75   std::string filename = std::string("");
76 
77   int64_t page_limit = -1;
78   int64_t pages_used = 0;
79 
80   off_t  file_size = 0;
81   off_t  page_size = 0;
82   void * page_ptr = nullptr;
83   size_t vma_size = 0;
84   void * vma_ptr = nullptr;
85   CPLJoinableThread* thread = nullptr;
86 };
87 
88 
89 static void uffd_cleanup(void * ptr)
90 {
91   struct cpl_uffd_context * ctx = static_cast<struct cpl_uffd_context *>(ptr);
92 
93   if (!ctx) return;
94 
95   // Signal shutdown
96   ctx->keep_going = false;
97   if( ctx->thread )
98   {
99       CPLJoinThread(ctx->thread);
100       ctx->thread = nullptr;
101   }
102 
103   if (ctx->uffd != -1) {
104     ioctl(ctx->uffd, UFFDIO_UNREGISTER, &ctx->uffdio_register);
105     close(ctx->uffd);
106     ctx->uffd = -1;
107   }
108   if (ctx->page_ptr && ctx->page_size)
109     munmap(ctx->page_ptr, ctx->page_size);
110   if (ctx->vma_ptr && ctx->vma_size)
111     munmap(ctx->vma_ptr, ctx->vma_size);
112   ctx->page_ptr = nullptr;
113   ctx->vma_ptr = nullptr;
114   ctx->page_size = 0;
115   ctx->vma_size = 0;
116   ctx->pages_used = 0;
117   ctx->page_limit = 0;
118 
119   delete ctx;
120 
121   return;
122 }
123 
124 #ifdef HAVE_GCC_WARNING_ZERO_AS_NULL_POINTER_CONSTANT
125 #pragma GCC diagnostic push
126 #pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant"
127 #endif
128 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER;
129 #ifdef HAVE_GCC_WARNING_ZERO_AS_NULL_POINTER_CONSTANT
130 #pragma GCC diagnostic pop
131 #endif
132 
133 static int64_t get_page_limit()
134 {
135   int64_t retval;
136   const char * variable = CPLGetConfigOption(GDAL_UFFD_LIMIT, nullptr);
137 
138   if (variable && sscanf(variable, "%" PRId64, &retval))
139     return retval;
140   else
141     return -1;
142 }
143 
144 static void cpl_uffd_fault_handler(void * ptr)
145 {
146   struct cpl_uffd_context * ctx = static_cast<struct cpl_uffd_context *>(ptr);
147   struct uffdio_copy uffdio_copy;
148   struct pollfd pollfd;
149 
150   // Setup pollfd structure
151   pollfd.fd = ctx->uffd;
152   pollfd.events = POLLIN;
153 
154   // Open asset for reading
155   VSILFILE * file = VSIFOpenL(ctx->filename.c_str(), "rb");
156 
157   if (!file) return;
158 
159   // Loop until told to stop
160   while(ctx->keep_going) {
161     uintptr_t fault_addr;
162     uint64_t offset;
163     off_t bytes_needed;
164     ssize_t bytes_read;
165 
166     // Poll for event
167     if (poll(&pollfd, 1, 16) == -1) break; // 60Hz when no demand
168     if ((pollfd.revents & POLLERR) || (pollfd.revents & POLLNVAL)) break;
169     if (!(pollfd.revents & POLLIN)) continue;
170 
171     // Read page fault events
172     bytes_read = static_cast<ssize_t>(read(ctx->uffd, ctx->uffd_msgs, MAX_MESSAGES*sizeof(uffd_msg)));
173     if (bytes_read < 1) {
174       if (errno == EWOULDBLOCK) continue;
175       else break;
176     }
177 
178     // If too many pages are in use, evict all pages (evict them from
179     // RAM and swap, not just to swap).  It is impossible to control
180     // which/when threads access the VMA, so access to the VMA has to
181     // forbidden while the activity is in progress.
182     //
183     // That is done by (1) installing special handlers for SIGSEGV and
184     // SIGBUS, (2) mprotecting the VMA so that any threads accessing
185     // it receive either SIGSEGV or SIGBUS (which one is apparently a
186     // function of the C library, at least on one non-Linux GNU
187     // system[1]), (3) unregistering the VMA from userfaultfd,
188     // remapping the VMA to evict the pages, registering the VMA
189     // again, (4) making the VMA accessible again, and finally (5)
190     // restoring the previous signal-handling behavior.
191     //
192     // [1] https://lists.debian.org/debian-bsd/2011/05/msg00032.html
193     if (ctx->page_limit > 0) {
194         pthread_mutex_lock(&mutex);
195         if (ctx->pages_used > ctx->page_limit) {
196             struct sigaction segv;
197             struct sigaction old_segv;
198             struct sigaction bus;
199             struct sigaction old_bus;
200 
201             memset(&segv, 0, sizeof(segv));
202             memset(&old_segv, 0, sizeof(old_segv));
203             memset(&bus, 0, sizeof(bus));
204             memset(&old_bus, 0, sizeof(old_bus));
205 
206             // Step 1 from the block comment above
207             segv.sa_handler = signal_handler;
208             bus.sa_handler = signal_handler;
209             if (sigaction(SIGSEGV, &segv, &old_segv) == -1) {
210                 CPLError(CE_Failure, CPLE_AppDefined,
211                         "cpl_uffd_fault_handler: sigaction(SIGSEGV) failed");
212                 pthread_mutex_unlock(&mutex);
213                 break;
214             }
215             if (sigaction(SIGBUS, &bus, &old_bus) == -1) {
216                 CPLError(CE_Failure, CPLE_AppDefined,
217                         "cpl_uffd_fault_handler: sigaction(SIGBUS) failed");
218                 pthread_mutex_unlock(&mutex);
219                 break;
220             }
221 
222             // WARNING: LACK OF THREAD-SAFETY.
223             //
224             // For example, if a user program (or another part of the
225             // library) installs a SIGSEGV or SIGBUS handler from another
226             // thread after this one has installed its handlers but before
227             // this one uninstalls its handlers, the intervening handler
228             // will be eliminated.  There are other examples, as well, but
229             // there can only be a problems with other threads because the
230             // faulting thread is blocked here.
231             //
232             // This implies that one should not use cpl_virtualmem.h API
233             // while other threads are actively generating faults that use
234             // this mechanism.
235             //
236             // Having multiple active threads that use this mechanism but
237             // with no changes to signal-handling in other threads is NOT a
238             // problem.
239 
240             // Step 2
241             if (mprotect(ctx->vma_ptr, ctx->vma_size, PROT_NONE) == -1) {
242                 CPLError(CE_Failure, CPLE_AppDefined,
243                         "cpl_uffd_fault_handler: mprotect() failed");
244                 pthread_mutex_unlock(&mutex);
245                 break;
246             }
247 
248             // Step 3
249             if (ioctl(ctx->uffd, UFFDIO_UNREGISTER, &ctx->uffdio_register)) {
250                 CPLError(CE_Failure, CPLE_AppDefined,
251                         "cpl_uffd_fault_handler: ioctl(UFFDIO_UNREGISTER) failed");
252                 pthread_mutex_unlock(&mutex);
253                 break;
254             }
255             ctx->vma_ptr = mmap(ctx->vma_ptr, ctx->vma_size, PROT_NONE,
256                                 MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0);
257             if (ctx->vma_ptr == BAD_MMAP) {
258                 CPLError(CE_Failure, CPLE_AppDefined,
259                         "cpl_uffd_fault_handler: mmap() failed");
260                 ctx->vma_ptr = nullptr;
261                 pthread_mutex_unlock(&mutex);
262                 break;
263             }
264             ctx->pages_used = 0;
265             if (ioctl(ctx->uffd, UFFDIO_REGISTER, &ctx->uffdio_register)) {
266                 CPLError(CE_Failure, CPLE_AppDefined,
267                         "cpl_uffd_fault_handler: ioctl(UFFDIO_REGISTER) failed");
268                 pthread_mutex_unlock(&mutex);
269                 break;
270             }
271 
272             // Step 4.  Problem: A thread might attempt to read here (before
273             // the mprotect) and receive a SIGSEGV or SIGBUS.
274             if (mprotect(ctx->vma_ptr, ctx->vma_size, PROT_READ) == -1) {
275                 CPLError(CE_Failure, CPLE_AppDefined,
276                         "cpl_uffd_fault_handler: mprotect() failed");
277                 pthread_mutex_unlock(&mutex);
278                 break;
279             }
280 
281             // Step 5.  Solution: Cannot unregister special handlers before
282             // any such threads have been handled by them, so sleep for
283             // 1/100th of a second.
284             // Coverity complains about sleeping under a mutex
285             // coverity[sleep]
286             usleep(10000);
287             if (sigaction(SIGSEGV, &old_segv, nullptr) == -1) {
288                 CPLError(CE_Failure, CPLE_AppDefined,
289                         "cpl_uffd_fault_handler: sigaction(SIGSEGV) failed");
290                 pthread_mutex_unlock(&mutex);
291                 break;
292             }
293             if (sigaction(SIGBUS, &old_bus, nullptr) == -1) {
294                 CPLError(CE_Failure, CPLE_AppDefined,
295                         "cpl_uffd_fault_handler: sigaction(SIGBUS) failed");
296                 pthread_mutex_unlock(&mutex);
297                 break;
298             }
299         }
300         pthread_mutex_unlock(&mutex);
301     }
302 
303     // Handle page fault events
304     for (int i = 0; i < static_cast<int>(bytes_read/sizeof(uffd_msg)); ++i) {
305       fault_addr = ctx->uffd_msgs[i].arg.pagefault.address & ~(ctx->page_size-1);
306       offset = static_cast<uint64_t>(fault_addr) - reinterpret_cast<uint64_t>(ctx->vma_ptr);
307       bytes_needed = static_cast<off_t>(ctx->file_size - offset);
308       if (bytes_needed > ctx->page_size) bytes_needed = ctx->page_size;
309 
310       // Copy data into page
311       if (VSIFSeekL(file, offset, SEEK_SET)) break;
312       if (VSIFReadL(ctx->page_ptr, bytes_needed, 1, file) != 1) break;
313       ctx->pages_used++;
314 
315       // Use the page to fulfill the page fault
316       uffdio_copy.src = reinterpret_cast<uintptr_t>(ctx->page_ptr);
317       uffdio_copy.dst = fault_addr;
318       uffdio_copy.len = static_cast<uintptr_t>(ctx->page_size);
319       uffdio_copy.mode = 0;
320       uffdio_copy.copy = 0;
321       if (ioctl(ctx->uffd, UFFDIO_COPY, &uffdio_copy) == -1) break;
322     }
323   } // end of while loop
324 
325   // Return resources
326   VSIFCloseL(file);
327 }
328 
329 static void signal_handler(int signal)
330 {
331   if (signal == SIGSEGV || signal == SIGBUS)
332     sched_yield();
333   return;
334 }
335 
336 bool CPLIsUserFaultMappingSupported()
337 {
338   // Check the Linux kernel version.  Linux 4.3 or newer is needed for
339   // userfaultfd.
340   int major = 0, minor = 0;
341   struct utsname utsname;
342 
343   if (uname(&utsname)) return false;
344   sscanf(utsname.release, "%d.%d", &major, &minor);
345   if (major < 4) return false;
346   if (major == 4 && minor < 3) return false;
347 
348   static int nEnableUserFaultFD = -1;
349   if( nEnableUserFaultFD < 0 )
350   {
351       nEnableUserFaultFD =
352         CPLTestBool(CPLGetConfigOption("CPL_ENABLE_USERFAULTFD", "YES"));
353   }
354 
355   return nEnableUserFaultFD != FALSE;
356 }
357 
358 /*
359  * Returns nullptr on failure, a valid pointer on success.
360  */
361 cpl_uffd_context* CPLCreateUserFaultMapping(const char * pszFilename, void ** ppVma, uint64_t * pnVmaSize)
362 {
363   VSIStatBufL statbuf;
364   struct cpl_uffd_context * ctx = nullptr;
365 
366   if( !CPLIsUserFaultMappingSupported() )
367   {
368       CPLError(CE_Failure, CPLE_NotSupported,
369                "CPLCreateUserFaultMapping(): Linux kernel 4.3 or newer needed");
370       return nullptr;
371   }
372 
373   // Get the size of the asset
374   if (VSIStatL(pszFilename, &statbuf)) return nullptr;
375 
376   // Setup the `cpl_uffd_context` struct
377   ctx = new cpl_uffd_context();
378   ctx->keep_going = true;
379   ctx->filename = std::string(pszFilename);
380   ctx->page_limit = get_page_limit();
381   ctx->pages_used = 0;
382   ctx->file_size = static_cast<off_t>(statbuf.st_size);
383   ctx->page_size = static_cast<off_t>(sysconf(_SC_PAGESIZE));
384   ctx->vma_size = static_cast<size_t>(((statbuf.st_size/ctx->page_size)+1) * ctx->page_size);
385   if (ctx->vma_size < static_cast<size_t>(statbuf.st_size)) { // Check for overflow
386     uffd_cleanup(ctx);
387     CPLError(CE_Failure, CPLE_AppDefined,
388              "CPLCreateUserFaultMapping(): File too large for architecture");
389     return nullptr;
390   }
391 
392   // If the mmap failed, free resources and return
393   ctx->vma_ptr = mmap(nullptr, ctx->vma_size, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
394   if (ctx->vma_ptr == BAD_MMAP) {
395     ctx->vma_ptr = nullptr;
396     uffd_cleanup(ctx);
397     CPLError(CE_Failure, CPLE_AppDefined,
398              "CPLCreateUserFaultMapping(): mmap() failed");
399     return nullptr;
400   }
401 
402   // Attempt to acquire a scratch page to use to fulfill requests.
403   ctx->page_ptr = mmap(nullptr, static_cast<size_t>(ctx->page_size), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0);
404   if (ctx->page_ptr == BAD_MMAP) {
405     ctx->page_ptr = nullptr;
406     uffd_cleanup(ctx);
407     CPLError(CE_Failure, CPLE_AppDefined,
408              "CPLCreateUserFaultMapping(): mmap() failed");
409     return nullptr;
410   }
411 
412   // Get userfaultfd
413   if ((ctx->uffd = static_cast<int>(syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK))) == -1) {
414     ctx->uffd = -1;
415     uffd_cleanup(ctx);
416     CPLError(CE_Failure, CPLE_AppDefined,
417              "CPLCreateUserFaultMapping(): syscall(__NR_userfaultfd) failed");
418     return nullptr;
419   }
420 
421   // Query API
422   {
423     struct uffdio_api uffdio_api = {};
424 
425     uffdio_api.api = UFFD_API;
426     uffdio_api.features = 0;
427 
428     if (ioctl(ctx->uffd, UFFDIO_API, &uffdio_api) == -1) {
429       uffd_cleanup(ctx);
430       CPLError(CE_Failure, CPLE_AppDefined,
431                "CPLCreateUserFaultMapping(): ioctl(UFFDIO_API) failed");
432       return nullptr;
433     }
434   }
435 
436   // Register memory range
437   ctx->uffdio_register.range.start = reinterpret_cast<uintptr_t>(ctx->vma_ptr);
438   ctx->uffdio_register.range.len = ctx->vma_size;
439   ctx->uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
440 
441   if (ioctl(ctx->uffd, UFFDIO_REGISTER, &ctx->uffdio_register) == -1) {
442     uffd_cleanup(ctx);
443     CPLError(CE_Failure, CPLE_AppDefined,
444              "CPLCreateUserFaultMapping(): ioctl(UFFDIO_REGISTER) failed");
445     return nullptr;
446   }
447 
448   // Start handler thread
449   ctx->thread = CPLCreateJoinableThread(cpl_uffd_fault_handler, ctx);
450   if( ctx->thread == nullptr )
451   {
452       CPLError(CE_Failure, CPLE_AppDefined,
453                "CPLCreateUserFaultMapping(): CPLCreateJoinableThread() failed");
454       uffd_cleanup(ctx);
455       return nullptr;
456   }
457 
458   *ppVma = ctx->vma_ptr;
459   *pnVmaSize = ctx->vma_size;
460   return ctx;
461 }
462 
463 void CPLDeleteUserFaultMapping(cpl_uffd_context * ctx)
464 {
465   if (ctx)
466   {
467       uffd_cleanup(ctx);
468   }
469 }
470 
471 #endif // ENABLE_UFFD
472