1 /****************************************************************************** 2 * 3 * Name: cpl_userfaultfd.cpp 4 * Project: CPL - Common Portability Library 5 * Purpose: Use userfaultfd and VSIL to service page faults 6 * Author: James McClain, <james.mcclain@gmail.com> 7 * 8 ****************************************************************************** 9 * Copyright (c) 2018, Dr. James McClain <james.mcclain@gmail.com> 10 * 11 * Permission is hereby granted, free of charge, to any person obtaining a 12 * copy of this software and associated documentation files (the "Software"), 13 * to deal in the Software without restriction, including without limitation 14 * the rights to use, copy, modify, merge, publish, distribute, sublicense, 15 * and/or sell copies of the Software, and to permit persons to whom the 16 * Software is furnished to do so, subject to the following conditions: 17 * 18 * The above copyright notice and this permission notice shall be included 19 * in all copies or substantial portions of the Software. 20 * 21 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS 22 * OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, 23 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL 24 * THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER 25 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING 26 * FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER 27 * DEALINGS IN THE SOFTWARE. 28 ****************************************************************************/ 29 30 #ifdef ENABLE_UFFD 31 32 #include <cstdlib> 33 #include <cinttypes> 34 #include <cstring> 35 #include <string> 36 37 #include <fcntl.h> 38 #include <poll.h> 39 #include <pthread.h> 40 #include <sched.h> 41 #include <signal.h> 42 #include <unistd.h> 43 44 #include <sys/ioctl.h> 45 #include <sys/mman.h> 46 #include <sys/stat.h> 47 #include <sys/syscall.h> 48 #include <sys/types.h> 49 #include <sys/utsname.h> 50 #include <linux/userfaultfd.h> 51 52 #include "cpl_conv.h" 53 #include "cpl_error.h" 54 #include "cpl_userfaultfd.h" 55 #include "cpl_string.h" 56 #include "cpl_vsi.h" 57 #include "cpl_multiproc.h" 58 59 60 #define BAD_MMAP (reinterpret_cast<void *>(-1)) 61 #define MAX_MESSAGES (0x100) 62 63 static int64_t get_page_limit(); 64 static void cpl_uffd_fault_handler(void * ptr); 65 static void signal_handler(int signal); 66 static void uffd_cleanup(void * ptr); 67 68 struct cpl_uffd_context { 69 bool keep_going = false; 70 71 int uffd = -1; 72 struct uffdio_register uffdio_register = {}; 73 struct uffd_msg uffd_msgs[MAX_MESSAGES]; 74 75 std::string filename = std::string(""); 76 77 int64_t page_limit = -1; 78 int64_t pages_used = 0; 79 80 off_t file_size = 0; 81 off_t page_size = 0; 82 void * page_ptr = nullptr; 83 size_t vma_size = 0; 84 void * vma_ptr = nullptr; 85 CPLJoinableThread* thread = nullptr; 86 }; 87 88 89 static void uffd_cleanup(void * ptr) 90 { 91 struct cpl_uffd_context * ctx = static_cast<struct cpl_uffd_context *>(ptr); 92 93 if (!ctx) return; 94 95 // Signal shutdown 96 ctx->keep_going = false; 97 if( ctx->thread ) 98 { 99 CPLJoinThread(ctx->thread); 100 ctx->thread = nullptr; 101 } 102 103 if (ctx->uffd != -1) { 104 ioctl(ctx->uffd, UFFDIO_UNREGISTER, &ctx->uffdio_register); 105 close(ctx->uffd); 106 ctx->uffd = -1; 107 } 108 if (ctx->page_ptr && ctx->page_size) 109 munmap(ctx->page_ptr, ctx->page_size); 110 if (ctx->vma_ptr && ctx->vma_size) 111 munmap(ctx->vma_ptr, ctx->vma_size); 112 ctx->page_ptr = nullptr; 113 ctx->vma_ptr = nullptr; 114 ctx->page_size = 0; 115 ctx->vma_size = 0; 116 ctx->pages_used = 0; 117 ctx->page_limit = 0; 118 119 delete ctx; 120 121 return; 122 } 123 124 #ifdef HAVE_GCC_WARNING_ZERO_AS_NULL_POINTER_CONSTANT 125 #pragma GCC diagnostic push 126 #pragma GCC diagnostic ignored "-Wzero-as-null-pointer-constant" 127 #endif 128 static pthread_mutex_t mutex = PTHREAD_MUTEX_INITIALIZER; 129 #ifdef HAVE_GCC_WARNING_ZERO_AS_NULL_POINTER_CONSTANT 130 #pragma GCC diagnostic pop 131 #endif 132 133 static int64_t get_page_limit() 134 { 135 int64_t retval; 136 const char * variable = CPLGetConfigOption(GDAL_UFFD_LIMIT, nullptr); 137 138 if (variable && sscanf(variable, "%" PRId64, &retval)) 139 return retval; 140 else 141 return -1; 142 } 143 144 static void cpl_uffd_fault_handler(void * ptr) 145 { 146 struct cpl_uffd_context * ctx = static_cast<struct cpl_uffd_context *>(ptr); 147 struct uffdio_copy uffdio_copy; 148 struct pollfd pollfd; 149 150 // Setup pollfd structure 151 pollfd.fd = ctx->uffd; 152 pollfd.events = POLLIN; 153 154 // Open asset for reading 155 VSILFILE * file = VSIFOpenL(ctx->filename.c_str(), "rb"); 156 157 if (!file) return; 158 159 // Loop until told to stop 160 while(ctx->keep_going) { 161 uintptr_t fault_addr; 162 uint64_t offset; 163 off_t bytes_needed; 164 ssize_t bytes_read; 165 166 // Poll for event 167 if (poll(&pollfd, 1, 16) == -1) break; // 60Hz when no demand 168 if ((pollfd.revents & POLLERR) || (pollfd.revents & POLLNVAL)) break; 169 if (!(pollfd.revents & POLLIN)) continue; 170 171 // Read page fault events 172 bytes_read = static_cast<ssize_t>(read(ctx->uffd, ctx->uffd_msgs, MAX_MESSAGES*sizeof(uffd_msg))); 173 if (bytes_read < 1) { 174 if (errno == EWOULDBLOCK) continue; 175 else break; 176 } 177 178 // If too many pages are in use, evict all pages (evict them from 179 // RAM and swap, not just to swap). It is impossible to control 180 // which/when threads access the VMA, so access to the VMA has to 181 // forbidden while the activity is in progress. 182 // 183 // That is done by (1) installing special handlers for SIGSEGV and 184 // SIGBUS, (2) mprotecting the VMA so that any threads accessing 185 // it receive either SIGSEGV or SIGBUS (which one is apparently a 186 // function of the C library, at least on one non-Linux GNU 187 // system[1]), (3) unregistering the VMA from userfaultfd, 188 // remapping the VMA to evict the pages, registering the VMA 189 // again, (4) making the VMA accessible again, and finally (5) 190 // restoring the previous signal-handling behavior. 191 // 192 // [1] https://lists.debian.org/debian-bsd/2011/05/msg00032.html 193 if (ctx->page_limit > 0) { 194 pthread_mutex_lock(&mutex); 195 if (ctx->pages_used > ctx->page_limit) { 196 struct sigaction segv; 197 struct sigaction old_segv; 198 struct sigaction bus; 199 struct sigaction old_bus; 200 201 memset(&segv, 0, sizeof(segv)); 202 memset(&old_segv, 0, sizeof(old_segv)); 203 memset(&bus, 0, sizeof(bus)); 204 memset(&old_bus, 0, sizeof(old_bus)); 205 206 // Step 1 from the block comment above 207 segv.sa_handler = signal_handler; 208 bus.sa_handler = signal_handler; 209 if (sigaction(SIGSEGV, &segv, &old_segv) == -1) { 210 CPLError(CE_Failure, CPLE_AppDefined, 211 "cpl_uffd_fault_handler: sigaction(SIGSEGV) failed"); 212 pthread_mutex_unlock(&mutex); 213 break; 214 } 215 if (sigaction(SIGBUS, &bus, &old_bus) == -1) { 216 CPLError(CE_Failure, CPLE_AppDefined, 217 "cpl_uffd_fault_handler: sigaction(SIGBUS) failed"); 218 pthread_mutex_unlock(&mutex); 219 break; 220 } 221 222 // WARNING: LACK OF THREAD-SAFETY. 223 // 224 // For example, if a user program (or another part of the 225 // library) installs a SIGSEGV or SIGBUS handler from another 226 // thread after this one has installed its handlers but before 227 // this one uninstalls its handlers, the intervening handler 228 // will be eliminated. There are other examples, as well, but 229 // there can only be a problems with other threads because the 230 // faulting thread is blocked here. 231 // 232 // This implies that one should not use cpl_virtualmem.h API 233 // while other threads are actively generating faults that use 234 // this mechanism. 235 // 236 // Having multiple active threads that use this mechanism but 237 // with no changes to signal-handling in other threads is NOT a 238 // problem. 239 240 // Step 2 241 if (mprotect(ctx->vma_ptr, ctx->vma_size, PROT_NONE) == -1) { 242 CPLError(CE_Failure, CPLE_AppDefined, 243 "cpl_uffd_fault_handler: mprotect() failed"); 244 pthread_mutex_unlock(&mutex); 245 break; 246 } 247 248 // Step 3 249 if (ioctl(ctx->uffd, UFFDIO_UNREGISTER, &ctx->uffdio_register)) { 250 CPLError(CE_Failure, CPLE_AppDefined, 251 "cpl_uffd_fault_handler: ioctl(UFFDIO_UNREGISTER) failed"); 252 pthread_mutex_unlock(&mutex); 253 break; 254 } 255 ctx->vma_ptr = mmap(ctx->vma_ptr, ctx->vma_size, PROT_NONE, 256 MAP_PRIVATE|MAP_ANONYMOUS|MAP_FIXED, -1, 0); 257 if (ctx->vma_ptr == BAD_MMAP) { 258 CPLError(CE_Failure, CPLE_AppDefined, 259 "cpl_uffd_fault_handler: mmap() failed"); 260 ctx->vma_ptr = nullptr; 261 pthread_mutex_unlock(&mutex); 262 break; 263 } 264 ctx->pages_used = 0; 265 if (ioctl(ctx->uffd, UFFDIO_REGISTER, &ctx->uffdio_register)) { 266 CPLError(CE_Failure, CPLE_AppDefined, 267 "cpl_uffd_fault_handler: ioctl(UFFDIO_REGISTER) failed"); 268 pthread_mutex_unlock(&mutex); 269 break; 270 } 271 272 // Step 4. Problem: A thread might attempt to read here (before 273 // the mprotect) and receive a SIGSEGV or SIGBUS. 274 if (mprotect(ctx->vma_ptr, ctx->vma_size, PROT_READ) == -1) { 275 CPLError(CE_Failure, CPLE_AppDefined, 276 "cpl_uffd_fault_handler: mprotect() failed"); 277 pthread_mutex_unlock(&mutex); 278 break; 279 } 280 281 // Step 5. Solution: Cannot unregister special handlers before 282 // any such threads have been handled by them, so sleep for 283 // 1/100th of a second. 284 // Coverity complains about sleeping under a mutex 285 // coverity[sleep] 286 usleep(10000); 287 if (sigaction(SIGSEGV, &old_segv, nullptr) == -1) { 288 CPLError(CE_Failure, CPLE_AppDefined, 289 "cpl_uffd_fault_handler: sigaction(SIGSEGV) failed"); 290 pthread_mutex_unlock(&mutex); 291 break; 292 } 293 if (sigaction(SIGBUS, &old_bus, nullptr) == -1) { 294 CPLError(CE_Failure, CPLE_AppDefined, 295 "cpl_uffd_fault_handler: sigaction(SIGBUS) failed"); 296 pthread_mutex_unlock(&mutex); 297 break; 298 } 299 } 300 pthread_mutex_unlock(&mutex); 301 } 302 303 // Handle page fault events 304 for (int i = 0; i < static_cast<int>(bytes_read/sizeof(uffd_msg)); ++i) { 305 fault_addr = ctx->uffd_msgs[i].arg.pagefault.address & ~(ctx->page_size-1); 306 offset = static_cast<uint64_t>(fault_addr) - reinterpret_cast<uint64_t>(ctx->vma_ptr); 307 bytes_needed = static_cast<off_t>(ctx->file_size - offset); 308 if (bytes_needed > ctx->page_size) bytes_needed = ctx->page_size; 309 310 // Copy data into page 311 if (VSIFSeekL(file, offset, SEEK_SET)) break; 312 if (VSIFReadL(ctx->page_ptr, bytes_needed, 1, file) != 1) break; 313 ctx->pages_used++; 314 315 // Use the page to fulfill the page fault 316 uffdio_copy.src = reinterpret_cast<uintptr_t>(ctx->page_ptr); 317 uffdio_copy.dst = fault_addr; 318 uffdio_copy.len = static_cast<uintptr_t>(ctx->page_size); 319 uffdio_copy.mode = 0; 320 uffdio_copy.copy = 0; 321 if (ioctl(ctx->uffd, UFFDIO_COPY, &uffdio_copy) == -1) break; 322 } 323 } // end of while loop 324 325 // Return resources 326 VSIFCloseL(file); 327 } 328 329 static void signal_handler(int signal) 330 { 331 if (signal == SIGSEGV || signal == SIGBUS) 332 sched_yield(); 333 return; 334 } 335 336 bool CPLIsUserFaultMappingSupported() 337 { 338 // Check the Linux kernel version. Linux 4.3 or newer is needed for 339 // userfaultfd. 340 int major = 0, minor = 0; 341 struct utsname utsname; 342 343 if (uname(&utsname)) return false; 344 sscanf(utsname.release, "%d.%d", &major, &minor); 345 if (major < 4) return false; 346 if (major == 4 && minor < 3) return false; 347 348 static int nEnableUserFaultFD = -1; 349 if( nEnableUserFaultFD < 0 ) 350 { 351 nEnableUserFaultFD = 352 CPLTestBool(CPLGetConfigOption("CPL_ENABLE_USERFAULTFD", "YES")); 353 } 354 355 return nEnableUserFaultFD != FALSE; 356 } 357 358 /* 359 * Returns nullptr on failure, a valid pointer on success. 360 */ 361 cpl_uffd_context* CPLCreateUserFaultMapping(const char * pszFilename, void ** ppVma, uint64_t * pnVmaSize) 362 { 363 VSIStatBufL statbuf; 364 struct cpl_uffd_context * ctx = nullptr; 365 366 if( !CPLIsUserFaultMappingSupported() ) 367 { 368 CPLError(CE_Failure, CPLE_NotSupported, 369 "CPLCreateUserFaultMapping(): Linux kernel 4.3 or newer needed"); 370 return nullptr; 371 } 372 373 // Get the size of the asset 374 if (VSIStatL(pszFilename, &statbuf)) return nullptr; 375 376 // Setup the `cpl_uffd_context` struct 377 ctx = new cpl_uffd_context(); 378 ctx->keep_going = true; 379 ctx->filename = std::string(pszFilename); 380 ctx->page_limit = get_page_limit(); 381 ctx->pages_used = 0; 382 ctx->file_size = static_cast<off_t>(statbuf.st_size); 383 ctx->page_size = static_cast<off_t>(sysconf(_SC_PAGESIZE)); 384 ctx->vma_size = static_cast<size_t>(((statbuf.st_size/ctx->page_size)+1) * ctx->page_size); 385 if (ctx->vma_size < static_cast<size_t>(statbuf.st_size)) { // Check for overflow 386 uffd_cleanup(ctx); 387 CPLError(CE_Failure, CPLE_AppDefined, 388 "CPLCreateUserFaultMapping(): File too large for architecture"); 389 return nullptr; 390 } 391 392 // If the mmap failed, free resources and return 393 ctx->vma_ptr = mmap(nullptr, ctx->vma_size, PROT_READ, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); 394 if (ctx->vma_ptr == BAD_MMAP) { 395 ctx->vma_ptr = nullptr; 396 uffd_cleanup(ctx); 397 CPLError(CE_Failure, CPLE_AppDefined, 398 "CPLCreateUserFaultMapping(): mmap() failed"); 399 return nullptr; 400 } 401 402 // Attempt to acquire a scratch page to use to fulfill requests. 403 ctx->page_ptr = mmap(nullptr, static_cast<size_t>(ctx->page_size), PROT_READ|PROT_WRITE, MAP_PRIVATE|MAP_ANONYMOUS, -1, 0); 404 if (ctx->page_ptr == BAD_MMAP) { 405 ctx->page_ptr = nullptr; 406 uffd_cleanup(ctx); 407 CPLError(CE_Failure, CPLE_AppDefined, 408 "CPLCreateUserFaultMapping(): mmap() failed"); 409 return nullptr; 410 } 411 412 // Get userfaultfd 413 if ((ctx->uffd = static_cast<int>(syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK))) == -1) { 414 ctx->uffd = -1; 415 uffd_cleanup(ctx); 416 CPLError(CE_Failure, CPLE_AppDefined, 417 "CPLCreateUserFaultMapping(): syscall(__NR_userfaultfd) failed"); 418 return nullptr; 419 } 420 421 // Query API 422 { 423 struct uffdio_api uffdio_api = {}; 424 425 uffdio_api.api = UFFD_API; 426 uffdio_api.features = 0; 427 428 if (ioctl(ctx->uffd, UFFDIO_API, &uffdio_api) == -1) { 429 uffd_cleanup(ctx); 430 CPLError(CE_Failure, CPLE_AppDefined, 431 "CPLCreateUserFaultMapping(): ioctl(UFFDIO_API) failed"); 432 return nullptr; 433 } 434 } 435 436 // Register memory range 437 ctx->uffdio_register.range.start = reinterpret_cast<uintptr_t>(ctx->vma_ptr); 438 ctx->uffdio_register.range.len = ctx->vma_size; 439 ctx->uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING; 440 441 if (ioctl(ctx->uffd, UFFDIO_REGISTER, &ctx->uffdio_register) == -1) { 442 uffd_cleanup(ctx); 443 CPLError(CE_Failure, CPLE_AppDefined, 444 "CPLCreateUserFaultMapping(): ioctl(UFFDIO_REGISTER) failed"); 445 return nullptr; 446 } 447 448 // Start handler thread 449 ctx->thread = CPLCreateJoinableThread(cpl_uffd_fault_handler, ctx); 450 if( ctx->thread == nullptr ) 451 { 452 CPLError(CE_Failure, CPLE_AppDefined, 453 "CPLCreateUserFaultMapping(): CPLCreateJoinableThread() failed"); 454 uffd_cleanup(ctx); 455 return nullptr; 456 } 457 458 *ppVma = ctx->vma_ptr; 459 *pnVmaSize = ctx->vma_size; 460 return ctx; 461 } 462 463 void CPLDeleteUserFaultMapping(cpl_uffd_context * ctx) 464 { 465 if (ctx) 466 { 467 uffd_cleanup(ctx); 468 } 469 } 470 471 #endif // ENABLE_UFFD 472