xref: /qemu/util/userfaultfd.c (revision c40c0463)
10e9b5cd6SAndrey Gruzdev /*
20e9b5cd6SAndrey Gruzdev  * Linux UFFD-WP support
30e9b5cd6SAndrey Gruzdev  *
40e9b5cd6SAndrey Gruzdev  * Copyright Virtuozzo GmbH, 2020
50e9b5cd6SAndrey Gruzdev  *
60e9b5cd6SAndrey Gruzdev  * Authors:
70e9b5cd6SAndrey Gruzdev  *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
80e9b5cd6SAndrey Gruzdev  *
90e9b5cd6SAndrey Gruzdev  * This work is licensed under the terms of the GNU GPL, version 2 or
100e9b5cd6SAndrey Gruzdev  * later.  See the COPYING file in the top-level directory.
110e9b5cd6SAndrey Gruzdev  */
120e9b5cd6SAndrey Gruzdev 
130e9b5cd6SAndrey Gruzdev #include "qemu/osdep.h"
140e9b5cd6SAndrey Gruzdev #include "qemu/bitops.h"
150e9b5cd6SAndrey Gruzdev #include "qemu/error-report.h"
160e9b5cd6SAndrey Gruzdev #include "qemu/userfaultfd.h"
170e9b5cd6SAndrey Gruzdev #include "trace.h"
180e9b5cd6SAndrey Gruzdev #include <poll.h>
190e9b5cd6SAndrey Gruzdev #include <sys/syscall.h>
200e9b5cd6SAndrey Gruzdev #include <sys/ioctl.h>
21c40c0463SPeter Xu #include <fcntl.h>
22c40c0463SPeter Xu 
23c40c0463SPeter Xu typedef enum {
24c40c0463SPeter Xu     UFFD_UNINITIALIZED = 0,
25c40c0463SPeter Xu     UFFD_USE_DEV_PATH,
26c40c0463SPeter Xu     UFFD_USE_SYSCALL,
27c40c0463SPeter Xu } uffd_open_mode;
280e9b5cd6SAndrey Gruzdev 
29d5890ea0SPeter Xu int uffd_open(int flags)
30d5890ea0SPeter Xu {
31d5890ea0SPeter Xu #if defined(__NR_userfaultfd)
32c40c0463SPeter Xu     static uffd_open_mode open_mode;
33c40c0463SPeter Xu     static int uffd_dev;
34c40c0463SPeter Xu 
35c40c0463SPeter Xu     /* Detect how to generate uffd desc when run the 1st time */
36c40c0463SPeter Xu     if (open_mode == UFFD_UNINITIALIZED) {
37c40c0463SPeter Xu         /*
38c40c0463SPeter Xu          * Make /dev/userfaultfd the default approach because it has better
39c40c0463SPeter Xu          * permission controls, meanwhile allows kernel faults without any
40c40c0463SPeter Xu          * privilege requirement (e.g. SYS_CAP_PTRACE).
41c40c0463SPeter Xu          */
42c40c0463SPeter Xu         uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
43c40c0463SPeter Xu         if (uffd_dev >= 0) {
44c40c0463SPeter Xu             open_mode = UFFD_USE_DEV_PATH;
45c40c0463SPeter Xu         } else {
46c40c0463SPeter Xu             /* Fallback to the system call */
47c40c0463SPeter Xu             open_mode = UFFD_USE_SYSCALL;
48c40c0463SPeter Xu         }
49c40c0463SPeter Xu         trace_uffd_detect_open_mode(open_mode);
50c40c0463SPeter Xu     }
51c40c0463SPeter Xu 
52c40c0463SPeter Xu     if (open_mode == UFFD_USE_DEV_PATH) {
53c40c0463SPeter Xu         assert(uffd_dev >= 0);
54c40c0463SPeter Xu         return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
55c40c0463SPeter Xu     }
56c40c0463SPeter Xu 
57d5890ea0SPeter Xu     return syscall(__NR_userfaultfd, flags);
58d5890ea0SPeter Xu #else
59d5890ea0SPeter Xu     return -EINVAL;
60d5890ea0SPeter Xu #endif
61d5890ea0SPeter Xu }
62d5890ea0SPeter Xu 
630e9b5cd6SAndrey Gruzdev /**
640e9b5cd6SAndrey Gruzdev  * uffd_query_features: query UFFD features
650e9b5cd6SAndrey Gruzdev  *
660e9b5cd6SAndrey Gruzdev  * Returns: 0 on success, negative value in case of an error
670e9b5cd6SAndrey Gruzdev  *
680e9b5cd6SAndrey Gruzdev  * @features: parameter to receive 'uffdio_api.features'
690e9b5cd6SAndrey Gruzdev  */
700e9b5cd6SAndrey Gruzdev int uffd_query_features(uint64_t *features)
710e9b5cd6SAndrey Gruzdev {
720e9b5cd6SAndrey Gruzdev     int uffd_fd;
730e9b5cd6SAndrey Gruzdev     struct uffdio_api api_struct = { 0 };
740e9b5cd6SAndrey Gruzdev     int ret = -1;
750e9b5cd6SAndrey Gruzdev 
76d5890ea0SPeter Xu     uffd_fd = uffd_open(O_CLOEXEC);
770e9b5cd6SAndrey Gruzdev     if (uffd_fd < 0) {
780e9b5cd6SAndrey Gruzdev         trace_uffd_query_features_nosys(errno);
790e9b5cd6SAndrey Gruzdev         return -1;
800e9b5cd6SAndrey Gruzdev     }
810e9b5cd6SAndrey Gruzdev 
820e9b5cd6SAndrey Gruzdev     api_struct.api = UFFD_API;
830e9b5cd6SAndrey Gruzdev     api_struct.features = 0;
840e9b5cd6SAndrey Gruzdev 
850e9b5cd6SAndrey Gruzdev     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
860e9b5cd6SAndrey Gruzdev         trace_uffd_query_features_api_failed(errno);
870e9b5cd6SAndrey Gruzdev         goto out;
880e9b5cd6SAndrey Gruzdev     }
890e9b5cd6SAndrey Gruzdev     *features = api_struct.features;
900e9b5cd6SAndrey Gruzdev     ret = 0;
910e9b5cd6SAndrey Gruzdev 
920e9b5cd6SAndrey Gruzdev out:
930e9b5cd6SAndrey Gruzdev     close(uffd_fd);
940e9b5cd6SAndrey Gruzdev     return ret;
950e9b5cd6SAndrey Gruzdev }
960e9b5cd6SAndrey Gruzdev 
970e9b5cd6SAndrey Gruzdev /**
980e9b5cd6SAndrey Gruzdev  * uffd_create_fd: create UFFD file descriptor
990e9b5cd6SAndrey Gruzdev  *
1000e9b5cd6SAndrey Gruzdev  * Returns non-negative file descriptor or negative value in case of an error
1010e9b5cd6SAndrey Gruzdev  *
1020e9b5cd6SAndrey Gruzdev  * @features: UFFD features to request
1030e9b5cd6SAndrey Gruzdev  * @non_blocking: create UFFD file descriptor for non-blocking operation
1040e9b5cd6SAndrey Gruzdev  */
1050e9b5cd6SAndrey Gruzdev int uffd_create_fd(uint64_t features, bool non_blocking)
1060e9b5cd6SAndrey Gruzdev {
1070e9b5cd6SAndrey Gruzdev     int uffd_fd;
1080e9b5cd6SAndrey Gruzdev     int flags;
1090e9b5cd6SAndrey Gruzdev     struct uffdio_api api_struct = { 0 };
1100e9b5cd6SAndrey Gruzdev     uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
1110e9b5cd6SAndrey Gruzdev 
1120e9b5cd6SAndrey Gruzdev     flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
113d5890ea0SPeter Xu     uffd_fd = uffd_open(flags);
1140e9b5cd6SAndrey Gruzdev     if (uffd_fd < 0) {
1150e9b5cd6SAndrey Gruzdev         trace_uffd_create_fd_nosys(errno);
1160e9b5cd6SAndrey Gruzdev         return -1;
1170e9b5cd6SAndrey Gruzdev     }
1180e9b5cd6SAndrey Gruzdev 
1190e9b5cd6SAndrey Gruzdev     api_struct.api = UFFD_API;
1200e9b5cd6SAndrey Gruzdev     api_struct.features = features;
1210e9b5cd6SAndrey Gruzdev     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
1220e9b5cd6SAndrey Gruzdev         trace_uffd_create_fd_api_failed(errno);
1230e9b5cd6SAndrey Gruzdev         goto fail;
1240e9b5cd6SAndrey Gruzdev     }
1250e9b5cd6SAndrey Gruzdev     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
1260e9b5cd6SAndrey Gruzdev         trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
1270e9b5cd6SAndrey Gruzdev         goto fail;
1280e9b5cd6SAndrey Gruzdev     }
1290e9b5cd6SAndrey Gruzdev 
1300e9b5cd6SAndrey Gruzdev     return uffd_fd;
1310e9b5cd6SAndrey Gruzdev 
1320e9b5cd6SAndrey Gruzdev fail:
1330e9b5cd6SAndrey Gruzdev     close(uffd_fd);
1340e9b5cd6SAndrey Gruzdev     return -1;
1350e9b5cd6SAndrey Gruzdev }
1360e9b5cd6SAndrey Gruzdev 
1370e9b5cd6SAndrey Gruzdev /**
1380e9b5cd6SAndrey Gruzdev  * uffd_close_fd: close UFFD file descriptor
1390e9b5cd6SAndrey Gruzdev  *
1400e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
1410e9b5cd6SAndrey Gruzdev  */
1420e9b5cd6SAndrey Gruzdev void uffd_close_fd(int uffd_fd)
1430e9b5cd6SAndrey Gruzdev {
1440e9b5cd6SAndrey Gruzdev     assert(uffd_fd >= 0);
1450e9b5cd6SAndrey Gruzdev     close(uffd_fd);
1460e9b5cd6SAndrey Gruzdev }
1470e9b5cd6SAndrey Gruzdev 
1480e9b5cd6SAndrey Gruzdev /**
1490e9b5cd6SAndrey Gruzdev  * uffd_register_memory: register memory range via UFFD-IO
1500e9b5cd6SAndrey Gruzdev  *
1510e9b5cd6SAndrey Gruzdev  * Returns 0 in case of success, negative value in case of an error
1520e9b5cd6SAndrey Gruzdev  *
1530e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
1540e9b5cd6SAndrey Gruzdev  * @addr: base address of memory range
1550e9b5cd6SAndrey Gruzdev  * @length: length of memory range
1560e9b5cd6SAndrey Gruzdev  * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
1570e9b5cd6SAndrey Gruzdev  * @ioctls: optional pointer to receive supported IOCTL mask
1580e9b5cd6SAndrey Gruzdev  */
1590e9b5cd6SAndrey Gruzdev int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
1600e9b5cd6SAndrey Gruzdev         uint64_t mode, uint64_t *ioctls)
1610e9b5cd6SAndrey Gruzdev {
1620e9b5cd6SAndrey Gruzdev     struct uffdio_register uffd_register;
1630e9b5cd6SAndrey Gruzdev 
1640e9b5cd6SAndrey Gruzdev     uffd_register.range.start = (uintptr_t) addr;
1650e9b5cd6SAndrey Gruzdev     uffd_register.range.len = length;
1660e9b5cd6SAndrey Gruzdev     uffd_register.mode = mode;
1670e9b5cd6SAndrey Gruzdev 
1680e9b5cd6SAndrey Gruzdev     if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
1690e9b5cd6SAndrey Gruzdev         trace_uffd_register_memory_failed(addr, length, mode, errno);
1700e9b5cd6SAndrey Gruzdev         return -1;
1710e9b5cd6SAndrey Gruzdev     }
1720e9b5cd6SAndrey Gruzdev     if (ioctls) {
1730e9b5cd6SAndrey Gruzdev         *ioctls = uffd_register.ioctls;
1740e9b5cd6SAndrey Gruzdev     }
1750e9b5cd6SAndrey Gruzdev 
1760e9b5cd6SAndrey Gruzdev     return 0;
1770e9b5cd6SAndrey Gruzdev }
1780e9b5cd6SAndrey Gruzdev 
1790e9b5cd6SAndrey Gruzdev /**
1800e9b5cd6SAndrey Gruzdev  * uffd_unregister_memory: un-register memory range with UFFD-IO
1810e9b5cd6SAndrey Gruzdev  *
1820e9b5cd6SAndrey Gruzdev  * Returns 0 in case of success, negative value in case of an error
1830e9b5cd6SAndrey Gruzdev  *
1840e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
1850e9b5cd6SAndrey Gruzdev  * @addr: base address of memory range
1860e9b5cd6SAndrey Gruzdev  * @length: length of memory range
1870e9b5cd6SAndrey Gruzdev  */
1880e9b5cd6SAndrey Gruzdev int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
1890e9b5cd6SAndrey Gruzdev {
1900e9b5cd6SAndrey Gruzdev     struct uffdio_range uffd_range;
1910e9b5cd6SAndrey Gruzdev 
1920e9b5cd6SAndrey Gruzdev     uffd_range.start = (uintptr_t) addr;
1930e9b5cd6SAndrey Gruzdev     uffd_range.len = length;
1940e9b5cd6SAndrey Gruzdev 
1950e9b5cd6SAndrey Gruzdev     if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
1960e9b5cd6SAndrey Gruzdev         trace_uffd_unregister_memory_failed(addr, length, errno);
1970e9b5cd6SAndrey Gruzdev         return -1;
1980e9b5cd6SAndrey Gruzdev     }
1990e9b5cd6SAndrey Gruzdev 
2000e9b5cd6SAndrey Gruzdev     return 0;
2010e9b5cd6SAndrey Gruzdev }
2020e9b5cd6SAndrey Gruzdev 
2030e9b5cd6SAndrey Gruzdev /**
2040e9b5cd6SAndrey Gruzdev  * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
2050e9b5cd6SAndrey Gruzdev  *
2060e9b5cd6SAndrey Gruzdev  * Returns 0 on success, negative value in case of error
2070e9b5cd6SAndrey Gruzdev  *
2080e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
2090e9b5cd6SAndrey Gruzdev  * @addr: base address of memory range
2100e9b5cd6SAndrey Gruzdev  * @length: length of memory range
2110e9b5cd6SAndrey Gruzdev  * @wp: write-protect/unprotect
2120e9b5cd6SAndrey Gruzdev  * @dont_wake: do not wake threads waiting on wr-protected page
2130e9b5cd6SAndrey Gruzdev  */
2140e9b5cd6SAndrey Gruzdev int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
2150e9b5cd6SAndrey Gruzdev         bool wp, bool dont_wake)
2160e9b5cd6SAndrey Gruzdev {
2170e9b5cd6SAndrey Gruzdev     struct uffdio_writeprotect uffd_writeprotect;
2180e9b5cd6SAndrey Gruzdev 
2190e9b5cd6SAndrey Gruzdev     uffd_writeprotect.range.start = (uintptr_t) addr;
2200e9b5cd6SAndrey Gruzdev     uffd_writeprotect.range.len = length;
2210e9b5cd6SAndrey Gruzdev     if (!wp && dont_wake) {
2220e9b5cd6SAndrey Gruzdev         /* DONTWAKE is meaningful only on protection release */
2230e9b5cd6SAndrey Gruzdev         uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
2240e9b5cd6SAndrey Gruzdev     } else {
2250e9b5cd6SAndrey Gruzdev         uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
2260e9b5cd6SAndrey Gruzdev     }
2270e9b5cd6SAndrey Gruzdev 
2280e9b5cd6SAndrey Gruzdev     if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
2290e9b5cd6SAndrey Gruzdev         error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
2300e9b5cd6SAndrey Gruzdev                 " mode=%" PRIx64 " errno=%i", addr, length,
2310e9b5cd6SAndrey Gruzdev                 (uint64_t) uffd_writeprotect.mode, errno);
2320e9b5cd6SAndrey Gruzdev         return -1;
2330e9b5cd6SAndrey Gruzdev     }
2340e9b5cd6SAndrey Gruzdev 
2350e9b5cd6SAndrey Gruzdev     return 0;
2360e9b5cd6SAndrey Gruzdev }
2370e9b5cd6SAndrey Gruzdev 
2380e9b5cd6SAndrey Gruzdev /**
2390e9b5cd6SAndrey Gruzdev  * uffd_copy_page: copy range of pages to destination via UFFD-IO
2400e9b5cd6SAndrey Gruzdev  *
2410e9b5cd6SAndrey Gruzdev  * Copy range of source pages to the destination to resolve
2420e9b5cd6SAndrey Gruzdev  * missing page fault somewhere in the destination range.
2430e9b5cd6SAndrey Gruzdev  *
2440e9b5cd6SAndrey Gruzdev  * Returns 0 on success, negative value in case of an error
2450e9b5cd6SAndrey Gruzdev  *
2460e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
2470e9b5cd6SAndrey Gruzdev  * @dst_addr: destination base address
2480e9b5cd6SAndrey Gruzdev  * @src_addr: source base address
2490e9b5cd6SAndrey Gruzdev  * @length: length of the range to copy
2500e9b5cd6SAndrey Gruzdev  * @dont_wake: do not wake threads waiting on missing page
2510e9b5cd6SAndrey Gruzdev  */
2520e9b5cd6SAndrey Gruzdev int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
2530e9b5cd6SAndrey Gruzdev         uint64_t length, bool dont_wake)
2540e9b5cd6SAndrey Gruzdev {
2550e9b5cd6SAndrey Gruzdev     struct uffdio_copy uffd_copy;
2560e9b5cd6SAndrey Gruzdev 
2570e9b5cd6SAndrey Gruzdev     uffd_copy.dst = (uintptr_t) dst_addr;
2580e9b5cd6SAndrey Gruzdev     uffd_copy.src = (uintptr_t) src_addr;
2590e9b5cd6SAndrey Gruzdev     uffd_copy.len = length;
2600e9b5cd6SAndrey Gruzdev     uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
2610e9b5cd6SAndrey Gruzdev 
2620e9b5cd6SAndrey Gruzdev     if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
2630e9b5cd6SAndrey Gruzdev         error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
2640e9b5cd6SAndrey Gruzdev                 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
2650e9b5cd6SAndrey Gruzdev                 length, (uint64_t) uffd_copy.mode, errno);
2660e9b5cd6SAndrey Gruzdev         return -1;
2670e9b5cd6SAndrey Gruzdev     }
2680e9b5cd6SAndrey Gruzdev 
2690e9b5cd6SAndrey Gruzdev     return 0;
2700e9b5cd6SAndrey Gruzdev }
2710e9b5cd6SAndrey Gruzdev 
2720e9b5cd6SAndrey Gruzdev /**
2730e9b5cd6SAndrey Gruzdev  * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
2740e9b5cd6SAndrey Gruzdev  *
2750e9b5cd6SAndrey Gruzdev  * Fill range pages with zeroes to resolve missing page fault within the range.
2760e9b5cd6SAndrey Gruzdev  *
2770e9b5cd6SAndrey Gruzdev  * Returns 0 on success, negative value in case of an error
2780e9b5cd6SAndrey Gruzdev  *
2790e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
2800e9b5cd6SAndrey Gruzdev  * @addr: base address
2810e9b5cd6SAndrey Gruzdev  * @length: length of the range to fill with zeroes
2820e9b5cd6SAndrey Gruzdev  * @dont_wake: do not wake threads waiting on missing page
2830e9b5cd6SAndrey Gruzdev  */
2840e9b5cd6SAndrey Gruzdev int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
2850e9b5cd6SAndrey Gruzdev {
2860e9b5cd6SAndrey Gruzdev     struct uffdio_zeropage uffd_zeropage;
2870e9b5cd6SAndrey Gruzdev 
2880e9b5cd6SAndrey Gruzdev     uffd_zeropage.range.start = (uintptr_t) addr;
2890e9b5cd6SAndrey Gruzdev     uffd_zeropage.range.len = length;
2900e9b5cd6SAndrey Gruzdev     uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
2910e9b5cd6SAndrey Gruzdev 
2920e9b5cd6SAndrey Gruzdev     if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
2930e9b5cd6SAndrey Gruzdev         error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
2940e9b5cd6SAndrey Gruzdev                 " mode=%" PRIx64 " errno=%i", addr, length,
2950e9b5cd6SAndrey Gruzdev                 (uint64_t) uffd_zeropage.mode, errno);
2960e9b5cd6SAndrey Gruzdev         return -1;
2970e9b5cd6SAndrey Gruzdev     }
2980e9b5cd6SAndrey Gruzdev 
2990e9b5cd6SAndrey Gruzdev     return 0;
3000e9b5cd6SAndrey Gruzdev }
3010e9b5cd6SAndrey Gruzdev 
3020e9b5cd6SAndrey Gruzdev /**
3030e9b5cd6SAndrey Gruzdev  * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
3040e9b5cd6SAndrey Gruzdev  *
3050e9b5cd6SAndrey Gruzdev  * Wake up threads waiting on any page/pages from the designated range.
3060e9b5cd6SAndrey Gruzdev  * The main use case is when during some period, page faults are resolved
3070e9b5cd6SAndrey Gruzdev  * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
3080e9b5cd6SAndrey Gruzdev  * for the whole memory range are satisfied in a single call to uffd_wakeup().
3090e9b5cd6SAndrey Gruzdev  *
3100e9b5cd6SAndrey Gruzdev  * Returns 0 on success, negative value in case of an error
3110e9b5cd6SAndrey Gruzdev  *
3120e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
3130e9b5cd6SAndrey Gruzdev  * @addr: base address
3140e9b5cd6SAndrey Gruzdev  * @length: length of the range
3150e9b5cd6SAndrey Gruzdev  */
3160e9b5cd6SAndrey Gruzdev int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
3170e9b5cd6SAndrey Gruzdev {
3180e9b5cd6SAndrey Gruzdev     struct uffdio_range uffd_range;
3190e9b5cd6SAndrey Gruzdev 
3200e9b5cd6SAndrey Gruzdev     uffd_range.start = (uintptr_t) addr;
3210e9b5cd6SAndrey Gruzdev     uffd_range.len = length;
3220e9b5cd6SAndrey Gruzdev 
3230e9b5cd6SAndrey Gruzdev     if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
3240e9b5cd6SAndrey Gruzdev         error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
3250e9b5cd6SAndrey Gruzdev                 addr, length, errno);
3260e9b5cd6SAndrey Gruzdev         return -1;
3270e9b5cd6SAndrey Gruzdev     }
3280e9b5cd6SAndrey Gruzdev 
3290e9b5cd6SAndrey Gruzdev     return 0;
3300e9b5cd6SAndrey Gruzdev }
3310e9b5cd6SAndrey Gruzdev 
3320e9b5cd6SAndrey Gruzdev /**
3330e9b5cd6SAndrey Gruzdev  * uffd_read_events: read pending UFFD events
3340e9b5cd6SAndrey Gruzdev  *
3350e9b5cd6SAndrey Gruzdev  * Returns number of fetched messages, 0 if non is available or
3360e9b5cd6SAndrey Gruzdev  * negative value in case of an error
3370e9b5cd6SAndrey Gruzdev  *
3380e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
3390e9b5cd6SAndrey Gruzdev  * @msgs: pointer to message buffer
3400e9b5cd6SAndrey Gruzdev  * @count: number of messages that can fit in the buffer
3410e9b5cd6SAndrey Gruzdev  */
3420e9b5cd6SAndrey Gruzdev int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
3430e9b5cd6SAndrey Gruzdev {
3440e9b5cd6SAndrey Gruzdev     ssize_t res;
3450e9b5cd6SAndrey Gruzdev     do {
3460e9b5cd6SAndrey Gruzdev         res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
3470e9b5cd6SAndrey Gruzdev     } while (res < 0 && errno == EINTR);
3480e9b5cd6SAndrey Gruzdev 
3490e9b5cd6SAndrey Gruzdev     if ((res < 0 && errno == EAGAIN)) {
3500e9b5cd6SAndrey Gruzdev         return 0;
3510e9b5cd6SAndrey Gruzdev     }
3520e9b5cd6SAndrey Gruzdev     if (res < 0) {
3530e9b5cd6SAndrey Gruzdev         error_report("uffd_read_events() failed: errno=%i", errno);
3540e9b5cd6SAndrey Gruzdev         return -1;
3550e9b5cd6SAndrey Gruzdev     }
3560e9b5cd6SAndrey Gruzdev 
3570e9b5cd6SAndrey Gruzdev     return (int) (res / sizeof(struct uffd_msg));
3580e9b5cd6SAndrey Gruzdev }
3590e9b5cd6SAndrey Gruzdev 
3600e9b5cd6SAndrey Gruzdev /**
3610e9b5cd6SAndrey Gruzdev  * uffd_poll_events: poll UFFD file descriptor for read
3620e9b5cd6SAndrey Gruzdev  *
3630e9b5cd6SAndrey Gruzdev  * Returns true if events are available for read, false otherwise
3640e9b5cd6SAndrey Gruzdev  *
3650e9b5cd6SAndrey Gruzdev  * @uffd_fd: UFFD file descriptor
3660e9b5cd6SAndrey Gruzdev  * @tmo: timeout value
3670e9b5cd6SAndrey Gruzdev  */
3680e9b5cd6SAndrey Gruzdev bool uffd_poll_events(int uffd_fd, int tmo)
3690e9b5cd6SAndrey Gruzdev {
3700e9b5cd6SAndrey Gruzdev     int res;
3710e9b5cd6SAndrey Gruzdev     struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
3720e9b5cd6SAndrey Gruzdev 
3730e9b5cd6SAndrey Gruzdev     do {
3740e9b5cd6SAndrey Gruzdev         res = poll(&poll_fd, 1, tmo);
3750e9b5cd6SAndrey Gruzdev     } while (res < 0 && errno == EINTR);
3760e9b5cd6SAndrey Gruzdev 
3770e9b5cd6SAndrey Gruzdev     if (res == 0) {
3780e9b5cd6SAndrey Gruzdev         return false;
3790e9b5cd6SAndrey Gruzdev     }
3800e9b5cd6SAndrey Gruzdev     if (res < 0) {
3810e9b5cd6SAndrey Gruzdev         error_report("uffd_poll_events() failed: errno=%i", errno);
3820e9b5cd6SAndrey Gruzdev         return false;
3830e9b5cd6SAndrey Gruzdev     }
3840e9b5cd6SAndrey Gruzdev 
3850e9b5cd6SAndrey Gruzdev     return (poll_fd.revents & POLLIN) != 0;
3860e9b5cd6SAndrey Gruzdev }
387