xref: /qemu/util/userfaultfd.c (revision 0ec8384f)
1 /*
2  * Linux UFFD-WP support
3  *
4  * Copyright Virtuozzo GmbH, 2020
5  *
6  * Authors:
7  *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or
10  * later.  See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/bitops.h"
15 #include "qemu/error-report.h"
16 #include "qemu/userfaultfd.h"
17 #include "trace.h"
18 #include <poll.h>
19 #include <sys/syscall.h>
20 #include <sys/ioctl.h>
21 #include <fcntl.h>
22 
23 typedef enum {
24     UFFD_UNINITIALIZED = 0,
25     UFFD_USE_DEV_PATH,
26     UFFD_USE_SYSCALL,
27 } uffd_open_mode;
28 
29 int uffd_open(int flags)
30 {
31 #if defined(__NR_userfaultfd)
32     static uffd_open_mode open_mode;
33     static int uffd_dev;
34 
35     /* Detect how to generate uffd desc when run the 1st time */
36     if (open_mode == UFFD_UNINITIALIZED) {
37         /*
38          * Make /dev/userfaultfd the default approach because it has better
39          * permission controls, meanwhile allows kernel faults without any
40          * privilege requirement (e.g. SYS_CAP_PTRACE).
41          */
42         uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
43         if (uffd_dev >= 0) {
44             open_mode = UFFD_USE_DEV_PATH;
45         } else {
46             /* Fallback to the system call */
47             open_mode = UFFD_USE_SYSCALL;
48         }
49         trace_uffd_detect_open_mode(open_mode);
50     }
51 
52     if (open_mode == UFFD_USE_DEV_PATH) {
53         assert(uffd_dev >= 0);
54         return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
55     }
56 
57     return syscall(__NR_userfaultfd, flags);
58 #else
59     return -EINVAL;
60 #endif
61 }
62 
63 /**
64  * uffd_query_features: query UFFD features
65  *
66  * Returns: 0 on success, negative value in case of an error
67  *
68  * @features: parameter to receive 'uffdio_api.features'
69  */
70 int uffd_query_features(uint64_t *features)
71 {
72     int uffd_fd;
73     struct uffdio_api api_struct = { 0 };
74     int ret = -1;
75 
76     uffd_fd = uffd_open(O_CLOEXEC);
77     if (uffd_fd < 0) {
78         trace_uffd_query_features_nosys(errno);
79         return -1;
80     }
81 
82     api_struct.api = UFFD_API;
83     api_struct.features = 0;
84 
85     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
86         trace_uffd_query_features_api_failed(errno);
87         goto out;
88     }
89     *features = api_struct.features;
90     ret = 0;
91 
92 out:
93     close(uffd_fd);
94     return ret;
95 }
96 
97 /**
98  * uffd_create_fd: create UFFD file descriptor
99  *
100  * Returns non-negative file descriptor or negative value in case of an error
101  *
102  * @features: UFFD features to request
103  * @non_blocking: create UFFD file descriptor for non-blocking operation
104  */
105 int uffd_create_fd(uint64_t features, bool non_blocking)
106 {
107     int uffd_fd;
108     int flags;
109     struct uffdio_api api_struct = { 0 };
110     uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
111 
112     flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
113     uffd_fd = uffd_open(flags);
114     if (uffd_fd < 0) {
115         trace_uffd_create_fd_nosys(errno);
116         return -1;
117     }
118 
119     api_struct.api = UFFD_API;
120     api_struct.features = features;
121     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
122         trace_uffd_create_fd_api_failed(errno);
123         goto fail;
124     }
125     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
126         trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
127         goto fail;
128     }
129 
130     return uffd_fd;
131 
132 fail:
133     close(uffd_fd);
134     return -1;
135 }
136 
137 /**
138  * uffd_close_fd: close UFFD file descriptor
139  *
140  * @uffd_fd: UFFD file descriptor
141  */
142 void uffd_close_fd(int uffd_fd)
143 {
144     assert(uffd_fd >= 0);
145     close(uffd_fd);
146 }
147 
148 /**
149  * uffd_register_memory: register memory range via UFFD-IO
150  *
151  * Returns 0 in case of success, negative value in case of an error
152  *
153  * @uffd_fd: UFFD file descriptor
154  * @addr: base address of memory range
155  * @length: length of memory range
156  * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
157  * @ioctls: optional pointer to receive supported IOCTL mask
158  */
159 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
160         uint64_t mode, uint64_t *ioctls)
161 {
162     struct uffdio_register uffd_register;
163 
164     uffd_register.range.start = (uintptr_t) addr;
165     uffd_register.range.len = length;
166     uffd_register.mode = mode;
167 
168     if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
169         trace_uffd_register_memory_failed(addr, length, mode, errno);
170         return -1;
171     }
172     if (ioctls) {
173         *ioctls = uffd_register.ioctls;
174     }
175 
176     return 0;
177 }
178 
179 /**
180  * uffd_unregister_memory: un-register memory range with UFFD-IO
181  *
182  * Returns 0 in case of success, negative value in case of an error
183  *
184  * @uffd_fd: UFFD file descriptor
185  * @addr: base address of memory range
186  * @length: length of memory range
187  */
188 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
189 {
190     struct uffdio_range uffd_range;
191 
192     uffd_range.start = (uintptr_t) addr;
193     uffd_range.len = length;
194 
195     if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
196         trace_uffd_unregister_memory_failed(addr, length, errno);
197         return -1;
198     }
199 
200     return 0;
201 }
202 
203 /**
204  * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
205  *
206  * Returns 0 on success, negative value in case of error
207  *
208  * @uffd_fd: UFFD file descriptor
209  * @addr: base address of memory range
210  * @length: length of memory range
211  * @wp: write-protect/unprotect
212  * @dont_wake: do not wake threads waiting on wr-protected page
213  */
214 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
215         bool wp, bool dont_wake)
216 {
217     struct uffdio_writeprotect uffd_writeprotect;
218 
219     uffd_writeprotect.range.start = (uintptr_t) addr;
220     uffd_writeprotect.range.len = length;
221     if (!wp && dont_wake) {
222         /* DONTWAKE is meaningful only on protection release */
223         uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
224     } else {
225         uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
226     }
227 
228     if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
229         error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
230                 " mode=%" PRIx64 " errno=%i", addr, length,
231                 (uint64_t) uffd_writeprotect.mode, errno);
232         return -1;
233     }
234 
235     return 0;
236 }
237 
238 /**
239  * uffd_copy_page: copy range of pages to destination via UFFD-IO
240  *
241  * Copy range of source pages to the destination to resolve
242  * missing page fault somewhere in the destination range.
243  *
244  * Returns 0 on success, negative value in case of an error
245  *
246  * @uffd_fd: UFFD file descriptor
247  * @dst_addr: destination base address
248  * @src_addr: source base address
249  * @length: length of the range to copy
250  * @dont_wake: do not wake threads waiting on missing page
251  */
252 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
253         uint64_t length, bool dont_wake)
254 {
255     struct uffdio_copy uffd_copy;
256 
257     uffd_copy.dst = (uintptr_t) dst_addr;
258     uffd_copy.src = (uintptr_t) src_addr;
259     uffd_copy.len = length;
260     uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
261 
262     if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
263         error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
264                 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
265                 length, (uint64_t) uffd_copy.mode, errno);
266         return -1;
267     }
268 
269     return 0;
270 }
271 
272 /**
273  * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
274  *
275  * Fill range pages with zeroes to resolve missing page fault within the range.
276  *
277  * Returns 0 on success, negative value in case of an error
278  *
279  * @uffd_fd: UFFD file descriptor
280  * @addr: base address
281  * @length: length of the range to fill with zeroes
282  * @dont_wake: do not wake threads waiting on missing page
283  */
284 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
285 {
286     struct uffdio_zeropage uffd_zeropage;
287 
288     uffd_zeropage.range.start = (uintptr_t) addr;
289     uffd_zeropage.range.len = length;
290     uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
291 
292     if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
293         error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
294                 " mode=%" PRIx64 " errno=%i", addr, length,
295                 (uint64_t) uffd_zeropage.mode, errno);
296         return -1;
297     }
298 
299     return 0;
300 }
301 
302 /**
303  * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
304  *
305  * Wake up threads waiting on any page/pages from the designated range.
306  * The main use case is when during some period, page faults are resolved
307  * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
308  * for the whole memory range are satisfied in a single call to uffd_wakeup().
309  *
310  * Returns 0 on success, negative value in case of an error
311  *
312  * @uffd_fd: UFFD file descriptor
313  * @addr: base address
314  * @length: length of the range
315  */
316 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
317 {
318     struct uffdio_range uffd_range;
319 
320     uffd_range.start = (uintptr_t) addr;
321     uffd_range.len = length;
322 
323     if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
324         error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
325                 addr, length, errno);
326         return -1;
327     }
328 
329     return 0;
330 }
331 
332 /**
333  * uffd_read_events: read pending UFFD events
334  *
335  * Returns number of fetched messages, 0 if non is available or
336  * negative value in case of an error
337  *
338  * @uffd_fd: UFFD file descriptor
339  * @msgs: pointer to message buffer
340  * @count: number of messages that can fit in the buffer
341  */
342 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
343 {
344     ssize_t res;
345     do {
346         res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
347     } while (res < 0 && errno == EINTR);
348 
349     if ((res < 0 && errno == EAGAIN)) {
350         return 0;
351     }
352     if (res < 0) {
353         error_report("uffd_read_events() failed: errno=%i", errno);
354         return -1;
355     }
356 
357     return (int) (res / sizeof(struct uffd_msg));
358 }
359 
360 /**
361  * uffd_poll_events: poll UFFD file descriptor for read
362  *
363  * Returns true if events are available for read, false otherwise
364  *
365  * @uffd_fd: UFFD file descriptor
366  * @tmo: timeout value
367  */
368 bool uffd_poll_events(int uffd_fd, int tmo)
369 {
370     int res;
371     struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
372 
373     do {
374         res = poll(&poll_fd, 1, tmo);
375     } while (res < 0 && errno == EINTR);
376 
377     if (res == 0) {
378         return false;
379     }
380     if (res < 0) {
381         error_report("uffd_poll_events() failed: errno=%i", errno);
382         return false;
383     }
384 
385     return (poll_fd.revents & POLLIN) != 0;
386 }
387