xref: /qemu/util/userfaultfd.c (revision 7c0dfcf9)
1 /*
2  * Linux UFFD-WP support
3  *
4  * Copyright Virtuozzo GmbH, 2020
5  *
6  * Authors:
7  *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or
10  * later.  See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/bitops.h"
15 #include "qemu/error-report.h"
16 #include "qemu/userfaultfd.h"
17 #include "trace.h"
18 #include <poll.h>
19 #include <sys/syscall.h>
20 #include <sys/ioctl.h>
21 
22 typedef enum {
23     UFFD_UNINITIALIZED = 0,
24     UFFD_USE_DEV_PATH,
25     UFFD_USE_SYSCALL,
26 } uffd_open_mode;
27 
28 int uffd_open(int flags)
29 {
30 #if defined(__NR_userfaultfd)
31     static uffd_open_mode open_mode;
32     static int uffd_dev;
33 
34     /* Detect how to generate uffd desc when run the 1st time */
35     if (open_mode == UFFD_UNINITIALIZED) {
36         /*
37          * Make /dev/userfaultfd the default approach because it has better
38          * permission controls, meanwhile allows kernel faults without any
39          * privilege requirement (e.g. SYS_CAP_PTRACE).
40          */
41         uffd_dev = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
42         if (uffd_dev >= 0) {
43             open_mode = UFFD_USE_DEV_PATH;
44         } else {
45             /* Fallback to the system call */
46             open_mode = UFFD_USE_SYSCALL;
47         }
48         trace_uffd_detect_open_mode(open_mode);
49     }
50 
51     if (open_mode == UFFD_USE_DEV_PATH) {
52         assert(uffd_dev >= 0);
53         return ioctl(uffd_dev, USERFAULTFD_IOC_NEW, flags);
54     }
55 
56     return syscall(__NR_userfaultfd, flags);
57 #else
58     return -EINVAL;
59 #endif
60 }
61 
62 /**
63  * uffd_query_features: query UFFD features
64  *
65  * Returns: 0 on success, negative value in case of an error
66  *
67  * @features: parameter to receive 'uffdio_api.features'
68  */
69 int uffd_query_features(uint64_t *features)
70 {
71     int uffd_fd;
72     struct uffdio_api api_struct = { 0 };
73     int ret = -1;
74 
75     uffd_fd = uffd_open(O_CLOEXEC);
76     if (uffd_fd < 0) {
77         trace_uffd_query_features_nosys(errno);
78         return -1;
79     }
80 
81     api_struct.api = UFFD_API;
82     api_struct.features = 0;
83 
84     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
85         trace_uffd_query_features_api_failed(errno);
86         goto out;
87     }
88     *features = api_struct.features;
89     ret = 0;
90 
91 out:
92     close(uffd_fd);
93     return ret;
94 }
95 
96 /**
97  * uffd_create_fd: create UFFD file descriptor
98  *
99  * Returns non-negative file descriptor or negative value in case of an error
100  *
101  * @features: UFFD features to request
102  * @non_blocking: create UFFD file descriptor for non-blocking operation
103  */
104 int uffd_create_fd(uint64_t features, bool non_blocking)
105 {
106     int uffd_fd;
107     int flags;
108     struct uffdio_api api_struct = { 0 };
109     uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
110 
111     flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
112     uffd_fd = uffd_open(flags);
113     if (uffd_fd < 0) {
114         trace_uffd_create_fd_nosys(errno);
115         return -1;
116     }
117 
118     api_struct.api = UFFD_API;
119     api_struct.features = features;
120     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
121         trace_uffd_create_fd_api_failed(errno);
122         goto fail;
123     }
124     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
125         trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
126         goto fail;
127     }
128 
129     return uffd_fd;
130 
131 fail:
132     close(uffd_fd);
133     return -1;
134 }
135 
136 /**
137  * uffd_close_fd: close UFFD file descriptor
138  *
139  * @uffd_fd: UFFD file descriptor
140  */
141 void uffd_close_fd(int uffd_fd)
142 {
143     assert(uffd_fd >= 0);
144     close(uffd_fd);
145 }
146 
147 /**
148  * uffd_register_memory: register memory range via UFFD-IO
149  *
150  * Returns 0 in case of success, negative value in case of an error
151  *
152  * @uffd_fd: UFFD file descriptor
153  * @addr: base address of memory range
154  * @length: length of memory range
155  * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
156  * @ioctls: optional pointer to receive supported IOCTL mask
157  */
158 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
159         uint64_t mode, uint64_t *ioctls)
160 {
161     struct uffdio_register uffd_register;
162 
163     uffd_register.range.start = (uintptr_t) addr;
164     uffd_register.range.len = length;
165     uffd_register.mode = mode;
166 
167     if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
168         trace_uffd_register_memory_failed(addr, length, mode, errno);
169         return -1;
170     }
171     if (ioctls) {
172         *ioctls = uffd_register.ioctls;
173     }
174 
175     return 0;
176 }
177 
178 /**
179  * uffd_unregister_memory: un-register memory range with UFFD-IO
180  *
181  * Returns 0 in case of success, negative value in case of an error
182  *
183  * @uffd_fd: UFFD file descriptor
184  * @addr: base address of memory range
185  * @length: length of memory range
186  */
187 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
188 {
189     struct uffdio_range uffd_range;
190 
191     uffd_range.start = (uintptr_t) addr;
192     uffd_range.len = length;
193 
194     if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
195         trace_uffd_unregister_memory_failed(addr, length, errno);
196         return -1;
197     }
198 
199     return 0;
200 }
201 
202 /**
203  * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
204  *
205  * Returns 0 on success, negative value in case of error
206  *
207  * @uffd_fd: UFFD file descriptor
208  * @addr: base address of memory range
209  * @length: length of memory range
210  * @wp: write-protect/unprotect
211  * @dont_wake: do not wake threads waiting on wr-protected page
212  */
213 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
214         bool wp, bool dont_wake)
215 {
216     struct uffdio_writeprotect uffd_writeprotect;
217 
218     uffd_writeprotect.range.start = (uintptr_t) addr;
219     uffd_writeprotect.range.len = length;
220     if (!wp && dont_wake) {
221         /* DONTWAKE is meaningful only on protection release */
222         uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
223     } else {
224         uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
225     }
226 
227     if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
228         error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
229                 " mode=%" PRIx64 " errno=%i", addr, length,
230                 (uint64_t) uffd_writeprotect.mode, errno);
231         return -1;
232     }
233 
234     return 0;
235 }
236 
237 /**
238  * uffd_copy_page: copy range of pages to destination via UFFD-IO
239  *
240  * Copy range of source pages to the destination to resolve
241  * missing page fault somewhere in the destination range.
242  *
243  * Returns 0 on success, negative value in case of an error
244  *
245  * @uffd_fd: UFFD file descriptor
246  * @dst_addr: destination base address
247  * @src_addr: source base address
248  * @length: length of the range to copy
249  * @dont_wake: do not wake threads waiting on missing page
250  */
251 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
252         uint64_t length, bool dont_wake)
253 {
254     struct uffdio_copy uffd_copy;
255 
256     uffd_copy.dst = (uintptr_t) dst_addr;
257     uffd_copy.src = (uintptr_t) src_addr;
258     uffd_copy.len = length;
259     uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
260 
261     if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
262         error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
263                 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
264                 length, (uint64_t) uffd_copy.mode, errno);
265         return -1;
266     }
267 
268     return 0;
269 }
270 
271 /**
272  * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
273  *
274  * Fill range pages with zeroes to resolve missing page fault within the range.
275  *
276  * Returns 0 on success, negative value in case of an error
277  *
278  * @uffd_fd: UFFD file descriptor
279  * @addr: base address
280  * @length: length of the range to fill with zeroes
281  * @dont_wake: do not wake threads waiting on missing page
282  */
283 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
284 {
285     struct uffdio_zeropage uffd_zeropage;
286 
287     uffd_zeropage.range.start = (uintptr_t) addr;
288     uffd_zeropage.range.len = length;
289     uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
290 
291     if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
292         error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
293                 " mode=%" PRIx64 " errno=%i", addr, length,
294                 (uint64_t) uffd_zeropage.mode, errno);
295         return -1;
296     }
297 
298     return 0;
299 }
300 
301 /**
302  * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
303  *
304  * Wake up threads waiting on any page/pages from the designated range.
305  * The main use case is when during some period, page faults are resolved
306  * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
307  * for the whole memory range are satisfied in a single call to uffd_wakeup().
308  *
309  * Returns 0 on success, negative value in case of an error
310  *
311  * @uffd_fd: UFFD file descriptor
312  * @addr: base address
313  * @length: length of the range
314  */
315 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
316 {
317     struct uffdio_range uffd_range;
318 
319     uffd_range.start = (uintptr_t) addr;
320     uffd_range.len = length;
321 
322     if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
323         error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
324                 addr, length, errno);
325         return -1;
326     }
327 
328     return 0;
329 }
330 
331 /**
332  * uffd_read_events: read pending UFFD events
333  *
334  * Returns number of fetched messages, 0 if non is available or
335  * negative value in case of an error
336  *
337  * @uffd_fd: UFFD file descriptor
338  * @msgs: pointer to message buffer
339  * @count: number of messages that can fit in the buffer
340  */
341 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
342 {
343     ssize_t res;
344     do {
345         res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
346     } while (res < 0 && errno == EINTR);
347 
348     if ((res < 0 && errno == EAGAIN)) {
349         return 0;
350     }
351     if (res < 0) {
352         error_report("uffd_read_events() failed: errno=%i", errno);
353         return -1;
354     }
355 
356     return (int) (res / sizeof(struct uffd_msg));
357 }
358 
359 /**
360  * uffd_poll_events: poll UFFD file descriptor for read
361  *
362  * Returns true if events are available for read, false otherwise
363  *
364  * @uffd_fd: UFFD file descriptor
365  * @tmo: timeout value
366  */
367 bool uffd_poll_events(int uffd_fd, int tmo)
368 {
369     int res;
370     struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
371 
372     do {
373         res = poll(&poll_fd, 1, tmo);
374     } while (res < 0 && errno == EINTR);
375 
376     if (res == 0) {
377         return false;
378     }
379     if (res < 0) {
380         error_report("uffd_poll_events() failed: errno=%i", errno);
381         return false;
382     }
383 
384     return (poll_fd.revents & POLLIN) != 0;
385 }
386