xref: /qemu/util/userfaultfd.c (revision 84615a19)
1 /*
2  * Linux UFFD-WP support
3  *
4  * Copyright Virtuozzo GmbH, 2020
5  *
6  * Authors:
7  *  Andrey Gruzdev   <andrey.gruzdev@virtuozzo.com>
8  *
9  * This work is licensed under the terms of the GNU GPL, version 2 or
10  * later.  See the COPYING file in the top-level directory.
11  */
12 
13 #include "qemu/osdep.h"
14 #include "qemu/bitops.h"
15 #include "qemu/error-report.h"
16 #include "qemu/userfaultfd.h"
17 #include "trace.h"
18 #include <poll.h>
19 #include <sys/syscall.h>
20 #include <sys/ioctl.h>
21 
22 int uffd_open(int flags)
23 {
24 #if defined(__NR_userfaultfd)
25     return syscall(__NR_userfaultfd, flags);
26 #else
27     return -EINVAL;
28 #endif
29 }
30 
31 /**
32  * uffd_query_features: query UFFD features
33  *
34  * Returns: 0 on success, negative value in case of an error
35  *
36  * @features: parameter to receive 'uffdio_api.features'
37  */
38 int uffd_query_features(uint64_t *features)
39 {
40     int uffd_fd;
41     struct uffdio_api api_struct = { 0 };
42     int ret = -1;
43 
44     uffd_fd = uffd_open(O_CLOEXEC);
45     if (uffd_fd < 0) {
46         trace_uffd_query_features_nosys(errno);
47         return -1;
48     }
49 
50     api_struct.api = UFFD_API;
51     api_struct.features = 0;
52 
53     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
54         trace_uffd_query_features_api_failed(errno);
55         goto out;
56     }
57     *features = api_struct.features;
58     ret = 0;
59 
60 out:
61     close(uffd_fd);
62     return ret;
63 }
64 
65 /**
66  * uffd_create_fd: create UFFD file descriptor
67  *
68  * Returns non-negative file descriptor or negative value in case of an error
69  *
70  * @features: UFFD features to request
71  * @non_blocking: create UFFD file descriptor for non-blocking operation
72  */
73 int uffd_create_fd(uint64_t features, bool non_blocking)
74 {
75     int uffd_fd;
76     int flags;
77     struct uffdio_api api_struct = { 0 };
78     uint64_t ioctl_mask = BIT(_UFFDIO_REGISTER) | BIT(_UFFDIO_UNREGISTER);
79 
80     flags = O_CLOEXEC | (non_blocking ? O_NONBLOCK : 0);
81     uffd_fd = uffd_open(flags);
82     if (uffd_fd < 0) {
83         trace_uffd_create_fd_nosys(errno);
84         return -1;
85     }
86 
87     api_struct.api = UFFD_API;
88     api_struct.features = features;
89     if (ioctl(uffd_fd, UFFDIO_API, &api_struct)) {
90         trace_uffd_create_fd_api_failed(errno);
91         goto fail;
92     }
93     if ((api_struct.ioctls & ioctl_mask) != ioctl_mask) {
94         trace_uffd_create_fd_api_noioctl(ioctl_mask, api_struct.ioctls);
95         goto fail;
96     }
97 
98     return uffd_fd;
99 
100 fail:
101     close(uffd_fd);
102     return -1;
103 }
104 
105 /**
106  * uffd_close_fd: close UFFD file descriptor
107  *
108  * @uffd_fd: UFFD file descriptor
109  */
110 void uffd_close_fd(int uffd_fd)
111 {
112     assert(uffd_fd >= 0);
113     close(uffd_fd);
114 }
115 
116 /**
117  * uffd_register_memory: register memory range via UFFD-IO
118  *
119  * Returns 0 in case of success, negative value in case of an error
120  *
121  * @uffd_fd: UFFD file descriptor
122  * @addr: base address of memory range
123  * @length: length of memory range
124  * @mode: UFFD register mode (UFFDIO_REGISTER_MODE_MISSING, ...)
125  * @ioctls: optional pointer to receive supported IOCTL mask
126  */
127 int uffd_register_memory(int uffd_fd, void *addr, uint64_t length,
128         uint64_t mode, uint64_t *ioctls)
129 {
130     struct uffdio_register uffd_register;
131 
132     uffd_register.range.start = (uintptr_t) addr;
133     uffd_register.range.len = length;
134     uffd_register.mode = mode;
135 
136     if (ioctl(uffd_fd, UFFDIO_REGISTER, &uffd_register)) {
137         trace_uffd_register_memory_failed(addr, length, mode, errno);
138         return -1;
139     }
140     if (ioctls) {
141         *ioctls = uffd_register.ioctls;
142     }
143 
144     return 0;
145 }
146 
147 /**
148  * uffd_unregister_memory: un-register memory range with UFFD-IO
149  *
150  * Returns 0 in case of success, negative value in case of an error
151  *
152  * @uffd_fd: UFFD file descriptor
153  * @addr: base address of memory range
154  * @length: length of memory range
155  */
156 int uffd_unregister_memory(int uffd_fd, void *addr, uint64_t length)
157 {
158     struct uffdio_range uffd_range;
159 
160     uffd_range.start = (uintptr_t) addr;
161     uffd_range.len = length;
162 
163     if (ioctl(uffd_fd, UFFDIO_UNREGISTER, &uffd_range)) {
164         trace_uffd_unregister_memory_failed(addr, length, errno);
165         return -1;
166     }
167 
168     return 0;
169 }
170 
171 /**
172  * uffd_change_protection: protect/un-protect memory range for writes via UFFD-IO
173  *
174  * Returns 0 on success, negative value in case of error
175  *
176  * @uffd_fd: UFFD file descriptor
177  * @addr: base address of memory range
178  * @length: length of memory range
179  * @wp: write-protect/unprotect
180  * @dont_wake: do not wake threads waiting on wr-protected page
181  */
182 int uffd_change_protection(int uffd_fd, void *addr, uint64_t length,
183         bool wp, bool dont_wake)
184 {
185     struct uffdio_writeprotect uffd_writeprotect;
186 
187     uffd_writeprotect.range.start = (uintptr_t) addr;
188     uffd_writeprotect.range.len = length;
189     if (!wp && dont_wake) {
190         /* DONTWAKE is meaningful only on protection release */
191         uffd_writeprotect.mode = UFFDIO_WRITEPROTECT_MODE_DONTWAKE;
192     } else {
193         uffd_writeprotect.mode = (wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0);
194     }
195 
196     if (ioctl(uffd_fd, UFFDIO_WRITEPROTECT, &uffd_writeprotect)) {
197         error_report("uffd_change_protection() failed: addr=%p len=%" PRIu64
198                 " mode=%" PRIx64 " errno=%i", addr, length,
199                 (uint64_t) uffd_writeprotect.mode, errno);
200         return -1;
201     }
202 
203     return 0;
204 }
205 
206 /**
207  * uffd_copy_page: copy range of pages to destination via UFFD-IO
208  *
209  * Copy range of source pages to the destination to resolve
210  * missing page fault somewhere in the destination range.
211  *
212  * Returns 0 on success, negative value in case of an error
213  *
214  * @uffd_fd: UFFD file descriptor
215  * @dst_addr: destination base address
216  * @src_addr: source base address
217  * @length: length of the range to copy
218  * @dont_wake: do not wake threads waiting on missing page
219  */
220 int uffd_copy_page(int uffd_fd, void *dst_addr, void *src_addr,
221         uint64_t length, bool dont_wake)
222 {
223     struct uffdio_copy uffd_copy;
224 
225     uffd_copy.dst = (uintptr_t) dst_addr;
226     uffd_copy.src = (uintptr_t) src_addr;
227     uffd_copy.len = length;
228     uffd_copy.mode = dont_wake ? UFFDIO_COPY_MODE_DONTWAKE : 0;
229 
230     if (ioctl(uffd_fd, UFFDIO_COPY, &uffd_copy)) {
231         error_report("uffd_copy_page() failed: dst_addr=%p src_addr=%p length=%" PRIu64
232                 " mode=%" PRIx64 " errno=%i", dst_addr, src_addr,
233                 length, (uint64_t) uffd_copy.mode, errno);
234         return -1;
235     }
236 
237     return 0;
238 }
239 
240 /**
241  * uffd_zero_page: fill range of pages with zeroes via UFFD-IO
242  *
243  * Fill range pages with zeroes to resolve missing page fault within the range.
244  *
245  * Returns 0 on success, negative value in case of an error
246  *
247  * @uffd_fd: UFFD file descriptor
248  * @addr: base address
249  * @length: length of the range to fill with zeroes
250  * @dont_wake: do not wake threads waiting on missing page
251  */
252 int uffd_zero_page(int uffd_fd, void *addr, uint64_t length, bool dont_wake)
253 {
254     struct uffdio_zeropage uffd_zeropage;
255 
256     uffd_zeropage.range.start = (uintptr_t) addr;
257     uffd_zeropage.range.len = length;
258     uffd_zeropage.mode = dont_wake ? UFFDIO_ZEROPAGE_MODE_DONTWAKE : 0;
259 
260     if (ioctl(uffd_fd, UFFDIO_ZEROPAGE, &uffd_zeropage)) {
261         error_report("uffd_zero_page() failed: addr=%p length=%" PRIu64
262                 " mode=%" PRIx64 " errno=%i", addr, length,
263                 (uint64_t) uffd_zeropage.mode, errno);
264         return -1;
265     }
266 
267     return 0;
268 }
269 
270 /**
271  * uffd_wakeup: wake up threads waiting on page UFFD-managed page fault resolution
272  *
273  * Wake up threads waiting on any page/pages from the designated range.
274  * The main use case is when during some period, page faults are resolved
275  * via UFFD-IO IOCTLs with MODE_DONTWAKE flag set, then after that all waits
276  * for the whole memory range are satisfied in a single call to uffd_wakeup().
277  *
278  * Returns 0 on success, negative value in case of an error
279  *
280  * @uffd_fd: UFFD file descriptor
281  * @addr: base address
282  * @length: length of the range
283  */
284 int uffd_wakeup(int uffd_fd, void *addr, uint64_t length)
285 {
286     struct uffdio_range uffd_range;
287 
288     uffd_range.start = (uintptr_t) addr;
289     uffd_range.len = length;
290 
291     if (ioctl(uffd_fd, UFFDIO_WAKE, &uffd_range)) {
292         error_report("uffd_wakeup() failed: addr=%p length=%" PRIu64 " errno=%i",
293                 addr, length, errno);
294         return -1;
295     }
296 
297     return 0;
298 }
299 
300 /**
301  * uffd_read_events: read pending UFFD events
302  *
303  * Returns number of fetched messages, 0 if non is available or
304  * negative value in case of an error
305  *
306  * @uffd_fd: UFFD file descriptor
307  * @msgs: pointer to message buffer
308  * @count: number of messages that can fit in the buffer
309  */
310 int uffd_read_events(int uffd_fd, struct uffd_msg *msgs, int count)
311 {
312     ssize_t res;
313     do {
314         res = read(uffd_fd, msgs, count * sizeof(struct uffd_msg));
315     } while (res < 0 && errno == EINTR);
316 
317     if ((res < 0 && errno == EAGAIN)) {
318         return 0;
319     }
320     if (res < 0) {
321         error_report("uffd_read_events() failed: errno=%i", errno);
322         return -1;
323     }
324 
325     return (int) (res / sizeof(struct uffd_msg));
326 }
327 
328 /**
329  * uffd_poll_events: poll UFFD file descriptor for read
330  *
331  * Returns true if events are available for read, false otherwise
332  *
333  * @uffd_fd: UFFD file descriptor
334  * @tmo: timeout value
335  */
336 bool uffd_poll_events(int uffd_fd, int tmo)
337 {
338     int res;
339     struct pollfd poll_fd = { .fd = uffd_fd, .events = POLLIN, .revents = 0 };
340 
341     do {
342         res = poll(&poll_fd, 1, tmo);
343     } while (res < 0 && errno == EINTR);
344 
345     if (res == 0) {
346         return false;
347     }
348     if (res < 0) {
349         error_report("uffd_poll_events() failed: errno=%i", errno);
350         return false;
351     }
352 
353     return (poll_fd.revents & POLLIN) != 0;
354 }
355