xref: /linux/tools/testing/selftests/mm/uffd-common.c (revision 1e525507)
1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Userfaultfd tests util functions
4  *
5  * Copyright (C) 2015-2023  Red Hat, Inc.
6  */
7 
8 #include "uffd-common.h"
9 
10 #define BASE_PMD_ADDR ((void *)(1UL << 30))
11 
12 volatile bool test_uffdio_copy_eexist = true;
13 unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
14 char *area_src, *area_src_alias, *area_dst, *area_dst_alias, *area_remap;
15 int uffd = -1, uffd_flags, finished, *pipefd, test_type;
16 bool map_shared;
17 bool test_uffdio_wp = true;
18 unsigned long long *count_verify;
19 uffd_test_ops_t *uffd_test_ops;
20 uffd_test_case_ops_t *uffd_test_case_ops;
21 atomic_bool ready_for_fork;
22 
23 static int uffd_mem_fd_create(off_t mem_size, bool hugetlb)
24 {
25 	unsigned int memfd_flags = 0;
26 	int mem_fd;
27 
28 	if (hugetlb)
29 		memfd_flags = MFD_HUGETLB;
30 	mem_fd = memfd_create("uffd-test", memfd_flags);
31 	if (mem_fd < 0)
32 		err("memfd_create");
33 	if (ftruncate(mem_fd, mem_size))
34 		err("ftruncate");
35 	if (fallocate(mem_fd,
36 		      FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, 0,
37 		      mem_size))
38 		err("fallocate");
39 
40 	return mem_fd;
41 }
42 
43 static void anon_release_pages(char *rel_area)
44 {
45 	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
46 		err("madvise(MADV_DONTNEED) failed");
47 }
48 
49 static int anon_allocate_area(void **alloc_area, bool is_src)
50 {
51 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
52 			   MAP_ANONYMOUS | MAP_PRIVATE, -1, 0);
53 	if (*alloc_area == MAP_FAILED) {
54 		*alloc_area = NULL;
55 		return -errno;
56 	}
57 	return 0;
58 }
59 
60 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
61 {
62 }
63 
64 static void hugetlb_release_pages(char *rel_area)
65 {
66 	if (!map_shared) {
67 		if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED))
68 			err("madvise(MADV_DONTNEED) failed");
69 	} else {
70 		if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
71 			err("madvise(MADV_REMOVE) failed");
72 	}
73 }
74 
75 static int hugetlb_allocate_area(void **alloc_area, bool is_src)
76 {
77 	off_t size = nr_pages * page_size;
78 	off_t offset = is_src ? 0 : size;
79 	void *area_alias = NULL;
80 	char **alloc_area_alias;
81 	int mem_fd = uffd_mem_fd_create(size * 2, true);
82 
83 	*alloc_area = mmap(NULL, size, PROT_READ | PROT_WRITE,
84 			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
85 			   (is_src ? 0 : MAP_NORESERVE),
86 			   mem_fd, offset);
87 	if (*alloc_area == MAP_FAILED) {
88 		*alloc_area = NULL;
89 		return -errno;
90 	}
91 
92 	if (map_shared) {
93 		area_alias = mmap(NULL, size, PROT_READ | PROT_WRITE,
94 				  MAP_SHARED, mem_fd, offset);
95 		if (area_alias == MAP_FAILED)
96 			return -errno;
97 	}
98 
99 	if (is_src) {
100 		alloc_area_alias = &area_src_alias;
101 	} else {
102 		alloc_area_alias = &area_dst_alias;
103 	}
104 	if (area_alias)
105 		*alloc_area_alias = area_alias;
106 
107 	close(mem_fd);
108 	return 0;
109 }
110 
111 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
112 {
113 	if (!map_shared)
114 		return;
115 
116 	*start = (unsigned long) area_dst_alias + offset;
117 }
118 
119 static void shmem_release_pages(char *rel_area)
120 {
121 	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE))
122 		err("madvise(MADV_REMOVE) failed");
123 }
124 
125 static int shmem_allocate_area(void **alloc_area, bool is_src)
126 {
127 	void *area_alias = NULL;
128 	size_t bytes = nr_pages * page_size, hpage_size = read_pmd_pagesize();
129 	unsigned long offset = is_src ? 0 : bytes;
130 	char *p = NULL, *p_alias = NULL;
131 	int mem_fd = uffd_mem_fd_create(bytes * 2, false);
132 
133 	/* TODO: clean this up.  Use a static addr is ugly */
134 	p = BASE_PMD_ADDR;
135 	if (!is_src)
136 		/* src map + alias + interleaved hpages */
137 		p += 2 * (bytes + hpage_size);
138 	p_alias = p;
139 	p_alias += bytes;
140 	p_alias += hpage_size;  /* Prevent src/dst VMA merge */
141 
142 	*alloc_area = mmap(p, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
143 			   mem_fd, offset);
144 	if (*alloc_area == MAP_FAILED) {
145 		*alloc_area = NULL;
146 		return -errno;
147 	}
148 	if (*alloc_area != p)
149 		err("mmap of memfd failed at %p", p);
150 
151 	area_alias = mmap(p_alias, bytes, PROT_READ | PROT_WRITE, MAP_SHARED,
152 			  mem_fd, offset);
153 	if (area_alias == MAP_FAILED) {
154 		munmap(*alloc_area, bytes);
155 		*alloc_area = NULL;
156 		return -errno;
157 	}
158 	if (area_alias != p_alias)
159 		err("mmap of anonymous memory failed at %p", p_alias);
160 
161 	if (is_src)
162 		area_src_alias = area_alias;
163 	else
164 		area_dst_alias = area_alias;
165 
166 	close(mem_fd);
167 	return 0;
168 }
169 
170 static void shmem_alias_mapping(__u64 *start, size_t len, unsigned long offset)
171 {
172 	*start = (unsigned long)area_dst_alias + offset;
173 }
174 
175 static void shmem_check_pmd_mapping(void *p, int expect_nr_hpages)
176 {
177 	if (!check_huge_shmem(area_dst_alias, expect_nr_hpages,
178 			      read_pmd_pagesize()))
179 		err("Did not find expected %d number of hugepages",
180 		    expect_nr_hpages);
181 }
182 
183 struct uffd_test_ops anon_uffd_test_ops = {
184 	.allocate_area = anon_allocate_area,
185 	.release_pages = anon_release_pages,
186 	.alias_mapping = noop_alias_mapping,
187 	.check_pmd_mapping = NULL,
188 };
189 
190 struct uffd_test_ops shmem_uffd_test_ops = {
191 	.allocate_area = shmem_allocate_area,
192 	.release_pages = shmem_release_pages,
193 	.alias_mapping = shmem_alias_mapping,
194 	.check_pmd_mapping = shmem_check_pmd_mapping,
195 };
196 
197 struct uffd_test_ops hugetlb_uffd_test_ops = {
198 	.allocate_area = hugetlb_allocate_area,
199 	.release_pages = hugetlb_release_pages,
200 	.alias_mapping = hugetlb_alias_mapping,
201 	.check_pmd_mapping = NULL,
202 };
203 
204 void uffd_stats_report(struct uffd_args *args, int n_cpus)
205 {
206 	int i;
207 	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
208 
209 	for (i = 0; i < n_cpus; i++) {
210 		miss_total += args[i].missing_faults;
211 		wp_total += args[i].wp_faults;
212 		minor_total += args[i].minor_faults;
213 	}
214 
215 	printf("userfaults: ");
216 	if (miss_total) {
217 		printf("%llu missing (", miss_total);
218 		for (i = 0; i < n_cpus; i++)
219 			printf("%lu+", args[i].missing_faults);
220 		printf("\b) ");
221 	}
222 	if (wp_total) {
223 		printf("%llu wp (", wp_total);
224 		for (i = 0; i < n_cpus; i++)
225 			printf("%lu+", args[i].wp_faults);
226 		printf("\b) ");
227 	}
228 	if (minor_total) {
229 		printf("%llu minor (", minor_total);
230 		for (i = 0; i < n_cpus; i++)
231 			printf("%lu+", args[i].minor_faults);
232 		printf("\b)");
233 	}
234 	printf("\n");
235 }
236 
237 int userfaultfd_open(uint64_t *features)
238 {
239 	struct uffdio_api uffdio_api;
240 
241 	uffd = uffd_open(UFFD_FLAGS);
242 	if (uffd < 0)
243 		return -1;
244 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
245 
246 	uffdio_api.api = UFFD_API;
247 	uffdio_api.features = *features;
248 	if (ioctl(uffd, UFFDIO_API, &uffdio_api))
249 		/* Probably lack of CAP_PTRACE? */
250 		return -1;
251 	if (uffdio_api.api != UFFD_API)
252 		err("UFFDIO_API error: %" PRIu64, (uint64_t)uffdio_api.api);
253 
254 	*features = uffdio_api.features;
255 	return 0;
256 }
257 
258 static inline void munmap_area(void **area)
259 {
260 	if (*area)
261 		if (munmap(*area, nr_pages * page_size))
262 			err("munmap");
263 
264 	*area = NULL;
265 }
266 
267 void uffd_test_ctx_clear(void)
268 {
269 	size_t i;
270 
271 	if (pipefd) {
272 		for (i = 0; i < nr_cpus * 2; ++i) {
273 			if (close(pipefd[i]))
274 				err("close pipefd");
275 		}
276 		free(pipefd);
277 		pipefd = NULL;
278 	}
279 
280 	if (count_verify) {
281 		free(count_verify);
282 		count_verify = NULL;
283 	}
284 
285 	if (uffd != -1) {
286 		if (close(uffd))
287 			err("close uffd");
288 		uffd = -1;
289 	}
290 
291 	munmap_area((void **)&area_src);
292 	munmap_area((void **)&area_src_alias);
293 	munmap_area((void **)&area_dst);
294 	munmap_area((void **)&area_dst_alias);
295 	munmap_area((void **)&area_remap);
296 }
297 
298 int uffd_test_ctx_init(uint64_t features, const char **errmsg)
299 {
300 	unsigned long nr, cpu;
301 	int ret;
302 
303 	if (uffd_test_case_ops && uffd_test_case_ops->pre_alloc) {
304 		ret = uffd_test_case_ops->pre_alloc(errmsg);
305 		if (ret)
306 			return ret;
307 	}
308 
309 	ret = uffd_test_ops->allocate_area((void **)&area_src, true);
310 	ret |= uffd_test_ops->allocate_area((void **)&area_dst, false);
311 	if (ret) {
312 		if (errmsg)
313 			*errmsg = "memory allocation failed";
314 		return ret;
315 	}
316 
317 	if (uffd_test_case_ops && uffd_test_case_ops->post_alloc) {
318 		ret = uffd_test_case_ops->post_alloc(errmsg);
319 		if (ret)
320 			return ret;
321 	}
322 
323 	ret = userfaultfd_open(&features);
324 	if (ret) {
325 		if (errmsg)
326 			*errmsg = "possible lack of priviledge";
327 		return ret;
328 	}
329 
330 	count_verify = malloc(nr_pages * sizeof(unsigned long long));
331 	if (!count_verify)
332 		err("count_verify");
333 
334 	for (nr = 0; nr < nr_pages; nr++) {
335 		*area_mutex(area_src, nr) =
336 			(pthread_mutex_t)PTHREAD_MUTEX_INITIALIZER;
337 		count_verify[nr] = *area_count(area_src, nr) = 1;
338 		/*
339 		 * In the transition between 255 to 256, powerpc will
340 		 * read out of order in my_bcmp and see both bytes as
341 		 * zero, so leave a placeholder below always non-zero
342 		 * after the count, to avoid my_bcmp to trigger false
343 		 * positives.
344 		 */
345 		*(area_count(area_src, nr) + 1) = 1;
346 	}
347 
348 	/*
349 	 * After initialization of area_src, we must explicitly release pages
350 	 * for area_dst to make sure it's fully empty.  Otherwise we could have
351 	 * some area_dst pages be errornously initialized with zero pages,
352 	 * hence we could hit memory corruption later in the test.
353 	 *
354 	 * One example is when THP is globally enabled, above allocate_area()
355 	 * calls could have the two areas merged into a single VMA (as they
356 	 * will have the same VMA flags so they're mergeable).  When we
357 	 * initialize the area_src above, it's possible that some part of
358 	 * area_dst could have been faulted in via one huge THP that will be
359 	 * shared between area_src and area_dst.  It could cause some of the
360 	 * area_dst won't be trapped by missing userfaults.
361 	 *
362 	 * This release_pages() will guarantee even if that happened, we'll
363 	 * proactively split the thp and drop any accidentally initialized
364 	 * pages within area_dst.
365 	 */
366 	uffd_test_ops->release_pages(area_dst);
367 
368 	pipefd = malloc(sizeof(int) * nr_cpus * 2);
369 	if (!pipefd)
370 		err("pipefd");
371 	for (cpu = 0; cpu < nr_cpus; cpu++)
372 		if (pipe2(&pipefd[cpu * 2], O_CLOEXEC | O_NONBLOCK))
373 			err("pipe");
374 
375 	return 0;
376 }
377 
378 void wp_range(int ufd, __u64 start, __u64 len, bool wp)
379 {
380 	struct uffdio_writeprotect prms;
381 
382 	/* Write protection page faults */
383 	prms.range.start = start;
384 	prms.range.len = len;
385 	/* Undo write-protect, do wakeup after that */
386 	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
387 
388 	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms))
389 		err("clear WP failed: address=0x%"PRIx64, (uint64_t)start);
390 }
391 
392 static void continue_range(int ufd, __u64 start, __u64 len, bool wp)
393 {
394 	struct uffdio_continue req;
395 	int ret;
396 
397 	req.range.start = start;
398 	req.range.len = len;
399 	req.mode = 0;
400 	if (wp)
401 		req.mode |= UFFDIO_CONTINUE_MODE_WP;
402 
403 	if (ioctl(ufd, UFFDIO_CONTINUE, &req))
404 		err("UFFDIO_CONTINUE failed for address 0x%" PRIx64,
405 		    (uint64_t)start);
406 
407 	/*
408 	 * Error handling within the kernel for continue is subtly different
409 	 * from copy or zeropage, so it may be a source of bugs. Trigger an
410 	 * error (-EEXIST) on purpose, to verify doing so doesn't cause a BUG.
411 	 */
412 	req.mapped = 0;
413 	ret = ioctl(ufd, UFFDIO_CONTINUE, &req);
414 	if (ret >= 0 || req.mapped != -EEXIST)
415 		err("failed to exercise UFFDIO_CONTINUE error handling, ret=%d, mapped=%" PRId64,
416 		    ret, (int64_t) req.mapped);
417 }
418 
419 int uffd_read_msg(int ufd, struct uffd_msg *msg)
420 {
421 	int ret = read(uffd, msg, sizeof(*msg));
422 
423 	if (ret != sizeof(*msg)) {
424 		if (ret < 0) {
425 			if (errno == EAGAIN || errno == EINTR)
426 				return 1;
427 			err("blocking read error");
428 		} else {
429 			err("short read");
430 		}
431 	}
432 
433 	return 0;
434 }
435 
436 void uffd_handle_page_fault(struct uffd_msg *msg, struct uffd_args *args)
437 {
438 	unsigned long offset;
439 
440 	if (msg->event != UFFD_EVENT_PAGEFAULT)
441 		err("unexpected msg event %u", msg->event);
442 
443 	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
444 		/* Write protect page faults */
445 		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
446 		args->wp_faults++;
447 	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
448 		uint8_t *area;
449 		int b;
450 
451 		/*
452 		 * Minor page faults
453 		 *
454 		 * To prove we can modify the original range for testing
455 		 * purposes, we're going to bit flip this range before
456 		 * continuing.
457 		 *
458 		 * Note that this requires all minor page fault tests operate on
459 		 * area_dst (non-UFFD-registered) and area_dst_alias
460 		 * (UFFD-registered).
461 		 */
462 
463 		area = (uint8_t *)(area_dst +
464 				   ((char *)msg->arg.pagefault.address -
465 				    area_dst_alias));
466 		for (b = 0; b < page_size; ++b)
467 			area[b] = ~area[b];
468 		continue_range(uffd, msg->arg.pagefault.address, page_size,
469 			       args->apply_wp);
470 		args->minor_faults++;
471 	} else {
472 		/*
473 		 * Missing page faults.
474 		 *
475 		 * Here we force a write check for each of the missing mode
476 		 * faults.  It's guaranteed because the only threads that
477 		 * will trigger uffd faults are the locking threads, and
478 		 * their first instruction to touch the missing page will
479 		 * always be pthread_mutex_lock().
480 		 *
481 		 * Note that here we relied on an NPTL glibc impl detail to
482 		 * always read the lock type at the entry of the lock op
483 		 * (pthread_mutex_t.__data.__type, offset 0x10) before
484 		 * doing any locking operations to guarantee that.  It's
485 		 * actually not good to rely on this impl detail because
486 		 * logically a pthread-compatible lib can implement the
487 		 * locks without types and we can fail when linking with
488 		 * them.  However since we used to find bugs with this
489 		 * strict check we still keep it around.  Hopefully this
490 		 * could be a good hint when it fails again.  If one day
491 		 * it'll break on some other impl of glibc we'll revisit.
492 		 */
493 		if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE)
494 			err("unexpected write fault");
495 
496 		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
497 		offset &= ~(page_size-1);
498 
499 		if (copy_page(uffd, offset, args->apply_wp))
500 			args->missing_faults++;
501 	}
502 }
503 
504 void *uffd_poll_thread(void *arg)
505 {
506 	struct uffd_args *args = (struct uffd_args *)arg;
507 	unsigned long cpu = args->cpu;
508 	struct pollfd pollfd[2];
509 	struct uffd_msg msg;
510 	struct uffdio_register uffd_reg;
511 	int ret;
512 	char tmp_chr;
513 
514 	if (!args->handle_fault)
515 		args->handle_fault = uffd_handle_page_fault;
516 
517 	pollfd[0].fd = uffd;
518 	pollfd[0].events = POLLIN;
519 	pollfd[1].fd = pipefd[cpu*2];
520 	pollfd[1].events = POLLIN;
521 
522 	ready_for_fork = true;
523 
524 	for (;;) {
525 		ret = poll(pollfd, 2, -1);
526 		if (ret <= 0) {
527 			if (errno == EINTR || errno == EAGAIN)
528 				continue;
529 			err("poll error: %d", ret);
530 		}
531 		if (pollfd[1].revents) {
532 			if (!(pollfd[1].revents & POLLIN))
533 				err("pollfd[1].revents %d", pollfd[1].revents);
534 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1)
535 				err("read pipefd error");
536 			break;
537 		}
538 		if (!(pollfd[0].revents & POLLIN))
539 			err("pollfd[0].revents %d", pollfd[0].revents);
540 		if (uffd_read_msg(uffd, &msg))
541 			continue;
542 		switch (msg.event) {
543 		default:
544 			err("unexpected msg event %u\n", msg.event);
545 			break;
546 		case UFFD_EVENT_PAGEFAULT:
547 			args->handle_fault(&msg, args);
548 			break;
549 		case UFFD_EVENT_FORK:
550 			close(uffd);
551 			uffd = msg.arg.fork.ufd;
552 			pollfd[0].fd = uffd;
553 			break;
554 		case UFFD_EVENT_REMOVE:
555 			uffd_reg.range.start = msg.arg.remove.start;
556 			uffd_reg.range.len = msg.arg.remove.end -
557 				msg.arg.remove.start;
558 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range))
559 				err("remove failure");
560 			break;
561 		case UFFD_EVENT_REMAP:
562 			area_remap = area_dst;  /* save for later unmap */
563 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
564 			break;
565 		}
566 	}
567 
568 	return NULL;
569 }
570 
571 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
572 			    unsigned long offset)
573 {
574 	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
575 				     uffdio_copy->len,
576 				     offset);
577 	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
578 		/* real retval in ufdio_copy.copy */
579 		if (uffdio_copy->copy != -EEXIST)
580 			err("UFFDIO_COPY retry error: %"PRId64,
581 			    (int64_t)uffdio_copy->copy);
582 	} else {
583 		err("UFFDIO_COPY retry unexpected: %"PRId64,
584 		    (int64_t)uffdio_copy->copy);
585 	}
586 }
587 
588 static void wake_range(int ufd, unsigned long addr, unsigned long len)
589 {
590 	struct uffdio_range uffdio_wake;
591 
592 	uffdio_wake.start = addr;
593 	uffdio_wake.len = len;
594 
595 	if (ioctl(ufd, UFFDIO_WAKE, &uffdio_wake))
596 		fprintf(stderr, "error waking %lu\n",
597 			addr), exit(1);
598 }
599 
600 int __copy_page(int ufd, unsigned long offset, bool retry, bool wp)
601 {
602 	struct uffdio_copy uffdio_copy;
603 
604 	if (offset >= nr_pages * page_size)
605 		err("unexpected offset %lu\n", offset);
606 	uffdio_copy.dst = (unsigned long) area_dst + offset;
607 	uffdio_copy.src = (unsigned long) area_src + offset;
608 	uffdio_copy.len = page_size;
609 	if (wp)
610 		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
611 	else
612 		uffdio_copy.mode = 0;
613 	uffdio_copy.copy = 0;
614 	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
615 		/* real retval in ufdio_copy.copy */
616 		if (uffdio_copy.copy != -EEXIST)
617 			err("UFFDIO_COPY error: %"PRId64,
618 			    (int64_t)uffdio_copy.copy);
619 		wake_range(ufd, uffdio_copy.dst, page_size);
620 	} else if (uffdio_copy.copy != page_size) {
621 		err("UFFDIO_COPY error: %"PRId64, (int64_t)uffdio_copy.copy);
622 	} else {
623 		if (test_uffdio_copy_eexist && retry) {
624 			test_uffdio_copy_eexist = false;
625 			retry_copy_page(ufd, &uffdio_copy, offset);
626 		}
627 		return 1;
628 	}
629 	return 0;
630 }
631 
632 int copy_page(int ufd, unsigned long offset, bool wp)
633 {
634 	return __copy_page(ufd, offset, false, wp);
635 }
636 
637 int move_page(int ufd, unsigned long offset, unsigned long len)
638 {
639 	struct uffdio_move uffdio_move;
640 
641 	if (offset + len > nr_pages * page_size)
642 		err("unexpected offset %lu and length %lu\n", offset, len);
643 	uffdio_move.dst = (unsigned long) area_dst + offset;
644 	uffdio_move.src = (unsigned long) area_src + offset;
645 	uffdio_move.len = len;
646 	uffdio_move.mode = UFFDIO_MOVE_MODE_ALLOW_SRC_HOLES;
647 	uffdio_move.move = 0;
648 	if (ioctl(ufd, UFFDIO_MOVE, &uffdio_move)) {
649 		/* real retval in uffdio_move.move */
650 		if (uffdio_move.move != -EEXIST)
651 			err("UFFDIO_MOVE error: %"PRId64,
652 			    (int64_t)uffdio_move.move);
653 		wake_range(ufd, uffdio_move.dst, len);
654 	} else if (uffdio_move.move != len) {
655 		err("UFFDIO_MOVE error: %"PRId64, (int64_t)uffdio_move.move);
656 	} else
657 		return 1;
658 	return 0;
659 }
660 
661 int uffd_open_dev(unsigned int flags)
662 {
663 	int fd, uffd;
664 
665 	fd = open("/dev/userfaultfd", O_RDWR | O_CLOEXEC);
666 	if (fd < 0)
667 		return fd;
668 	uffd = ioctl(fd, USERFAULTFD_IOC_NEW, flags);
669 	close(fd);
670 
671 	return uffd;
672 }
673 
674 int uffd_open_sys(unsigned int flags)
675 {
676 #ifdef __NR_userfaultfd
677 	return syscall(__NR_userfaultfd, flags);
678 #else
679 	return -1;
680 #endif
681 }
682 
683 int uffd_open(unsigned int flags)
684 {
685 	int uffd = uffd_open_sys(flags);
686 
687 	if (uffd < 0)
688 		uffd = uffd_open_dev(flags);
689 
690 	return uffd;
691 }
692 
693 int uffd_get_features(uint64_t *features)
694 {
695 	struct uffdio_api uffdio_api = { .api = UFFD_API, .features = 0 };
696 	/*
697 	 * This should by default work in most kernels; the feature list
698 	 * will be the same no matter what we pass in here.
699 	 */
700 	int fd = uffd_open(UFFD_USER_MODE_ONLY);
701 
702 	if (fd < 0)
703 		/* Maybe the kernel is older than user-only mode? */
704 		fd = uffd_open(0);
705 
706 	if (fd < 0)
707 		return fd;
708 
709 	if (ioctl(fd, UFFDIO_API, &uffdio_api)) {
710 		close(fd);
711 		return -errno;
712 	}
713 
714 	*features = uffdio_api.features;
715 	close(fd);
716 
717 	return 0;
718 }
719