1 // SPDX-License-Identifier: GPL-2.0-only
2 /*
3  * Stress userfaultfd syscall.
4  *
5  *  Copyright (C) 2015  Red Hat, Inc.
6  *
7  * This test allocates two virtual areas and bounces the physical
8  * memory across the two virtual areas (from area_src to area_dst)
9  * using userfaultfd.
10  *
11  * There are three threads running per CPU:
12  *
13  * 1) one per-CPU thread takes a per-page pthread_mutex in a random
14  *    page of the area_dst (while the physical page may still be in
15  *    area_src), and increments a per-page counter in the same page,
16  *    and checks its value against a verification region.
17  *
18  * 2) another per-CPU thread handles the userfaults generated by
19  *    thread 1 above. userfaultfd blocking reads or poll() modes are
20  *    exercised interleaved.
21  *
22  * 3) one last per-CPU thread transfers the memory in the background
23  *    at maximum bandwidth (if not already transferred by thread
24  *    2). Each cpu thread takes cares of transferring a portion of the
25  *    area.
26  *
27  * When all threads of type 3 completed the transfer, one bounce is
28  * complete. area_src and area_dst are then swapped. All threads are
29  * respawned and so the bounce is immediately restarted in the
30  * opposite direction.
31  *
32  * per-CPU threads 1 by triggering userfaults inside
33  * pthread_mutex_lock will also verify the atomicity of the memory
34  * transfer (UFFDIO_COPY).
35  */
36 
37 #define _GNU_SOURCE
38 #include <stdio.h>
39 #include <errno.h>
40 #include <unistd.h>
41 #include <stdlib.h>
42 #include <sys/types.h>
43 #include <sys/stat.h>
44 #include <fcntl.h>
45 #include <time.h>
46 #include <signal.h>
47 #include <poll.h>
48 #include <string.h>
49 #include <sys/mman.h>
50 #include <sys/syscall.h>
51 #include <sys/ioctl.h>
52 #include <sys/wait.h>
53 #include <pthread.h>
54 #include <linux/userfaultfd.h>
55 #include <setjmp.h>
56 #include <stdbool.h>
57 #include <assert.h>
58 #include <inttypes.h>
59 #include <stdint.h>
60 
61 #include "../kselftest.h"
62 
63 #ifdef __NR_userfaultfd
64 
65 static unsigned long nr_cpus, nr_pages, nr_pages_per_cpu, page_size;
66 
67 #define BOUNCE_RANDOM		(1<<0)
68 #define BOUNCE_RACINGFAULTS	(1<<1)
69 #define BOUNCE_VERIFY		(1<<2)
70 #define BOUNCE_POLL		(1<<3)
71 static int bounces;
72 
73 #define TEST_ANON	1
74 #define TEST_HUGETLB	2
75 #define TEST_SHMEM	3
76 static int test_type;
77 
78 /* exercise the test_uffdio_*_eexist every ALARM_INTERVAL_SECS */
79 #define ALARM_INTERVAL_SECS 10
80 static volatile bool test_uffdio_copy_eexist = true;
81 static volatile bool test_uffdio_zeropage_eexist = true;
82 /* Whether to test uffd write-protection */
83 static bool test_uffdio_wp = false;
84 /* Whether to test uffd minor faults */
85 static bool test_uffdio_minor = false;
86 
87 static bool map_shared;
88 static int huge_fd;
89 static char *huge_fd_off0;
90 static unsigned long long *count_verify;
91 static int uffd, uffd_flags, finished, *pipefd;
92 static char *area_src, *area_src_alias, *area_dst, *area_dst_alias;
93 static char *zeropage;
94 pthread_attr_t attr;
95 
96 /* Userfaultfd test statistics */
97 struct uffd_stats {
98 	int cpu;
99 	unsigned long missing_faults;
100 	unsigned long wp_faults;
101 	unsigned long minor_faults;
102 };
103 
104 /* pthread_mutex_t starts at page offset 0 */
105 #define area_mutex(___area, ___nr)					\
106 	((pthread_mutex_t *) ((___area) + (___nr)*page_size))
107 /*
108  * count is placed in the page after pthread_mutex_t naturally aligned
109  * to avoid non alignment faults on non-x86 archs.
110  */
111 #define area_count(___area, ___nr)					\
112 	((volatile unsigned long long *) ((unsigned long)		\
113 				 ((___area) + (___nr)*page_size +	\
114 				  sizeof(pthread_mutex_t) +		\
115 				  sizeof(unsigned long long) - 1) &	\
116 				 ~(unsigned long)(sizeof(unsigned long long) \
117 						  -  1)))
118 
119 const char *examples =
120     "# Run anonymous memory test on 100MiB region with 99999 bounces:\n"
121     "./userfaultfd anon 100 99999\n\n"
122     "# Run share memory test on 1GiB region with 99 bounces:\n"
123     "./userfaultfd shmem 1000 99\n\n"
124     "# Run hugetlb memory test on 256MiB region with 50 bounces (using /dev/hugepages/hugefile):\n"
125     "./userfaultfd hugetlb 256 50 /dev/hugepages/hugefile\n\n"
126     "# Run the same hugetlb test but using shmem:\n"
127     "./userfaultfd hugetlb_shared 256 50 /dev/hugepages/hugefile\n\n"
128     "# 10MiB-~6GiB 999 bounces anonymous test, "
129     "continue forever unless an error triggers\n"
130     "while ./userfaultfd anon $[RANDOM % 6000 + 10] 999; do true; done\n\n";
131 
usage(void)132 static void usage(void)
133 {
134 	fprintf(stderr, "\nUsage: ./userfaultfd <test type> <MiB> <bounces> "
135 		"[hugetlbfs_file]\n\n");
136 	fprintf(stderr, "Supported <test type>: anon, hugetlb, "
137 		"hugetlb_shared, shmem\n\n");
138 	fprintf(stderr, "Examples:\n\n");
139 	fprintf(stderr, "%s", examples);
140 	exit(1);
141 }
142 
143 #define uffd_error(code, fmt, ...)                                             \
144 	do {                                                                   \
145 		fprintf(stderr, fmt, ##__VA_ARGS__);                           \
146 		fprintf(stderr, ": %" PRId64 "\n", (int64_t)(code));           \
147 		exit(1);                                                       \
148 	} while (0)
149 
uffd_stats_reset(struct uffd_stats * uffd_stats,unsigned long n_cpus)150 static void uffd_stats_reset(struct uffd_stats *uffd_stats,
151 			     unsigned long n_cpus)
152 {
153 	int i;
154 
155 	for (i = 0; i < n_cpus; i++) {
156 		uffd_stats[i].cpu = i;
157 		uffd_stats[i].missing_faults = 0;
158 		uffd_stats[i].wp_faults = 0;
159 		uffd_stats[i].minor_faults = 0;
160 	}
161 }
162 
uffd_stats_report(struct uffd_stats * stats,int n_cpus)163 static void uffd_stats_report(struct uffd_stats *stats, int n_cpus)
164 {
165 	int i;
166 	unsigned long long miss_total = 0, wp_total = 0, minor_total = 0;
167 
168 	for (i = 0; i < n_cpus; i++) {
169 		miss_total += stats[i].missing_faults;
170 		wp_total += stats[i].wp_faults;
171 		minor_total += stats[i].minor_faults;
172 	}
173 
174 	printf("userfaults: %llu missing (", miss_total);
175 	for (i = 0; i < n_cpus; i++)
176 		printf("%lu+", stats[i].missing_faults);
177 	printf("\b), %llu wp (", wp_total);
178 	for (i = 0; i < n_cpus; i++)
179 		printf("%lu+", stats[i].wp_faults);
180 	printf("\b), %llu minor (", minor_total);
181 	for (i = 0; i < n_cpus; i++)
182 		printf("%lu+", stats[i].minor_faults);
183 	printf("\b)\n");
184 }
185 
anon_release_pages(char * rel_area)186 static int anon_release_pages(char *rel_area)
187 {
188 	int ret = 0;
189 
190 	if (madvise(rel_area, nr_pages * page_size, MADV_DONTNEED)) {
191 		perror("madvise");
192 		ret = 1;
193 	}
194 
195 	return ret;
196 }
197 
anon_allocate_area(void ** alloc_area)198 static void anon_allocate_area(void **alloc_area)
199 {
200 	if (posix_memalign(alloc_area, page_size, nr_pages * page_size)) {
201 		fprintf(stderr, "out of memory\n");
202 		*alloc_area = NULL;
203 	}
204 }
205 
noop_alias_mapping(__u64 * start,size_t len,unsigned long offset)206 static void noop_alias_mapping(__u64 *start, size_t len, unsigned long offset)
207 {
208 }
209 
210 /* HugeTLB memory */
hugetlb_release_pages(char * rel_area)211 static int hugetlb_release_pages(char *rel_area)
212 {
213 	int ret = 0;
214 
215 	if (fallocate(huge_fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE,
216 				rel_area == huge_fd_off0 ? 0 :
217 				nr_pages * page_size,
218 				nr_pages * page_size)) {
219 		perror("fallocate");
220 		ret = 1;
221 	}
222 
223 	return ret;
224 }
225 
hugetlb_allocate_area(void ** alloc_area)226 static void hugetlb_allocate_area(void **alloc_area)
227 {
228 	void *area_alias = NULL;
229 	char **alloc_area_alias;
230 
231 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
232 			   (map_shared ? MAP_SHARED : MAP_PRIVATE) |
233 			   MAP_HUGETLB,
234 			   huge_fd, *alloc_area == area_src ? 0 :
235 			   nr_pages * page_size);
236 	if (*alloc_area == MAP_FAILED) {
237 		perror("mmap of hugetlbfs file failed");
238 		goto fail;
239 	}
240 
241 	if (map_shared) {
242 		area_alias = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
243 				  MAP_SHARED | MAP_HUGETLB,
244 				  huge_fd, *alloc_area == area_src ? 0 :
245 				  nr_pages * page_size);
246 		if (area_alias == MAP_FAILED) {
247 			perror("mmap of hugetlb file alias failed");
248 			goto fail_munmap;
249 		}
250 	}
251 
252 	if (*alloc_area == area_src) {
253 		huge_fd_off0 = *alloc_area;
254 		alloc_area_alias = &area_src_alias;
255 	} else {
256 		alloc_area_alias = &area_dst_alias;
257 	}
258 	if (area_alias)
259 		*alloc_area_alias = area_alias;
260 
261 	return;
262 
263 fail_munmap:
264 	if (munmap(*alloc_area, nr_pages * page_size) < 0) {
265 		perror("hugetlb munmap");
266 		exit(1);
267 	}
268 fail:
269 	*alloc_area = NULL;
270 }
271 
hugetlb_alias_mapping(__u64 * start,size_t len,unsigned long offset)272 static void hugetlb_alias_mapping(__u64 *start, size_t len, unsigned long offset)
273 {
274 	if (!map_shared)
275 		return;
276 	/*
277 	 * We can't zap just the pagetable with hugetlbfs because
278 	 * MADV_DONTEED won't work. So exercise -EEXIST on a alias
279 	 * mapping where the pagetables are not established initially,
280 	 * this way we'll exercise the -EEXEC at the fs level.
281 	 */
282 	*start = (unsigned long) area_dst_alias + offset;
283 }
284 
285 /* Shared memory */
shmem_release_pages(char * rel_area)286 static int shmem_release_pages(char *rel_area)
287 {
288 	int ret = 0;
289 
290 	if (madvise(rel_area, nr_pages * page_size, MADV_REMOVE)) {
291 		perror("madvise");
292 		ret = 1;
293 	}
294 
295 	return ret;
296 }
297 
shmem_allocate_area(void ** alloc_area)298 static void shmem_allocate_area(void **alloc_area)
299 {
300 	*alloc_area = mmap(NULL, nr_pages * page_size, PROT_READ | PROT_WRITE,
301 			   MAP_ANONYMOUS | MAP_SHARED, -1, 0);
302 	if (*alloc_area == MAP_FAILED) {
303 		fprintf(stderr, "shared memory mmap failed\n");
304 		*alloc_area = NULL;
305 	}
306 }
307 
308 struct uffd_test_ops {
309 	unsigned long expected_ioctls;
310 	void (*allocate_area)(void **alloc_area);
311 	int (*release_pages)(char *rel_area);
312 	void (*alias_mapping)(__u64 *start, size_t len, unsigned long offset);
313 };
314 
315 #define SHMEM_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
316 					 (1 << _UFFDIO_COPY) | \
317 					 (1 << _UFFDIO_ZEROPAGE))
318 
319 #define ANON_EXPECTED_IOCTLS		((1 << _UFFDIO_WAKE) | \
320 					 (1 << _UFFDIO_COPY) | \
321 					 (1 << _UFFDIO_ZEROPAGE) | \
322 					 (1 << _UFFDIO_WRITEPROTECT))
323 
324 static struct uffd_test_ops anon_uffd_test_ops = {
325 	.expected_ioctls = ANON_EXPECTED_IOCTLS,
326 	.allocate_area	= anon_allocate_area,
327 	.release_pages	= anon_release_pages,
328 	.alias_mapping = noop_alias_mapping,
329 };
330 
331 static struct uffd_test_ops shmem_uffd_test_ops = {
332 	.expected_ioctls = SHMEM_EXPECTED_IOCTLS,
333 	.allocate_area	= shmem_allocate_area,
334 	.release_pages	= shmem_release_pages,
335 	.alias_mapping = noop_alias_mapping,
336 };
337 
338 static struct uffd_test_ops hugetlb_uffd_test_ops = {
339 	.expected_ioctls = UFFD_API_RANGE_IOCTLS_BASIC & ~(1 << _UFFDIO_CONTINUE),
340 	.allocate_area	= hugetlb_allocate_area,
341 	.release_pages	= hugetlb_release_pages,
342 	.alias_mapping = hugetlb_alias_mapping,
343 };
344 
345 static struct uffd_test_ops *uffd_test_ops;
346 
my_bcmp(char * str1,char * str2,size_t n)347 static int my_bcmp(char *str1, char *str2, size_t n)
348 {
349 	unsigned long i;
350 	for (i = 0; i < n; i++)
351 		if (str1[i] != str2[i])
352 			return 1;
353 	return 0;
354 }
355 
wp_range(int ufd,__u64 start,__u64 len,bool wp)356 static void wp_range(int ufd, __u64 start, __u64 len, bool wp)
357 {
358 	struct uffdio_writeprotect prms;
359 
360 	/* Write protection page faults */
361 	prms.range.start = start;
362 	prms.range.len = len;
363 	/* Undo write-protect, do wakeup after that */
364 	prms.mode = wp ? UFFDIO_WRITEPROTECT_MODE_WP : 0;
365 
366 	if (ioctl(ufd, UFFDIO_WRITEPROTECT, &prms)) {
367 		fprintf(stderr, "clear WP failed for address 0x%" PRIx64 "\n",
368 			(uint64_t)start);
369 		exit(1);
370 	}
371 }
372 
continue_range(int ufd,__u64 start,__u64 len)373 static void continue_range(int ufd, __u64 start, __u64 len)
374 {
375 	struct uffdio_continue req;
376 
377 	req.range.start = start;
378 	req.range.len = len;
379 	req.mode = 0;
380 
381 	if (ioctl(ufd, UFFDIO_CONTINUE, &req)) {
382 		fprintf(stderr,
383 			"UFFDIO_CONTINUE failed for address 0x%" PRIx64 "\n",
384 			(uint64_t)start);
385 		exit(1);
386 	}
387 }
388 
locking_thread(void * arg)389 static void *locking_thread(void *arg)
390 {
391 	unsigned long cpu = (unsigned long) arg;
392 	struct random_data rand;
393 	unsigned long page_nr = *(&(page_nr)); /* uninitialized warning */
394 	int32_t rand_nr;
395 	unsigned long long count;
396 	char randstate[64];
397 	unsigned int seed;
398 	time_t start;
399 
400 	if (bounces & BOUNCE_RANDOM) {
401 		seed = (unsigned int) time(NULL) - bounces;
402 		if (!(bounces & BOUNCE_RACINGFAULTS))
403 			seed += cpu;
404 		bzero(&rand, sizeof(rand));
405 		bzero(&randstate, sizeof(randstate));
406 		if (initstate_r(seed, randstate, sizeof(randstate), &rand)) {
407 			fprintf(stderr, "srandom_r error\n");
408 			exit(1);
409 		}
410 	} else {
411 		page_nr = -bounces;
412 		if (!(bounces & BOUNCE_RACINGFAULTS))
413 			page_nr += cpu * nr_pages_per_cpu;
414 	}
415 
416 	while (!finished) {
417 		if (bounces & BOUNCE_RANDOM) {
418 			if (random_r(&rand, &rand_nr)) {
419 				fprintf(stderr, "random_r 1 error\n");
420 				exit(1);
421 			}
422 			page_nr = rand_nr;
423 			if (sizeof(page_nr) > sizeof(rand_nr)) {
424 				if (random_r(&rand, &rand_nr)) {
425 					fprintf(stderr, "random_r 2 error\n");
426 					exit(1);
427 				}
428 				page_nr |= (((unsigned long) rand_nr) << 16) <<
429 					   16;
430 			}
431 		} else
432 			page_nr += 1;
433 		page_nr %= nr_pages;
434 
435 		start = time(NULL);
436 		if (bounces & BOUNCE_VERIFY) {
437 			count = *area_count(area_dst, page_nr);
438 			if (!count) {
439 				fprintf(stderr,
440 					"page_nr %lu wrong count %Lu %Lu\n",
441 					page_nr, count,
442 					count_verify[page_nr]);
443 				exit(1);
444 			}
445 
446 
447 			/*
448 			 * We can't use bcmp (or memcmp) because that
449 			 * returns 0 erroneously if the memory is
450 			 * changing under it (even if the end of the
451 			 * page is never changing and always
452 			 * different).
453 			 */
454 #if 1
455 			if (!my_bcmp(area_dst + page_nr * page_size, zeropage,
456 				     page_size)) {
457 				fprintf(stderr,
458 					"my_bcmp page_nr %lu wrong count %Lu %Lu\n",
459 					page_nr, count, count_verify[page_nr]);
460 				exit(1);
461 			}
462 #else
463 			unsigned long loops;
464 
465 			loops = 0;
466 			/* uncomment the below line to test with mutex */
467 			/* pthread_mutex_lock(area_mutex(area_dst, page_nr)); */
468 			while (!bcmp(area_dst + page_nr * page_size, zeropage,
469 				     page_size)) {
470 				loops += 1;
471 				if (loops > 10)
472 					break;
473 			}
474 			/* uncomment below line to test with mutex */
475 			/* pthread_mutex_unlock(area_mutex(area_dst, page_nr)); */
476 			if (loops) {
477 				fprintf(stderr,
478 					"page_nr %lu all zero thread %lu %p %lu\n",
479 					page_nr, cpu, area_dst + page_nr * page_size,
480 					loops);
481 				if (loops > 10)
482 					exit(1);
483 			}
484 #endif
485 		}
486 
487 		pthread_mutex_lock(area_mutex(area_dst, page_nr));
488 		count = *area_count(area_dst, page_nr);
489 		if (count != count_verify[page_nr]) {
490 			fprintf(stderr,
491 				"page_nr %lu memory corruption %Lu %Lu\n",
492 				page_nr, count,
493 				count_verify[page_nr]); exit(1);
494 		}
495 		count++;
496 		*area_count(area_dst, page_nr) = count_verify[page_nr] = count;
497 		pthread_mutex_unlock(area_mutex(area_dst, page_nr));
498 
499 		if (time(NULL) - start > 1)
500 			fprintf(stderr,
501 				"userfault too slow %ld "
502 				"possible false positive with overcommit\n",
503 				time(NULL) - start);
504 	}
505 
506 	return NULL;
507 }
508 
retry_copy_page(int ufd,struct uffdio_copy * uffdio_copy,unsigned long offset)509 static void retry_copy_page(int ufd, struct uffdio_copy *uffdio_copy,
510 			    unsigned long offset)
511 {
512 	uffd_test_ops->alias_mapping(&uffdio_copy->dst,
513 				     uffdio_copy->len,
514 				     offset);
515 	if (ioctl(ufd, UFFDIO_COPY, uffdio_copy)) {
516 		/* real retval in ufdio_copy.copy */
517 		if (uffdio_copy->copy != -EEXIST) {
518 			uffd_error(uffdio_copy->copy,
519 				   "UFFDIO_COPY retry error");
520 		}
521 	} else
522 		uffd_error(uffdio_copy->copy, "UFFDIO_COPY retry unexpected");
523 }
524 
__copy_page(int ufd,unsigned long offset,bool retry)525 static int __copy_page(int ufd, unsigned long offset, bool retry)
526 {
527 	struct uffdio_copy uffdio_copy;
528 
529 	if (offset >= nr_pages * page_size) {
530 		fprintf(stderr, "unexpected offset %lu\n", offset);
531 		exit(1);
532 	}
533 	uffdio_copy.dst = (unsigned long) area_dst + offset;
534 	uffdio_copy.src = (unsigned long) area_src + offset;
535 	uffdio_copy.len = page_size;
536 	if (test_uffdio_wp)
537 		uffdio_copy.mode = UFFDIO_COPY_MODE_WP;
538 	else
539 		uffdio_copy.mode = 0;
540 	uffdio_copy.copy = 0;
541 	if (ioctl(ufd, UFFDIO_COPY, &uffdio_copy)) {
542 		/* real retval in ufdio_copy.copy */
543 		if (uffdio_copy.copy != -EEXIST)
544 			uffd_error(uffdio_copy.copy, "UFFDIO_COPY error");
545 	} else if (uffdio_copy.copy != page_size) {
546 		uffd_error(uffdio_copy.copy, "UFFDIO_COPY unexpected copy");
547 	} else {
548 		if (test_uffdio_copy_eexist && retry) {
549 			test_uffdio_copy_eexist = false;
550 			retry_copy_page(ufd, &uffdio_copy, offset);
551 		}
552 		return 1;
553 	}
554 	return 0;
555 }
556 
copy_page_retry(int ufd,unsigned long offset)557 static int copy_page_retry(int ufd, unsigned long offset)
558 {
559 	return __copy_page(ufd, offset, true);
560 }
561 
copy_page(int ufd,unsigned long offset)562 static int copy_page(int ufd, unsigned long offset)
563 {
564 	return __copy_page(ufd, offset, false);
565 }
566 
uffd_read_msg(int ufd,struct uffd_msg * msg)567 static int uffd_read_msg(int ufd, struct uffd_msg *msg)
568 {
569 	int ret = read(uffd, msg, sizeof(*msg));
570 
571 	if (ret != sizeof(*msg)) {
572 		if (ret < 0) {
573 			if (errno == EAGAIN)
574 				return 1;
575 			perror("blocking read error");
576 		} else {
577 			fprintf(stderr, "short read\n");
578 		}
579 		exit(1);
580 	}
581 
582 	return 0;
583 }
584 
uffd_handle_page_fault(struct uffd_msg * msg,struct uffd_stats * stats)585 static void uffd_handle_page_fault(struct uffd_msg *msg,
586 				   struct uffd_stats *stats)
587 {
588 	unsigned long offset;
589 
590 	if (msg->event != UFFD_EVENT_PAGEFAULT) {
591 		fprintf(stderr, "unexpected msg event %u\n", msg->event);
592 		exit(1);
593 	}
594 
595 	if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WP) {
596 		/* Write protect page faults */
597 		wp_range(uffd, msg->arg.pagefault.address, page_size, false);
598 		stats->wp_faults++;
599 	} else if (msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_MINOR) {
600 		uint8_t *area;
601 		int b;
602 
603 		/*
604 		 * Minor page faults
605 		 *
606 		 * To prove we can modify the original range for testing
607 		 * purposes, we're going to bit flip this range before
608 		 * continuing.
609 		 *
610 		 * Note that this requires all minor page fault tests operate on
611 		 * area_dst (non-UFFD-registered) and area_dst_alias
612 		 * (UFFD-registered).
613 		 */
614 
615 		area = (uint8_t *)(area_dst +
616 				   ((char *)msg->arg.pagefault.address -
617 				    area_dst_alias));
618 		for (b = 0; b < page_size; ++b)
619 			area[b] = ~area[b];
620 		continue_range(uffd, msg->arg.pagefault.address, page_size);
621 		stats->minor_faults++;
622 	} else {
623 		/* Missing page faults */
624 		if (bounces & BOUNCE_VERIFY &&
625 		    msg->arg.pagefault.flags & UFFD_PAGEFAULT_FLAG_WRITE) {
626 			fprintf(stderr, "unexpected write fault\n");
627 			exit(1);
628 		}
629 
630 		offset = (char *)(unsigned long)msg->arg.pagefault.address - area_dst;
631 		offset &= ~(page_size-1);
632 
633 		if (copy_page(uffd, offset))
634 			stats->missing_faults++;
635 	}
636 }
637 
uffd_poll_thread(void * arg)638 static void *uffd_poll_thread(void *arg)
639 {
640 	struct uffd_stats *stats = (struct uffd_stats *)arg;
641 	unsigned long cpu = stats->cpu;
642 	struct pollfd pollfd[2];
643 	struct uffd_msg msg;
644 	struct uffdio_register uffd_reg;
645 	int ret;
646 	char tmp_chr;
647 
648 	pollfd[0].fd = uffd;
649 	pollfd[0].events = POLLIN;
650 	pollfd[1].fd = pipefd[cpu*2];
651 	pollfd[1].events = POLLIN;
652 
653 	for (;;) {
654 		ret = poll(pollfd, 2, -1);
655 		if (!ret) {
656 			fprintf(stderr, "poll error %d\n", ret);
657 			exit(1);
658 		}
659 		if (ret < 0) {
660 			perror("poll");
661 			exit(1);
662 		}
663 		if (pollfd[1].revents & POLLIN) {
664 			if (read(pollfd[1].fd, &tmp_chr, 1) != 1) {
665 				fprintf(stderr, "read pipefd error\n");
666 				exit(1);
667 			}
668 			break;
669 		}
670 		if (!(pollfd[0].revents & POLLIN)) {
671 			fprintf(stderr, "pollfd[0].revents %d\n",
672 				pollfd[0].revents);
673 			exit(1);
674 		}
675 		if (uffd_read_msg(uffd, &msg))
676 			continue;
677 		switch (msg.event) {
678 		default:
679 			fprintf(stderr, "unexpected msg event %u\n",
680 				msg.event); exit(1);
681 			break;
682 		case UFFD_EVENT_PAGEFAULT:
683 			uffd_handle_page_fault(&msg, stats);
684 			break;
685 		case UFFD_EVENT_FORK:
686 			close(uffd);
687 			uffd = msg.arg.fork.ufd;
688 			pollfd[0].fd = uffd;
689 			break;
690 		case UFFD_EVENT_REMOVE:
691 			uffd_reg.range.start = msg.arg.remove.start;
692 			uffd_reg.range.len = msg.arg.remove.end -
693 				msg.arg.remove.start;
694 			if (ioctl(uffd, UFFDIO_UNREGISTER, &uffd_reg.range)) {
695 				fprintf(stderr, "remove failure\n");
696 				exit(1);
697 			}
698 			break;
699 		case UFFD_EVENT_REMAP:
700 			area_dst = (char *)(unsigned long)msg.arg.remap.to;
701 			break;
702 		}
703 	}
704 
705 	return NULL;
706 }
707 
708 pthread_mutex_t uffd_read_mutex = PTHREAD_MUTEX_INITIALIZER;
709 
uffd_read_thread(void * arg)710 static void *uffd_read_thread(void *arg)
711 {
712 	struct uffd_stats *stats = (struct uffd_stats *)arg;
713 	struct uffd_msg msg;
714 
715 	pthread_mutex_unlock(&uffd_read_mutex);
716 	/* from here cancellation is ok */
717 
718 	for (;;) {
719 		if (uffd_read_msg(uffd, &msg))
720 			continue;
721 		uffd_handle_page_fault(&msg, stats);
722 	}
723 
724 	return NULL;
725 }
726 
background_thread(void * arg)727 static void *background_thread(void *arg)
728 {
729 	unsigned long cpu = (unsigned long) arg;
730 	unsigned long page_nr, start_nr, mid_nr, end_nr;
731 
732 	start_nr = cpu * nr_pages_per_cpu;
733 	end_nr = (cpu+1) * nr_pages_per_cpu;
734 	mid_nr = (start_nr + end_nr) / 2;
735 
736 	/* Copy the first half of the pages */
737 	for (page_nr = start_nr; page_nr < mid_nr; page_nr++)
738 		copy_page_retry(uffd, page_nr * page_size);
739 
740 	/*
741 	 * If we need to test uffd-wp, set it up now.  Then we'll have
742 	 * at least the first half of the pages mapped already which
743 	 * can be write-protected for testing
744 	 */
745 	if (test_uffdio_wp)
746 		wp_range(uffd, (unsigned long)area_dst + start_nr * page_size,
747 			nr_pages_per_cpu * page_size, true);
748 
749 	/*
750 	 * Continue the 2nd half of the page copying, handling write
751 	 * protection faults if any
752 	 */
753 	for (page_nr = mid_nr; page_nr < end_nr; page_nr++)
754 		copy_page_retry(uffd, page_nr * page_size);
755 
756 	return NULL;
757 }
758 
stress(struct uffd_stats * uffd_stats)759 static int stress(struct uffd_stats *uffd_stats)
760 {
761 	unsigned long cpu;
762 	pthread_t locking_threads[nr_cpus];
763 	pthread_t uffd_threads[nr_cpus];
764 	pthread_t background_threads[nr_cpus];
765 
766 	finished = 0;
767 	for (cpu = 0; cpu < nr_cpus; cpu++) {
768 		if (pthread_create(&locking_threads[cpu], &attr,
769 				   locking_thread, (void *)cpu))
770 			return 1;
771 		if (bounces & BOUNCE_POLL) {
772 			if (pthread_create(&uffd_threads[cpu], &attr,
773 					   uffd_poll_thread,
774 					   (void *)&uffd_stats[cpu]))
775 				return 1;
776 		} else {
777 			if (pthread_create(&uffd_threads[cpu], &attr,
778 					   uffd_read_thread,
779 					   (void *)&uffd_stats[cpu]))
780 				return 1;
781 			pthread_mutex_lock(&uffd_read_mutex);
782 		}
783 		if (pthread_create(&background_threads[cpu], &attr,
784 				   background_thread, (void *)cpu))
785 			return 1;
786 	}
787 	for (cpu = 0; cpu < nr_cpus; cpu++)
788 		if (pthread_join(background_threads[cpu], NULL))
789 			return 1;
790 
791 	/*
792 	 * Be strict and immediately zap area_src, the whole area has
793 	 * been transferred already by the background treads. The
794 	 * area_src could then be faulted in in a racy way by still
795 	 * running uffdio_threads reading zeropages after we zapped
796 	 * area_src (but they're guaranteed to get -EEXIST from
797 	 * UFFDIO_COPY without writing zero pages into area_dst
798 	 * because the background threads already completed).
799 	 */
800 	if (uffd_test_ops->release_pages(area_src))
801 		return 1;
802 
803 
804 	finished = 1;
805 	for (cpu = 0; cpu < nr_cpus; cpu++)
806 		if (pthread_join(locking_threads[cpu], NULL))
807 			return 1;
808 
809 	for (cpu = 0; cpu < nr_cpus; cpu++) {
810 		char c;
811 		if (bounces & BOUNCE_POLL) {
812 			if (write(pipefd[cpu*2+1], &c, 1) != 1) {
813 				fprintf(stderr, "pipefd write error\n");
814 				return 1;
815 			}
816 			if (pthread_join(uffd_threads[cpu],
817 					 (void *)&uffd_stats[cpu]))
818 				return 1;
819 		} else {
820 			if (pthread_cancel(uffd_threads[cpu]))
821 				return 1;
822 			if (pthread_join(uffd_threads[cpu], NULL))
823 				return 1;
824 		}
825 	}
826 
827 	return 0;
828 }
829 
userfaultfd_open_ext(uint64_t * features)830 static int userfaultfd_open_ext(uint64_t *features)
831 {
832 	struct uffdio_api uffdio_api;
833 
834 	uffd = syscall(__NR_userfaultfd, O_CLOEXEC | O_NONBLOCK);
835 	if (uffd < 0) {
836 		fprintf(stderr,
837 			"userfaultfd syscall not available in this kernel\n");
838 		return 1;
839 	}
840 	uffd_flags = fcntl(uffd, F_GETFD, NULL);
841 
842 	uffdio_api.api = UFFD_API;
843 	uffdio_api.features = *features;
844 	if (ioctl(uffd, UFFDIO_API, &uffdio_api)) {
845 		fprintf(stderr, "UFFDIO_API failed.\nPlease make sure to "
846 			"run with either root or ptrace capability.\n");
847 		return 1;
848 	}
849 	if (uffdio_api.api != UFFD_API) {
850 		fprintf(stderr, "UFFDIO_API error: %" PRIu64 "\n",
851 			(uint64_t)uffdio_api.api);
852 		return 1;
853 	}
854 
855 	*features = uffdio_api.features;
856 	return 0;
857 }
858 
userfaultfd_open(uint64_t features)859 static int userfaultfd_open(uint64_t features)
860 {
861 	return userfaultfd_open_ext(&features);
862 }
863 
864 sigjmp_buf jbuf, *sigbuf;
865 
sighndl(int sig,siginfo_t * siginfo,void * ptr)866 static void sighndl(int sig, siginfo_t *siginfo, void *ptr)
867 {
868 	if (sig == SIGBUS) {
869 		if (sigbuf)
870 			siglongjmp(*sigbuf, 1);
871 		abort();
872 	}
873 }
874 
875 /*
876  * For non-cooperative userfaultfd test we fork() a process that will
877  * generate pagefaults, will mremap the area monitored by the
878  * userfaultfd and at last this process will release the monitored
879  * area.
880  * For the anonymous and shared memory the area is divided into two
881  * parts, the first part is accessed before mremap, and the second
882  * part is accessed after mremap. Since hugetlbfs does not support
883  * mremap, the entire monitored area is accessed in a single pass for
884  * HUGETLB_TEST.
885  * The release of the pages currently generates event for shmem and
886  * anonymous memory (UFFD_EVENT_REMOVE), hence it is not checked
887  * for hugetlb.
888  * For signal test(UFFD_FEATURE_SIGBUS), signal_test = 1, we register
889  * monitored area, generate pagefaults and test that signal is delivered.
890  * Use UFFDIO_COPY to allocate missing page and retry. For signal_test = 2
891  * test robustness use case - we release monitored area, fork a process
892  * that will generate pagefaults and verify signal is generated.
893  * This also tests UFFD_FEATURE_EVENT_FORK event along with the signal
894  * feature. Using monitor thread, verify no userfault events are generated.
895  */
faulting_process(int signal_test)896 static int faulting_process(int signal_test)
897 {
898 	unsigned long nr;
899 	unsigned long long count;
900 	unsigned long split_nr_pages;
901 	unsigned long lastnr;
902 	struct sigaction act;
903 	unsigned long signalled = 0;
904 
905 	if (test_type != TEST_HUGETLB)
906 		split_nr_pages = (nr_pages + 1) / 2;
907 	else
908 		split_nr_pages = nr_pages;
909 
910 	if (signal_test) {
911 		sigbuf = &jbuf;
912 		memset(&act, 0, sizeof(act));
913 		act.sa_sigaction = sighndl;
914 		act.sa_flags = SA_SIGINFO;
915 		if (sigaction(SIGBUS, &act, 0)) {
916 			perror("sigaction");
917 			return 1;
918 		}
919 		lastnr = (unsigned long)-1;
920 	}
921 
922 	for (nr = 0; nr < split_nr_pages; nr++) {
923 		int steps = 1;
924 		unsigned long offset = nr * page_size;
925 
926 		if (signal_test) {
927 			if (sigsetjmp(*sigbuf, 1) != 0) {
928 				if (steps == 1 && nr == lastnr) {
929 					fprintf(stderr, "Signal repeated\n");
930 					return 1;
931 				}
932 
933 				lastnr = nr;
934 				if (signal_test == 1) {
935 					if (steps == 1) {
936 						/* This is a MISSING request */
937 						steps++;
938 						if (copy_page(uffd, offset))
939 							signalled++;
940 					} else {
941 						/* This is a WP request */
942 						assert(steps == 2);
943 						wp_range(uffd,
944 							 (__u64)area_dst +
945 							 offset,
946 							 page_size, false);
947 					}
948 				} else {
949 					signalled++;
950 					continue;
951 				}
952 			}
953 		}
954 
955 		count = *area_count(area_dst, nr);
956 		if (count != count_verify[nr]) {
957 			fprintf(stderr,
958 				"nr %lu memory corruption %Lu %Lu\n",
959 				nr, count,
960 				count_verify[nr]);
961 	        }
962 		/*
963 		 * Trigger write protection if there is by writing
964 		 * the same value back.
965 		 */
966 		*area_count(area_dst, nr) = count;
967 	}
968 
969 	if (signal_test)
970 		return signalled != split_nr_pages;
971 
972 	if (test_type == TEST_HUGETLB)
973 		return 0;
974 
975 	area_dst = mremap(area_dst, nr_pages * page_size,  nr_pages * page_size,
976 			  MREMAP_MAYMOVE | MREMAP_FIXED, area_src);
977 	if (area_dst == MAP_FAILED) {
978 		perror("mremap");
979 		exit(1);
980 	}
981 
982 	for (; nr < nr_pages; nr++) {
983 		count = *area_count(area_dst, nr);
984 		if (count != count_verify[nr]) {
985 			fprintf(stderr,
986 				"nr %lu memory corruption %Lu %Lu\n",
987 				nr, count,
988 				count_verify[nr]); exit(1);
989 		}
990 		/*
991 		 * Trigger write protection if there is by writing
992 		 * the same value back.
993 		 */
994 		*area_count(area_dst, nr) = count;
995 	}
996 
997 	if (uffd_test_ops->release_pages(area_dst))
998 		return 1;
999 
1000 	for (nr = 0; nr < nr_pages; nr++) {
1001 		if (my_bcmp(area_dst + nr * page_size, zeropage, page_size)) {
1002 			fprintf(stderr, "nr %lu is not zero\n", nr);
1003 			exit(1);
1004 		}
1005 	}
1006 
1007 	return 0;
1008 }
1009 
retry_uffdio_zeropage(int ufd,struct uffdio_zeropage * uffdio_zeropage,unsigned long offset)1010 static void retry_uffdio_zeropage(int ufd,
1011 				  struct uffdio_zeropage *uffdio_zeropage,
1012 				  unsigned long offset)
1013 {
1014 	uffd_test_ops->alias_mapping(&uffdio_zeropage->range.start,
1015 				     uffdio_zeropage->range.len,
1016 				     offset);
1017 	if (ioctl(ufd, UFFDIO_ZEROPAGE, uffdio_zeropage)) {
1018 		if (uffdio_zeropage->zeropage != -EEXIST) {
1019 			uffd_error(uffdio_zeropage->zeropage,
1020 				   "UFFDIO_ZEROPAGE retry error");
1021 		}
1022 	} else {
1023 		uffd_error(uffdio_zeropage->zeropage,
1024 			   "UFFDIO_ZEROPAGE retry unexpected");
1025 	}
1026 }
1027 
__uffdio_zeropage(int ufd,unsigned long offset,bool retry)1028 static int __uffdio_zeropage(int ufd, unsigned long offset, bool retry)
1029 {
1030 	struct uffdio_zeropage uffdio_zeropage;
1031 	int ret;
1032 	unsigned long has_zeropage;
1033 	__s64 res;
1034 
1035 	has_zeropage = uffd_test_ops->expected_ioctls & (1 << _UFFDIO_ZEROPAGE);
1036 
1037 	if (offset >= nr_pages * page_size) {
1038 		fprintf(stderr, "unexpected offset %lu\n", offset);
1039 		exit(1);
1040 	}
1041 	uffdio_zeropage.range.start = (unsigned long) area_dst + offset;
1042 	uffdio_zeropage.range.len = page_size;
1043 	uffdio_zeropage.mode = 0;
1044 	ret = ioctl(ufd, UFFDIO_ZEROPAGE, &uffdio_zeropage);
1045 	res = uffdio_zeropage.zeropage;
1046 	if (ret) {
1047 		/* real retval in ufdio_zeropage.zeropage */
1048 		if (has_zeropage) {
1049 			uffd_error(res, "UFFDIO_ZEROPAGE %s",
1050 				   res == -EEXIST ? "-EEXIST" : "error");
1051 		} else if (res != -EINVAL)
1052 			uffd_error(res, "UFFDIO_ZEROPAGE not -EINVAL");
1053 	} else if (has_zeropage) {
1054 		if (res != page_size) {
1055 			uffd_error(res, "UFFDIO_ZEROPAGE unexpected");
1056 		} else {
1057 			if (test_uffdio_zeropage_eexist && retry) {
1058 				test_uffdio_zeropage_eexist = false;
1059 				retry_uffdio_zeropage(ufd, &uffdio_zeropage,
1060 						      offset);
1061 			}
1062 			return 1;
1063 		}
1064 	} else
1065 		uffd_error(res, "UFFDIO_ZEROPAGE succeeded");
1066 
1067 	return 0;
1068 }
1069 
uffdio_zeropage(int ufd,unsigned long offset)1070 static int uffdio_zeropage(int ufd, unsigned long offset)
1071 {
1072 	return __uffdio_zeropage(ufd, offset, false);
1073 }
1074 
1075 /* exercise UFFDIO_ZEROPAGE */
userfaultfd_zeropage_test(void)1076 static int userfaultfd_zeropage_test(void)
1077 {
1078 	struct uffdio_register uffdio_register;
1079 	unsigned long expected_ioctls;
1080 
1081 	printf("testing UFFDIO_ZEROPAGE: ");
1082 	fflush(stdout);
1083 
1084 	if (uffd_test_ops->release_pages(area_dst))
1085 		return 1;
1086 
1087 	if (userfaultfd_open(0))
1088 		return 1;
1089 	uffdio_register.range.start = (unsigned long) area_dst;
1090 	uffdio_register.range.len = nr_pages * page_size;
1091 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1092 	if (test_uffdio_wp)
1093 		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1094 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1095 		fprintf(stderr, "register failure\n");
1096 		exit(1);
1097 	}
1098 
1099 	expected_ioctls = uffd_test_ops->expected_ioctls;
1100 	if ((uffdio_register.ioctls & expected_ioctls) !=
1101 	    expected_ioctls) {
1102 		fprintf(stderr,
1103 			"unexpected missing ioctl for anon memory\n");
1104 		exit(1);
1105 	}
1106 
1107 	if (uffdio_zeropage(uffd, 0)) {
1108 		if (my_bcmp(area_dst, zeropage, page_size)) {
1109 			fprintf(stderr, "zeropage is not zero\n");
1110 			exit(1);
1111 		}
1112 	}
1113 
1114 	close(uffd);
1115 	printf("done.\n");
1116 	return 0;
1117 }
1118 
userfaultfd_events_test(void)1119 static int userfaultfd_events_test(void)
1120 {
1121 	struct uffdio_register uffdio_register;
1122 	unsigned long expected_ioctls;
1123 	pthread_t uffd_mon;
1124 	int err, features;
1125 	pid_t pid;
1126 	char c;
1127 	struct uffd_stats stats = { 0 };
1128 
1129 	printf("testing events (fork, remap, remove): ");
1130 	fflush(stdout);
1131 
1132 	if (uffd_test_ops->release_pages(area_dst))
1133 		return 1;
1134 
1135 	features = UFFD_FEATURE_EVENT_FORK | UFFD_FEATURE_EVENT_REMAP |
1136 		UFFD_FEATURE_EVENT_REMOVE;
1137 	if (userfaultfd_open(features))
1138 		return 1;
1139 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1140 
1141 	uffdio_register.range.start = (unsigned long) area_dst;
1142 	uffdio_register.range.len = nr_pages * page_size;
1143 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1144 	if (test_uffdio_wp)
1145 		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1146 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1147 		fprintf(stderr, "register failure\n");
1148 		exit(1);
1149 	}
1150 
1151 	expected_ioctls = uffd_test_ops->expected_ioctls;
1152 	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1153 		fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1154 		exit(1);
1155 	}
1156 
1157 	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1158 		perror("uffd_poll_thread create");
1159 		exit(1);
1160 	}
1161 
1162 	pid = fork();
1163 	if (pid < 0) {
1164 		perror("fork");
1165 		exit(1);
1166 	}
1167 
1168 	if (!pid)
1169 		exit(faulting_process(0));
1170 
1171 	waitpid(pid, &err, 0);
1172 	if (err) {
1173 		fprintf(stderr, "faulting process failed\n");
1174 		exit(1);
1175 	}
1176 
1177 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1178 		perror("pipe write");
1179 		exit(1);
1180 	}
1181 	if (pthread_join(uffd_mon, NULL))
1182 		return 1;
1183 
1184 	close(uffd);
1185 
1186 	uffd_stats_report(&stats, 1);
1187 
1188 	return stats.missing_faults != nr_pages;
1189 }
1190 
userfaultfd_sig_test(void)1191 static int userfaultfd_sig_test(void)
1192 {
1193 	struct uffdio_register uffdio_register;
1194 	unsigned long expected_ioctls;
1195 	unsigned long userfaults;
1196 	pthread_t uffd_mon;
1197 	int err, features;
1198 	pid_t pid;
1199 	char c;
1200 	struct uffd_stats stats = { 0 };
1201 
1202 	printf("testing signal delivery: ");
1203 	fflush(stdout);
1204 
1205 	if (uffd_test_ops->release_pages(area_dst))
1206 		return 1;
1207 
1208 	features = UFFD_FEATURE_EVENT_FORK|UFFD_FEATURE_SIGBUS;
1209 	if (userfaultfd_open(features))
1210 		return 1;
1211 	fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1212 
1213 	uffdio_register.range.start = (unsigned long) area_dst;
1214 	uffdio_register.range.len = nr_pages * page_size;
1215 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1216 	if (test_uffdio_wp)
1217 		uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1218 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1219 		fprintf(stderr, "register failure\n");
1220 		exit(1);
1221 	}
1222 
1223 	expected_ioctls = uffd_test_ops->expected_ioctls;
1224 	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1225 		fprintf(stderr, "unexpected missing ioctl for anon memory\n");
1226 		exit(1);
1227 	}
1228 
1229 	if (faulting_process(1)) {
1230 		fprintf(stderr, "faulting process failed\n");
1231 		exit(1);
1232 	}
1233 
1234 	if (uffd_test_ops->release_pages(area_dst))
1235 		return 1;
1236 
1237 	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1238 		perror("uffd_poll_thread create");
1239 		exit(1);
1240 	}
1241 
1242 	pid = fork();
1243 	if (pid < 0) {
1244 		perror("fork");
1245 		exit(1);
1246 	}
1247 
1248 	if (!pid)
1249 		exit(faulting_process(2));
1250 
1251 	waitpid(pid, &err, 0);
1252 	if (err) {
1253 		fprintf(stderr, "faulting process failed\n");
1254 		exit(1);
1255 	}
1256 
1257 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1258 		perror("pipe write");
1259 		exit(1);
1260 	}
1261 	if (pthread_join(uffd_mon, (void **)&userfaults))
1262 		return 1;
1263 
1264 	printf("done.\n");
1265 	if (userfaults)
1266 		fprintf(stderr, "Signal test failed, userfaults: %ld\n",
1267 			userfaults);
1268 	close(uffd);
1269 	return userfaults != 0;
1270 }
1271 
userfaultfd_minor_test(void)1272 static int userfaultfd_minor_test(void)
1273 {
1274 	struct uffdio_register uffdio_register;
1275 	unsigned long expected_ioctls;
1276 	unsigned long p;
1277 	pthread_t uffd_mon;
1278 	uint8_t expected_byte;
1279 	void *expected_page;
1280 	char c;
1281 	struct uffd_stats stats = { 0 };
1282 	uint64_t features = UFFD_FEATURE_MINOR_HUGETLBFS;
1283 
1284 	if (!test_uffdio_minor)
1285 		return 0;
1286 
1287 	printf("testing minor faults: ");
1288 	fflush(stdout);
1289 
1290 	if (uffd_test_ops->release_pages(area_dst))
1291 		return 1;
1292 
1293 	if (userfaultfd_open_ext(&features))
1294 		return 1;
1295 	/* If kernel reports the feature isn't supported, skip the test. */
1296 	if (!(features & UFFD_FEATURE_MINOR_HUGETLBFS)) {
1297 		printf("skipping test due to lack of feature support\n");
1298 		fflush(stdout);
1299 		return 0;
1300 	}
1301 
1302 	uffdio_register.range.start = (unsigned long)area_dst_alias;
1303 	uffdio_register.range.len = nr_pages * page_size;
1304 	uffdio_register.mode = UFFDIO_REGISTER_MODE_MINOR;
1305 	if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1306 		fprintf(stderr, "register failure\n");
1307 		exit(1);
1308 	}
1309 
1310 	expected_ioctls = uffd_test_ops->expected_ioctls;
1311 	expected_ioctls |= 1 << _UFFDIO_CONTINUE;
1312 	if ((uffdio_register.ioctls & expected_ioctls) != expected_ioctls) {
1313 		fprintf(stderr, "unexpected missing ioctl(s)\n");
1314 		exit(1);
1315 	}
1316 
1317 	/*
1318 	 * After registering with UFFD, populate the non-UFFD-registered side of
1319 	 * the shared mapping. This should *not* trigger any UFFD minor faults.
1320 	 */
1321 	for (p = 0; p < nr_pages; ++p) {
1322 		memset(area_dst + (p * page_size), p % ((uint8_t)-1),
1323 		       page_size);
1324 	}
1325 
1326 	if (pthread_create(&uffd_mon, &attr, uffd_poll_thread, &stats)) {
1327 		perror("uffd_poll_thread create");
1328 		exit(1);
1329 	}
1330 
1331 	/*
1332 	 * Read each of the pages back using the UFFD-registered mapping. We
1333 	 * expect that the first time we touch a page, it will result in a minor
1334 	 * fault. uffd_poll_thread will resolve the fault by bit-flipping the
1335 	 * page's contents, and then issuing a CONTINUE ioctl.
1336 	 */
1337 
1338 	if (posix_memalign(&expected_page, page_size, page_size)) {
1339 		fprintf(stderr, "out of memory\n");
1340 		return 1;
1341 	}
1342 
1343 	for (p = 0; p < nr_pages; ++p) {
1344 		expected_byte = ~((uint8_t)(p % ((uint8_t)-1)));
1345 		memset(expected_page, expected_byte, page_size);
1346 		if (my_bcmp(expected_page, area_dst_alias + (p * page_size),
1347 			    page_size)) {
1348 			fprintf(stderr,
1349 				"unexpected page contents after minor fault\n");
1350 			exit(1);
1351 		}
1352 	}
1353 
1354 	if (write(pipefd[1], &c, sizeof(c)) != sizeof(c)) {
1355 		perror("pipe write");
1356 		exit(1);
1357 	}
1358 	if (pthread_join(uffd_mon, NULL))
1359 		return 1;
1360 
1361 	close(uffd);
1362 
1363 	uffd_stats_report(&stats, 1);
1364 
1365 	return stats.missing_faults != 0 || stats.minor_faults != nr_pages;
1366 }
1367 
userfaultfd_stress(void)1368 static int userfaultfd_stress(void)
1369 {
1370 	void *area;
1371 	char *tmp_area;
1372 	unsigned long nr;
1373 	struct uffdio_register uffdio_register;
1374 	unsigned long cpu;
1375 	int err;
1376 	struct uffd_stats uffd_stats[nr_cpus];
1377 
1378 	uffd_test_ops->allocate_area((void **)&area_src);
1379 	if (!area_src)
1380 		return 1;
1381 	uffd_test_ops->allocate_area((void **)&area_dst);
1382 	if (!area_dst)
1383 		return 1;
1384 
1385 	if (userfaultfd_open(0))
1386 		return 1;
1387 
1388 	count_verify = malloc(nr_pages * sizeof(unsigned long long));
1389 	if (!count_verify) {
1390 		perror("count_verify");
1391 		return 1;
1392 	}
1393 
1394 	for (nr = 0; nr < nr_pages; nr++) {
1395 		*area_mutex(area_src, nr) = (pthread_mutex_t)
1396 			PTHREAD_MUTEX_INITIALIZER;
1397 		count_verify[nr] = *area_count(area_src, nr) = 1;
1398 		/*
1399 		 * In the transition between 255 to 256, powerpc will
1400 		 * read out of order in my_bcmp and see both bytes as
1401 		 * zero, so leave a placeholder below always non-zero
1402 		 * after the count, to avoid my_bcmp to trigger false
1403 		 * positives.
1404 		 */
1405 		*(area_count(area_src, nr) + 1) = 1;
1406 	}
1407 
1408 	pipefd = malloc(sizeof(int) * nr_cpus * 2);
1409 	if (!pipefd) {
1410 		perror("pipefd");
1411 		return 1;
1412 	}
1413 	for (cpu = 0; cpu < nr_cpus; cpu++) {
1414 		if (pipe2(&pipefd[cpu*2], O_CLOEXEC | O_NONBLOCK)) {
1415 			perror("pipe");
1416 			return 1;
1417 		}
1418 	}
1419 
1420 	if (posix_memalign(&area, page_size, page_size)) {
1421 		fprintf(stderr, "out of memory\n");
1422 		return 1;
1423 	}
1424 	zeropage = area;
1425 	bzero(zeropage, page_size);
1426 
1427 	pthread_mutex_lock(&uffd_read_mutex);
1428 
1429 	pthread_attr_init(&attr);
1430 	pthread_attr_setstacksize(&attr, 16*1024*1024);
1431 
1432 	err = 0;
1433 	while (bounces--) {
1434 		unsigned long expected_ioctls;
1435 
1436 		printf("bounces: %d, mode:", bounces);
1437 		if (bounces & BOUNCE_RANDOM)
1438 			printf(" rnd");
1439 		if (bounces & BOUNCE_RACINGFAULTS)
1440 			printf(" racing");
1441 		if (bounces & BOUNCE_VERIFY)
1442 			printf(" ver");
1443 		if (bounces & BOUNCE_POLL)
1444 			printf(" poll");
1445 		else
1446 			printf(" read");
1447 		printf(", ");
1448 		fflush(stdout);
1449 
1450 		if (bounces & BOUNCE_POLL)
1451 			fcntl(uffd, F_SETFL, uffd_flags | O_NONBLOCK);
1452 		else
1453 			fcntl(uffd, F_SETFL, uffd_flags & ~O_NONBLOCK);
1454 
1455 		/* register */
1456 		uffdio_register.range.start = (unsigned long) area_dst;
1457 		uffdio_register.range.len = nr_pages * page_size;
1458 		uffdio_register.mode = UFFDIO_REGISTER_MODE_MISSING;
1459 		if (test_uffdio_wp)
1460 			uffdio_register.mode |= UFFDIO_REGISTER_MODE_WP;
1461 		if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1462 			fprintf(stderr, "register failure\n");
1463 			return 1;
1464 		}
1465 		expected_ioctls = uffd_test_ops->expected_ioctls;
1466 		if ((uffdio_register.ioctls & expected_ioctls) !=
1467 		    expected_ioctls) {
1468 			fprintf(stderr,
1469 				"unexpected missing ioctl for anon memory\n");
1470 			return 1;
1471 		}
1472 
1473 		if (area_dst_alias) {
1474 			uffdio_register.range.start = (unsigned long)
1475 				area_dst_alias;
1476 			if (ioctl(uffd, UFFDIO_REGISTER, &uffdio_register)) {
1477 				fprintf(stderr, "register failure alias\n");
1478 				return 1;
1479 			}
1480 		}
1481 
1482 		/*
1483 		 * The madvise done previously isn't enough: some
1484 		 * uffd_thread could have read userfaults (one of
1485 		 * those already resolved by the background thread)
1486 		 * and it may be in the process of calling
1487 		 * UFFDIO_COPY. UFFDIO_COPY will read the zapped
1488 		 * area_src and it would map a zero page in it (of
1489 		 * course such a UFFDIO_COPY is perfectly safe as it'd
1490 		 * return -EEXIST). The problem comes at the next
1491 		 * bounce though: that racing UFFDIO_COPY would
1492 		 * generate zeropages in the area_src, so invalidating
1493 		 * the previous MADV_DONTNEED. Without this additional
1494 		 * MADV_DONTNEED those zeropages leftovers in the
1495 		 * area_src would lead to -EEXIST failure during the
1496 		 * next bounce, effectively leaving a zeropage in the
1497 		 * area_dst.
1498 		 *
1499 		 * Try to comment this out madvise to see the memory
1500 		 * corruption being caught pretty quick.
1501 		 *
1502 		 * khugepaged is also inhibited to collapse THP after
1503 		 * MADV_DONTNEED only after the UFFDIO_REGISTER, so it's
1504 		 * required to MADV_DONTNEED here.
1505 		 */
1506 		if (uffd_test_ops->release_pages(area_dst))
1507 			return 1;
1508 
1509 		uffd_stats_reset(uffd_stats, nr_cpus);
1510 
1511 		/* bounce pass */
1512 		if (stress(uffd_stats))
1513 			return 1;
1514 
1515 		/* Clear all the write protections if there is any */
1516 		if (test_uffdio_wp)
1517 			wp_range(uffd, (unsigned long)area_dst,
1518 				 nr_pages * page_size, false);
1519 
1520 		/* unregister */
1521 		if (ioctl(uffd, UFFDIO_UNREGISTER, &uffdio_register.range)) {
1522 			fprintf(stderr, "unregister failure\n");
1523 			return 1;
1524 		}
1525 		if (area_dst_alias) {
1526 			uffdio_register.range.start = (unsigned long) area_dst;
1527 			if (ioctl(uffd, UFFDIO_UNREGISTER,
1528 				  &uffdio_register.range)) {
1529 				fprintf(stderr, "unregister failure alias\n");
1530 				return 1;
1531 			}
1532 		}
1533 
1534 		/* verification */
1535 		if (bounces & BOUNCE_VERIFY) {
1536 			for (nr = 0; nr < nr_pages; nr++) {
1537 				if (*area_count(area_dst, nr) != count_verify[nr]) {
1538 					fprintf(stderr,
1539 						"error area_count %Lu %Lu %lu\n",
1540 						*area_count(area_src, nr),
1541 						count_verify[nr],
1542 						nr);
1543 					err = 1;
1544 					bounces = 0;
1545 				}
1546 			}
1547 		}
1548 
1549 		/* prepare next bounce */
1550 		tmp_area = area_src;
1551 		area_src = area_dst;
1552 		area_dst = tmp_area;
1553 
1554 		tmp_area = area_src_alias;
1555 		area_src_alias = area_dst_alias;
1556 		area_dst_alias = tmp_area;
1557 
1558 		uffd_stats_report(uffd_stats, nr_cpus);
1559 	}
1560 
1561 	if (err)
1562 		return err;
1563 
1564 	close(uffd);
1565 	return userfaultfd_zeropage_test() || userfaultfd_sig_test()
1566 		|| userfaultfd_events_test() || userfaultfd_minor_test();
1567 }
1568 
1569 /*
1570  * Copied from mlock2-tests.c
1571  */
default_huge_page_size(void)1572 unsigned long default_huge_page_size(void)
1573 {
1574 	unsigned long hps = 0;
1575 	char *line = NULL;
1576 	size_t linelen = 0;
1577 	FILE *f = fopen("/proc/meminfo", "r");
1578 
1579 	if (!f)
1580 		return 0;
1581 	while (getline(&line, &linelen, f) > 0) {
1582 		if (sscanf(line, "Hugepagesize:       %lu kB", &hps) == 1) {
1583 			hps <<= 10;
1584 			break;
1585 		}
1586 	}
1587 
1588 	free(line);
1589 	fclose(f);
1590 	return hps;
1591 }
1592 
set_test_type(const char * type)1593 static void set_test_type(const char *type)
1594 {
1595 	if (!strcmp(type, "anon")) {
1596 		test_type = TEST_ANON;
1597 		uffd_test_ops = &anon_uffd_test_ops;
1598 		/* Only enable write-protect test for anonymous test */
1599 		test_uffdio_wp = true;
1600 	} else if (!strcmp(type, "hugetlb")) {
1601 		test_type = TEST_HUGETLB;
1602 		uffd_test_ops = &hugetlb_uffd_test_ops;
1603 	} else if (!strcmp(type, "hugetlb_shared")) {
1604 		map_shared = true;
1605 		test_type = TEST_HUGETLB;
1606 		uffd_test_ops = &hugetlb_uffd_test_ops;
1607 		/* Minor faults require shared hugetlb; only enable here. */
1608 		test_uffdio_minor = true;
1609 	} else if (!strcmp(type, "shmem")) {
1610 		map_shared = true;
1611 		test_type = TEST_SHMEM;
1612 		uffd_test_ops = &shmem_uffd_test_ops;
1613 	} else {
1614 		fprintf(stderr, "Unknown test type: %s\n", type); exit(1);
1615 	}
1616 
1617 	if (test_type == TEST_HUGETLB)
1618 		page_size = default_huge_page_size();
1619 	else
1620 		page_size = sysconf(_SC_PAGE_SIZE);
1621 
1622 	if (!page_size) {
1623 		fprintf(stderr, "Unable to determine page size\n");
1624 		exit(2);
1625 	}
1626 	if ((unsigned long) area_count(NULL, 0) + sizeof(unsigned long long) * 2
1627 	    > page_size) {
1628 		fprintf(stderr, "Impossible to run this test\n");
1629 		exit(2);
1630 	}
1631 }
1632 
sigalrm(int sig)1633 static void sigalrm(int sig)
1634 {
1635 	if (sig != SIGALRM)
1636 		abort();
1637 	test_uffdio_copy_eexist = true;
1638 	test_uffdio_zeropage_eexist = true;
1639 	alarm(ALARM_INTERVAL_SECS);
1640 }
1641 
main(int argc,char ** argv)1642 int main(int argc, char **argv)
1643 {
1644 	if (argc < 4)
1645 		usage();
1646 
1647 	if (signal(SIGALRM, sigalrm) == SIG_ERR) {
1648 		fprintf(stderr, "failed to arm SIGALRM");
1649 		exit(1);
1650 	}
1651 	alarm(ALARM_INTERVAL_SECS);
1652 
1653 	set_test_type(argv[1]);
1654 
1655 	nr_cpus = sysconf(_SC_NPROCESSORS_ONLN);
1656 	nr_pages_per_cpu = atol(argv[2]) * 1024*1024 / page_size /
1657 		nr_cpus;
1658 	if (!nr_pages_per_cpu) {
1659 		fprintf(stderr, "invalid MiB\n");
1660 		usage();
1661 	}
1662 
1663 	bounces = atoi(argv[3]);
1664 	if (bounces <= 0) {
1665 		fprintf(stderr, "invalid bounces\n");
1666 		usage();
1667 	}
1668 	nr_pages = nr_pages_per_cpu * nr_cpus;
1669 
1670 	if (test_type == TEST_HUGETLB) {
1671 		if (argc < 5)
1672 			usage();
1673 		huge_fd = open(argv[4], O_CREAT | O_RDWR, 0755);
1674 		if (huge_fd < 0) {
1675 			fprintf(stderr, "Open of %s failed", argv[3]);
1676 			perror("open");
1677 			exit(1);
1678 		}
1679 		if (ftruncate(huge_fd, 0)) {
1680 			fprintf(stderr, "ftruncate %s to size 0 failed", argv[3]);
1681 			perror("ftruncate");
1682 			exit(1);
1683 		}
1684 	}
1685 	printf("nr_pages: %lu, nr_pages_per_cpu: %lu\n",
1686 	       nr_pages, nr_pages_per_cpu);
1687 	return userfaultfd_stress();
1688 }
1689 
1690 #else /* __NR_userfaultfd */
1691 
1692 #warning "missing __NR_userfaultfd definition"
1693 
main(void)1694 int main(void)
1695 {
1696 	printf("skip: Skipping userfaultfd test (missing __NR_userfaultfd)\n");
1697 	return KSFT_SKIP;
1698 }
1699 
1700 #endif /* __NR_userfaultfd */
1701