1 /*
2  * Copyright (c) 2021 Dell Inc. or its subsidiaries. All Rights Reserved.
3  * Copyright (c) 2022 The FreeBSD Foundation
4  *
5  * Portions of this software were developed by Mark Johnston under sponsorship
6  * from the FreeBSD Foundation.
7  *
8  * Redistribution and use in source and binary forms, with or without
9  * modification, are permitted provided that the following conditions
10  * are met:
11  * 1. Redistributions of source code must retain the above copyright
12  *    notice, this list of conditions and the following disclaimer.
13  * 2. Redistributions in binary form must reproduce the above copyright
14  *    notice, this list of conditions and the following disclaimer in the
15  *    documentation and/or other materials provided with the distribution.
16  *
17  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
18  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
19  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
20  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
21  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
22  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
23  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
24  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
25  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
26  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
27  * SUCH DAMAGE.
28  */
29 
30 /*
31  * Test behavior when a mapping of a shared shadow vm object is
32  * invalidated by COW from another mapping.  In particular, when
33  * minherit(INHERT_SHARE) is applied to a COW mapping, a subsequently
34  * forked child process will share the parent's shadow object.  Thus,
35  * pages already mapped into one sharing process may be written from
36  * another, triggering a copy into the shadow object.  The VM system
37  * expects that a fully shadowed page is unmapped, but at one point the
38  * use of a shared shadow object could break this invariant.
39  *
40  * This is a regression test for an issue isolated by rlibby@FreeBSD.org
41  * from an issue detected by stress2's collapse.sh by jeff@FreeBSD.org.
42  * The issue became CVE-2021-29626.
43  *
44  * This file is written as an ATF test suite but may be compiled as a
45  * standalone program with -DSTANDALONE (and optionally -DDEBUG).
46  */
47 
48 #include <sys/param.h>
49 #include <sys/mman.h>
50 #include <sys/procctl.h>
51 #include <sys/resource.h>
52 #include <sys/sysctl.h>
53 #include <sys/wait.h>
54 
55 #include <machine/atomic.h>
56 
57 #include <err.h>
58 #include <errno.h>
59 #include <stdbool.h>
60 #include <stddef.h>
61 #include <stdio.h>
62 #include <stdlib.h>
63 #include <unistd.h>
64 
65 #ifdef STANDALONE
66 #define	ATF_REQUIRE(x)	do {		\
67 	if (!(x))			\
68 		errx(1, "%s", #x);	\
69 } while (0)
70 #else
71 #include <atf-c.h>
72 #endif
73 
74 #ifdef DEBUG
75 #define	dprintf(...)	printf(__VA_ARGS__)
76 #else
77 #define	dprintf(...)
78 #endif
79 
80 #define	DEPTH	5
81 
82 #define	FLAG_COLLAPSE		0x1
83 #define	FLAG_BLOCK_XFER		0x2
84 #define	FLAG_FULLMOD		0x4
85 #define FLAG_MASK		(FLAG_COLLAPSE | FLAG_BLOCK_XFER | FLAG_FULLMOD)
86 
87 struct shared_state {
88 	void *p;
89 	size_t len;
90 	size_t modlen;
91 	size_t pagesize;
92 	bool collapse;
93 	bool block_xfer;
94 	bool lazy_cow;
95 	bool okay;
96 	volatile bool exiting[DEPTH];
97 	volatile bool exit;
98 	volatile bool p3_did_write;
99 };
100 
101 /*
102  * Program flow.  There are three or four processes that are descendants
103  * of the process running the test (P0), where arrows go from parents to
104  * children, and thicker arrows indicate sharing a certain memory region
105  * without COW semantics:
106  *     P0 -> P1 -> P2 => P3
107  *             \=> P4
108  * The main idea is that P1 maps a memory region, and that region is
109  * shared with P2/P3, but with COW semantics.  When P3 modifies the
110  * memory, P2 ought to see that modification.  P4 optionally exists to
111  * defeat a COW optimization.
112  */
113 
114 #define	child_err(...)	do {						\
115 	ss->exit = true;						\
116 	err(1, __VA_ARGS__);						\
117 } while (0)
118 
119 #define	child_errx(...)	do {						\
120 	ss->exit = true;						\
121 	errx(1, __VA_ARGS__);						\
122 } while (0)
123 
124 #define	SLEEP_TIME_US	1000
125 
126 static void child(struct shared_state *ss, int depth);
127 
128 static pid_t
129 child_fork(struct shared_state *ss, int depth)
130 {
131 	pid_t pid = fork();
132 	if (pid == -1)
133 		child_err("fork");
134 	else if (pid == 0)
135 		child(ss, depth);
136 	return pid;
137 }
138 
139 static void
140 child_fault(struct shared_state *ss)
141 {
142 	size_t i;
143 
144 	for (i = 0; i < ss->len; i += ss->pagesize)
145 		(void)((volatile char *)ss->p)[i];
146 }
147 
148 static void
149 child_write(struct shared_state *ss, int val, size_t len)
150 {
151 	size_t i;
152 
153 	for (i = 0; i < len; i += ss->pagesize)
154 		((int *)ss->p)[i / sizeof(int)] = val;
155 	atomic_thread_fence_rel();
156 }
157 
158 static void
159 child_wait_p3_write(struct shared_state *ss)
160 {
161 	while (!ss->p3_did_write) {
162 		if (ss->exit)
163 			exit(1);
164 		usleep(SLEEP_TIME_US);
165 	}
166 	atomic_thread_fence_acq();
167 }
168 
169 static void
170 child_verify(struct shared_state *ss, int depth, int newval, int oldval)
171 {
172 	size_t i;
173 	int expectval, foundval;
174 
175 	for (i = 0; i < ss->len; i += ss->pagesize) {
176 		expectval = i < ss->modlen ? newval : oldval;
177 		foundval = ((int *)ss->p)[i / sizeof(int)];
178 		if (foundval == expectval)
179 			continue;
180 		child_errx("P%d saw %d but expected %d, %d was the old value",
181 		    depth, foundval, expectval, oldval);
182 	}
183 }
184 
185 static void
186 child(struct shared_state *ss, int depth)
187 {
188 	pid_t mypid, oldval, pid;
189 
190 	if (depth < 1 || depth >= DEPTH)
191 		child_errx("Bad depth %d", depth);
192 	mypid = getpid();
193 	dprintf("P%d (pid %d) started\n", depth, mypid);
194 	switch (depth) {
195 	case 1:
196 		/* Shared memory undergoing test. */
197 		ss->p = mmap(NULL, ss->len, PROT_READ | PROT_WRITE,
198 		    MAP_SHARED | MAP_ANON, -1, 0);
199 		if (ss->p == MAP_FAILED)
200 			child_err("mmap");
201 
202 		/* P1 stamps the shared memory. */
203 		child_write(ss, mypid, ss->len);
204 		if (!ss->lazy_cow) {
205 			if (mlock(ss->p, ss->len) == -1)
206 				child_err("mlock");
207 			if (mprotect(ss->p, ss->len, PROT_READ) == -1)
208 				child_err("mprotect");
209 		}
210 		if (ss->block_xfer) {
211 			/*
212 			 * P4 is forked so that its existence blocks a page COW
213 			 * path where the page is simply transferred between
214 			 * objects, rather than being copied.
215 			 */
216 			child_fork(ss, 4);
217 		}
218 		/*
219 		 * P1 specifies that modifications from its child processes not
220 		 * be shared with P1.  Child process reads can be serviced from
221 		 * pages in P1's object, but writes must be COW'd.
222 		 */
223 		if (minherit(ss->p, ss->len, INHERIT_COPY) != 0)
224 			child_err("minherit");
225 		/* Fork P2. */
226 		child_fork(ss, depth + 1);
227 		/* P1 and P4 wait for P3's writes before exiting. */
228 		child_wait_p3_write(ss);
229 		child_verify(ss, depth, mypid, mypid);
230 		if (!ss->collapse) {
231 			/* Hang around to prevent collapse. */
232 			while (!ss->exit)
233 				usleep(SLEEP_TIME_US);
234 		}
235 		/* Exit so the P2 -> P1/P4 shadow chain can collapse. */
236 		break;
237 	case 2:
238 		/*
239 		 * P2 now specifies that modifications from its child processes
240 		 * be shared.  P2 and P3 will share a shadow object.
241 		 */
242 		if (minherit(ss->p, ss->len, INHERIT_SHARE) != 0)
243 			child_err("minherit");
244 
245 		/*
246 		 * P2 faults a page in P1's object before P1 exits and the
247 		 * shadow chain is collapsed.  This may be redundant if the
248 		 * (read-only) mappings were copied by fork(), but it doesn't
249 		 * hurt.
250 		 */
251 		child_fault(ss);
252 		oldval = atomic_load_acq_int(ss->p);
253 
254 		/* Fork P3. */
255 		pid = child_fork(ss, depth + 1);
256 		if (ss->collapse) {
257 			/* Wait for P1 and P4 to exit, triggering collapse. */
258 			while (!ss->exiting[1] ||
259 			    (ss->block_xfer && !ss->exiting[4]))
260 				usleep(SLEEP_TIME_US);
261 			/*
262 			 * This is racy, just guess at how long it may take
263 			 * them to finish exiting.
264 			 */
265 			usleep(100 * 1000);
266 		}
267 		/* P2 waits for P3's modification. */
268 		child_wait_p3_write(ss);
269 		child_verify(ss, depth, pid, oldval);
270 		ss->okay = true;
271 		ss->exit = true;
272 		break;
273 	case 3:
274 		/*
275 		 * Use mlock()+mprotect() to trigger the COW.  This
276 		 * exercises a different COW handler than the one used
277 		 * for lazy faults.
278 		 */
279 		if (!ss->lazy_cow) {
280 			if (mlock(ss->p, ss->len) == -1)
281 				child_err("mlock");
282 			if (mprotect(ss->p, ss->len, PROT_READ | PROT_WRITE) ==
283 			    -1)
284 				child_err("mprotect");
285 		}
286 
287 		/*
288 		 * P3 writes the memory.  A page is faulted into the shared
289 		 * P2/P3 shadow object.  P2's mapping of the page in P1's
290 		 * object must now be shot down, or else P2 will wrongly
291 		 * continue to have that page mapped.
292 		 */
293 		child_write(ss, mypid, ss->modlen);
294 		ss->p3_did_write = true;
295 		dprintf("P3 (pid %d) wrote its pid\n", mypid);
296 		break;
297 	case 4:
298 		/* Just hang around until P3 is done writing. */
299 		oldval = atomic_load_acq_int(ss->p);
300 		child_wait_p3_write(ss);
301 		child_verify(ss, depth, oldval, oldval);
302 		break;
303 	default:
304 		child_errx("Bad depth %d", depth);
305 	}
306 
307 	dprintf("P%d (pid %d) exiting\n", depth, mypid);
308 	ss->exiting[depth] = true;
309 	exit(0);
310 }
311 
312 static void
313 do_one_shared_shadow_inval(bool lazy_cow, size_t pagesize, size_t len,
314     unsigned int flags)
315 {
316 	struct shared_state *ss;
317 	pid_t pid;
318 	int status;
319 
320 	pid = getpid();
321 
322 	dprintf("P0 (pid %d) %s(collapse=%d, block_xfer=%d, full_mod=%d)\n",
323 	    pid, __func__, (int)collapse, (int)block_xfer, (int)full_mod);
324 
325 	ATF_REQUIRE(procctl(P_PID, pid, PROC_REAP_ACQUIRE, NULL) == 0);
326 
327 	/* Shared memory for coordination. */
328 	ss = mmap(NULL, sizeof(*ss), PROT_READ | PROT_WRITE,
329 	    MAP_SHARED | MAP_ANON, -1, 0);
330 	ATF_REQUIRE(ss != MAP_FAILED);
331 
332 	ss->len = len;
333 	ss->modlen = (flags & FLAG_FULLMOD) ? ss->len : ss->len / 2;
334 	ss->pagesize = pagesize;
335 	ss->collapse = (flags & FLAG_COLLAPSE) != 0;
336 	ss->block_xfer = (flags & FLAG_BLOCK_XFER) != 0;
337 	ss->lazy_cow = lazy_cow;
338 
339 	pid = fork();
340 	ATF_REQUIRE(pid != -1);
341 	if (pid == 0)
342 		child(ss, 1);
343 
344 	/* Wait for all descendants to exit. */
345 	do {
346 		pid = wait(&status);
347 		ATF_REQUIRE(WIFEXITED(status));
348 	} while (pid != -1 || errno != ECHILD);
349 
350 	atomic_thread_fence_acq();
351 	ATF_REQUIRE(ss->okay);
352 
353 	ATF_REQUIRE(munmap(ss, sizeof(*ss)) == 0);
354 	ATF_REQUIRE(procctl(P_PID, getpid(), PROC_REAP_RELEASE, NULL) == 0);
355 }
356 
357 static void
358 do_shared_shadow_inval(bool lazy_cow)
359 {
360 	size_t largepagesize, pagesize, pagesizes[MAXPAGESIZES], sysctllen;
361 
362 	sysctllen = sizeof(pagesizes);
363 	ATF_REQUIRE(sysctlbyname("hw.pagesizes", pagesizes, &sysctllen, NULL,
364 	    0) == 0);
365 	ATF_REQUIRE(sysctllen >= sizeof(size_t));
366 
367 	pagesize = pagesizes[0];
368 	largepagesize = MAXPAGESIZES >= 2 &&
369 	    sysctllen >= 2 * sizeof(size_t) && pagesizes[1] != 0 ?
370 	    pagesizes[1] : 2 * 1024 * 1024;
371 
372 	for (unsigned int i = 0; i <= FLAG_MASK; i++) {
373 		do_one_shared_shadow_inval(lazy_cow, pagesize,
374 		    pagesize, i);
375 		do_one_shared_shadow_inval(lazy_cow, pagesize,
376 		    2 * pagesize, i);
377 		do_one_shared_shadow_inval(lazy_cow, pagesize,
378 		    largepagesize - pagesize, i);
379 		do_one_shared_shadow_inval(lazy_cow, pagesize,
380 		    largepagesize, i);
381 		do_one_shared_shadow_inval(lazy_cow, pagesize,
382 		    largepagesize + pagesize, i);
383 	}
384 }
385 
386 static void
387 do_shared_shadow_inval_eager(void)
388 {
389 	struct rlimit rl;
390 
391 	rl.rlim_cur = rl.rlim_max = RLIM_INFINITY;
392 	ATF_REQUIRE(setrlimit(RLIMIT_MEMLOCK, &rl) == 0);
393 
394 	do_shared_shadow_inval(false);
395 }
396 
397 static void
398 do_shared_shadow_inval_lazy(void)
399 {
400 	do_shared_shadow_inval(true);
401 }
402 
403 #ifdef STANDALONE
404 int
405 main(void)
406 {
407 	do_shared_shadow_inval_lazy();
408 	do_shared_shadow_inval_eager();
409 	printf("pass\n");
410 }
411 #else
412 ATF_TC_WITHOUT_HEAD(shared_shadow_inval__lazy_cow);
413 ATF_TC_BODY(shared_shadow_inval__lazy_cow, tc)
414 {
415 	do_shared_shadow_inval_lazy();
416 }
417 
418 ATF_TC(shared_shadow_inval__eager_cow);
419 ATF_TC_HEAD(shared_shadow_inval__eager_cow, tc)
420 {
421 	/* Needed to raise the mlock() limit. */
422 	atf_tc_set_md_var(tc, "require.user", "root");
423 }
424 ATF_TC_BODY(shared_shadow_inval__eager_cow, tc)
425 {
426 	do_shared_shadow_inval_eager();
427 }
428 
429 ATF_TP_ADD_TCS(tp)
430 {
431 	ATF_TP_ADD_TC(tp, shared_shadow_inval__lazy_cow);
432 	ATF_TP_ADD_TC(tp, shared_shadow_inval__eager_cow);
433 	return (atf_no_error());
434 }
435 #endif /* !STANDALONE */
436