xref: /freebsd/sys/kern/kern_jail.c (revision 7926a01e)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 1999 Poul-Henning Kamp.
5  * Copyright (c) 2008 Bjoern A. Zeeb.
6  * Copyright (c) 2009 James Gritton.
7  * All rights reserved.
8  *
9  * Redistribution and use in source and binary forms, with or without
10  * modification, are permitted provided that the following conditions
11  * are met:
12  * 1. Redistributions of source code must retain the above copyright
13  *    notice, this list of conditions and the following disclaimer.
14  * 2. Redistributions in binary form must reproduce the above copyright
15  *    notice, this list of conditions and the following disclaimer in the
16  *    documentation and/or other materials provided with the distribution.
17  *
18  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
19  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
20  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
21  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
22  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
23  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
24  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
25  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
26  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
27  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
28  * SUCH DAMAGE.
29  */
30 
31 #include <sys/cdefs.h>
32 __FBSDID("$FreeBSD$");
33 
34 #include "opt_ddb.h"
35 #include "opt_inet.h"
36 #include "opt_inet6.h"
37 #include "opt_nfs.h"
38 
39 #include <sys/param.h>
40 #include <sys/types.h>
41 #include <sys/kernel.h>
42 #include <sys/systm.h>
43 #include <sys/errno.h>
44 #include <sys/sysproto.h>
45 #include <sys/malloc.h>
46 #include <sys/osd.h>
47 #include <sys/priv.h>
48 #include <sys/proc.h>
49 #include <sys/epoch.h>
50 #include <sys/taskqueue.h>
51 #include <sys/fcntl.h>
52 #include <sys/jail.h>
53 #include <sys/linker.h>
54 #include <sys/lock.h>
55 #include <sys/mman.h>
56 #include <sys/mutex.h>
57 #include <sys/racct.h>
58 #include <sys/rctl.h>
59 #include <sys/refcount.h>
60 #include <sys/sx.h>
61 #include <sys/sysent.h>
62 #include <sys/namei.h>
63 #include <sys/mount.h>
64 #include <sys/queue.h>
65 #include <sys/socket.h>
66 #include <sys/syscallsubr.h>
67 #include <sys/sysctl.h>
68 #include <sys/uuid.h>
69 #include <sys/vnode.h>
70 
71 #include <net/if.h>
72 #include <net/vnet.h>
73 
74 #include <netinet/in.h>
75 
76 #ifdef DDB
77 #include <ddb/ddb.h>
78 #endif /* DDB */
79 
80 #include <security/mac/mac_framework.h>
81 
82 #define	PRISON0_HOSTUUID_MODULE	"hostuuid"
83 
84 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
85 static MALLOC_DEFINE(M_PRISON_RACCT, "prison_racct", "Prison racct structures");
86 
87 /* Keep struct prison prison0 and some code in kern_jail_set() readable. */
88 #ifdef INET
89 #ifdef INET6
90 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL|PR_IP6_SADDRSEL
91 #else
92 #define	_PR_IP_SADDRSEL	PR_IP4_SADDRSEL
93 #endif
94 #else /* !INET */
95 #ifdef INET6
96 #define	_PR_IP_SADDRSEL	PR_IP6_SADDRSEL
97 #else
98 #define	_PR_IP_SADDRSEL	0
99 #endif
100 #endif
101 
102 /* prison0 describes what is "real" about the system. */
103 struct prison prison0 = {
104 	.pr_id		= 0,
105 	.pr_name	= "0",
106 	.pr_ref		= 1,
107 	.pr_uref	= 1,
108 	.pr_path	= "/",
109 	.pr_securelevel	= -1,
110 	.pr_devfs_rsnum = 0,
111 	.pr_state	= PRISON_STATE_ALIVE,
112 	.pr_childmax	= JAIL_MAX,
113 	.pr_hostuuid	= DEFAULT_HOSTUUID,
114 	.pr_children	= LIST_HEAD_INITIALIZER(prison0.pr_children),
115 #ifdef VIMAGE
116 	.pr_flags	= PR_HOST|PR_VNET|_PR_IP_SADDRSEL,
117 #else
118 	.pr_flags	= PR_HOST|_PR_IP_SADDRSEL,
119 #endif
120 	.pr_allow	= PR_ALLOW_ALL_STATIC,
121 	.pr_permid	= 1,
122 };
123 MTX_SYSINIT(prison0, &prison0.pr_mtx, "jail mutex", MTX_DEF);
124 
125 struct bool_flags {
126 	const char	*name;
127 	const char	*noname;
128 	volatile u_int	 flag;
129 };
130 struct jailsys_flags {
131 	const char	*name;
132 	unsigned	 disable;
133 	unsigned	 new;
134 };
135 
136 /* allprison, allprison_racct and lastprid are protected by allprison_lock. */
137 struct	sx allprison_lock;
138 SX_SYSINIT(allprison_lock, &allprison_lock, "allprison");
139 struct	prisonlist allprison = TAILQ_HEAD_INITIALIZER(allprison);
140 LIST_HEAD(, prison_racct) allprison_racct;
141 int	lastprid = 0;
142 
143 static int get_next_prid(struct prison **insprp);
144 static int do_jail_attach(struct thread *td, struct prison *pr, int drflags);
145 static void prison_complete(void *context, int pending);
146 static void prison_deref(struct prison *pr, int flags);
147 static void prison_deref_kill(struct prison *pr, struct prisonlist *freeprison);
148 static int prison_lock_xlock(struct prison *pr, int flags);
149 static void prison_cleanup(struct prison *pr);
150 static void prison_free_not_last(struct prison *pr);
151 static void prison_proc_free_not_last(struct prison *pr);
152 static void prison_proc_relink(struct prison *opr, struct prison *npr,
153     struct proc *p);
154 static void prison_set_allow_locked(struct prison *pr, unsigned flag,
155     int enable);
156 static char *prison_path(struct prison *pr1, struct prison *pr2);
157 #ifdef RACCT
158 static void prison_racct_attach(struct prison *pr);
159 static void prison_racct_modify(struct prison *pr);
160 static void prison_racct_detach(struct prison *pr);
161 #endif
162 
163 /* Flags for prison_deref */
164 #define	PD_DEREF	0x01	/* Decrement pr_ref */
165 #define	PD_DEUREF	0x02	/* Decrement pr_uref */
166 #define	PD_KILL		0x04	/* Remove jail, kill processes, etc */
167 #define	PD_LOCKED	0x10	/* pr_mtx is held */
168 #define	PD_LIST_SLOCKED	0x20	/* allprison_lock is held shared */
169 #define	PD_LIST_XLOCKED	0x40	/* allprison_lock is held exclusive */
170 #define PD_OP_FLAGS	0x07	/* Operation flags */
171 #define PD_LOCK_FLAGS	0x70	/* Lock status flags */
172 
173 /*
174  * Parameter names corresponding to PR_* flag values.  Size values are for kvm
175  * as we cannot figure out the size of a sparse array, or an array without a
176  * terminating entry.
177  */
178 static struct bool_flags pr_flag_bool[] = {
179 	{"persist", "nopersist", PR_PERSIST},
180 #ifdef INET
181 	{"ip4.saddrsel", "ip4.nosaddrsel", PR_IP4_SADDRSEL},
182 #endif
183 #ifdef INET6
184 	{"ip6.saddrsel", "ip6.nosaddrsel", PR_IP6_SADDRSEL},
185 #endif
186 };
187 const size_t pr_flag_bool_size = sizeof(pr_flag_bool);
188 
189 static struct jailsys_flags pr_flag_jailsys[] = {
190 	{"host", 0, PR_HOST},
191 #ifdef VIMAGE
192 	{"vnet", 0, PR_VNET},
193 #endif
194 #ifdef INET
195 	{"ip4", PR_IP4_USER, PR_IP4_USER},
196 #endif
197 #ifdef INET6
198 	{"ip6", PR_IP6_USER, PR_IP6_USER},
199 #endif
200 };
201 const size_t pr_flag_jailsys_size = sizeof(pr_flag_jailsys);
202 
203 /*
204  * Make this array full-size so dynamic parameters can be added.
205  * It is protected by prison0.mtx, but lockless reading is allowed
206  * with an atomic check of the flag values.
207  */
208 static struct bool_flags pr_flag_allow[NBBY * NBPW] = {
209 	{"allow.set_hostname", "allow.noset_hostname", PR_ALLOW_SET_HOSTNAME},
210 	{"allow.sysvipc", "allow.nosysvipc", PR_ALLOW_SYSVIPC},
211 	{"allow.raw_sockets", "allow.noraw_sockets", PR_ALLOW_RAW_SOCKETS},
212 	{"allow.chflags", "allow.nochflags", PR_ALLOW_CHFLAGS},
213 	{"allow.mount", "allow.nomount", PR_ALLOW_MOUNT},
214 	{"allow.quotas", "allow.noquotas", PR_ALLOW_QUOTAS},
215 	{"allow.socket_af", "allow.nosocket_af", PR_ALLOW_SOCKET_AF},
216 	{"allow.mlock", "allow.nomlock", PR_ALLOW_MLOCK},
217 	{"allow.reserved_ports", "allow.noreserved_ports",
218 	 PR_ALLOW_RESERVED_PORTS},
219 	{"allow.read_msgbuf", "allow.noread_msgbuf", PR_ALLOW_READ_MSGBUF},
220 	{"allow.unprivileged_proc_debug", "allow.nounprivileged_proc_debug",
221 	 PR_ALLOW_UNPRIV_DEBUG},
222 	{"allow.suser", "allow.nosuser", PR_ALLOW_SUSER},
223 #if defined(VNET_NFSD) && defined(VIMAGE) && defined(NFSD)
224 	{"allow.nfsd", "allow.nonfsd", PR_ALLOW_NFSD},
225 #endif
226 };
227 static unsigned pr_allow_all = PR_ALLOW_ALL_STATIC;
228 const size_t pr_flag_allow_size = sizeof(pr_flag_allow);
229 
230 #define	JAIL_DEFAULT_ALLOW		(PR_ALLOW_SET_HOSTNAME | \
231 					 PR_ALLOW_RESERVED_PORTS | \
232 					 PR_ALLOW_UNPRIV_DEBUG | \
233 					 PR_ALLOW_SUSER)
234 #define	JAIL_DEFAULT_ENFORCE_STATFS	2
235 #define	JAIL_DEFAULT_DEVFS_RSNUM	0
236 static unsigned jail_default_allow = JAIL_DEFAULT_ALLOW;
237 static int jail_default_enforce_statfs = JAIL_DEFAULT_ENFORCE_STATFS;
238 static int jail_default_devfs_rsnum = JAIL_DEFAULT_DEVFS_RSNUM;
239 #if defined(INET) || defined(INET6)
240 static unsigned jail_max_af_ips = 255;
241 #endif
242 
243 /*
244  * Initialize the parts of prison0 that can't be static-initialized with
245  * constants.  This is called from proc0_init() after creating thread0 cpuset.
246  */
247 void
248 prison0_init(void)
249 {
250 	uint8_t *file, *data;
251 	size_t size;
252 	char buf[sizeof(prison0.pr_hostuuid)];
253 	bool valid;
254 
255 	prison0.pr_cpuset = cpuset_ref(thread0.td_cpuset);
256 	prison0.pr_osreldate = osreldate;
257 	strlcpy(prison0.pr_osrelease, osrelease, sizeof(prison0.pr_osrelease));
258 
259 	/* If we have a preloaded hostuuid, use it. */
260 	file = preload_search_by_type(PRISON0_HOSTUUID_MODULE);
261 	if (file != NULL) {
262 		data = preload_fetch_addr(file);
263 		size = preload_fetch_size(file);
264 		if (data != NULL) {
265 			/*
266 			 * The preloaded data may include trailing whitespace, almost
267 			 * certainly a newline; skip over any whitespace or
268 			 * non-printable characters to be safe.
269 			 */
270 			while (size > 0 && data[size - 1] <= 0x20) {
271 				size--;
272 			}
273 
274 			valid = false;
275 
276 			/*
277 			 * Not NUL-terminated when passed from loader, but
278 			 * validate_uuid requires that due to using sscanf (as
279 			 * does the subsequent strlcpy, since it still reads
280 			 * past the given size to return the true length);
281 			 * bounce to a temporary buffer to fix.
282 			 */
283 			if (size >= sizeof(buf))
284 				goto done;
285 
286 			memcpy(buf, data, size);
287 			buf[size] = '\0';
288 
289 			if (validate_uuid(buf, size, NULL, 0) != 0)
290 				goto done;
291 
292 			valid = true;
293 			(void)strlcpy(prison0.pr_hostuuid, buf,
294 			    sizeof(prison0.pr_hostuuid));
295 
296 done:
297 			if (bootverbose && !valid) {
298 				printf("hostuuid: preload data malformed: '%.*s'\n",
299 				    (int)size, data);
300 			}
301 		}
302 	}
303 	if (bootverbose)
304 		printf("hostuuid: using %s\n", prison0.pr_hostuuid);
305 }
306 
307 /*
308  * struct jail_args {
309  *	struct jail *jail;
310  * };
311  */
312 int
313 sys_jail(struct thread *td, struct jail_args *uap)
314 {
315 	uint32_t version;
316 	int error;
317 	struct jail j;
318 
319 	error = copyin(uap->jail, &version, sizeof(uint32_t));
320 	if (error)
321 		return (error);
322 
323 	switch (version) {
324 	case 0:
325 	{
326 		struct jail_v0 j0;
327 
328 		/* FreeBSD single IPv4 jails. */
329 		bzero(&j, sizeof(struct jail));
330 		error = copyin(uap->jail, &j0, sizeof(struct jail_v0));
331 		if (error)
332 			return (error);
333 		j.version = j0.version;
334 		j.path = j0.path;
335 		j.hostname = j0.hostname;
336 		j.ip4s = htonl(j0.ip_number);	/* jail_v0 is host order */
337 		break;
338 	}
339 
340 	case 1:
341 		/*
342 		 * Version 1 was used by multi-IPv4 jail implementations
343 		 * that never made it into the official kernel.
344 		 */
345 		return (EINVAL);
346 
347 	case 2:	/* JAIL_API_VERSION */
348 		/* FreeBSD multi-IPv4/IPv6,noIP jails. */
349 		error = copyin(uap->jail, &j, sizeof(struct jail));
350 		if (error)
351 			return (error);
352 		break;
353 
354 	default:
355 		/* Sci-Fi jails are not supported, sorry. */
356 		return (EINVAL);
357 	}
358 	return (kern_jail(td, &j));
359 }
360 
361 int
362 kern_jail(struct thread *td, struct jail *j)
363 {
364 	struct iovec optiov[2 * (4 + nitems(pr_flag_allow)
365 #ifdef INET
366 			    + 1
367 #endif
368 #ifdef INET6
369 			    + 1
370 #endif
371 			    )];
372 	struct uio opt;
373 	char *u_path, *u_hostname, *u_name;
374 	struct bool_flags *bf;
375 #ifdef INET
376 	uint32_t ip4s;
377 	struct in_addr *u_ip4;
378 #endif
379 #ifdef INET6
380 	struct in6_addr *u_ip6;
381 #endif
382 	size_t tmplen;
383 	int error, enforce_statfs;
384 
385 	bzero(&optiov, sizeof(optiov));
386 	opt.uio_iov = optiov;
387 	opt.uio_iovcnt = 0;
388 	opt.uio_offset = -1;
389 	opt.uio_resid = -1;
390 	opt.uio_segflg = UIO_SYSSPACE;
391 	opt.uio_rw = UIO_READ;
392 	opt.uio_td = td;
393 
394 	/* Set permissions for top-level jails from sysctls. */
395 	if (!jailed(td->td_ucred)) {
396 		for (bf = pr_flag_allow;
397 		     bf < pr_flag_allow + nitems(pr_flag_allow) &&
398 			atomic_load_int(&bf->flag) != 0;
399 		     bf++) {
400 			optiov[opt.uio_iovcnt].iov_base = __DECONST(char *,
401 			    (jail_default_allow & bf->flag)
402 			    ? bf->name : bf->noname);
403 			optiov[opt.uio_iovcnt].iov_len =
404 			    strlen(optiov[opt.uio_iovcnt].iov_base) + 1;
405 			opt.uio_iovcnt += 2;
406 		}
407 		optiov[opt.uio_iovcnt].iov_base = "enforce_statfs";
408 		optiov[opt.uio_iovcnt].iov_len = sizeof("enforce_statfs");
409 		opt.uio_iovcnt++;
410 		enforce_statfs = jail_default_enforce_statfs;
411 		optiov[opt.uio_iovcnt].iov_base = &enforce_statfs;
412 		optiov[opt.uio_iovcnt].iov_len = sizeof(enforce_statfs);
413 		opt.uio_iovcnt++;
414 	}
415 
416 	tmplen = MAXPATHLEN + MAXHOSTNAMELEN + MAXHOSTNAMELEN;
417 #ifdef INET
418 	ip4s = (j->version == 0) ? 1 : j->ip4s;
419 	if (ip4s > jail_max_af_ips)
420 		return (EINVAL);
421 	tmplen += ip4s * sizeof(struct in_addr);
422 #else
423 	if (j->ip4s > 0)
424 		return (EINVAL);
425 #endif
426 #ifdef INET6
427 	if (j->ip6s > jail_max_af_ips)
428 		return (EINVAL);
429 	tmplen += j->ip6s * sizeof(struct in6_addr);
430 #else
431 	if (j->ip6s > 0)
432 		return (EINVAL);
433 #endif
434 	u_path = malloc(tmplen, M_TEMP, M_WAITOK);
435 	u_hostname = u_path + MAXPATHLEN;
436 	u_name = u_hostname + MAXHOSTNAMELEN;
437 #ifdef INET
438 	u_ip4 = (struct in_addr *)(u_name + MAXHOSTNAMELEN);
439 #endif
440 #ifdef INET6
441 #ifdef INET
442 	u_ip6 = (struct in6_addr *)(u_ip4 + ip4s);
443 #else
444 	u_ip6 = (struct in6_addr *)(u_name + MAXHOSTNAMELEN);
445 #endif
446 #endif
447 	optiov[opt.uio_iovcnt].iov_base = "path";
448 	optiov[opt.uio_iovcnt].iov_len = sizeof("path");
449 	opt.uio_iovcnt++;
450 	optiov[opt.uio_iovcnt].iov_base = u_path;
451 	error = copyinstr(j->path, u_path, MAXPATHLEN,
452 	    &optiov[opt.uio_iovcnt].iov_len);
453 	if (error) {
454 		free(u_path, M_TEMP);
455 		return (error);
456 	}
457 	opt.uio_iovcnt++;
458 	optiov[opt.uio_iovcnt].iov_base = "host.hostname";
459 	optiov[opt.uio_iovcnt].iov_len = sizeof("host.hostname");
460 	opt.uio_iovcnt++;
461 	optiov[opt.uio_iovcnt].iov_base = u_hostname;
462 	error = copyinstr(j->hostname, u_hostname, MAXHOSTNAMELEN,
463 	    &optiov[opt.uio_iovcnt].iov_len);
464 	if (error) {
465 		free(u_path, M_TEMP);
466 		return (error);
467 	}
468 	opt.uio_iovcnt++;
469 	if (j->jailname != NULL) {
470 		optiov[opt.uio_iovcnt].iov_base = "name";
471 		optiov[opt.uio_iovcnt].iov_len = sizeof("name");
472 		opt.uio_iovcnt++;
473 		optiov[opt.uio_iovcnt].iov_base = u_name;
474 		error = copyinstr(j->jailname, u_name, MAXHOSTNAMELEN,
475 		    &optiov[opt.uio_iovcnt].iov_len);
476 		if (error) {
477 			free(u_path, M_TEMP);
478 			return (error);
479 		}
480 		opt.uio_iovcnt++;
481 	}
482 #ifdef INET
483 	optiov[opt.uio_iovcnt].iov_base = "ip4.addr";
484 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip4.addr");
485 	opt.uio_iovcnt++;
486 	optiov[opt.uio_iovcnt].iov_base = u_ip4;
487 	optiov[opt.uio_iovcnt].iov_len = ip4s * sizeof(struct in_addr);
488 	if (j->version == 0)
489 		u_ip4->s_addr = j->ip4s;
490 	else {
491 		error = copyin(j->ip4, u_ip4, optiov[opt.uio_iovcnt].iov_len);
492 		if (error) {
493 			free(u_path, M_TEMP);
494 			return (error);
495 		}
496 	}
497 	opt.uio_iovcnt++;
498 #endif
499 #ifdef INET6
500 	optiov[opt.uio_iovcnt].iov_base = "ip6.addr";
501 	optiov[opt.uio_iovcnt].iov_len = sizeof("ip6.addr");
502 	opt.uio_iovcnt++;
503 	optiov[opt.uio_iovcnt].iov_base = u_ip6;
504 	optiov[opt.uio_iovcnt].iov_len = j->ip6s * sizeof(struct in6_addr);
505 	error = copyin(j->ip6, u_ip6, optiov[opt.uio_iovcnt].iov_len);
506 	if (error) {
507 		free(u_path, M_TEMP);
508 		return (error);
509 	}
510 	opt.uio_iovcnt++;
511 #endif
512 	KASSERT(opt.uio_iovcnt <= nitems(optiov),
513 		("kern_jail: too many iovecs (%d)", opt.uio_iovcnt));
514 	error = kern_jail_set(td, &opt, JAIL_CREATE | JAIL_ATTACH);
515 	free(u_path, M_TEMP);
516 	return (error);
517 }
518 
519 /*
520  * struct jail_set_args {
521  *	struct iovec *iovp;
522  *	unsigned int iovcnt;
523  *	int flags;
524  * };
525  */
526 int
527 sys_jail_set(struct thread *td, struct jail_set_args *uap)
528 {
529 	struct uio *auio;
530 	int error;
531 
532 	/* Check that we have an even number of iovecs. */
533 	if (uap->iovcnt & 1)
534 		return (EINVAL);
535 
536 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
537 	if (error)
538 		return (error);
539 	error = kern_jail_set(td, auio, uap->flags);
540 	free(auio, M_IOV);
541 	return (error);
542 }
543 
544 #if defined(INET) || defined(INET6)
545 typedef int prison_addr_cmp_t(const void *, const void *);
546 typedef bool prison_addr_valid_t(const void *);
547 static const struct pr_family {
548 	size_t			size;
549 	prison_addr_cmp_t	*cmp;
550 	prison_addr_valid_t	*valid;
551 	int			ip_flag;
552 } pr_families[PR_FAMILY_MAX] = {
553 #ifdef INET
554 	[PR_INET] = {
555 		.size = sizeof(struct in_addr),
556 		.cmp = prison_qcmp_v4,
557 		.valid = prison_valid_v4,
558 		.ip_flag = PR_IP4_USER,
559 	 },
560 #endif
561 #ifdef INET6
562 	[PR_INET6] = {
563 		.size = sizeof(struct in6_addr),
564 		.cmp = prison_qcmp_v6,
565 		.valid = prison_valid_v6,
566 		.ip_flag = PR_IP6_USER,
567 	},
568 #endif
569 };
570 
571 /*
572  * Network address lists (pr_addrs) allocation for jails.  The addresses
573  * are accessed locklessly by the network stack, thus need to be protected by
574  * the network epoch.
575  */
576 struct prison_ip {
577 	struct epoch_context ctx;
578 	uint32_t	ips;
579 #ifdef FUTURE_C
580 	union {
581 		struct in_addr pr_ip4[];
582 		struct in6_addr pr_ip6[];
583 	};
584 #else /* No future C :( */
585 #define	PR_IP(pip, i)	((const char *)((pip) + 1) + pr_families[af].size * (i))
586 #define	PR_IPD(pip, i)	((char *)((pip) + 1) + pr_families[af].size * (i))
587 #endif
588 };
589 
590 static struct prison_ip *
591 prison_ip_alloc(const pr_family_t af, uint32_t cnt, int flags)
592 {
593 	struct prison_ip *pip;
594 
595 	pip = malloc(sizeof(struct prison_ip) + cnt * pr_families[af].size,
596 	    M_PRISON, flags);
597 	if (pip != NULL)
598 		pip->ips = cnt;
599 	return (pip);
600 }
601 
602 /*
603  * Allocate and copyin user supplied address list, sorting and validating.
604  * kern_jail_set() helper.
605  */
606 static struct prison_ip *
607 prison_ip_copyin(const pr_family_t af, void *op, uint32_t cnt)
608 {
609 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
610 	const size_t size = pr_families[af].size;
611 	struct prison_ip *pip;
612 
613 	pip = prison_ip_alloc(af, cnt, M_WAITOK);
614 	bcopy(op, pip + 1, cnt * size);
615 	/*
616 	 * IP addresses are all sorted but ip[0] to preserve
617 	 * the primary IP address as given from userland.
618 	 * This special IP is used for unbound outgoing
619 	 * connections as well for "loopback" traffic in case
620 	 * source address selection cannot find any more fitting
621 	 * address to connect from.
622 	 */
623 	if (cnt > 1)
624 		qsort((char *)(pip + 1) + size, cnt - 1, size,
625 		    pr_families[af].cmp);
626 	/*
627 	 * Check for duplicate addresses and do some simple
628 	 * zero and broadcast checks. If users give other bogus
629 	 * addresses it is their problem.
630 	 */
631 	for (int i = 0; i < cnt; i++) {
632 		if (!pr_families[af].valid(PR_IP(pip, i))) {
633 			free(pip, M_PRISON);
634 			return (NULL);
635 		}
636 		if (i + 1 < cnt &&
637 		    (cmp(PR_IP(pip, 0), PR_IP(pip, i + 1)) == 0 ||
638 		     cmp(PR_IP(pip, i), PR_IP(pip, i + 1)) == 0)) {
639 			free(pip, M_PRISON);
640 			return (NULL);
641 		}
642 	}
643 
644 	return (pip);
645 }
646 
647 /*
648  * Allocate and dup parent prison address list.
649  * kern_jail_set() helper.
650  */
651 static void
652 prison_ip_dup(struct prison *ppr, struct prison *pr, const pr_family_t af)
653 {
654 
655 	if (ppr->pr_addrs[af] != NULL) {
656 		pr->pr_addrs[af] = prison_ip_alloc(af,
657 		    ppr->pr_addrs[af]->ips, M_WAITOK);
658 		bcopy(ppr->pr_addrs[af] + 1, pr->pr_addrs[af] + 1,
659 		    pr->pr_addrs[af]->ips * pr_families[af].size);
660 	}
661 }
662 
663 /*
664  * Make sure the new set of IP addresses is a subset of the parent's list.
665  * Don't worry about the parent being unlocked, as any setting is done with
666  * allprison_lock held.
667  * kern_jail_set() helper.
668  */
669 static bool
670 prison_ip_parent_match(const struct prison_ip *ppip,
671     const struct prison_ip *pip, const pr_family_t af)
672 {
673 	prison_addr_cmp_t *const cmp = pr_families[af].cmp;
674 	int i, j;
675 
676 	if (ppip == NULL)
677 		return (false);
678 
679 	for (i = 0; i < ppip->ips; i++)
680 		if (cmp(PR_IP(pip, 0), PR_IP(ppip, i)) == 0)
681 			break;
682 
683 	if (i == ppip->ips)
684 		/* Main address not present in parent. */
685 		return (false);
686 
687 	if (pip->ips > 1) {
688 		for (i = j = 1; i < pip->ips; i++) {
689 			if (cmp(PR_IP(pip, i), PR_IP(ppip, 0)) == 0)
690 				/* Equals to parent primary address. */
691 				continue;
692 			for (; j < ppip->ips; j++)
693 				if (cmp(PR_IP(pip, i), PR_IP(ppip, j)) == 0)
694 					break;
695 			if (j == ppip->ips)
696 				break;
697 		}
698 		if (j == ppip->ips)
699 			/* Address not present in parent. */
700 			return (false);
701 	}
702 	return (true);
703 }
704 
705 /*
706  * Check for conflicting IP addresses.  We permit them if there is no more
707  * than one IP on each jail.  If there is a duplicate on a jail with more
708  * than one IP stop checking and return error.
709  * kern_jail_set() helper.
710  */
711 static bool
712 prison_ip_conflict_check(const struct prison *ppr, const struct prison *pr,
713     const struct prison_ip *pip, pr_family_t af)
714 {
715 	const struct prison *tppr, *tpr;
716 	int descend;
717 
718 #ifdef VIMAGE
719 	for (tppr = ppr; tppr != &prison0; tppr = tppr->pr_parent)
720 		if (tppr->pr_flags & PR_VNET)
721 			break;
722 #else
723 	tppr = &prison0;
724 #endif
725 	FOREACH_PRISON_DESCENDANT(tppr, tpr, descend) {
726 		if (tpr == pr ||
727 #ifdef VIMAGE
728 		    (tpr != tppr && (tpr->pr_flags & PR_VNET)) ||
729 #endif
730 		    !prison_isalive(tpr)) {
731 			descend = 0;
732 			continue;
733 		}
734 		if (!(tpr->pr_flags & pr_families[af].ip_flag))
735 			continue;
736 		descend = 0;
737 		if (tpr->pr_addrs[af] == NULL ||
738 		    (pip->ips == 1 && tpr->pr_addrs[af]->ips == 1))
739 			continue;
740 		for (int i = 0; i < pip->ips; i++)
741 			if (prison_ip_check(tpr, af, PR_IP(pip, i)) == 0)
742 				return (false);
743 	}
744 
745 	return (true);
746 }
747 
748 _Static_assert(offsetof(struct prison_ip, ctx) == 0,
749     "prison must start with epoch context");
750 static void
751 prison_ip_free_deferred(epoch_context_t ctx)
752 {
753 
754 	free(ctx, M_PRISON);
755 }
756 
757 static void
758 prison_ip_free(struct prison_ip *pip)
759 {
760 
761 	if (pip != NULL)
762 		NET_EPOCH_CALL(prison_ip_free_deferred, &pip->ctx);
763 }
764 
765 static void
766 prison_ip_set(struct prison *pr, const pr_family_t af, struct prison_ip *new)
767 {
768 	struct prison_ip **mem, *old;
769 
770 	mtx_assert(&pr->pr_mtx, MA_OWNED);
771 
772 	mem = &pr->pr_addrs[af];
773 
774 	old = *mem;
775 	ck_pr_store_ptr(mem, new);
776 	prison_ip_free(old);
777 }
778 
779 /*
780  * Restrict a prison's IP address list with its parent's, possibly replacing
781  * it.  Return true if succeed, otherwise should redo.
782  * kern_jail_set() helper.
783  */
784 static bool
785 prison_ip_restrict(struct prison *pr, const pr_family_t af,
786     struct prison_ip **newp)
787 {
788 	const struct prison_ip *ppip = pr->pr_parent->pr_addrs[af];
789 	const struct prison_ip *pip = pr->pr_addrs[af];
790 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
791 	const size_t size = pr_families[af].size;
792 	struct prison_ip *new = newp != NULL ? *newp : NULL;
793 	uint32_t ips;
794 
795 	mtx_assert(&pr->pr_mtx, MA_OWNED);
796 
797 	/*
798 	 * Due to epoch-synchronized access to the IP address lists we always
799 	 * allocate a new list even if the old one has enough space.  We could
800 	 * atomically update an IPv4 address inside a list, but that would
801 	 * screw up sorting, and in case of IPv6 we can't even atomically write
802 	 * one.
803 	 */
804 	if (ppip == NULL) {
805 		if (pip != NULL)
806 			prison_ip_set(pr, af, NULL);
807 		return (true);
808 	}
809 
810 	if (!(pr->pr_flags & pr_families[af].ip_flag)) {
811 		if (new == NULL) {
812 			new = prison_ip_alloc(af, ppip->ips, M_NOWAIT);
813 			if (new == NULL)
814 				return (false); /* Redo */
815 		}
816 		/* This has no user settings, so just copy the parent's list. */
817 		MPASS(new->ips == ppip->ips);
818 		bcopy(ppip + 1, new + 1, ppip->ips * size);
819 		prison_ip_set(pr, af, new);
820 		if (newp != NULL)
821 			*newp = NULL; /* Used */
822 	} else if (pip != NULL) {
823 		/* Remove addresses that aren't in the parent. */
824 		int i;
825 
826 		i = 0; /* index in pip */
827 		ips = 0; /* index in new */
828 
829 		if (new == NULL) {
830 			new = prison_ip_alloc(af, pip->ips, M_NOWAIT);
831 			if (new == NULL)
832 				return (false); /* Redo */
833 		}
834 
835 		for (int pi = 0; pi < ppip->ips; pi++)
836 			if (cmp(PR_IP(pip, 0), PR_IP(ppip, pi)) == 0) {
837 				/* Found our primary address in parent. */
838 				bcopy(PR_IP(pip, i), PR_IPD(new, ips), size);
839 				i++;
840 				ips++;
841 				break;
842 			}
843 		for (int pi = 1; i < pip->ips; ) {
844 			/* Check against primary, which is unsorted. */
845 			if (cmp(PR_IP(pip, i), PR_IP(ppip, 0)) == 0) {
846 				/* Matches parent's primary address. */
847 				bcopy(PR_IP(pip, i), PR_IPD(new, ips), size);
848 				i++;
849 				ips++;
850 				continue;
851 			}
852 			/* The rest are sorted. */
853 			switch (pi >= ppip->ips ? -1 :
854 				cmp(PR_IP(pip, i), PR_IP(ppip, pi))) {
855 			case -1:
856 				i++;
857 				break;
858 			case 0:
859 				bcopy(PR_IP(pip, i), PR_IPD(new, ips), size);
860 				i++;
861 				pi++;
862 				ips++;
863 				break;
864 			case 1:
865 				pi++;
866 				break;
867 			}
868 		}
869 		if (ips == 0) {
870 			if (newp == NULL || *newp == NULL)
871 				prison_ip_free(new);
872 			new = NULL;
873 		} else {
874 			/* Shrink to real size */
875 			KASSERT((new->ips >= ips),
876 			    ("Out-of-bounds write to prison_ip %p", new));
877 			new->ips = ips;
878 		}
879 		prison_ip_set(pr, af, new);
880 		if (newp != NULL)
881 			*newp = NULL; /* Used */
882 	}
883 	return (true);
884 }
885 
886 /*
887  * Fast-path check if an address belongs to a prison.
888  */
889 int
890 prison_ip_check(const struct prison *pr, const pr_family_t af,
891     const void *addr)
892 {
893 	int (*const cmp)(const void *, const void *) = pr_families[af].cmp;
894 	const struct prison_ip *pip;
895 	int i, a, z, d;
896 
897 	MPASS(mtx_owned(&pr->pr_mtx) ||
898 	    in_epoch(net_epoch_preempt) ||
899 	    sx_xlocked(&allprison_lock));
900 
901 	pip = ck_pr_load_ptr(&pr->pr_addrs[af]);
902 	if (__predict_false(pip == NULL))
903 		return (EAFNOSUPPORT);
904 
905 	/* Check the primary IP. */
906 	if (cmp(PR_IP(pip, 0), addr) == 0)
907 		return (0);
908 
909 	/*
910 	 * All the other IPs are sorted so we can do a binary search.
911 	 */
912 	a = 0;
913 	z = pip->ips - 2;
914 	while (a <= z) {
915 		i = (a + z) / 2;
916 		d = cmp(PR_IP(pip, i + 1), addr);
917 		if (d > 0)
918 			z = i - 1;
919 		else if (d < 0)
920 			a = i + 1;
921 		else
922 			return (0);
923 	}
924 
925 	return (EADDRNOTAVAIL);
926 }
927 
928 /*
929  * Grab primary IP.  Historically required mutex, but nothing prevents
930  * us to support epoch-protected access.  Is it used in fast path?
931  * in{6}_jail.c helper
932  */
933 const void *
934 prison_ip_get0(const struct prison *pr, const pr_family_t af)
935 {
936 	const struct prison_ip *pip = pr->pr_addrs[af];
937 
938 	mtx_assert(&pr->pr_mtx, MA_OWNED);
939 	MPASS(pip);
940 
941 	return (pip + 1);
942 }
943 
944 u_int
945 prison_ip_cnt(const struct prison *pr, const pr_family_t af)
946 {
947 
948 	return (pr->pr_addrs[af]->ips);
949 }
950 #endif	/* defined(INET) || defined(INET6) */
951 
952 int
953 kern_jail_set(struct thread *td, struct uio *optuio, int flags)
954 {
955 	struct nameidata nd;
956 #ifdef INET
957 	struct prison_ip *ip4;
958 #endif
959 #ifdef INET6
960 	struct prison_ip *ip6;
961 #endif
962 	struct vfsopt *opt;
963 	struct vfsoptlist *opts;
964 	struct prison *pr, *deadpr, *inspr, *mypr, *ppr, *tpr;
965 	struct vnode *root;
966 	char *domain, *errmsg, *host, *name, *namelc, *p, *path, *uuid;
967 	char *g_path, *osrelstr;
968 	struct bool_flags *bf;
969 	struct jailsys_flags *jsf;
970 #if defined(INET) || defined(INET6)
971 	void *op;
972 #endif
973 	unsigned long hid;
974 	size_t namelen, onamelen, pnamelen;
975 	int born, created, cuflags, descend, drflags, enforce;
976 	int error, errmsg_len, errmsg_pos;
977 	int gotchildmax, gotenforce, gothid, gotrsnum, gotslevel;
978 	int jid, jsys, len, level;
979 	int childmax, osreldt, rsnum, slevel;
980 #ifdef INET
981 	int ip4s;
982 	bool redo_ip4;
983 #endif
984 #ifdef INET6
985 	int ip6s;
986 	bool redo_ip6;
987 #endif
988 	uint64_t pr_allow, ch_allow, pr_flags, ch_flags;
989 	uint64_t pr_allow_diff;
990 	unsigned tallow;
991 	char numbuf[12];
992 	static uint64_t init_permid = 2;
993 
994 	error = priv_check(td, PRIV_JAIL_SET);
995 	if (!error && (flags & JAIL_ATTACH))
996 		error = priv_check(td, PRIV_JAIL_ATTACH);
997 	if (error)
998 		return (error);
999 	mypr = td->td_ucred->cr_prison;
1000 	if ((flags & JAIL_CREATE) && mypr->pr_childmax == 0)
1001 		return (EPERM);
1002 	if (flags & ~JAIL_SET_MASK)
1003 		return (EINVAL);
1004 
1005 	/*
1006 	 * Check all the parameters before committing to anything.  Not all
1007 	 * errors can be caught early, but we may as well try.  Also, this
1008 	 * takes care of some expensive stuff (path lookup) before getting
1009 	 * the allprison lock.
1010 	 *
1011 	 * XXX Jails are not filesystems, and jail parameters are not mount
1012 	 *     options.  But it makes more sense to re-use the vfsopt code
1013 	 *     than duplicate it under a different name.
1014 	 */
1015 	error = vfs_buildopts(optuio, &opts);
1016 	if (error)
1017 		return (error);
1018 #ifdef INET
1019 	ip4 = NULL;
1020 #endif
1021 #ifdef INET6
1022 	ip6 = NULL;
1023 #endif
1024 	g_path = NULL;
1025 
1026 	cuflags = flags & (JAIL_CREATE | JAIL_UPDATE);
1027 	if (!cuflags) {
1028 		error = EINVAL;
1029 		vfs_opterror(opts, "no valid operation (create or update)");
1030 		goto done_errmsg;
1031 	}
1032 
1033 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
1034 	if (error == ENOENT)
1035 		jid = 0;
1036 	else if (error != 0)
1037 		goto done_free;
1038 
1039 	error = vfs_copyopt(opts, "securelevel", &slevel, sizeof(slevel));
1040 	if (error == ENOENT)
1041 		gotslevel = 0;
1042 	else if (error != 0)
1043 		goto done_free;
1044 	else
1045 		gotslevel = 1;
1046 
1047 	error =
1048 	    vfs_copyopt(opts, "children.max", &childmax, sizeof(childmax));
1049 	if (error == ENOENT)
1050 		gotchildmax = 0;
1051 	else if (error != 0)
1052 		goto done_free;
1053 	else
1054 		gotchildmax = 1;
1055 
1056 	error = vfs_copyopt(opts, "enforce_statfs", &enforce, sizeof(enforce));
1057 	if (error == ENOENT)
1058 		gotenforce = 0;
1059 	else if (error != 0)
1060 		goto done_free;
1061 	else if (enforce < 0 || enforce > 2) {
1062 		error = EINVAL;
1063 		goto done_free;
1064 	} else
1065 		gotenforce = 1;
1066 
1067 	error = vfs_copyopt(opts, "devfs_ruleset", &rsnum, sizeof(rsnum));
1068 	if (error == ENOENT)
1069 		gotrsnum = 0;
1070 	else if (error != 0)
1071 		goto done_free;
1072 	else
1073 		gotrsnum = 1;
1074 
1075 	pr_flags = ch_flags = 0;
1076 	for (bf = pr_flag_bool;
1077 	     bf < pr_flag_bool + nitems(pr_flag_bool);
1078 	     bf++) {
1079 		vfs_flagopt(opts, bf->name, &pr_flags, bf->flag);
1080 		vfs_flagopt(opts, bf->noname, &ch_flags, bf->flag);
1081 	}
1082 	ch_flags |= pr_flags;
1083 	for (jsf = pr_flag_jailsys;
1084 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
1085 	     jsf++) {
1086 		error = vfs_copyopt(opts, jsf->name, &jsys, sizeof(jsys));
1087 		if (error == ENOENT)
1088 			continue;
1089 		if (error != 0)
1090 			goto done_free;
1091 		switch (jsys) {
1092 		case JAIL_SYS_DISABLE:
1093 			if (!jsf->disable) {
1094 				error = EINVAL;
1095 				goto done_free;
1096 			}
1097 			pr_flags |= jsf->disable;
1098 			break;
1099 		case JAIL_SYS_NEW:
1100 			pr_flags |= jsf->new;
1101 			break;
1102 		case JAIL_SYS_INHERIT:
1103 			break;
1104 		default:
1105 			error = EINVAL;
1106 			goto done_free;
1107 		}
1108 		ch_flags |= jsf->new | jsf->disable;
1109 	}
1110 	if ((flags & (JAIL_CREATE | JAIL_ATTACH)) == JAIL_CREATE
1111 	    && !(pr_flags & PR_PERSIST)) {
1112 		error = EINVAL;
1113 		vfs_opterror(opts, "new jail must persist or attach");
1114 		goto done_errmsg;
1115 	}
1116 #ifdef VIMAGE
1117 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_VNET)) {
1118 		error = EINVAL;
1119 		vfs_opterror(opts, "vnet cannot be changed after creation");
1120 		goto done_errmsg;
1121 	}
1122 #endif
1123 #ifdef INET
1124 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP4_USER)) {
1125 		error = EINVAL;
1126 		vfs_opterror(opts, "ip4 cannot be changed after creation");
1127 		goto done_errmsg;
1128 	}
1129 #endif
1130 #ifdef INET6
1131 	if ((flags & JAIL_UPDATE) && (ch_flags & PR_IP6_USER)) {
1132 		error = EINVAL;
1133 		vfs_opterror(opts, "ip6 cannot be changed after creation");
1134 		goto done_errmsg;
1135 	}
1136 #endif
1137 
1138 	pr_allow = ch_allow = 0;
1139 	for (bf = pr_flag_allow;
1140 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
1141 		atomic_load_int(&bf->flag) != 0;
1142 	     bf++) {
1143 		vfs_flagopt(opts, bf->name, &pr_allow, bf->flag);
1144 		vfs_flagopt(opts, bf->noname, &ch_allow, bf->flag);
1145 	}
1146 	ch_allow |= pr_allow;
1147 
1148 	error = vfs_getopt(opts, "name", (void **)&name, &len);
1149 	if (error == ENOENT)
1150 		name = NULL;
1151 	else if (error != 0)
1152 		goto done_free;
1153 	else {
1154 		if (len == 0 || name[len - 1] != '\0') {
1155 			error = EINVAL;
1156 			goto done_free;
1157 		}
1158 		if (len > MAXHOSTNAMELEN) {
1159 			error = ENAMETOOLONG;
1160 			goto done_free;
1161 		}
1162 	}
1163 
1164 	error = vfs_getopt(opts, "host.hostname", (void **)&host, &len);
1165 	if (error == ENOENT)
1166 		host = NULL;
1167 	else if (error != 0)
1168 		goto done_free;
1169 	else {
1170 		ch_flags |= PR_HOST;
1171 		pr_flags |= PR_HOST;
1172 		if (len == 0 || host[len - 1] != '\0') {
1173 			error = EINVAL;
1174 			goto done_free;
1175 		}
1176 		if (len > MAXHOSTNAMELEN) {
1177 			error = ENAMETOOLONG;
1178 			goto done_free;
1179 		}
1180 	}
1181 
1182 	error = vfs_getopt(opts, "host.domainname", (void **)&domain, &len);
1183 	if (error == ENOENT)
1184 		domain = NULL;
1185 	else if (error != 0)
1186 		goto done_free;
1187 	else {
1188 		ch_flags |= PR_HOST;
1189 		pr_flags |= PR_HOST;
1190 		if (len == 0 || domain[len - 1] != '\0') {
1191 			error = EINVAL;
1192 			goto done_free;
1193 		}
1194 		if (len > MAXHOSTNAMELEN) {
1195 			error = ENAMETOOLONG;
1196 			goto done_free;
1197 		}
1198 	}
1199 
1200 	error = vfs_getopt(opts, "host.hostuuid", (void **)&uuid, &len);
1201 	if (error == ENOENT)
1202 		uuid = NULL;
1203 	else if (error != 0)
1204 		goto done_free;
1205 	else {
1206 		ch_flags |= PR_HOST;
1207 		pr_flags |= PR_HOST;
1208 		if (len == 0 || uuid[len - 1] != '\0') {
1209 			error = EINVAL;
1210 			goto done_free;
1211 		}
1212 		if (len > HOSTUUIDLEN) {
1213 			error = ENAMETOOLONG;
1214 			goto done_free;
1215 		}
1216 	}
1217 
1218 #ifdef COMPAT_FREEBSD32
1219 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
1220 		uint32_t hid32;
1221 
1222 		error = vfs_copyopt(opts, "host.hostid", &hid32, sizeof(hid32));
1223 		hid = hid32;
1224 	} else
1225 #endif
1226 		error = vfs_copyopt(opts, "host.hostid", &hid, sizeof(hid));
1227 	if (error == ENOENT)
1228 		gothid = 0;
1229 	else if (error != 0)
1230 		goto done_free;
1231 	else {
1232 		gothid = 1;
1233 		ch_flags |= PR_HOST;
1234 		pr_flags |= PR_HOST;
1235 	}
1236 
1237 #ifdef INET
1238 	error = vfs_getopt(opts, "ip4.addr", &op, &ip4s);
1239 	if (error == ENOENT)
1240 		ip4s = 0;
1241 	else if (error != 0)
1242 		goto done_free;
1243 	else if (ip4s & (sizeof(struct in_addr) - 1)) {
1244 		error = EINVAL;
1245 		goto done_free;
1246 	} else {
1247 		ch_flags |= PR_IP4_USER;
1248 		pr_flags |= PR_IP4_USER;
1249 		if (ip4s > 0) {
1250 			ip4s /= sizeof(struct in_addr);
1251 			if (ip4s > jail_max_af_ips) {
1252 				error = EINVAL;
1253 				vfs_opterror(opts, "too many IPv4 addresses");
1254 				goto done_errmsg;
1255 			}
1256 			ip4 = prison_ip_copyin(PR_INET, op, ip4s);
1257 			if (ip4 == NULL) {
1258 				error = EINVAL;
1259 				goto done_free;
1260 			}
1261 		}
1262 	}
1263 #endif
1264 
1265 #ifdef INET6
1266 	error = vfs_getopt(opts, "ip6.addr", &op, &ip6s);
1267 	if (error == ENOENT)
1268 		ip6s = 0;
1269 	else if (error != 0)
1270 		goto done_free;
1271 	else if (ip6s & (sizeof(struct in6_addr) - 1)) {
1272 		error = EINVAL;
1273 		goto done_free;
1274 	} else {
1275 		ch_flags |= PR_IP6_USER;
1276 		pr_flags |= PR_IP6_USER;
1277 		if (ip6s > 0) {
1278 			ip6s /= sizeof(struct in6_addr);
1279 			if (ip6s > jail_max_af_ips) {
1280 				error = EINVAL;
1281 				vfs_opterror(opts, "too many IPv6 addresses");
1282 				goto done_errmsg;
1283 			}
1284 			ip6 = prison_ip_copyin(PR_INET6, op, ip6s);
1285 			if (ip6 == NULL) {
1286 				error = EINVAL;
1287 				goto done_free;
1288 			}
1289 		}
1290 	}
1291 #endif
1292 
1293 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1294 	if ((ch_flags & PR_VNET) && (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1295 		error = EINVAL;
1296 		vfs_opterror(opts,
1297 		    "vnet jails cannot have IP address restrictions");
1298 		goto done_errmsg;
1299 	}
1300 #endif
1301 
1302 	error = vfs_getopt(opts, "osrelease", (void **)&osrelstr, &len);
1303 	if (error == ENOENT)
1304 		osrelstr = NULL;
1305 	else if (error != 0)
1306 		goto done_free;
1307 	else {
1308 		if (flags & JAIL_UPDATE) {
1309 			error = EINVAL;
1310 			vfs_opterror(opts,
1311 			    "osrelease cannot be changed after creation");
1312 			goto done_errmsg;
1313 		}
1314 		if (len == 0 || osrelstr[len - 1] != '\0') {
1315 			error = EINVAL;
1316 			goto done_free;
1317 		}
1318 		if (len >= OSRELEASELEN) {
1319 			error = ENAMETOOLONG;
1320 			vfs_opterror(opts,
1321 			    "osrelease string must be 1-%d bytes long",
1322 			    OSRELEASELEN - 1);
1323 			goto done_errmsg;
1324 		}
1325 	}
1326 
1327 	error = vfs_copyopt(opts, "osreldate", &osreldt, sizeof(osreldt));
1328 	if (error == ENOENT)
1329 		osreldt = 0;
1330 	else if (error != 0)
1331 		goto done_free;
1332 	else {
1333 		if (flags & JAIL_UPDATE) {
1334 			error = EINVAL;
1335 			vfs_opterror(opts,
1336 			    "osreldate cannot be changed after creation");
1337 			goto done_errmsg;
1338 		}
1339 		if (osreldt == 0) {
1340 			error = EINVAL;
1341 			vfs_opterror(opts, "osreldate cannot be 0");
1342 			goto done_errmsg;
1343 		}
1344 	}
1345 
1346 	root = NULL;
1347 	error = vfs_getopt(opts, "path", (void **)&path, &len);
1348 	if (error == ENOENT)
1349 		path = NULL;
1350 	else if (error != 0)
1351 		goto done_free;
1352 	else {
1353 		if (flags & JAIL_UPDATE) {
1354 			error = EINVAL;
1355 			vfs_opterror(opts,
1356 			    "path cannot be changed after creation");
1357 			goto done_errmsg;
1358 		}
1359 		if (len == 0 || path[len - 1] != '\0') {
1360 			error = EINVAL;
1361 			goto done_free;
1362 		}
1363 		NDINIT(&nd, LOOKUP, FOLLOW | LOCKLEAF, UIO_SYSSPACE, path);
1364 		error = namei(&nd);
1365 		if (error)
1366 			goto done_free;
1367 		root = nd.ni_vp;
1368 		NDFREE_PNBUF(&nd);
1369 		g_path = malloc(MAXPATHLEN, M_TEMP, M_WAITOK);
1370 		strlcpy(g_path, path, MAXPATHLEN);
1371 		error = vn_path_to_global_path(td, root, g_path, MAXPATHLEN);
1372 		if (error == 0) {
1373 			path = g_path;
1374 		} else {
1375 			/* exit on other errors */
1376 			goto done_free;
1377 		}
1378 		if (root->v_type != VDIR) {
1379 			error = ENOTDIR;
1380 			vput(root);
1381 			goto done_free;
1382 		}
1383 		VOP_UNLOCK(root);
1384 	}
1385 
1386 	/*
1387 	 * Find the specified jail, or at least its parent.
1388 	 * This abuses the file error codes ENOENT and EEXIST.
1389 	 */
1390 	pr = NULL;
1391 	inspr = NULL;
1392 	if (cuflags == JAIL_CREATE && jid == 0 && name != NULL) {
1393 		namelc = strrchr(name, '.');
1394 		jid = strtoul(namelc != NULL ? namelc + 1 : name, &p, 10);
1395 		if (*p != '\0')
1396 			jid = 0;
1397 	}
1398 	sx_xlock(&allprison_lock);
1399 	drflags = PD_LIST_XLOCKED;
1400 	ppr = mypr;
1401 	if (!prison_isalive(ppr)) {
1402 		/* This jail is dying.  This process will surely follow. */
1403 		error = EAGAIN;
1404 		goto done_deref;
1405 	}
1406 	if (jid != 0) {
1407 		if (jid < 0) {
1408 			error = EINVAL;
1409 			vfs_opterror(opts, "negative jid");
1410 			goto done_deref;
1411 		}
1412 		/*
1413 		 * See if a requested jid already exists.  Keep track of
1414 		 * where it can be inserted later.
1415 		 */
1416 		TAILQ_FOREACH(inspr, &allprison, pr_list) {
1417 			if (inspr->pr_id < jid)
1418 				continue;
1419 			if (inspr->pr_id > jid)
1420 				break;
1421 			pr = inspr;
1422 			mtx_lock(&pr->pr_mtx);
1423 			drflags |= PD_LOCKED;
1424 			inspr = NULL;
1425 			break;
1426 		}
1427 		if (pr != NULL) {
1428 			/* Create: jid must not exist. */
1429 			if (cuflags == JAIL_CREATE) {
1430 				/*
1431 				 * Even creators that cannot see the jail will
1432 				 * get EEXIST.
1433 				 */
1434 				error = EEXIST;
1435 				vfs_opterror(opts, "jail %d already exists",
1436 				    jid);
1437 				goto done_deref;
1438 			}
1439 			if (!prison_ischild(mypr, pr)) {
1440 				/*
1441 				 * Updaters get ENOENT if they cannot see the
1442 				 * jail.  This is true even for CREATE | UPDATE,
1443 				 * which normally cannot give this error.
1444 				 */
1445 				error = ENOENT;
1446 				vfs_opterror(opts, "jail %d not found", jid);
1447 				goto done_deref;
1448 			}
1449 			ppr = pr->pr_parent;
1450 			if (!prison_isalive(ppr)) {
1451 				error = ENOENT;
1452 				vfs_opterror(opts, "jail %d is dying",
1453 				    ppr->pr_id);
1454 				goto done_deref;
1455 			}
1456 			if (!prison_isalive(pr)) {
1457 				if (!(flags & JAIL_DYING)) {
1458 					error = ENOENT;
1459 					vfs_opterror(opts, "jail %d is dying",
1460 					    jid);
1461 					goto done_deref;
1462 				}
1463 				if ((flags & JAIL_ATTACH) ||
1464 				    (pr_flags & PR_PERSIST)) {
1465 					/*
1466 					 * A dying jail might be resurrected
1467 					 * (via attach or persist), but first
1468 					 * it must determine if another jail
1469 					 * has claimed its name.  Accomplish
1470 					 * this by implicitly re-setting the
1471 					 * name.
1472 					 */
1473 					if (name == NULL)
1474 						name = prison_name(mypr, pr);
1475 				}
1476 			}
1477 		} else {
1478 			/* Update: jid must exist. */
1479 			if (cuflags == JAIL_UPDATE) {
1480 				error = ENOENT;
1481 				vfs_opterror(opts, "jail %d not found", jid);
1482 				goto done_deref;
1483 			}
1484 		}
1485 	}
1486 	/*
1487 	 * If the caller provided a name, look for a jail by that name.
1488 	 * This has different semantics for creates and updates keyed by jid
1489 	 * (where the name must not already exist in a different jail),
1490 	 * and updates keyed by the name itself (where the name must exist
1491 	 * because that is the jail being updated).
1492 	 */
1493 	namelc = NULL;
1494 	if (name != NULL) {
1495 		namelc = strrchr(name, '.');
1496 		if (namelc == NULL)
1497 			namelc = name;
1498 		else {
1499 			/*
1500 			 * This is a hierarchical name.  Split it into the
1501 			 * parent and child names, and make sure the parent
1502 			 * exists or matches an already found jail.
1503 			 */
1504 			if (pr != NULL) {
1505 				if (strncmp(name, ppr->pr_name, namelc - name)
1506 				    || ppr->pr_name[namelc - name] != '\0') {
1507 					error = EINVAL;
1508 					vfs_opterror(opts,
1509 					    "cannot change jail's parent");
1510 					goto done_deref;
1511 				}
1512 			} else {
1513 				*namelc = '\0';
1514 				ppr = prison_find_name(mypr, name);
1515 				if (ppr == NULL) {
1516 					error = ENOENT;
1517 					vfs_opterror(opts,
1518 					    "jail \"%s\" not found", name);
1519 					goto done_deref;
1520 				}
1521 				mtx_unlock(&ppr->pr_mtx);
1522 				if (!prison_isalive(ppr)) {
1523 					error = ENOENT;
1524 					vfs_opterror(opts,
1525 					    "jail \"%s\" is dying", name);
1526 					goto done_deref;
1527 				}
1528 				*namelc = '.';
1529 			}
1530 			namelc++;
1531 		}
1532 		if (namelc[0] != '\0') {
1533 			pnamelen =
1534 			    (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1535 			deadpr = NULL;
1536 			FOREACH_PRISON_CHILD(ppr, tpr) {
1537 				if (tpr != pr &&
1538 				    !strcmp(tpr->pr_name + pnamelen, namelc)) {
1539 					if (prison_isalive(tpr)) {
1540 						if (pr == NULL &&
1541 						    cuflags != JAIL_CREATE) {
1542 							/*
1543 							 * Use this jail
1544 							 * for updates.
1545 							 */
1546 							pr = tpr;
1547 							mtx_lock(&pr->pr_mtx);
1548 							drflags |= PD_LOCKED;
1549 							break;
1550 						}
1551 						/*
1552 						 * Create, or update(jid):
1553 						 * name must not exist in an
1554 						 * active sibling jail.
1555 						 */
1556 						error = EEXIST;
1557 						vfs_opterror(opts,
1558 						   "jail \"%s\" already exists",
1559 						   name);
1560 						goto done_deref;
1561 					}
1562 					if (pr == NULL &&
1563 					    cuflags != JAIL_CREATE) {
1564 						deadpr = tpr;
1565 					}
1566 				}
1567 			}
1568 			/* If no active jail is found, use a dying one. */
1569 			if (deadpr != NULL && pr == NULL) {
1570 				if (flags & JAIL_DYING) {
1571 					pr = deadpr;
1572 					mtx_lock(&pr->pr_mtx);
1573 					drflags |= PD_LOCKED;
1574 				} else if (cuflags == JAIL_UPDATE) {
1575 					error = ENOENT;
1576 					vfs_opterror(opts,
1577 					    "jail \"%s\" is dying", name);
1578 					goto done_deref;
1579 				}
1580 			}
1581 			/* Update: name must exist if no jid. */
1582 			else if (cuflags == JAIL_UPDATE && pr == NULL) {
1583 				error = ENOENT;
1584 				vfs_opterror(opts, "jail \"%s\" not found",
1585 				    name);
1586 				goto done_deref;
1587 			}
1588 		}
1589 	}
1590 	/* Update: must provide a jid or name. */
1591 	else if (cuflags == JAIL_UPDATE && pr == NULL) {
1592 		error = ENOENT;
1593 		vfs_opterror(opts, "update specified no jail");
1594 		goto done_deref;
1595 	}
1596 
1597 	/* If there's no prison to update, create a new one and link it in. */
1598 	created = pr == NULL;
1599 	if (created) {
1600 		for (tpr = mypr; tpr != NULL; tpr = tpr->pr_parent)
1601 			if (tpr->pr_childcount >= tpr->pr_childmax) {
1602 				error = EPERM;
1603 				vfs_opterror(opts, "prison limit exceeded");
1604 				goto done_deref;
1605 			}
1606 		if (jid == 0 && (jid = get_next_prid(&inspr)) == 0) {
1607 			error = EAGAIN;
1608 			vfs_opterror(opts, "no available jail IDs");
1609 			goto done_deref;
1610 		}
1611 
1612 		pr = malloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
1613 		pr->pr_state = PRISON_STATE_INVALID;
1614 		refcount_init(&pr->pr_ref, 1);
1615 		refcount_init(&pr->pr_uref, 0);
1616 		drflags |= PD_DEREF;
1617 		LIST_INIT(&pr->pr_children);
1618 		mtx_init(&pr->pr_mtx, "jail mutex", NULL, MTX_DEF | MTX_DUPOK);
1619 		TASK_INIT(&pr->pr_task, 0, prison_complete, pr);
1620 
1621 		pr->pr_id = jid;
1622 		pr->pr_permid = init_permid++;
1623 		if (inspr != NULL)
1624 			TAILQ_INSERT_BEFORE(inspr, pr, pr_list);
1625 		else
1626 			TAILQ_INSERT_TAIL(&allprison, pr, pr_list);
1627 
1628 		pr->pr_parent = ppr;
1629 		prison_hold(ppr);
1630 		prison_proc_hold(ppr);
1631 		LIST_INSERT_HEAD(&ppr->pr_children, pr, pr_sibling);
1632 		for (tpr = ppr; tpr != NULL; tpr = tpr->pr_parent)
1633 			tpr->pr_childcount++;
1634 
1635 		/* Set some default values, and inherit some from the parent. */
1636 		if (namelc == NULL)
1637 			namelc = "";
1638 		if (path == NULL) {
1639 			path = "/";
1640 			root = mypr->pr_root;
1641 			vref(root);
1642 		}
1643 		strlcpy(pr->pr_hostuuid, DEFAULT_HOSTUUID, HOSTUUIDLEN);
1644 		pr->pr_flags |= PR_HOST;
1645 #if defined(INET) || defined(INET6)
1646 #ifdef VIMAGE
1647 		if (!(pr_flags & PR_VNET))
1648 #endif
1649 		{
1650 #ifdef INET
1651 			if (!(ch_flags & PR_IP4_USER))
1652 				pr->pr_flags |= PR_IP4 | PR_IP4_USER;
1653 			else if (!(pr_flags & PR_IP4_USER)) {
1654 				pr->pr_flags |= ppr->pr_flags & PR_IP4;
1655 				prison_ip_dup(ppr, pr, PR_INET);
1656 			}
1657 #endif
1658 #ifdef INET6
1659 			if (!(ch_flags & PR_IP6_USER))
1660 				pr->pr_flags |= PR_IP6 | PR_IP6_USER;
1661 			else if (!(pr_flags & PR_IP6_USER)) {
1662 				pr->pr_flags |= ppr->pr_flags & PR_IP6;
1663 				prison_ip_dup(ppr, pr, PR_INET6);
1664 			}
1665 #endif
1666 		}
1667 #endif
1668 		/* Source address selection is always on by default. */
1669 		pr->pr_flags |= _PR_IP_SADDRSEL;
1670 
1671 		pr->pr_securelevel = ppr->pr_securelevel;
1672 		pr->pr_allow = JAIL_DEFAULT_ALLOW & ppr->pr_allow;
1673 		pr->pr_enforce_statfs = jail_default_enforce_statfs;
1674 		pr->pr_devfs_rsnum = ppr->pr_devfs_rsnum;
1675 
1676 		pr->pr_osreldate = osreldt ? osreldt : ppr->pr_osreldate;
1677 		if (osrelstr == NULL)
1678 			strlcpy(pr->pr_osrelease, ppr->pr_osrelease,
1679 			    sizeof(pr->pr_osrelease));
1680 		else
1681 			strlcpy(pr->pr_osrelease, osrelstr,
1682 			    sizeof(pr->pr_osrelease));
1683 
1684 #ifdef VIMAGE
1685 		/* Allocate a new vnet if specified. */
1686 		pr->pr_vnet = (pr_flags & PR_VNET)
1687 		    ? vnet_alloc() : ppr->pr_vnet;
1688 #endif
1689 		/*
1690 		 * Allocate a dedicated cpuset for each jail.
1691 		 * Unlike other initial settings, this may return an error.
1692 		 */
1693 		error = cpuset_create_root(ppr, &pr->pr_cpuset);
1694 		if (error)
1695 			goto done_deref;
1696 
1697 		mtx_lock(&pr->pr_mtx);
1698 		drflags |= PD_LOCKED;
1699 	} else {
1700 		/*
1701 		 * Grab a reference for existing prisons, to ensure they
1702 		 * continue to exist for the duration of the call.
1703 		 */
1704 		prison_hold(pr);
1705 		drflags |= PD_DEREF;
1706 #if defined(VIMAGE) && (defined(INET) || defined(INET6))
1707 		if ((pr->pr_flags & PR_VNET) &&
1708 		    (ch_flags & (PR_IP4_USER | PR_IP6_USER))) {
1709 			error = EINVAL;
1710 			vfs_opterror(opts,
1711 			    "vnet jails cannot have IP address restrictions");
1712 			goto done_deref;
1713 		}
1714 #endif
1715 #ifdef INET
1716 		if (PR_IP4_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1717 			error = EINVAL;
1718 			vfs_opterror(opts,
1719 			    "ip4 cannot be changed after creation");
1720 			goto done_deref;
1721 		}
1722 #endif
1723 #ifdef INET6
1724 		if (PR_IP6_USER & ch_flags & (pr_flags ^ pr->pr_flags)) {
1725 			error = EINVAL;
1726 			vfs_opterror(opts,
1727 			    "ip6 cannot be changed after creation");
1728 			goto done_deref;
1729 		}
1730 #endif
1731 	}
1732 
1733 	/* Do final error checking before setting anything. */
1734 	if (gotslevel) {
1735 		if (slevel < ppr->pr_securelevel) {
1736 			error = EPERM;
1737 			goto done_deref;
1738 		}
1739 	}
1740 	if (gotchildmax) {
1741 		if (childmax >= ppr->pr_childmax) {
1742 			error = EPERM;
1743 			goto done_deref;
1744 		}
1745 	}
1746 	if (gotenforce) {
1747 		if (enforce < ppr->pr_enforce_statfs) {
1748 			error = EPERM;
1749 			goto done_deref;
1750 		}
1751 	}
1752 	if (gotrsnum) {
1753 		/*
1754 		 * devfs_rsnum is a uint16_t
1755 		 */
1756 		if (rsnum < 0 || rsnum > 65535) {
1757 			error = EINVAL;
1758 			goto done_deref;
1759 		}
1760 		/*
1761 		 * Nested jails always inherit parent's devfs ruleset
1762 		 */
1763 		if (jailed(td->td_ucred)) {
1764 			if (rsnum > 0 && rsnum != ppr->pr_devfs_rsnum) {
1765 				error = EPERM;
1766 				goto done_deref;
1767 			} else
1768 				rsnum = ppr->pr_devfs_rsnum;
1769 		}
1770 	}
1771 #ifdef INET
1772 	if (ip4s > 0) {
1773 		if ((ppr->pr_flags & PR_IP4) &&
1774 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET], ip4,
1775 		    PR_INET)) {
1776 			error = EPERM;
1777 			goto done_deref;
1778 		}
1779 		if (!prison_ip_conflict_check(ppr, pr, ip4, PR_INET)) {
1780 			error = EADDRINUSE;
1781 			vfs_opterror(opts, "IPv4 addresses clash");
1782 			goto done_deref;
1783 		}
1784 	}
1785 #endif
1786 #ifdef INET6
1787 	if (ip6s > 0) {
1788 		if ((ppr->pr_flags & PR_IP6) &&
1789 		    !prison_ip_parent_match(ppr->pr_addrs[PR_INET6], ip6,
1790 		    PR_INET6)) {
1791 			error = EPERM;
1792 			goto done_deref;
1793 		}
1794 		if (!prison_ip_conflict_check(ppr, pr, ip6, PR_INET6)) {
1795 			error = EADDRINUSE;
1796 			vfs_opterror(opts, "IPv6 addresses clash");
1797 			goto done_deref;
1798 		}
1799 	}
1800 #endif
1801 	onamelen = namelen = 0;
1802 	if (namelc != NULL) {
1803 		/* Give a default name of the jid.  Also allow the name to be
1804 		 * explicitly the jid - but not any other number, and only in
1805 		 * normal form (no leading zero/etc).
1806 		 */
1807 		if (namelc[0] == '\0')
1808 			snprintf(namelc = numbuf, sizeof(numbuf), "%d", jid);
1809 		else if ((strtoul(namelc, &p, 10) != jid ||
1810 			  namelc[0] < '1' || namelc[0] > '9') && *p == '\0') {
1811 			error = EINVAL;
1812 			vfs_opterror(opts,
1813 			    "name cannot be numeric (unless it is the jid)");
1814 			goto done_deref;
1815 		}
1816 		/*
1817 		 * Make sure the name isn't too long for the prison or its
1818 		 * children.
1819 		 */
1820 		pnamelen = (ppr == &prison0) ? 0 : strlen(ppr->pr_name) + 1;
1821 		onamelen = strlen(pr->pr_name + pnamelen);
1822 		namelen = strlen(namelc);
1823 		if (pnamelen + namelen + 1 > sizeof(pr->pr_name)) {
1824 			error = ENAMETOOLONG;
1825 			goto done_deref;
1826 		}
1827 		FOREACH_PRISON_DESCENDANT(pr, tpr, descend) {
1828 			if (strlen(tpr->pr_name) + (namelen - onamelen) >=
1829 			    sizeof(pr->pr_name)) {
1830 				error = ENAMETOOLONG;
1831 				goto done_deref;
1832 			}
1833 		}
1834 	}
1835 	pr_allow_diff = pr_allow & ~ppr->pr_allow;
1836 	if (pr_allow_diff & ~PR_ALLOW_DIFFERENCES) {
1837 		error = EPERM;
1838 		goto done_deref;
1839 	}
1840 
1841 	/*
1842 	 * Let modules check their parameters.  This requires unlocking and
1843 	 * then re-locking the prison, but this is still a valid state as long
1844 	 * as allprison_lock remains xlocked.
1845 	 */
1846 	mtx_unlock(&pr->pr_mtx);
1847 	drflags &= ~PD_LOCKED;
1848 	error = osd_jail_call(pr, PR_METHOD_CHECK, opts);
1849 	if (error != 0)
1850 		goto done_deref;
1851 	mtx_lock(&pr->pr_mtx);
1852 	drflags |= PD_LOCKED;
1853 
1854 	/* At this point, all valid parameters should have been noted. */
1855 	TAILQ_FOREACH(opt, opts, link) {
1856 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
1857 			error = EINVAL;
1858 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
1859 			goto done_deref;
1860 		}
1861 	}
1862 
1863 	/* Set the parameters of the prison. */
1864 #ifdef INET
1865 	redo_ip4 = false;
1866 	if (pr_flags & PR_IP4_USER) {
1867 		pr->pr_flags |= PR_IP4;
1868 		prison_ip_set(pr, PR_INET, ip4);
1869 		ip4 = NULL;
1870 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1871 #ifdef VIMAGE
1872 			if (tpr->pr_flags & PR_VNET) {
1873 				descend = 0;
1874 				continue;
1875 			}
1876 #endif
1877 			if (!prison_ip_restrict(tpr, PR_INET, NULL)) {
1878 				redo_ip4 = true;
1879 				descend = 0;
1880 			}
1881 		}
1882 	}
1883 #endif
1884 #ifdef INET6
1885 	redo_ip6 = false;
1886 	if (pr_flags & PR_IP6_USER) {
1887 		pr->pr_flags |= PR_IP6;
1888 		prison_ip_set(pr, PR_INET6, ip6);
1889 		ip6 = NULL;
1890 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1891 #ifdef VIMAGE
1892 			if (tpr->pr_flags & PR_VNET) {
1893 				descend = 0;
1894 				continue;
1895 			}
1896 #endif
1897 			if (!prison_ip_restrict(tpr, PR_INET6, NULL)) {
1898 				redo_ip6 = true;
1899 				descend = 0;
1900 			}
1901 		}
1902 	}
1903 #endif
1904 	if (gotslevel) {
1905 		pr->pr_securelevel = slevel;
1906 		/* Set all child jails to be at least this level. */
1907 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1908 			if (tpr->pr_securelevel < slevel)
1909 				tpr->pr_securelevel = slevel;
1910 	}
1911 	if (gotchildmax) {
1912 		pr->pr_childmax = childmax;
1913 		/* Set all child jails to under this limit. */
1914 		FOREACH_PRISON_DESCENDANT_LOCKED_LEVEL(pr, tpr, descend, level)
1915 			if (tpr->pr_childmax > childmax - level)
1916 				tpr->pr_childmax = childmax > level
1917 				    ? childmax - level : 0;
1918 	}
1919 	if (gotenforce) {
1920 		pr->pr_enforce_statfs = enforce;
1921 		/* Pass this restriction on to the children. */
1922 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1923 			if (tpr->pr_enforce_statfs < enforce)
1924 				tpr->pr_enforce_statfs = enforce;
1925 	}
1926 	if (gotrsnum) {
1927 		pr->pr_devfs_rsnum = rsnum;
1928 		/* Pass this restriction on to the children. */
1929 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend)
1930 			tpr->pr_devfs_rsnum = rsnum;
1931 	}
1932 	if (namelc != NULL) {
1933 		if (ppr == &prison0)
1934 			strlcpy(pr->pr_name, namelc, sizeof(pr->pr_name));
1935 		else
1936 			snprintf(pr->pr_name, sizeof(pr->pr_name), "%s.%s",
1937 			    ppr->pr_name, namelc);
1938 		/* Change this component of child names. */
1939 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1940 			bcopy(tpr->pr_name + onamelen, tpr->pr_name + namelen,
1941 			    strlen(tpr->pr_name + onamelen) + 1);
1942 			bcopy(pr->pr_name, tpr->pr_name, namelen);
1943 		}
1944 	}
1945 	if (path != NULL) {
1946 		/* Try to keep a real-rooted full pathname. */
1947 		strlcpy(pr->pr_path, path, sizeof(pr->pr_path));
1948 		pr->pr_root = root;
1949 		root = NULL;
1950 	}
1951 	if (PR_HOST & ch_flags & ~pr_flags) {
1952 		if (pr->pr_flags & PR_HOST) {
1953 			/*
1954 			 * Copy the parent's host info.  As with pr_ip4 above,
1955 			 * the lack of a lock on the parent is not a problem;
1956 			 * it is always set with allprison_lock at least
1957 			 * shared, and is held exclusively here.
1958 			 */
1959 			strlcpy(pr->pr_hostname, pr->pr_parent->pr_hostname,
1960 			    sizeof(pr->pr_hostname));
1961 			strlcpy(pr->pr_domainname, pr->pr_parent->pr_domainname,
1962 			    sizeof(pr->pr_domainname));
1963 			strlcpy(pr->pr_hostuuid, pr->pr_parent->pr_hostuuid,
1964 			    sizeof(pr->pr_hostuuid));
1965 			pr->pr_hostid = pr->pr_parent->pr_hostid;
1966 		}
1967 	} else if (host != NULL || domain != NULL || uuid != NULL || gothid) {
1968 		/* Set this prison, and any descendants without PR_HOST. */
1969 		if (host != NULL)
1970 			strlcpy(pr->pr_hostname, host, sizeof(pr->pr_hostname));
1971 		if (domain != NULL)
1972 			strlcpy(pr->pr_domainname, domain,
1973 			    sizeof(pr->pr_domainname));
1974 		if (uuid != NULL)
1975 			strlcpy(pr->pr_hostuuid, uuid, sizeof(pr->pr_hostuuid));
1976 		if (gothid)
1977 			pr->pr_hostid = hid;
1978 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
1979 			if (tpr->pr_flags & PR_HOST)
1980 				descend = 0;
1981 			else {
1982 				if (host != NULL)
1983 					strlcpy(tpr->pr_hostname,
1984 					    pr->pr_hostname,
1985 					    sizeof(tpr->pr_hostname));
1986 				if (domain != NULL)
1987 					strlcpy(tpr->pr_domainname,
1988 					    pr->pr_domainname,
1989 					    sizeof(tpr->pr_domainname));
1990 				if (uuid != NULL)
1991 					strlcpy(tpr->pr_hostuuid,
1992 					    pr->pr_hostuuid,
1993 					    sizeof(tpr->pr_hostuuid));
1994 				if (gothid)
1995 					tpr->pr_hostid = hid;
1996 			}
1997 		}
1998 	}
1999 	pr->pr_allow = (pr->pr_allow & ~ch_allow) | pr_allow;
2000 	if ((tallow = ch_allow & ~pr_allow))
2001 		prison_set_allow_locked(pr, tallow, 0);
2002 	/*
2003 	 * Persistent prisons get an extra reference, and prisons losing their
2004 	 * persist flag lose that reference.
2005 	 */
2006 	born = !prison_isalive(pr);
2007 	if (ch_flags & PR_PERSIST & (pr_flags ^ pr->pr_flags)) {
2008 		if (pr_flags & PR_PERSIST) {
2009 			prison_hold(pr);
2010 			/*
2011 			 * This may make a dead prison alive again, but wait
2012 			 * to label it as such until after OSD calls have had
2013 			 * a chance to run (and perhaps to fail).
2014 			 */
2015 			refcount_acquire(&pr->pr_uref);
2016 		} else {
2017 			drflags |= PD_DEUREF;
2018 			prison_free_not_last(pr);
2019 		}
2020 	}
2021 	pr->pr_flags = (pr->pr_flags & ~ch_flags) | pr_flags;
2022 	mtx_unlock(&pr->pr_mtx);
2023 	drflags &= ~PD_LOCKED;
2024 	/*
2025 	 * Any errors past this point will need to de-persist newly created
2026 	 * prisons, as well as call remove methods.
2027 	 */
2028 	if (born)
2029 		drflags |= PD_KILL;
2030 
2031 #ifdef RACCT
2032 	if (racct_enable && created)
2033 		prison_racct_attach(pr);
2034 #endif
2035 
2036 	/* Locks may have prevented a complete restriction of child IP
2037 	 * addresses.  If so, allocate some more memory and try again.
2038 	 */
2039 #ifdef INET
2040 	while (redo_ip4) {
2041 		ip4s = pr->pr_addrs[PR_INET]->ips;
2042 		MPASS(ip4 == NULL);
2043 		ip4 = prison_ip_alloc(PR_INET, ip4s, M_WAITOK);
2044 		mtx_lock(&pr->pr_mtx);
2045 		redo_ip4 = false;
2046 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2047 #ifdef VIMAGE
2048 			if (tpr->pr_flags & PR_VNET) {
2049 				descend = 0;
2050 				continue;
2051 			}
2052 #endif
2053 			redo_ip4 = !prison_ip_restrict(tpr, PR_INET, &ip4);
2054 		}
2055 		mtx_unlock(&pr->pr_mtx);
2056 	}
2057 #endif
2058 #ifdef INET6
2059 	while (redo_ip6) {
2060 		ip6s = pr->pr_addrs[PR_INET6]->ips;
2061 		MPASS(ip6 == NULL);
2062 		ip6 = prison_ip_alloc(PR_INET6, ip6s, M_WAITOK);
2063 		mtx_lock(&pr->pr_mtx);
2064 		redo_ip6 = false;
2065 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, tpr, descend) {
2066 #ifdef VIMAGE
2067 			if (tpr->pr_flags & PR_VNET) {
2068 				descend = 0;
2069 				continue;
2070 			}
2071 #endif
2072 			redo_ip6 = !prison_ip_restrict(tpr, PR_INET6, &ip6);
2073 		}
2074 		mtx_unlock(&pr->pr_mtx);
2075 	}
2076 #endif
2077 
2078 	/* Let the modules do their work. */
2079 	if (born) {
2080 		error = osd_jail_call(pr, PR_METHOD_CREATE, opts);
2081 		if (error)
2082 			goto done_deref;
2083 	}
2084 	error = osd_jail_call(pr, PR_METHOD_SET, opts);
2085 	if (error)
2086 		goto done_deref;
2087 
2088 	/*
2089 	 * A new prison is now ready to be seen; either it has gained a user
2090 	 * reference via persistence, or is about to gain one via attachment.
2091 	 */
2092 	if (born) {
2093 		drflags = prison_lock_xlock(pr, drflags);
2094 		pr->pr_state = PRISON_STATE_ALIVE;
2095 	}
2096 
2097 	/* Attach this process to the prison if requested. */
2098 	if (flags & JAIL_ATTACH) {
2099 		error = do_jail_attach(td, pr,
2100 		    prison_lock_xlock(pr, drflags & PD_LOCK_FLAGS));
2101 		drflags &= ~(PD_LOCKED | PD_LIST_XLOCKED);
2102 		if (error) {
2103 			vfs_opterror(opts, "attach failed");
2104 			goto done_deref;
2105 		}
2106 	}
2107 
2108 #ifdef RACCT
2109 	if (racct_enable && !created) {
2110 		if (drflags & PD_LOCKED) {
2111 			mtx_unlock(&pr->pr_mtx);
2112 			drflags &= ~PD_LOCKED;
2113 		}
2114 		if (drflags & PD_LIST_XLOCKED) {
2115 			sx_xunlock(&allprison_lock);
2116 			drflags &= ~PD_LIST_XLOCKED;
2117 		}
2118 		prison_racct_modify(pr);
2119 	}
2120 #endif
2121 
2122 #ifdef VNET_NFSD
2123 	if (born && pr != &prison0 && (pr->pr_allow & PR_ALLOW_NFSD) != 0 &&
2124 	    (pr->pr_root->v_vflag & VV_ROOT) == 0)
2125 		printf("Warning jail jid=%d: mountd/nfsd requires a separate"
2126 		   " file system\n", pr->pr_id);
2127 #endif
2128 
2129 	drflags &= ~PD_KILL;
2130 	td->td_retval[0] = pr->pr_id;
2131 
2132  done_deref:
2133 	/* Release any temporary prison holds and/or locks. */
2134 	if (pr != NULL)
2135 		prison_deref(pr, drflags);
2136 	else if (drflags & PD_LIST_SLOCKED)
2137 		sx_sunlock(&allprison_lock);
2138 	else if (drflags & PD_LIST_XLOCKED)
2139 		sx_xunlock(&allprison_lock);
2140 	if (root != NULL)
2141 		vrele(root);
2142  done_errmsg:
2143 	if (error) {
2144 		/* Write the error message back to userspace. */
2145 		if (vfs_getopt(opts, "errmsg", (void **)&errmsg,
2146 		    &errmsg_len) == 0 && errmsg_len > 0) {
2147 			errmsg_pos = 2 * vfs_getopt_pos(opts, "errmsg") + 1;
2148 			if (optuio->uio_segflg == UIO_SYSSPACE)
2149 				bcopy(errmsg,
2150 				    optuio->uio_iov[errmsg_pos].iov_base,
2151 				    errmsg_len);
2152 			else
2153 				copyout(errmsg,
2154 				    optuio->uio_iov[errmsg_pos].iov_base,
2155 				    errmsg_len);
2156 		}
2157 	}
2158  done_free:
2159 #ifdef INET
2160 	prison_ip_free(ip4);
2161 #endif
2162 #ifdef INET6
2163 	prison_ip_free(ip6);
2164 #endif
2165 	if (g_path != NULL)
2166 		free(g_path, M_TEMP);
2167 	vfs_freeopts(opts);
2168 	return (error);
2169 }
2170 
2171 /*
2172  * Find the next available prison ID.  Return the ID on success, or zero
2173  * on failure.  Also set a pointer to the allprison list entry the prison
2174  * should be inserted before.
2175  */
2176 static int
2177 get_next_prid(struct prison **insprp)
2178 {
2179 	struct prison *inspr;
2180 	int jid, maxid;
2181 
2182 	jid = lastprid % JAIL_MAX + 1;
2183 	if (TAILQ_EMPTY(&allprison) ||
2184 	    TAILQ_LAST(&allprison, prisonlist)->pr_id < jid) {
2185 		/*
2186 		 * A common case is for all jails to be implicitly numbered,
2187 		 * which means they'll go on the end of the list, at least
2188 		 * for the first JAIL_MAX times.
2189 		 */
2190 		inspr = NULL;
2191 	} else {
2192 		/*
2193 		 * Take two passes through the allprison list: first starting
2194 		 * with the proposed jid, then ending with it.
2195 		 */
2196 		for (maxid = JAIL_MAX; maxid != 0; ) {
2197 			TAILQ_FOREACH(inspr, &allprison, pr_list) {
2198 				if (inspr->pr_id < jid)
2199 					continue;
2200 				if (inspr->pr_id > jid) {
2201 					/* Found an opening. */
2202 					maxid = 0;
2203 					break;
2204 				}
2205 				if (++jid > maxid) {
2206 					if (lastprid == maxid || lastprid == 0)
2207 					{
2208 						/*
2209 						 * The entire legal range
2210 						 * has been traversed
2211 						 */
2212 						return 0;
2213 					}
2214 					/* Try again from the start. */
2215 					jid = 1;
2216 					maxid = lastprid;
2217 					break;
2218 				}
2219 			}
2220 			if (inspr == NULL) {
2221 				/* Found room at the end of the list. */
2222 				break;
2223 			}
2224 		}
2225 	}
2226 	*insprp = inspr;
2227 	lastprid = jid;
2228 	return (jid);
2229 }
2230 
2231 /*
2232  * struct jail_get_args {
2233  *	struct iovec *iovp;
2234  *	unsigned int iovcnt;
2235  *	int flags;
2236  * };
2237  */
2238 int
2239 sys_jail_get(struct thread *td, struct jail_get_args *uap)
2240 {
2241 	struct uio *auio;
2242 	int error;
2243 
2244 	/* Check that we have an even number of iovecs. */
2245 	if (uap->iovcnt & 1)
2246 		return (EINVAL);
2247 
2248 	error = copyinuio(uap->iovp, uap->iovcnt, &auio);
2249 	if (error)
2250 		return (error);
2251 	error = kern_jail_get(td, auio, uap->flags);
2252 	if (error == 0)
2253 		error = copyout(auio->uio_iov, uap->iovp,
2254 		    uap->iovcnt * sizeof (struct iovec));
2255 	free(auio, M_IOV);
2256 	return (error);
2257 }
2258 
2259 int
2260 kern_jail_get(struct thread *td, struct uio *optuio, int flags)
2261 {
2262 	struct bool_flags *bf;
2263 	struct jailsys_flags *jsf;
2264 	struct prison *pr, *mypr;
2265 	struct vfsopt *opt;
2266 	struct vfsoptlist *opts;
2267 	char *errmsg, *name;
2268 	int drflags, error, errmsg_len, errmsg_pos, i, jid, len, pos;
2269 	unsigned f;
2270 
2271 	if (flags & ~JAIL_GET_MASK)
2272 		return (EINVAL);
2273 
2274 	/* Get the parameter list. */
2275 	error = vfs_buildopts(optuio, &opts);
2276 	if (error)
2277 		return (error);
2278 	errmsg_pos = vfs_getopt_pos(opts, "errmsg");
2279 	mypr = td->td_ucred->cr_prison;
2280 	pr = NULL;
2281 
2282 	/*
2283 	 * Find the prison specified by one of: lastjid, jid, name.
2284 	 */
2285 	sx_slock(&allprison_lock);
2286 	drflags = PD_LIST_SLOCKED;
2287 	error = vfs_copyopt(opts, "lastjid", &jid, sizeof(jid));
2288 	if (error == 0) {
2289 		TAILQ_FOREACH(pr, &allprison, pr_list) {
2290 			if (pr->pr_id > jid &&
2291 			    ((flags & JAIL_DYING) || prison_isalive(pr)) &&
2292 			    prison_ischild(mypr, pr)) {
2293 				mtx_lock(&pr->pr_mtx);
2294 				drflags |= PD_LOCKED;
2295 				goto found_prison;
2296 			}
2297 		}
2298 		error = ENOENT;
2299 		vfs_opterror(opts, "no jail after %d", jid);
2300 		goto done;
2301 	} else if (error != ENOENT)
2302 		goto done;
2303 
2304 	error = vfs_copyopt(opts, "jid", &jid, sizeof(jid));
2305 	if (error == 0) {
2306 		if (jid != 0) {
2307 			pr = prison_find_child(mypr, jid);
2308 			if (pr != NULL) {
2309 				drflags |= PD_LOCKED;
2310 				if (!(prison_isalive(pr) ||
2311 				    (flags & JAIL_DYING))) {
2312 					error = ENOENT;
2313 					vfs_opterror(opts, "jail %d is dying",
2314 					    jid);
2315 					goto done;
2316 				}
2317 				goto found_prison;
2318 			}
2319 			error = ENOENT;
2320 			vfs_opterror(opts, "jail %d not found", jid);
2321 			goto done;
2322 		}
2323 	} else if (error != ENOENT)
2324 		goto done;
2325 
2326 	error = vfs_getopt(opts, "name", (void **)&name, &len);
2327 	if (error == 0) {
2328 		if (len == 0 || name[len - 1] != '\0') {
2329 			error = EINVAL;
2330 			goto done;
2331 		}
2332 		pr = prison_find_name(mypr, name);
2333 		if (pr != NULL) {
2334 			drflags |= PD_LOCKED;
2335 			if (!(prison_isalive(pr) || (flags & JAIL_DYING))) {
2336 				error = ENOENT;
2337 				vfs_opterror(opts, "jail \"%s\" is dying",
2338 				    name);
2339 				goto done;
2340 			}
2341 			goto found_prison;
2342 		}
2343 		error = ENOENT;
2344 		vfs_opterror(opts, "jail \"%s\" not found", name);
2345 		goto done;
2346 	} else if (error != ENOENT)
2347 		goto done;
2348 
2349 	vfs_opterror(opts, "no jail specified");
2350 	error = ENOENT;
2351 	goto done;
2352 
2353  found_prison:
2354 	/* Get the parameters of the prison. */
2355 	prison_hold(pr);
2356 	drflags |= PD_DEREF;
2357 	td->td_retval[0] = pr->pr_id;
2358 	error = vfs_setopt(opts, "jid", &pr->pr_id, sizeof(pr->pr_id));
2359 	if (error != 0 && error != ENOENT)
2360 		goto done;
2361 	i = (pr->pr_parent == mypr) ? 0 : pr->pr_parent->pr_id;
2362 	error = vfs_setopt(opts, "parent", &i, sizeof(i));
2363 	if (error != 0 && error != ENOENT)
2364 		goto done;
2365 	error = vfs_setopts(opts, "name", prison_name(mypr, pr));
2366 	if (error != 0 && error != ENOENT)
2367 		goto done;
2368 	error = vfs_setopt(opts, "cpuset.id", &pr->pr_cpuset->cs_id,
2369 	    sizeof(pr->pr_cpuset->cs_id));
2370 	if (error != 0 && error != ENOENT)
2371 		goto done;
2372 	error = vfs_setopts(opts, "path", prison_path(mypr, pr));
2373 	if (error != 0 && error != ENOENT)
2374 		goto done;
2375 #ifdef INET
2376 	error = vfs_setopt_part(opts, "ip4.addr", pr->pr_addrs[PR_INET] + 1,
2377 	    pr->pr_addrs[PR_INET] ? pr->pr_addrs[PR_INET]->ips *
2378 	    pr_families[PR_INET].size : 0 );
2379 	if (error != 0 && error != ENOENT)
2380 		goto done;
2381 #endif
2382 #ifdef INET6
2383 	error = vfs_setopt_part(opts, "ip6.addr", pr->pr_addrs[PR_INET6] + 1,
2384 	    pr->pr_addrs[PR_INET6] ? pr->pr_addrs[PR_INET6]->ips *
2385 	    pr_families[PR_INET6].size : 0 );
2386 	if (error != 0 && error != ENOENT)
2387 		goto done;
2388 #endif
2389 	error = vfs_setopt(opts, "securelevel", &pr->pr_securelevel,
2390 	    sizeof(pr->pr_securelevel));
2391 	if (error != 0 && error != ENOENT)
2392 		goto done;
2393 	error = vfs_setopt(opts, "children.cur", &pr->pr_childcount,
2394 	    sizeof(pr->pr_childcount));
2395 	if (error != 0 && error != ENOENT)
2396 		goto done;
2397 	error = vfs_setopt(opts, "children.max", &pr->pr_childmax,
2398 	    sizeof(pr->pr_childmax));
2399 	if (error != 0 && error != ENOENT)
2400 		goto done;
2401 	error = vfs_setopts(opts, "host.hostname", pr->pr_hostname);
2402 	if (error != 0 && error != ENOENT)
2403 		goto done;
2404 	error = vfs_setopts(opts, "host.domainname", pr->pr_domainname);
2405 	if (error != 0 && error != ENOENT)
2406 		goto done;
2407 	error = vfs_setopts(opts, "host.hostuuid", pr->pr_hostuuid);
2408 	if (error != 0 && error != ENOENT)
2409 		goto done;
2410 #ifdef COMPAT_FREEBSD32
2411 	if (SV_PROC_FLAG(td->td_proc, SV_ILP32)) {
2412 		uint32_t hid32 = pr->pr_hostid;
2413 
2414 		error = vfs_setopt(opts, "host.hostid", &hid32, sizeof(hid32));
2415 	} else
2416 #endif
2417 	error = vfs_setopt(opts, "host.hostid", &pr->pr_hostid,
2418 	    sizeof(pr->pr_hostid));
2419 	if (error != 0 && error != ENOENT)
2420 		goto done;
2421 	error = vfs_setopt(opts, "enforce_statfs", &pr->pr_enforce_statfs,
2422 	    sizeof(pr->pr_enforce_statfs));
2423 	if (error != 0 && error != ENOENT)
2424 		goto done;
2425 	error = vfs_setopt(opts, "devfs_ruleset", &pr->pr_devfs_rsnum,
2426 	    sizeof(pr->pr_devfs_rsnum));
2427 	if (error != 0 && error != ENOENT)
2428 		goto done;
2429 	for (bf = pr_flag_bool;
2430 	     bf < pr_flag_bool + nitems(pr_flag_bool);
2431 	     bf++) {
2432 		i = (pr->pr_flags & bf->flag) ? 1 : 0;
2433 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2434 		if (error != 0 && error != ENOENT)
2435 			goto done;
2436 		i = !i;
2437 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2438 		if (error != 0 && error != ENOENT)
2439 			goto done;
2440 	}
2441 	for (jsf = pr_flag_jailsys;
2442 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
2443 	     jsf++) {
2444 		f = pr->pr_flags & (jsf->disable | jsf->new);
2445 		i = (f != 0 && f == jsf->disable) ? JAIL_SYS_DISABLE
2446 		    : (f == jsf->new) ? JAIL_SYS_NEW
2447 		    : JAIL_SYS_INHERIT;
2448 		error = vfs_setopt(opts, jsf->name, &i, sizeof(i));
2449 		if (error != 0 && error != ENOENT)
2450 			goto done;
2451 	}
2452 	for (bf = pr_flag_allow;
2453 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
2454 		atomic_load_int(&bf->flag) != 0;
2455 	     bf++) {
2456 		i = (pr->pr_allow & bf->flag) ? 1 : 0;
2457 		error = vfs_setopt(opts, bf->name, &i, sizeof(i));
2458 		if (error != 0 && error != ENOENT)
2459 			goto done;
2460 		i = !i;
2461 		error = vfs_setopt(opts, bf->noname, &i, sizeof(i));
2462 		if (error != 0 && error != ENOENT)
2463 			goto done;
2464 	}
2465 	i = !prison_isalive(pr);
2466 	error = vfs_setopt(opts, "dying", &i, sizeof(i));
2467 	if (error != 0 && error != ENOENT)
2468 		goto done;
2469 	i = !i;
2470 	error = vfs_setopt(opts, "nodying", &i, sizeof(i));
2471 	if (error != 0 && error != ENOENT)
2472 		goto done;
2473 	error = vfs_setopt(opts, "osreldate", &pr->pr_osreldate,
2474 	    sizeof(pr->pr_osreldate));
2475 	if (error != 0 && error != ENOENT)
2476 		goto done;
2477 	error = vfs_setopts(opts, "osrelease", pr->pr_osrelease);
2478 	if (error != 0 && error != ENOENT)
2479 		goto done;
2480 
2481 	/* Get the module parameters. */
2482 	mtx_unlock(&pr->pr_mtx);
2483 	drflags &= ~PD_LOCKED;
2484 	error = osd_jail_call(pr, PR_METHOD_GET, opts);
2485 	if (error)
2486 		goto done;
2487 	prison_deref(pr, drflags);
2488 	pr = NULL;
2489 	drflags = 0;
2490 
2491 	/* By now, all parameters should have been noted. */
2492 	TAILQ_FOREACH(opt, opts, link) {
2493 		if (!opt->seen && strcmp(opt->name, "errmsg")) {
2494 			error = EINVAL;
2495 			vfs_opterror(opts, "unknown parameter: %s", opt->name);
2496 			goto done;
2497 		}
2498 	}
2499 
2500 	/* Write the fetched parameters back to userspace. */
2501 	error = 0;
2502 	TAILQ_FOREACH(opt, opts, link) {
2503 		if (opt->pos >= 0 && opt->pos != errmsg_pos) {
2504 			pos = 2 * opt->pos + 1;
2505 			optuio->uio_iov[pos].iov_len = opt->len;
2506 			if (opt->value != NULL) {
2507 				if (optuio->uio_segflg == UIO_SYSSPACE) {
2508 					bcopy(opt->value,
2509 					    optuio->uio_iov[pos].iov_base,
2510 					    opt->len);
2511 				} else {
2512 					error = copyout(opt->value,
2513 					    optuio->uio_iov[pos].iov_base,
2514 					    opt->len);
2515 					if (error)
2516 						break;
2517 				}
2518 			}
2519 		}
2520 	}
2521 
2522  done:
2523 	/* Release any temporary prison holds and/or locks. */
2524 	if (pr != NULL)
2525 		prison_deref(pr, drflags);
2526 	else if (drflags & PD_LIST_SLOCKED)
2527 		sx_sunlock(&allprison_lock);
2528 	if (error && errmsg_pos >= 0) {
2529 		/* Write the error message back to userspace. */
2530 		vfs_getopt(opts, "errmsg", (void **)&errmsg, &errmsg_len);
2531 		errmsg_pos = 2 * errmsg_pos + 1;
2532 		if (errmsg_len > 0) {
2533 			if (optuio->uio_segflg == UIO_SYSSPACE)
2534 				bcopy(errmsg,
2535 				    optuio->uio_iov[errmsg_pos].iov_base,
2536 				    errmsg_len);
2537 			else
2538 				copyout(errmsg,
2539 				    optuio->uio_iov[errmsg_pos].iov_base,
2540 				    errmsg_len);
2541 		}
2542 	}
2543 	vfs_freeopts(opts);
2544 	return (error);
2545 }
2546 
2547 /*
2548  * struct jail_remove_args {
2549  *	int jid;
2550  * };
2551  */
2552 int
2553 sys_jail_remove(struct thread *td, struct jail_remove_args *uap)
2554 {
2555 	struct prison *pr;
2556 	int error;
2557 
2558 	error = priv_check(td, PRIV_JAIL_REMOVE);
2559 	if (error)
2560 		return (error);
2561 
2562 	sx_xlock(&allprison_lock);
2563 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2564 	if (pr == NULL) {
2565 		sx_xunlock(&allprison_lock);
2566 		return (EINVAL);
2567 	}
2568 	if (!prison_isalive(pr)) {
2569 		/* Silently ignore already-dying prisons. */
2570 		mtx_unlock(&pr->pr_mtx);
2571 		sx_xunlock(&allprison_lock);
2572 		return (0);
2573 	}
2574 	prison_deref(pr, PD_KILL | PD_LOCKED | PD_LIST_XLOCKED);
2575 	return (0);
2576 }
2577 
2578 /*
2579  * struct jail_attach_args {
2580  *	int jid;
2581  * };
2582  */
2583 int
2584 sys_jail_attach(struct thread *td, struct jail_attach_args *uap)
2585 {
2586 	struct prison *pr;
2587 	int error;
2588 
2589 	error = priv_check(td, PRIV_JAIL_ATTACH);
2590 	if (error)
2591 		return (error);
2592 
2593 	sx_slock(&allprison_lock);
2594 	pr = prison_find_child(td->td_ucred->cr_prison, uap->jid);
2595 	if (pr == NULL) {
2596 		sx_sunlock(&allprison_lock);
2597 		return (EINVAL);
2598 	}
2599 
2600 	/* Do not allow a process to attach to a prison that is not alive. */
2601 	if (!prison_isalive(pr)) {
2602 		mtx_unlock(&pr->pr_mtx);
2603 		sx_sunlock(&allprison_lock);
2604 		return (EINVAL);
2605 	}
2606 
2607 	return (do_jail_attach(td, pr, PD_LOCKED | PD_LIST_SLOCKED));
2608 }
2609 
2610 static int
2611 do_jail_attach(struct thread *td, struct prison *pr, int drflags)
2612 {
2613 	struct proc *p;
2614 	struct ucred *newcred, *oldcred;
2615 	int error;
2616 
2617 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2618 	sx_assert(&allprison_lock, SX_LOCKED);
2619 	drflags &= PD_LOCK_FLAGS;
2620 	/*
2621 	 * XXX: Note that there is a slight race here if two threads
2622 	 * in the same privileged process attempt to attach to two
2623 	 * different jails at the same time.  It is important for
2624 	 * user processes not to do this, or they might end up with
2625 	 * a process root from one prison, but attached to the jail
2626 	 * of another.
2627 	 */
2628 	prison_hold(pr);
2629 	refcount_acquire(&pr->pr_uref);
2630 	drflags |= PD_DEREF | PD_DEUREF;
2631 	mtx_unlock(&pr->pr_mtx);
2632 	drflags &= ~PD_LOCKED;
2633 
2634 	/* Let modules do whatever they need to prepare for attaching. */
2635 	error = osd_jail_call(pr, PR_METHOD_ATTACH, td);
2636 	if (error) {
2637 		prison_deref(pr, drflags);
2638 		return (error);
2639 	}
2640 	sx_unlock(&allprison_lock);
2641 	drflags &= ~(PD_LIST_SLOCKED | PD_LIST_XLOCKED);
2642 
2643 	/*
2644 	 * Reparent the newly attached process to this jail.
2645 	 */
2646 	p = td->td_proc;
2647 	error = cpuset_setproc_update_set(p, pr->pr_cpuset);
2648 	if (error)
2649 		goto e_revert_osd;
2650 
2651 	vn_lock(pr->pr_root, LK_EXCLUSIVE | LK_RETRY);
2652 	if ((error = change_dir(pr->pr_root, td)) != 0)
2653 		goto e_unlock;
2654 #ifdef MAC
2655 	if ((error = mac_vnode_check_chroot(td->td_ucred, pr->pr_root)))
2656 		goto e_unlock;
2657 #endif
2658 	VOP_UNLOCK(pr->pr_root);
2659 	if ((error = pwd_chroot_chdir(td, pr->pr_root)))
2660 		goto e_revert_osd;
2661 
2662 	newcred = crget();
2663 	PROC_LOCK(p);
2664 	oldcred = crcopysafe(p, newcred);
2665 	newcred->cr_prison = pr;
2666 	proc_set_cred(p, newcred);
2667 	setsugid(p);
2668 #ifdef RACCT
2669 	racct_proc_ucred_changed(p, oldcred, newcred);
2670 	crhold(newcred);
2671 #endif
2672 	PROC_UNLOCK(p);
2673 #ifdef RCTL
2674 	rctl_proc_ucred_changed(p, newcred);
2675 	crfree(newcred);
2676 #endif
2677 	prison_proc_relink(oldcred->cr_prison, pr, p);
2678 	prison_deref(oldcred->cr_prison, drflags);
2679 	crfree(oldcred);
2680 
2681 	/*
2682 	 * If the prison was killed while changing credentials, die along
2683 	 * with it.
2684 	 */
2685 	if (!prison_isalive(pr)) {
2686 		PROC_LOCK(p);
2687 		kern_psignal(p, SIGKILL);
2688 		PROC_UNLOCK(p);
2689 	}
2690 
2691 	return (0);
2692 
2693  e_unlock:
2694 	VOP_UNLOCK(pr->pr_root);
2695  e_revert_osd:
2696 	/* Tell modules this thread is still in its old jail after all. */
2697 	sx_slock(&allprison_lock);
2698 	drflags |= PD_LIST_SLOCKED;
2699 	(void)osd_jail_call(td->td_ucred->cr_prison, PR_METHOD_ATTACH, td);
2700 	prison_deref(pr, drflags);
2701 	return (error);
2702 }
2703 
2704 /*
2705  * Returns a locked prison instance, or NULL on failure.
2706  */
2707 struct prison *
2708 prison_find(int prid)
2709 {
2710 	struct prison *pr;
2711 
2712 	sx_assert(&allprison_lock, SX_LOCKED);
2713 	TAILQ_FOREACH(pr, &allprison, pr_list) {
2714 		if (pr->pr_id < prid)
2715 			continue;
2716 		if (pr->pr_id > prid)
2717 			break;
2718 		KASSERT(prison_isvalid(pr), ("Found invalid prison %p", pr));
2719 		mtx_lock(&pr->pr_mtx);
2720 		return (pr);
2721 	}
2722 	return (NULL);
2723 }
2724 
2725 /*
2726  * Find a prison that is a descendant of mypr.  Returns a locked prison or NULL.
2727  */
2728 struct prison *
2729 prison_find_child(struct prison *mypr, int prid)
2730 {
2731 	struct prison *pr;
2732 	int descend;
2733 
2734 	sx_assert(&allprison_lock, SX_LOCKED);
2735 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2736 		if (pr->pr_id == prid) {
2737 			KASSERT(prison_isvalid(pr),
2738 			    ("Found invalid prison %p", pr));
2739 			mtx_lock(&pr->pr_mtx);
2740 			return (pr);
2741 		}
2742 	}
2743 	return (NULL);
2744 }
2745 
2746 /*
2747  * Look for the name relative to mypr.  Returns a locked prison or NULL.
2748  */
2749 struct prison *
2750 prison_find_name(struct prison *mypr, const char *name)
2751 {
2752 	struct prison *pr, *deadpr;
2753 	size_t mylen;
2754 	int descend;
2755 
2756 	sx_assert(&allprison_lock, SX_LOCKED);
2757 	mylen = (mypr == &prison0) ? 0 : strlen(mypr->pr_name) + 1;
2758 	deadpr = NULL;
2759 	FOREACH_PRISON_DESCENDANT(mypr, pr, descend) {
2760 		if (!strcmp(pr->pr_name + mylen, name)) {
2761 			KASSERT(prison_isvalid(pr),
2762 			    ("Found invalid prison %p", pr));
2763 			if (prison_isalive(pr)) {
2764 				mtx_lock(&pr->pr_mtx);
2765 				return (pr);
2766 			}
2767 			deadpr = pr;
2768 		}
2769 	}
2770 	/* There was no valid prison - perhaps there was a dying one. */
2771 	if (deadpr != NULL)
2772 		mtx_lock(&deadpr->pr_mtx);
2773 	return (deadpr);
2774 }
2775 
2776 /*
2777  * See if a prison has the specific flag set.  The prison should be locked,
2778  * unless checking for flags that are only set at jail creation (such as
2779  * PR_IP4 and PR_IP6), or only the single bit is examined, without regard
2780  * to any other prison data.
2781  */
2782 int
2783 prison_flag(struct ucred *cred, unsigned flag)
2784 {
2785 
2786 	return (cred->cr_prison->pr_flags & flag);
2787 }
2788 
2789 int
2790 prison_allow(struct ucred *cred, unsigned flag)
2791 {
2792 
2793 	return ((cred->cr_prison->pr_allow & flag) != 0);
2794 }
2795 
2796 /*
2797  * Hold a prison reference, by incrementing pr_ref.  It is generally
2798  * an error to hold a prison that does not already have a reference.
2799  * A prison record will remain valid as long as it has at least one
2800  * reference, and will not be removed as long as either the prison
2801  * mutex or the allprison lock is held (allprison_lock may be shared).
2802  */
2803 void
2804 prison_hold_locked(struct prison *pr)
2805 {
2806 
2807 	/* Locking is no longer required. */
2808 	prison_hold(pr);
2809 }
2810 
2811 void
2812 prison_hold(struct prison *pr)
2813 {
2814 #ifdef INVARIANTS
2815 	int was_valid = refcount_acquire_if_not_zero(&pr->pr_ref);
2816 
2817 	KASSERT(was_valid,
2818 	    ("Trying to hold dead prison %p (jid=%d).", pr, pr->pr_id));
2819 #else
2820 	refcount_acquire(&pr->pr_ref);
2821 #endif
2822 }
2823 
2824 /*
2825  * Remove a prison reference.  If that was the last reference, the
2826  * prison will be removed (at a later time).
2827  */
2828 void
2829 prison_free_locked(struct prison *pr)
2830 {
2831 
2832 	mtx_assert(&pr->pr_mtx, MA_OWNED);
2833 	/*
2834 	 * Locking is no longer required, but unlock because the caller
2835 	 * expects it.
2836 	 */
2837 	mtx_unlock(&pr->pr_mtx);
2838 	prison_free(pr);
2839 }
2840 
2841 void
2842 prison_free(struct prison *pr)
2843 {
2844 
2845 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2846 	    ("Trying to free dead prison %p (jid=%d).",
2847 	     pr, pr->pr_id));
2848 	if (!refcount_release_if_not_last(&pr->pr_ref)) {
2849 		/*
2850 		 * Don't remove the last reference in this context,
2851 		 * in case there are locks held.
2852 		 */
2853 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2854 	}
2855 }
2856 
2857 static void
2858 prison_free_not_last(struct prison *pr)
2859 {
2860 #ifdef INVARIANTS
2861 	int lastref;
2862 
2863 	KASSERT(refcount_load(&pr->pr_ref) > 0,
2864 	    ("Trying to free dead prison %p (jid=%d).",
2865 	     pr, pr->pr_id));
2866 	lastref = refcount_release(&pr->pr_ref);
2867 	KASSERT(!lastref,
2868 	    ("prison_free_not_last freed last ref on prison %p (jid=%d).",
2869 	     pr, pr->pr_id));
2870 #else
2871 	refcount_release(&pr->pr_ref);
2872 #endif
2873 }
2874 
2875 /*
2876  * Hold a prison for user visibility, by incrementing pr_uref.
2877  * It is generally an error to hold a prison that isn't already
2878  * user-visible, except through the jail system calls.  It is also
2879  * an error to hold an invalid prison.  A prison record will remain
2880  * alive as long as it has at least one user reference, and will not
2881  * be set to the dying state until the prison mutex and allprison_lock
2882  * are both freed.
2883  */
2884 void
2885 prison_proc_hold(struct prison *pr)
2886 {
2887 #ifdef INVARIANTS
2888 	int was_alive = refcount_acquire_if_not_zero(&pr->pr_uref);
2889 
2890 	KASSERT(was_alive,
2891 	    ("Cannot add a process to a non-alive prison (jid=%d)", pr->pr_id));
2892 #else
2893 	refcount_acquire(&pr->pr_uref);
2894 #endif
2895 }
2896 
2897 /*
2898  * Remove a prison user reference.  If it was the last reference, the
2899  * prison will be considered "dying", and may be removed once all of
2900  * its references are dropped.
2901  */
2902 void
2903 prison_proc_free(struct prison *pr)
2904 {
2905 
2906 	/*
2907 	 * Locking is only required when releasing the last reference.
2908 	 * This allows assurance that a locked prison will remain alive
2909 	 * until it is unlocked.
2910 	 */
2911 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2912 	    ("Trying to kill a process in a dead prison (jid=%d)", pr->pr_id));
2913 	if (!refcount_release_if_not_last(&pr->pr_uref)) {
2914 		/*
2915 		 * Don't remove the last user reference in this context,
2916 		 * which is expected to be a process that is not only locked,
2917 		 * but also half dead.  Add a reference so any calls to
2918 		 * prison_free() won't re-submit the task.
2919 		 */
2920 		prison_hold(pr);
2921 		mtx_lock(&pr->pr_mtx);
2922 		KASSERT(!(pr->pr_flags & PR_COMPLETE_PROC),
2923 		    ("Redundant last reference in prison_proc_free (jid=%d)",
2924 		     pr->pr_id));
2925 		pr->pr_flags |= PR_COMPLETE_PROC;
2926 		mtx_unlock(&pr->pr_mtx);
2927 		taskqueue_enqueue(taskqueue_thread, &pr->pr_task);
2928 	}
2929 }
2930 
2931 static void
2932 prison_proc_free_not_last(struct prison *pr)
2933 {
2934 #ifdef INVARIANTS
2935 	int lastref;
2936 
2937 	KASSERT(refcount_load(&pr->pr_uref) > 0,
2938 	    ("Trying to free dead prison %p (jid=%d).",
2939 	     pr, pr->pr_id));
2940 	lastref = refcount_release(&pr->pr_uref);
2941 	KASSERT(!lastref,
2942 	    ("prison_proc_free_not_last freed last uref on prison %p (jid=%d).",
2943 	     pr, pr->pr_id));
2944 #else
2945 	refcount_release(&pr->pr_uref);
2946 #endif
2947 }
2948 
2949 void
2950 prison_proc_link(struct prison *pr, struct proc *p)
2951 {
2952 
2953 	sx_assert(&allproc_lock, SA_XLOCKED);
2954 	LIST_INSERT_HEAD(&pr->pr_proclist, p, p_jaillist);
2955 }
2956 
2957 void
2958 prison_proc_unlink(struct prison *pr, struct proc *p)
2959 {
2960 
2961 	sx_assert(&allproc_lock, SA_XLOCKED);
2962 	LIST_REMOVE(p, p_jaillist);
2963 }
2964 
2965 static void
2966 prison_proc_relink(struct prison *opr, struct prison *npr, struct proc *p)
2967 {
2968 
2969 	sx_xlock(&allproc_lock);
2970 	prison_proc_unlink(opr, p);
2971 	prison_proc_link(npr, p);
2972 	sx_xunlock(&allproc_lock);
2973 }
2974 
2975 /*
2976  * Complete a call to either prison_free or prison_proc_free.
2977  */
2978 static void
2979 prison_complete(void *context, int pending)
2980 {
2981 	struct prison *pr = context;
2982 	int drflags;
2983 
2984 	/*
2985 	 * This could be called to release the last reference, or the last
2986 	 * user reference (plus the reference held in prison_proc_free).
2987 	 */
2988 	drflags = prison_lock_xlock(pr, PD_DEREF);
2989 	if (pr->pr_flags & PR_COMPLETE_PROC) {
2990 		pr->pr_flags &= ~PR_COMPLETE_PROC;
2991 		drflags |= PD_DEUREF;
2992 	}
2993 	prison_deref(pr, drflags);
2994 }
2995 
2996 static void
2997 prison_kill_processes_cb(struct proc *p, void *arg __unused)
2998 {
2999 
3000 	kern_psignal(p, SIGKILL);
3001 }
3002 
3003 /*
3004  * Note the iteration does not guarantee acting on all processes.
3005  * Most notably there may be fork or jail_attach in progress.
3006  */
3007 void
3008 prison_proc_iterate(struct prison *pr, void (*cb)(struct proc *, void *),
3009     void *cbarg)
3010 {
3011 	struct prison *ppr;
3012 	struct proc *p;
3013 
3014 	if (atomic_load_int(&pr->pr_childcount) == 0) {
3015 		sx_slock(&allproc_lock);
3016 		LIST_FOREACH(p, &pr->pr_proclist, p_jaillist) {
3017 			if (p->p_state == PRS_NEW)
3018 				continue;
3019 			PROC_LOCK(p);
3020 			cb(p, cbarg);
3021 			PROC_UNLOCK(p);
3022 		}
3023 		sx_sunlock(&allproc_lock);
3024 		if (atomic_load_int(&pr->pr_childcount) == 0)
3025 			return;
3026 		/*
3027 		 * Some jails popped up during the iteration, fall through to a
3028 		 * system-wide search.
3029 		 */
3030 	}
3031 
3032 	sx_slock(&allproc_lock);
3033 	FOREACH_PROC_IN_SYSTEM(p) {
3034 		PROC_LOCK(p);
3035 		if (p->p_state != PRS_NEW && p->p_ucred != NULL) {
3036 			for (ppr = p->p_ucred->cr_prison;
3037 			    ppr != &prison0;
3038 			    ppr = ppr->pr_parent) {
3039 				if (ppr == pr) {
3040 					cb(p, cbarg);
3041 					break;
3042 				}
3043 			}
3044 		}
3045 		PROC_UNLOCK(p);
3046 	}
3047 	sx_sunlock(&allproc_lock);
3048 }
3049 
3050 /*
3051  * Remove a prison reference and/or user reference (usually).
3052  * This assumes context that allows sleeping (for allprison_lock),
3053  * with no non-sleeping locks held, except perhaps the prison itself.
3054  * If there are no more references, release and delist the prison.
3055  * On completion, the prison lock and the allprison lock are both
3056  * unlocked.
3057  */
3058 static void
3059 prison_deref(struct prison *pr, int flags)
3060 {
3061 	struct prisonlist freeprison;
3062 	struct prison *killpr, *rpr, *ppr, *tpr;
3063 
3064 	killpr = NULL;
3065 	TAILQ_INIT(&freeprison);
3066 	/*
3067 	 * Release this prison as requested, which may cause its parent
3068 	 * to be released, and then maybe its grandparent, etc.
3069 	 */
3070 	for (;;) {
3071 		if (flags & PD_KILL) {
3072 			/* Kill the prison and its descendents. */
3073 			KASSERT(pr != &prison0,
3074 			    ("prison_deref trying to kill prison0"));
3075 			if (!(flags & PD_DEREF)) {
3076 				prison_hold(pr);
3077 				flags |= PD_DEREF;
3078 			}
3079 			flags = prison_lock_xlock(pr, flags);
3080 			prison_deref_kill(pr, &freeprison);
3081 		}
3082 		if (flags & PD_DEUREF) {
3083 			/* Drop a user reference. */
3084 			KASSERT(refcount_load(&pr->pr_uref) > 0,
3085 			    ("prison_deref PD_DEUREF on a dead prison (jid=%d)",
3086 			     pr->pr_id));
3087 			if (!refcount_release_if_not_last(&pr->pr_uref)) {
3088 				if (!(flags & PD_DEREF)) {
3089 					prison_hold(pr);
3090 					flags |= PD_DEREF;
3091 				}
3092 				flags = prison_lock_xlock(pr, flags);
3093 				if (refcount_release(&pr->pr_uref) &&
3094 				    pr->pr_state == PRISON_STATE_ALIVE) {
3095 					/*
3096 					 * When the last user references goes,
3097 					 * this becomes a dying prison.
3098 					 */
3099 					KASSERT(
3100 					    refcount_load(&prison0.pr_uref) > 0,
3101 					    ("prison0 pr_uref=0"));
3102 					pr->pr_state = PRISON_STATE_DYING;
3103 					mtx_unlock(&pr->pr_mtx);
3104 					flags &= ~PD_LOCKED;
3105 					prison_cleanup(pr);
3106 				}
3107 			}
3108 		}
3109 		if (flags & PD_KILL) {
3110 			/*
3111 			 * Any remaining user references are probably processes
3112 			 * that need to be killed, either in this prison or its
3113 			 * descendants.
3114 			 */
3115 			if (refcount_load(&pr->pr_uref) > 0)
3116 				killpr = pr;
3117 			/* Make sure the parent prison doesn't get killed. */
3118 			flags &= ~PD_KILL;
3119 		}
3120 		if (flags & PD_DEREF) {
3121 			/* Drop a reference. */
3122 			KASSERT(refcount_load(&pr->pr_ref) > 0,
3123 			    ("prison_deref PD_DEREF on a dead prison (jid=%d)",
3124 			     pr->pr_id));
3125 			if (!refcount_release_if_not_last(&pr->pr_ref)) {
3126 				flags = prison_lock_xlock(pr, flags);
3127 				if (refcount_release(&pr->pr_ref)) {
3128 					/*
3129 					 * When the last reference goes,
3130 					 * unlink the prison and set it aside.
3131 					 */
3132 					KASSERT(
3133 					    refcount_load(&pr->pr_uref) == 0,
3134 					    ("prison_deref: last ref, "
3135 					     "but still has %d urefs (jid=%d)",
3136 					     pr->pr_uref, pr->pr_id));
3137 					KASSERT(
3138 					    refcount_load(&prison0.pr_ref) != 0,
3139 					    ("prison0 pr_ref=0"));
3140 					pr->pr_state = PRISON_STATE_INVALID;
3141 					TAILQ_REMOVE(&allprison, pr, pr_list);
3142 					LIST_REMOVE(pr, pr_sibling);
3143 					TAILQ_INSERT_TAIL(&freeprison, pr,
3144 					    pr_list);
3145 					for (ppr = pr->pr_parent;
3146 					     ppr != NULL;
3147 					     ppr = ppr->pr_parent)
3148 						ppr->pr_childcount--;
3149 					/*
3150 					 * Removing a prison frees references
3151 					 * from its parent.
3152 					 */
3153 					mtx_unlock(&pr->pr_mtx);
3154 					flags &= ~PD_LOCKED;
3155 					pr = pr->pr_parent;
3156 					flags |= PD_DEREF | PD_DEUREF;
3157 					continue;
3158 				}
3159 			}
3160 		}
3161 		break;
3162 	}
3163 
3164 	/* Release all the prison locks. */
3165 	if (flags & PD_LOCKED)
3166 		mtx_unlock(&pr->pr_mtx);
3167 	if (flags & PD_LIST_SLOCKED)
3168 		sx_sunlock(&allprison_lock);
3169 	else if (flags & PD_LIST_XLOCKED)
3170 		sx_xunlock(&allprison_lock);
3171 
3172 	/* Kill any processes attached to a killed prison. */
3173 	if (killpr != NULL)
3174 		prison_proc_iterate(killpr, prison_kill_processes_cb, NULL);
3175 
3176 	/*
3177 	 * Finish removing any unreferenced prisons, which couldn't happen
3178 	 * while allprison_lock was held (to avoid a LOR on vrele).
3179 	 */
3180 	TAILQ_FOREACH_SAFE(rpr, &freeprison, pr_list, tpr) {
3181 #ifdef VIMAGE
3182 		if (rpr->pr_vnet != rpr->pr_parent->pr_vnet)
3183 			vnet_destroy(rpr->pr_vnet);
3184 #endif
3185 		if (rpr->pr_root != NULL)
3186 			vrele(rpr->pr_root);
3187 		mtx_destroy(&rpr->pr_mtx);
3188 #ifdef INET
3189 		prison_ip_free(rpr->pr_addrs[PR_INET]);
3190 #endif
3191 #ifdef INET6
3192 		prison_ip_free(rpr->pr_addrs[PR_INET6]);
3193 #endif
3194 		if (rpr->pr_cpuset != NULL)
3195 			cpuset_rel(rpr->pr_cpuset);
3196 		osd_jail_exit(rpr);
3197 #ifdef RACCT
3198 		if (racct_enable)
3199 			prison_racct_detach(rpr);
3200 #endif
3201 		TAILQ_REMOVE(&freeprison, rpr, pr_list);
3202 		free(rpr, M_PRISON);
3203 	}
3204 }
3205 
3206 /*
3207  * Kill the prison and its descendants.  Mark them as dying, clear the
3208  * persist flag, and call module remove methods.
3209  */
3210 static void
3211 prison_deref_kill(struct prison *pr, struct prisonlist *freeprison)
3212 {
3213 	struct prison *cpr, *ppr, *rpr;
3214 	bool descend;
3215 
3216 	/*
3217 	 * Unlike the descendants, the target prison can be killed
3218 	 * even if it is currently dying.  This is useful for failed
3219 	 * creation in jail_set(2).
3220 	 */
3221 	KASSERT(refcount_load(&pr->pr_ref) > 0,
3222 	    ("Trying to kill dead prison %p (jid=%d).",
3223 	     pr, pr->pr_id));
3224 	refcount_acquire(&pr->pr_uref);
3225 	pr->pr_state = PRISON_STATE_DYING;
3226 	mtx_unlock(&pr->pr_mtx);
3227 
3228 	rpr = NULL;
3229 	FOREACH_PRISON_DESCENDANT_PRE_POST(pr, cpr, descend) {
3230 		if (descend) {
3231 			if (!prison_isalive(cpr)) {
3232 				descend = false;
3233 				continue;
3234 			}
3235 			prison_hold(cpr);
3236 			prison_proc_hold(cpr);
3237 			mtx_lock(&cpr->pr_mtx);
3238 			cpr->pr_state = PRISON_STATE_DYING;
3239 			cpr->pr_flags |= PR_REMOVE;
3240 			mtx_unlock(&cpr->pr_mtx);
3241 			continue;
3242 		}
3243 		if (!(cpr->pr_flags & PR_REMOVE))
3244 			continue;
3245 		prison_cleanup(cpr);
3246 		mtx_lock(&cpr->pr_mtx);
3247 		cpr->pr_flags &= ~PR_REMOVE;
3248 		if (cpr->pr_flags & PR_PERSIST) {
3249 			cpr->pr_flags &= ~PR_PERSIST;
3250 			prison_proc_free_not_last(cpr);
3251 			prison_free_not_last(cpr);
3252 		}
3253 		(void)refcount_release(&cpr->pr_uref);
3254 		if (refcount_release(&cpr->pr_ref)) {
3255 			/*
3256 			 * When the last reference goes, unlink the prison
3257 			 * and set it aside for prison_deref() to handle.
3258 			 * Delay unlinking the sibling list to keep the loop
3259 			 * safe.
3260 			 */
3261 			if (rpr != NULL)
3262 				LIST_REMOVE(rpr, pr_sibling);
3263 			rpr = cpr;
3264 			rpr->pr_state = PRISON_STATE_INVALID;
3265 			TAILQ_REMOVE(&allprison, rpr, pr_list);
3266 			TAILQ_INSERT_TAIL(freeprison, rpr, pr_list);
3267 			/*
3268 			 * Removing a prison frees references from its parent.
3269 			 */
3270 			ppr = rpr->pr_parent;
3271 			prison_proc_free_not_last(ppr);
3272 			prison_free_not_last(ppr);
3273 			for (; ppr != NULL; ppr = ppr->pr_parent)
3274 				ppr->pr_childcount--;
3275 		}
3276 		mtx_unlock(&cpr->pr_mtx);
3277 	}
3278 	if (rpr != NULL)
3279 		LIST_REMOVE(rpr, pr_sibling);
3280 
3281 	prison_cleanup(pr);
3282 	mtx_lock(&pr->pr_mtx);
3283 	if (pr->pr_flags & PR_PERSIST) {
3284 		pr->pr_flags &= ~PR_PERSIST;
3285 		prison_proc_free_not_last(pr);
3286 		prison_free_not_last(pr);
3287 	}
3288 	(void)refcount_release(&pr->pr_uref);
3289 }
3290 
3291 /*
3292  * Given the current locking state in the flags, make sure allprison_lock
3293  * is held exclusive, and the prison is locked.  Return flags indicating
3294  * the new state.
3295  */
3296 static int
3297 prison_lock_xlock(struct prison *pr, int flags)
3298 {
3299 
3300 	if (!(flags & PD_LIST_XLOCKED)) {
3301 		/*
3302 		 * Get allprison_lock, which may be an upgrade,
3303 		 * and may require unlocking the prison.
3304 		 */
3305 		if (flags & PD_LOCKED) {
3306 			mtx_unlock(&pr->pr_mtx);
3307 			flags &= ~PD_LOCKED;
3308 		}
3309 		if (flags & PD_LIST_SLOCKED) {
3310 			if (!sx_try_upgrade(&allprison_lock)) {
3311 				sx_sunlock(&allprison_lock);
3312 				sx_xlock(&allprison_lock);
3313 			}
3314 			flags &= ~PD_LIST_SLOCKED;
3315 		} else
3316 			sx_xlock(&allprison_lock);
3317 		flags |= PD_LIST_XLOCKED;
3318 	}
3319 	if (!(flags & PD_LOCKED)) {
3320 		/* Lock the prison mutex. */
3321 		mtx_lock(&pr->pr_mtx);
3322 		flags |= PD_LOCKED;
3323 	}
3324 	return flags;
3325 }
3326 
3327 /*
3328  * Release a prison's resources when it starts dying (when the last user
3329  * reference is dropped, or when it is killed).
3330  */
3331 static void
3332 prison_cleanup(struct prison *pr)
3333 {
3334 	sx_assert(&allprison_lock, SA_XLOCKED);
3335 	mtx_assert(&pr->pr_mtx, MA_NOTOWNED);
3336 	shm_remove_prison(pr);
3337 	(void)osd_jail_call(pr, PR_METHOD_REMOVE, NULL);
3338 }
3339 
3340 /*
3341  * Set or clear a permission bit in the pr_allow field, passing restrictions
3342  * (cleared permission) down to child jails.
3343  */
3344 void
3345 prison_set_allow(struct ucred *cred, unsigned flag, int enable)
3346 {
3347 	struct prison *pr;
3348 
3349 	pr = cred->cr_prison;
3350 	sx_slock(&allprison_lock);
3351 	mtx_lock(&pr->pr_mtx);
3352 	prison_set_allow_locked(pr, flag, enable);
3353 	mtx_unlock(&pr->pr_mtx);
3354 	sx_sunlock(&allprison_lock);
3355 }
3356 
3357 static void
3358 prison_set_allow_locked(struct prison *pr, unsigned flag, int enable)
3359 {
3360 	struct prison *cpr;
3361 	int descend;
3362 
3363 	if (enable != 0)
3364 		pr->pr_allow |= flag;
3365 	else {
3366 		pr->pr_allow &= ~flag;
3367 		FOREACH_PRISON_DESCENDANT_LOCKED(pr, cpr, descend)
3368 			cpr->pr_allow &= ~flag;
3369 	}
3370 }
3371 
3372 /*
3373  * Check if a jail supports the given address family.
3374  *
3375  * Returns 0 if not jailed or the address family is supported, EAFNOSUPPORT
3376  * if not.
3377  */
3378 int
3379 prison_check_af(struct ucred *cred, int af)
3380 {
3381 	struct prison *pr;
3382 	int error;
3383 
3384 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3385 
3386 	pr = cred->cr_prison;
3387 #ifdef VIMAGE
3388 	/* Prisons with their own network stack are not limited. */
3389 	if (prison_owns_vnet(cred))
3390 		return (0);
3391 #endif
3392 
3393 	error = 0;
3394 	switch (af)
3395 	{
3396 #ifdef INET
3397 	case AF_INET:
3398 		if (pr->pr_flags & PR_IP4)
3399 		{
3400 			mtx_lock(&pr->pr_mtx);
3401 			if ((pr->pr_flags & PR_IP4) &&
3402 			    pr->pr_addrs[PR_INET] == NULL)
3403 				error = EAFNOSUPPORT;
3404 			mtx_unlock(&pr->pr_mtx);
3405 		}
3406 		break;
3407 #endif
3408 #ifdef INET6
3409 	case AF_INET6:
3410 		if (pr->pr_flags & PR_IP6)
3411 		{
3412 			mtx_lock(&pr->pr_mtx);
3413 			if ((pr->pr_flags & PR_IP6) &&
3414 			    pr->pr_addrs[PR_INET6] == NULL)
3415 				error = EAFNOSUPPORT;
3416 			mtx_unlock(&pr->pr_mtx);
3417 		}
3418 		break;
3419 #endif
3420 	case AF_LOCAL:
3421 	case AF_ROUTE:
3422 		break;
3423 	default:
3424 		if (!(pr->pr_allow & PR_ALLOW_SOCKET_AF))
3425 			error = EAFNOSUPPORT;
3426 	}
3427 	return (error);
3428 }
3429 
3430 /*
3431  * Check if given address belongs to the jail referenced by cred (wrapper to
3432  * prison_check_ip[46]).
3433  *
3434  * Returns 0 if jail doesn't restrict the address family or if address belongs
3435  * to jail, EADDRNOTAVAIL if the address doesn't belong, or EAFNOSUPPORT if
3436  * the jail doesn't allow the address family.  IPv4 Address passed in in NBO.
3437  */
3438 int
3439 prison_if(struct ucred *cred, const struct sockaddr *sa)
3440 {
3441 #ifdef INET
3442 	const struct sockaddr_in *sai;
3443 #endif
3444 #ifdef INET6
3445 	const struct sockaddr_in6 *sai6;
3446 #endif
3447 	int error;
3448 
3449 	KASSERT(cred != NULL, ("%s: cred is NULL", __func__));
3450 	KASSERT(sa != NULL, ("%s: sa is NULL", __func__));
3451 
3452 #ifdef VIMAGE
3453 	if (prison_owns_vnet(cred))
3454 		return (0);
3455 #endif
3456 
3457 	error = 0;
3458 	switch (sa->sa_family)
3459 	{
3460 #ifdef INET
3461 	case AF_INET:
3462 		sai = (const struct sockaddr_in *)sa;
3463 		error = prison_check_ip4(cred, &sai->sin_addr);
3464 		break;
3465 #endif
3466 #ifdef INET6
3467 	case AF_INET6:
3468 		sai6 = (const struct sockaddr_in6 *)sa;
3469 		error = prison_check_ip6(cred, &sai6->sin6_addr);
3470 		break;
3471 #endif
3472 	default:
3473 		if (!(cred->cr_prison->pr_allow & PR_ALLOW_SOCKET_AF))
3474 			error = EAFNOSUPPORT;
3475 	}
3476 	return (error);
3477 }
3478 
3479 /*
3480  * Return 0 if jails permit p1 to frob p2, otherwise ESRCH.
3481  */
3482 int
3483 prison_check(struct ucred *cred1, struct ucred *cred2)
3484 {
3485 
3486 	return ((cred1->cr_prison == cred2->cr_prison ||
3487 	    prison_ischild(cred1->cr_prison, cred2->cr_prison)) ? 0 : ESRCH);
3488 }
3489 
3490 /*
3491  * For mountd/nfsd to run within a prison, it must be:
3492  * - A vnet prison.
3493  * - PR_ALLOW_NFSD must be set on it.
3494  * - The root directory (pr_root) of the prison must be
3495  *   a file system mount point, so the mountd can hang
3496  *   export information on it.
3497  * - The prison's enforce_statfs cannot be 0, so that
3498  *   mountd(8) can do exports.
3499  */
3500 bool
3501 prison_check_nfsd(struct ucred *cred)
3502 {
3503 
3504 	if (jailed_without_vnet(cred))
3505 		return (false);
3506 	if (!prison_allow(cred, PR_ALLOW_NFSD))
3507 		return (false);
3508 	if ((cred->cr_prison->pr_root->v_vflag & VV_ROOT) == 0)
3509 		return (false);
3510 	if (cred->cr_prison->pr_enforce_statfs == 0)
3511 		return (false);
3512 	return (true);
3513 }
3514 
3515 /*
3516  * Return 1 if p2 is a child of p1, otherwise 0.
3517  */
3518 int
3519 prison_ischild(struct prison *pr1, struct prison *pr2)
3520 {
3521 
3522 	for (pr2 = pr2->pr_parent; pr2 != NULL; pr2 = pr2->pr_parent)
3523 		if (pr1 == pr2)
3524 			return (1);
3525 	return (0);
3526 }
3527 
3528 /*
3529  * Return true if the prison is currently alive.  A prison is alive if it
3530  * holds user references and it isn't being removed.
3531  */
3532 bool
3533 prison_isalive(const struct prison *pr)
3534 {
3535 
3536 	if (__predict_false(pr->pr_state != PRISON_STATE_ALIVE))
3537 		return (false);
3538 	return (true);
3539 }
3540 
3541 /*
3542  * Return true if the prison is currently alive.  Identified by pr_permid.
3543  */
3544 bool
3545 prison_isalive_permid(const uint64_t prison_permid)
3546 {
3547 	struct prison *pr;
3548 	bool alive;
3549 
3550 	/*
3551 	 * permid == 0 --> never assigned to a prison
3552 	 * permid == 1 --> assigned to prison0, always alive
3553 	 */
3554 	if (prison_permid == 0)
3555 		return (false);
3556 	else if (prison_permid == 1)
3557 		return (true);
3558 	sx_slock(&allprison_lock);
3559 	TAILQ_FOREACH(pr, &allprison, pr_list) {
3560 		if (pr->pr_permid == prison_permid) {
3561 			alive = prison_isalive(pr);
3562 			sx_unlock(&allprison_lock);
3563 			return (alive);
3564 		}
3565 	}
3566 	sx_unlock(&allprison_lock);
3567 	return (false);
3568 }
3569 
3570 /*
3571  * Return true if the prison is currently valid.  A prison is valid if it has
3572  * been fully created, and is not being destroyed.  Note that dying prisons
3573  * are still considered valid.  Invalid prisons won't be found under normal
3574  * circumstances, as they're only put in that state by functions that have
3575  * an exclusive hold on allprison_lock.
3576  */
3577 bool
3578 prison_isvalid(struct prison *pr)
3579 {
3580 
3581 	if (__predict_false(pr->pr_state == PRISON_STATE_INVALID))
3582 		return (false);
3583 	if (__predict_false(refcount_load(&pr->pr_ref) == 0))
3584 		return (false);
3585 	return (true);
3586 }
3587 
3588 /*
3589  * Return 1 if the passed credential is in a jail and that jail does not
3590  * have its own virtual network stack, otherwise 0.
3591  */
3592 int
3593 jailed_without_vnet(struct ucred *cred)
3594 {
3595 
3596 	if (!jailed(cred))
3597 		return (0);
3598 #ifdef VIMAGE
3599 	if (prison_owns_vnet(cred))
3600 		return (0);
3601 #endif
3602 
3603 	return (1);
3604 }
3605 
3606 /*
3607  * Return the correct hostname (domainname, et al) for the passed credential.
3608  */
3609 void
3610 getcredhostname(struct ucred *cred, char *buf, size_t size)
3611 {
3612 	struct prison *pr;
3613 
3614 	/*
3615 	 * A NULL credential can be used to shortcut to the physical
3616 	 * system's hostname.
3617 	 */
3618 	pr = (cred != NULL) ? cred->cr_prison : &prison0;
3619 	mtx_lock(&pr->pr_mtx);
3620 	strlcpy(buf, pr->pr_hostname, size);
3621 	mtx_unlock(&pr->pr_mtx);
3622 }
3623 
3624 void
3625 getcreddomainname(struct ucred *cred, char *buf, size_t size)
3626 {
3627 
3628 	mtx_lock(&cred->cr_prison->pr_mtx);
3629 	strlcpy(buf, cred->cr_prison->pr_domainname, size);
3630 	mtx_unlock(&cred->cr_prison->pr_mtx);
3631 }
3632 
3633 void
3634 getcredhostuuid(struct ucred *cred, char *buf, size_t size)
3635 {
3636 
3637 	mtx_lock(&cred->cr_prison->pr_mtx);
3638 	strlcpy(buf, cred->cr_prison->pr_hostuuid, size);
3639 	mtx_unlock(&cred->cr_prison->pr_mtx);
3640 }
3641 
3642 void
3643 getcredhostid(struct ucred *cred, unsigned long *hostid)
3644 {
3645 
3646 	mtx_lock(&cred->cr_prison->pr_mtx);
3647 	*hostid = cred->cr_prison->pr_hostid;
3648 	mtx_unlock(&cred->cr_prison->pr_mtx);
3649 }
3650 
3651 void
3652 getjailname(struct ucred *cred, char *name, size_t len)
3653 {
3654 
3655 	mtx_lock(&cred->cr_prison->pr_mtx);
3656 	strlcpy(name, cred->cr_prison->pr_name, len);
3657 	mtx_unlock(&cred->cr_prison->pr_mtx);
3658 }
3659 
3660 #ifdef VIMAGE
3661 /*
3662  * Determine whether the prison represented by cred owns
3663  * its vnet rather than having it inherited.
3664  *
3665  * Returns 1 in case the prison owns the vnet, 0 otherwise.
3666  */
3667 int
3668 prison_owns_vnet(struct ucred *cred)
3669 {
3670 
3671 	/*
3672 	 * vnets cannot be added/removed after jail creation,
3673 	 * so no need to lock here.
3674 	 */
3675 	return (cred->cr_prison->pr_flags & PR_VNET ? 1 : 0);
3676 }
3677 #endif
3678 
3679 /*
3680  * Determine whether the subject represented by cred can "see"
3681  * status of a mount point.
3682  * Returns: 0 for permitted, ENOENT otherwise.
3683  * XXX: This function should be called cr_canseemount() and should be
3684  *      placed in kern_prot.c.
3685  */
3686 int
3687 prison_canseemount(struct ucred *cred, struct mount *mp)
3688 {
3689 	struct prison *pr;
3690 	struct statfs *sp;
3691 	size_t len;
3692 
3693 	pr = cred->cr_prison;
3694 	if (pr->pr_enforce_statfs == 0)
3695 		return (0);
3696 	if (pr->pr_root->v_mount == mp)
3697 		return (0);
3698 	if (pr->pr_enforce_statfs == 2)
3699 		return (ENOENT);
3700 	/*
3701 	 * If jail's chroot directory is set to "/" we should be able to see
3702 	 * all mount-points from inside a jail.
3703 	 * This is ugly check, but this is the only situation when jail's
3704 	 * directory ends with '/'.
3705 	 */
3706 	if (strcmp(pr->pr_path, "/") == 0)
3707 		return (0);
3708 	len = strlen(pr->pr_path);
3709 	sp = &mp->mnt_stat;
3710 	if (strncmp(pr->pr_path, sp->f_mntonname, len) != 0)
3711 		return (ENOENT);
3712 	/*
3713 	 * Be sure that we don't have situation where jail's root directory
3714 	 * is "/some/path" and mount point is "/some/pathpath".
3715 	 */
3716 	if (sp->f_mntonname[len] != '\0' && sp->f_mntonname[len] != '/')
3717 		return (ENOENT);
3718 	return (0);
3719 }
3720 
3721 void
3722 prison_enforce_statfs(struct ucred *cred, struct mount *mp, struct statfs *sp)
3723 {
3724 	char jpath[MAXPATHLEN];
3725 	struct prison *pr;
3726 	size_t len;
3727 
3728 	pr = cred->cr_prison;
3729 	if (pr->pr_enforce_statfs == 0)
3730 		return;
3731 	if (prison_canseemount(cred, mp) != 0) {
3732 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3733 		strlcpy(sp->f_mntonname, "[restricted]",
3734 		    sizeof(sp->f_mntonname));
3735 		return;
3736 	}
3737 	if (pr->pr_root->v_mount == mp) {
3738 		/*
3739 		 * Clear current buffer data, so we are sure nothing from
3740 		 * the valid path left there.
3741 		 */
3742 		bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3743 		*sp->f_mntonname = '/';
3744 		return;
3745 	}
3746 	/*
3747 	 * If jail's chroot directory is set to "/" we should be able to see
3748 	 * all mount-points from inside a jail.
3749 	 */
3750 	if (strcmp(pr->pr_path, "/") == 0)
3751 		return;
3752 	len = strlen(pr->pr_path);
3753 	strlcpy(jpath, sp->f_mntonname + len, sizeof(jpath));
3754 	/*
3755 	 * Clear current buffer data, so we are sure nothing from
3756 	 * the valid path left there.
3757 	 */
3758 	bzero(sp->f_mntonname, sizeof(sp->f_mntonname));
3759 	if (*jpath == '\0') {
3760 		/* Should never happen. */
3761 		*sp->f_mntonname = '/';
3762 	} else {
3763 		strlcpy(sp->f_mntonname, jpath, sizeof(sp->f_mntonname));
3764 	}
3765 }
3766 
3767 /*
3768  * Check with permission for a specific privilege is granted within jail.  We
3769  * have a specific list of accepted privileges; the rest are denied.
3770  */
3771 int
3772 prison_priv_check(struct ucred *cred, int priv)
3773 {
3774 	struct prison *pr;
3775 	int error;
3776 
3777 	/*
3778 	 * Some policies have custom handlers. This routine should not be
3779 	 * called for them. See priv_check_cred().
3780 	 */
3781 	switch (priv) {
3782 	case PRIV_VFS_LOOKUP:
3783 	case PRIV_VFS_GENERATION:
3784 		KASSERT(0, ("prison_priv_check instead of a custom handler "
3785 		    "called for %d\n", priv));
3786 	}
3787 
3788 	if (!jailed(cred))
3789 		return (0);
3790 
3791 #ifdef VIMAGE
3792 	/*
3793 	 * Privileges specific to prisons with a virtual network stack.
3794 	 * There might be a duplicate entry here in case the privilege
3795 	 * is only granted conditionally in the legacy jail case.
3796 	 */
3797 	switch (priv) {
3798 		/*
3799 		 * NFS-specific privileges.
3800 		 */
3801 	case PRIV_NFS_DAEMON:
3802 	case PRIV_VFS_GETFH:
3803 	case PRIV_VFS_MOUNT_EXPORTED:
3804 #ifdef VNET_NFSD
3805 		if (!prison_check_nfsd(cred))
3806 #else
3807 		printf("running nfsd in a prison requires a kernel "
3808 		    "built with ''options VNET_NFSD''\n");
3809 #endif
3810 			return (EPERM);
3811 #ifdef notyet
3812 	case PRIV_NFS_LOCKD:
3813 #endif
3814 		/*
3815 		 * Network stack privileges.
3816 		 */
3817 	case PRIV_NET_BRIDGE:
3818 	case PRIV_NET_GRE:
3819 	case PRIV_NET_BPF:
3820 	case PRIV_NET_RAW:		/* Dup, cond. in legacy jail case. */
3821 	case PRIV_NET_ROUTE:
3822 	case PRIV_NET_TAP:
3823 	case PRIV_NET_SETIFMTU:
3824 	case PRIV_NET_SETIFFLAGS:
3825 	case PRIV_NET_SETIFCAP:
3826 	case PRIV_NET_SETIFDESCR:
3827 	case PRIV_NET_SETIFNAME	:
3828 	case PRIV_NET_SETIFMETRIC:
3829 	case PRIV_NET_SETIFPHYS:
3830 	case PRIV_NET_SETIFMAC:
3831 	case PRIV_NET_SETLANPCP:
3832 	case PRIV_NET_ADDMULTI:
3833 	case PRIV_NET_DELMULTI:
3834 	case PRIV_NET_HWIOCTL:
3835 	case PRIV_NET_SETLLADDR:
3836 	case PRIV_NET_ADDIFGROUP:
3837 	case PRIV_NET_DELIFGROUP:
3838 	case PRIV_NET_IFCREATE:
3839 	case PRIV_NET_IFDESTROY:
3840 	case PRIV_NET_ADDIFADDR:
3841 	case PRIV_NET_DELIFADDR:
3842 	case PRIV_NET_LAGG:
3843 	case PRIV_NET_GIF:
3844 	case PRIV_NET_SETIFVNET:
3845 	case PRIV_NET_SETIFFIB:
3846 	case PRIV_NET_OVPN:
3847 	case PRIV_NET_ME:
3848 	case PRIV_NET_WG:
3849 
3850 		/*
3851 		 * 802.11-related privileges.
3852 		 */
3853 	case PRIV_NET80211_VAP_GETKEY:
3854 	case PRIV_NET80211_VAP_MANAGE:
3855 
3856 #ifdef notyet
3857 		/*
3858 		 * ATM privileges.
3859 		 */
3860 	case PRIV_NETATM_CFG:
3861 	case PRIV_NETATM_ADD:
3862 	case PRIV_NETATM_DEL:
3863 	case PRIV_NETATM_SET:
3864 
3865 		/*
3866 		 * Bluetooth privileges.
3867 		 */
3868 	case PRIV_NETBLUETOOTH_RAW:
3869 #endif
3870 
3871 		/*
3872 		 * Netgraph and netgraph module privileges.
3873 		 */
3874 	case PRIV_NETGRAPH_CONTROL:
3875 #ifdef notyet
3876 	case PRIV_NETGRAPH_TTY:
3877 #endif
3878 
3879 		/*
3880 		 * IPv4 and IPv6 privileges.
3881 		 */
3882 	case PRIV_NETINET_IPFW:
3883 	case PRIV_NETINET_DIVERT:
3884 	case PRIV_NETINET_PF:
3885 	case PRIV_NETINET_DUMMYNET:
3886 	case PRIV_NETINET_CARP:
3887 	case PRIV_NETINET_MROUTE:
3888 	case PRIV_NETINET_RAW:
3889 	case PRIV_NETINET_ADDRCTRL6:
3890 	case PRIV_NETINET_ND6:
3891 	case PRIV_NETINET_SCOPE6:
3892 	case PRIV_NETINET_ALIFETIME6:
3893 	case PRIV_NETINET_IPSEC:
3894 	case PRIV_NETINET_BINDANY:
3895 
3896 #ifdef notyet
3897 		/*
3898 		 * NCP privileges.
3899 		 */
3900 	case PRIV_NETNCP:
3901 
3902 		/*
3903 		 * SMB privileges.
3904 		 */
3905 	case PRIV_NETSMB:
3906 #endif
3907 
3908 	/*
3909 	 * No default: or deny here.
3910 	 * In case of no permit fall through to next switch().
3911 	 */
3912 		if (cred->cr_prison->pr_flags & PR_VNET)
3913 			return (0);
3914 	}
3915 #endif /* VIMAGE */
3916 
3917 	switch (priv) {
3918 		/*
3919 		 * Allow ktrace privileges for root in jail.
3920 		 */
3921 	case PRIV_KTRACE:
3922 
3923 #if 0
3924 		/*
3925 		 * Allow jailed processes to configure audit identity and
3926 		 * submit audit records (login, etc).  In the future we may
3927 		 * want to further refine the relationship between audit and
3928 		 * jail.
3929 		 */
3930 	case PRIV_AUDIT_GETAUDIT:
3931 	case PRIV_AUDIT_SETAUDIT:
3932 	case PRIV_AUDIT_SUBMIT:
3933 #endif
3934 
3935 		/*
3936 		 * Allow jailed processes to manipulate process UNIX
3937 		 * credentials in any way they see fit.
3938 		 */
3939 	case PRIV_CRED_SETUID:
3940 	case PRIV_CRED_SETEUID:
3941 	case PRIV_CRED_SETGID:
3942 	case PRIV_CRED_SETEGID:
3943 	case PRIV_CRED_SETGROUPS:
3944 	case PRIV_CRED_SETREUID:
3945 	case PRIV_CRED_SETREGID:
3946 	case PRIV_CRED_SETRESUID:
3947 	case PRIV_CRED_SETRESGID:
3948 
3949 		/*
3950 		 * Jail implements visibility constraints already, so allow
3951 		 * jailed root to override uid/gid-based constraints.
3952 		 */
3953 	case PRIV_SEEOTHERGIDS:
3954 	case PRIV_SEEOTHERUIDS:
3955 
3956 		/*
3957 		 * Jail implements inter-process debugging limits already, so
3958 		 * allow jailed root various debugging privileges.
3959 		 */
3960 	case PRIV_DEBUG_DIFFCRED:
3961 	case PRIV_DEBUG_SUGID:
3962 	case PRIV_DEBUG_UNPRIV:
3963 
3964 		/*
3965 		 * Allow jail to set various resource limits and login
3966 		 * properties, and for now, exceed process resource limits.
3967 		 */
3968 	case PRIV_PROC_LIMIT:
3969 	case PRIV_PROC_SETLOGIN:
3970 	case PRIV_PROC_SETRLIMIT:
3971 
3972 		/*
3973 		 * System V and POSIX IPC privileges are granted in jail.
3974 		 */
3975 	case PRIV_IPC_READ:
3976 	case PRIV_IPC_WRITE:
3977 	case PRIV_IPC_ADMIN:
3978 	case PRIV_IPC_MSGSIZE:
3979 	case PRIV_MQ_ADMIN:
3980 
3981 		/*
3982 		 * Jail operations within a jail work on child jails.
3983 		 */
3984 	case PRIV_JAIL_ATTACH:
3985 	case PRIV_JAIL_SET:
3986 	case PRIV_JAIL_REMOVE:
3987 
3988 		/*
3989 		 * Jail implements its own inter-process limits, so allow
3990 		 * root processes in jail to change scheduling on other
3991 		 * processes in the same jail.  Likewise for signalling.
3992 		 */
3993 	case PRIV_SCHED_DIFFCRED:
3994 	case PRIV_SCHED_CPUSET:
3995 	case PRIV_SIGNAL_DIFFCRED:
3996 	case PRIV_SIGNAL_SUGID:
3997 
3998 		/*
3999 		 * Allow jailed processes to write to sysctls marked as jail
4000 		 * writable.
4001 		 */
4002 	case PRIV_SYSCTL_WRITEJAIL:
4003 
4004 		/*
4005 		 * Allow root in jail to manage a variety of quota
4006 		 * properties.  These should likely be conditional on a
4007 		 * configuration option.
4008 		 */
4009 	case PRIV_VFS_GETQUOTA:
4010 	case PRIV_VFS_SETQUOTA:
4011 
4012 		/*
4013 		 * Since Jail relies on chroot() to implement file system
4014 		 * protections, grant many VFS privileges to root in jail.
4015 		 * Be careful to exclude mount-related and NFS-related
4016 		 * privileges.
4017 		 */
4018 	case PRIV_VFS_READ:
4019 	case PRIV_VFS_WRITE:
4020 	case PRIV_VFS_ADMIN:
4021 	case PRIV_VFS_EXEC:
4022 	case PRIV_VFS_BLOCKRESERVE:	/* XXXRW: Slightly surprising. */
4023 	case PRIV_VFS_CHFLAGS_DEV:
4024 	case PRIV_VFS_CHOWN:
4025 	case PRIV_VFS_CHROOT:
4026 	case PRIV_VFS_RETAINSUGID:
4027 	case PRIV_VFS_FCHROOT:
4028 	case PRIV_VFS_LINK:
4029 	case PRIV_VFS_SETGID:
4030 	case PRIV_VFS_STAT:
4031 	case PRIV_VFS_STICKYFILE:
4032 
4033 		/*
4034 		 * As in the non-jail case, non-root users are expected to be
4035 		 * able to read kernel/physical memory (provided /dev/[k]mem
4036 		 * exists in the jail and they have permission to access it).
4037 		 */
4038 	case PRIV_KMEM_READ:
4039 		return (0);
4040 
4041 		/*
4042 		 * Depending on the global setting, allow privilege of
4043 		 * setting system flags.
4044 		 */
4045 	case PRIV_VFS_SYSFLAGS:
4046 		if (cred->cr_prison->pr_allow & PR_ALLOW_CHFLAGS)
4047 			return (0);
4048 		else
4049 			return (EPERM);
4050 
4051 		/*
4052 		 * Depending on the global setting, allow privilege of
4053 		 * mounting/unmounting file systems.
4054 		 */
4055 	case PRIV_VFS_MOUNT:
4056 	case PRIV_VFS_UNMOUNT:
4057 	case PRIV_VFS_MOUNT_NONUSER:
4058 	case PRIV_VFS_MOUNT_OWNER:
4059 		pr = cred->cr_prison;
4060 		prison_lock(pr);
4061 		if (pr->pr_allow & PR_ALLOW_MOUNT && pr->pr_enforce_statfs < 2)
4062 			error = 0;
4063 		else
4064 			error = EPERM;
4065 		prison_unlock(pr);
4066 		return (error);
4067 
4068 		/*
4069 		 * Jails should hold no disposition on the PRIV_VFS_READ_DIR
4070 		 * policy.  priv_check_cred will not specifically allow it, and
4071 		 * we may want a MAC policy to allow it.
4072 		 */
4073 	case PRIV_VFS_READ_DIR:
4074 		return (0);
4075 
4076 		/*
4077 		 * Conditionnaly allow locking (unlocking) physical pages
4078 		 * in memory.
4079 		 */
4080 	case PRIV_VM_MLOCK:
4081 	case PRIV_VM_MUNLOCK:
4082 		if (cred->cr_prison->pr_allow & PR_ALLOW_MLOCK)
4083 			return (0);
4084 		else
4085 			return (EPERM);
4086 
4087 		/*
4088 		 * Conditionally allow jailed root to bind reserved ports.
4089 		 */
4090 	case PRIV_NETINET_RESERVEDPORT:
4091 		if (cred->cr_prison->pr_allow & PR_ALLOW_RESERVED_PORTS)
4092 			return (0);
4093 		else
4094 			return (EPERM);
4095 
4096 		/*
4097 		 * Allow jailed root to reuse in-use ports.
4098 		 */
4099 	case PRIV_NETINET_REUSEPORT:
4100 		return (0);
4101 
4102 		/*
4103 		 * Allow jailed root to set certain IPv4/6 (option) headers.
4104 		 */
4105 	case PRIV_NETINET_SETHDROPTS:
4106 		return (0);
4107 
4108 		/*
4109 		 * Conditionally allow creating raw sockets in jail.
4110 		 */
4111 	case PRIV_NETINET_RAW:
4112 		if (cred->cr_prison->pr_allow & PR_ALLOW_RAW_SOCKETS)
4113 			return (0);
4114 		else
4115 			return (EPERM);
4116 
4117 		/*
4118 		 * Since jail implements its own visibility limits on netstat
4119 		 * sysctls, allow getcred.  This allows identd to work in
4120 		 * jail.
4121 		 */
4122 	case PRIV_NETINET_GETCRED:
4123 		return (0);
4124 
4125 		/*
4126 		 * Allow jailed root to set loginclass.
4127 		 */
4128 	case PRIV_PROC_SETLOGINCLASS:
4129 		return (0);
4130 
4131 		/*
4132 		 * Do not allow a process inside a jail to read the kernel
4133 		 * message buffer unless explicitly permitted.
4134 		 */
4135 	case PRIV_MSGBUF:
4136 		if (cred->cr_prison->pr_allow & PR_ALLOW_READ_MSGBUF)
4137 			return (0);
4138 		return (EPERM);
4139 
4140 	default:
4141 		/*
4142 		 * In all remaining cases, deny the privilege request.  This
4143 		 * includes almost all network privileges, many system
4144 		 * configuration privileges.
4145 		 */
4146 		return (EPERM);
4147 	}
4148 }
4149 
4150 /*
4151  * Return the part of pr2's name that is relative to pr1, or the whole name
4152  * if it does not directly follow.
4153  */
4154 
4155 char *
4156 prison_name(struct prison *pr1, struct prison *pr2)
4157 {
4158 	char *name;
4159 
4160 	/* Jails see themselves as "0" (if they see themselves at all). */
4161 	if (pr1 == pr2)
4162 		return "0";
4163 	name = pr2->pr_name;
4164 	if (prison_ischild(pr1, pr2)) {
4165 		/*
4166 		 * pr1 isn't locked (and allprison_lock may not be either)
4167 		 * so its length can't be counted on.  But the number of dots
4168 		 * can be counted on - and counted.
4169 		 */
4170 		for (; pr1 != &prison0; pr1 = pr1->pr_parent)
4171 			name = strchr(name, '.') + 1;
4172 	}
4173 	return (name);
4174 }
4175 
4176 /*
4177  * Return the part of pr2's path that is relative to pr1, or the whole path
4178  * if it does not directly follow.
4179  */
4180 static char *
4181 prison_path(struct prison *pr1, struct prison *pr2)
4182 {
4183 	char *path1, *path2;
4184 	int len1;
4185 
4186 	path1 = pr1->pr_path;
4187 	path2 = pr2->pr_path;
4188 	if (!strcmp(path1, "/"))
4189 		return (path2);
4190 	len1 = strlen(path1);
4191 	if (strncmp(path1, path2, len1))
4192 		return (path2);
4193 	if (path2[len1] == '\0')
4194 		return "/";
4195 	if (path2[len1] == '/')
4196 		return (path2 + len1);
4197 	return (path2);
4198 }
4199 
4200 /*
4201  * Jail-related sysctls.
4202  */
4203 static SYSCTL_NODE(_security, OID_AUTO, jail, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4204     "Jails");
4205 
4206 #if defined(INET) || defined(INET6)
4207 /*
4208  * Copy address array to memory that would be then SYSCTL_OUT-ed.
4209  * sysctl_jail_list() helper.
4210  */
4211 static void
4212 prison_ip_copyout(struct prison *pr, const pr_family_t af, void **out, int *len)
4213 {
4214 	const size_t size = pr_families[af].size;
4215 
4216  again:
4217 	mtx_assert(&pr->pr_mtx, MA_OWNED);
4218 	if (pr->pr_addrs[af] != NULL) {
4219 		if (*len < pr->pr_addrs[af]->ips) {
4220 			*len = pr->pr_addrs[af]->ips;
4221 			mtx_unlock(&pr->pr_mtx);
4222 			*out = realloc(*out, *len * size, M_TEMP, M_WAITOK);
4223 			mtx_lock(&pr->pr_mtx);
4224 			goto again;
4225 		}
4226 		bcopy(pr->pr_addrs[af] + 1, *out, pr->pr_addrs[af]->ips * size);
4227 	}
4228 }
4229 #endif
4230 
4231 static int
4232 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
4233 {
4234 	struct xprison *xp;
4235 	struct prison *pr, *cpr;
4236 #ifdef INET
4237 	struct in_addr *ip4 = NULL;
4238 	int ip4s = 0;
4239 #endif
4240 #ifdef INET6
4241 	struct in6_addr *ip6 = NULL;
4242 	int ip6s = 0;
4243 #endif
4244 	int descend, error;
4245 
4246 	xp = malloc(sizeof(*xp), M_TEMP, M_WAITOK);
4247 	pr = req->td->td_ucred->cr_prison;
4248 	error = 0;
4249 	sx_slock(&allprison_lock);
4250 	FOREACH_PRISON_DESCENDANT(pr, cpr, descend) {
4251 		mtx_lock(&cpr->pr_mtx);
4252 #ifdef INET
4253 		prison_ip_copyout(cpr, PR_INET, (void **)&ip4, &ip4s);
4254 #endif
4255 #ifdef INET6
4256 		prison_ip_copyout(cpr, PR_INET6, (void **)&ip6, &ip6s);
4257 #endif
4258 		bzero(xp, sizeof(*xp));
4259 		xp->pr_version = XPRISON_VERSION;
4260 		xp->pr_id = cpr->pr_id;
4261 		xp->pr_state = cpr->pr_state;
4262 		strlcpy(xp->pr_path, prison_path(pr, cpr), sizeof(xp->pr_path));
4263 		strlcpy(xp->pr_host, cpr->pr_hostname, sizeof(xp->pr_host));
4264 		strlcpy(xp->pr_name, prison_name(pr, cpr), sizeof(xp->pr_name));
4265 #ifdef INET
4266 		xp->pr_ip4s = ip4s;
4267 #endif
4268 #ifdef INET6
4269 		xp->pr_ip6s = ip6s;
4270 #endif
4271 		mtx_unlock(&cpr->pr_mtx);
4272 		error = SYSCTL_OUT(req, xp, sizeof(*xp));
4273 		if (error)
4274 			break;
4275 #ifdef INET
4276 		if (xp->pr_ip4s > 0) {
4277 			error = SYSCTL_OUT(req, ip4,
4278 			    xp->pr_ip4s * sizeof(struct in_addr));
4279 			if (error)
4280 				break;
4281 		}
4282 #endif
4283 #ifdef INET6
4284 		if (xp->pr_ip6s > 0) {
4285 			error = SYSCTL_OUT(req, ip6,
4286 			    xp->pr_ip6s * sizeof(struct in6_addr));
4287 			if (error)
4288 				break;
4289 		}
4290 #endif
4291 	}
4292 	sx_sunlock(&allprison_lock);
4293 	free(xp, M_TEMP);
4294 #ifdef INET
4295 	free(ip4, M_TEMP);
4296 #endif
4297 #ifdef INET6
4298 	free(ip6, M_TEMP);
4299 #endif
4300 	return (error);
4301 }
4302 
4303 SYSCTL_OID(_security_jail, OID_AUTO, list,
4304     CTLTYPE_STRUCT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4305     sysctl_jail_list, "S", "List of active jails");
4306 
4307 static int
4308 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
4309 {
4310 	int error, injail;
4311 
4312 	injail = jailed(req->td->td_ucred);
4313 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
4314 
4315 	return (error);
4316 }
4317 
4318 SYSCTL_PROC(_security_jail, OID_AUTO, jailed,
4319     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4320     sysctl_jail_jailed, "I", "Process in jail?");
4321 
4322 static int
4323 sysctl_jail_vnet(SYSCTL_HANDLER_ARGS)
4324 {
4325 	int error, havevnet;
4326 #ifdef VIMAGE
4327 	struct ucred *cred = req->td->td_ucred;
4328 
4329 	havevnet = jailed(cred) && prison_owns_vnet(cred);
4330 #else
4331 	havevnet = 0;
4332 #endif
4333 	error = SYSCTL_OUT(req, &havevnet, sizeof(havevnet));
4334 
4335 	return (error);
4336 }
4337 
4338 SYSCTL_PROC(_security_jail, OID_AUTO, vnet,
4339     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE, NULL, 0,
4340     sysctl_jail_vnet, "I", "Jail owns vnet?");
4341 
4342 #if defined(INET) || defined(INET6)
4343 SYSCTL_UINT(_security_jail, OID_AUTO, jail_max_af_ips, CTLFLAG_RW,
4344     &jail_max_af_ips, 0,
4345     "Number of IP addresses a jail may have at most per address family (deprecated)");
4346 #endif
4347 
4348 /*
4349  * Default parameters for jail(2) compatibility.  For historical reasons,
4350  * the sysctl names have varying similarity to the parameter names.  Prisons
4351  * just see their own parameters, and can't change them.
4352  */
4353 static int
4354 sysctl_jail_default_allow(SYSCTL_HANDLER_ARGS)
4355 {
4356 	int error, i;
4357 
4358 	/* Get the current flag value, and convert it to a boolean. */
4359 	if (req->td->td_ucred->cr_prison == &prison0) {
4360 		mtx_lock(&prison0.pr_mtx);
4361 		i = (jail_default_allow & arg2) != 0;
4362 		mtx_unlock(&prison0.pr_mtx);
4363 	} else
4364 		i = prison_allow(req->td->td_ucred, arg2);
4365 
4366 	if (arg1 != NULL)
4367 		i = !i;
4368 	error = sysctl_handle_int(oidp, &i, 0, req);
4369 	if (error || !req->newptr)
4370 		return (error);
4371 	i = i ? arg2 : 0;
4372 	if (arg1 != NULL)
4373 		i ^= arg2;
4374 	/*
4375 	 * The sysctls don't have CTLFLAGS_PRISON, so assume prison0
4376 	 * for writing.
4377 	 */
4378 	mtx_lock(&prison0.pr_mtx);
4379 	jail_default_allow = (jail_default_allow & ~arg2) | i;
4380 	mtx_unlock(&prison0.pr_mtx);
4381 	return (0);
4382 }
4383 
4384 SYSCTL_PROC(_security_jail, OID_AUTO, set_hostname_allowed,
4385     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4386     NULL, PR_ALLOW_SET_HOSTNAME, sysctl_jail_default_allow, "I",
4387     "Processes in jail can set their hostnames (deprecated)");
4388 SYSCTL_PROC(_security_jail, OID_AUTO, socket_unixiproute_only,
4389     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4390     (void *)1, PR_ALLOW_SOCKET_AF, sysctl_jail_default_allow, "I",
4391     "Processes in jail are limited to creating UNIX/IP/route sockets only (deprecated)");
4392 SYSCTL_PROC(_security_jail, OID_AUTO, sysvipc_allowed,
4393     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4394     NULL, PR_ALLOW_SYSVIPC, sysctl_jail_default_allow, "I",
4395     "Processes in jail can use System V IPC primitives (deprecated)");
4396 SYSCTL_PROC(_security_jail, OID_AUTO, allow_raw_sockets,
4397     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4398     NULL, PR_ALLOW_RAW_SOCKETS, sysctl_jail_default_allow, "I",
4399     "Prison root can create raw sockets (deprecated)");
4400 SYSCTL_PROC(_security_jail, OID_AUTO, chflags_allowed,
4401     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4402     NULL, PR_ALLOW_CHFLAGS, sysctl_jail_default_allow, "I",
4403     "Processes in jail can alter system file flags (deprecated)");
4404 SYSCTL_PROC(_security_jail, OID_AUTO, mount_allowed,
4405     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4406     NULL, PR_ALLOW_MOUNT, sysctl_jail_default_allow, "I",
4407     "Processes in jail can mount/unmount jail-friendly file systems (deprecated)");
4408 
4409 static int
4410 sysctl_jail_default_level(SYSCTL_HANDLER_ARGS)
4411 {
4412 	struct prison *pr;
4413 	int level, error;
4414 
4415 	pr = req->td->td_ucred->cr_prison;
4416 	level = (pr == &prison0) ? *(int *)arg1 : *(int *)((char *)pr + arg2);
4417 	error = sysctl_handle_int(oidp, &level, 0, req);
4418 	if (error || !req->newptr)
4419 		return (error);
4420 	*(int *)arg1 = level;
4421 	return (0);
4422 }
4423 
4424 SYSCTL_PROC(_security_jail, OID_AUTO, enforce_statfs,
4425     CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4426     &jail_default_enforce_statfs, offsetof(struct prison, pr_enforce_statfs),
4427     sysctl_jail_default_level, "I",
4428     "Processes in jail cannot see all mounted file systems (deprecated)");
4429 
4430 SYSCTL_PROC(_security_jail, OID_AUTO, devfs_ruleset,
4431     CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_MPSAFE,
4432     &jail_default_devfs_rsnum, offsetof(struct prison, pr_devfs_rsnum),
4433     sysctl_jail_default_level, "I",
4434     "Ruleset for the devfs filesystem in jail (deprecated)");
4435 
4436 /*
4437  * Nodes to describe jail parameters.  Maximum length of string parameters
4438  * is returned in the string itself, and the other parameters exist merely
4439  * to make themselves and their types known.
4440  */
4441 SYSCTL_NODE(_security_jail, OID_AUTO, param, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
4442     "Jail parameters");
4443 
4444 int
4445 sysctl_jail_param(SYSCTL_HANDLER_ARGS)
4446 {
4447 	int i;
4448 	long l;
4449 	size_t s;
4450 	char numbuf[12];
4451 
4452 	switch (oidp->oid_kind & CTLTYPE)
4453 	{
4454 	case CTLTYPE_LONG:
4455 	case CTLTYPE_ULONG:
4456 		l = 0;
4457 #ifdef SCTL_MASK32
4458 		if (!(req->flags & SCTL_MASK32))
4459 #endif
4460 			return (SYSCTL_OUT(req, &l, sizeof(l)));
4461 	case CTLTYPE_INT:
4462 	case CTLTYPE_UINT:
4463 		i = 0;
4464 		return (SYSCTL_OUT(req, &i, sizeof(i)));
4465 	case CTLTYPE_STRING:
4466 		snprintf(numbuf, sizeof(numbuf), "%jd", (intmax_t)arg2);
4467 		return
4468 		    (sysctl_handle_string(oidp, numbuf, sizeof(numbuf), req));
4469 	case CTLTYPE_STRUCT:
4470 		s = (size_t)arg2;
4471 		return (SYSCTL_OUT(req, &s, sizeof(s)));
4472 	}
4473 	return (0);
4474 }
4475 
4476 /*
4477  * CTLFLAG_RDTUN in the following indicates jail parameters that can be set at
4478  * jail creation time but cannot be changed in an existing jail.
4479  */
4480 SYSCTL_JAIL_PARAM(, jid, CTLTYPE_INT | CTLFLAG_RDTUN, "I", "Jail ID");
4481 SYSCTL_JAIL_PARAM(, parent, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail parent ID");
4482 SYSCTL_JAIL_PARAM_STRING(, name, CTLFLAG_RW, MAXHOSTNAMELEN, "Jail name");
4483 SYSCTL_JAIL_PARAM_STRING(, path, CTLFLAG_RDTUN, MAXPATHLEN, "Jail root path");
4484 SYSCTL_JAIL_PARAM(, securelevel, CTLTYPE_INT | CTLFLAG_RW,
4485     "I", "Jail secure level");
4486 SYSCTL_JAIL_PARAM(, osreldate, CTLTYPE_INT | CTLFLAG_RDTUN, "I",
4487     "Jail value for kern.osreldate and uname -K");
4488 SYSCTL_JAIL_PARAM_STRING(, osrelease, CTLFLAG_RDTUN, OSRELEASELEN,
4489     "Jail value for kern.osrelease and uname -r");
4490 SYSCTL_JAIL_PARAM(, enforce_statfs, CTLTYPE_INT | CTLFLAG_RW,
4491     "I", "Jail cannot see all mounted file systems");
4492 SYSCTL_JAIL_PARAM(, devfs_ruleset, CTLTYPE_INT | CTLFLAG_RW,
4493     "I", "Ruleset for in-jail devfs mounts");
4494 SYSCTL_JAIL_PARAM(, persist, CTLTYPE_INT | CTLFLAG_RW,
4495     "B", "Jail persistence");
4496 #ifdef VIMAGE
4497 SYSCTL_JAIL_PARAM(, vnet, CTLTYPE_INT | CTLFLAG_RDTUN,
4498     "E,jailsys", "Virtual network stack");
4499 #endif
4500 SYSCTL_JAIL_PARAM(, dying, CTLTYPE_INT | CTLFLAG_RD,
4501     "B", "Jail is in the process of shutting down");
4502 
4503 SYSCTL_JAIL_PARAM_NODE(children, "Number of child jails");
4504 SYSCTL_JAIL_PARAM(_children, cur, CTLTYPE_INT | CTLFLAG_RD,
4505     "I", "Current number of child jails");
4506 SYSCTL_JAIL_PARAM(_children, max, CTLTYPE_INT | CTLFLAG_RW,
4507     "I", "Maximum number of child jails");
4508 
4509 SYSCTL_JAIL_PARAM_SYS_NODE(host, CTLFLAG_RW, "Jail host info");
4510 SYSCTL_JAIL_PARAM_STRING(_host, hostname, CTLFLAG_RW, MAXHOSTNAMELEN,
4511     "Jail hostname");
4512 SYSCTL_JAIL_PARAM_STRING(_host, domainname, CTLFLAG_RW, MAXHOSTNAMELEN,
4513     "Jail NIS domainname");
4514 SYSCTL_JAIL_PARAM_STRING(_host, hostuuid, CTLFLAG_RW, HOSTUUIDLEN,
4515     "Jail host UUID");
4516 SYSCTL_JAIL_PARAM(_host, hostid, CTLTYPE_ULONG | CTLFLAG_RW,
4517     "LU", "Jail host ID");
4518 
4519 SYSCTL_JAIL_PARAM_NODE(cpuset, "Jail cpuset");
4520 SYSCTL_JAIL_PARAM(_cpuset, id, CTLTYPE_INT | CTLFLAG_RD, "I", "Jail cpuset ID");
4521 
4522 #ifdef INET
4523 SYSCTL_JAIL_PARAM_SYS_NODE(ip4, CTLFLAG_RDTUN,
4524     "Jail IPv4 address virtualization");
4525 SYSCTL_JAIL_PARAM_STRUCT(_ip4, addr, CTLFLAG_RW, sizeof(struct in_addr),
4526     "S,in_addr,a", "Jail IPv4 addresses");
4527 SYSCTL_JAIL_PARAM(_ip4, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4528     "B", "Do (not) use IPv4 source address selection rather than the "
4529     "primary jail IPv4 address.");
4530 #endif
4531 #ifdef INET6
4532 SYSCTL_JAIL_PARAM_SYS_NODE(ip6, CTLFLAG_RDTUN,
4533     "Jail IPv6 address virtualization");
4534 SYSCTL_JAIL_PARAM_STRUCT(_ip6, addr, CTLFLAG_RW, sizeof(struct in6_addr),
4535     "S,in6_addr,a", "Jail IPv6 addresses");
4536 SYSCTL_JAIL_PARAM(_ip6, saddrsel, CTLTYPE_INT | CTLFLAG_RW,
4537     "B", "Do (not) use IPv6 source address selection rather than the "
4538     "primary jail IPv6 address.");
4539 #endif
4540 
4541 SYSCTL_JAIL_PARAM_NODE(allow, "Jail permission flags");
4542 SYSCTL_JAIL_PARAM(_allow, set_hostname, CTLTYPE_INT | CTLFLAG_RW,
4543     "B", "Jail may set hostname");
4544 SYSCTL_JAIL_PARAM(_allow, sysvipc, CTLTYPE_INT | CTLFLAG_RW,
4545     "B", "Jail may use SYSV IPC");
4546 SYSCTL_JAIL_PARAM(_allow, raw_sockets, CTLTYPE_INT | CTLFLAG_RW,
4547     "B", "Jail may create raw sockets");
4548 SYSCTL_JAIL_PARAM(_allow, chflags, CTLTYPE_INT | CTLFLAG_RW,
4549     "B", "Jail may alter system file flags");
4550 SYSCTL_JAIL_PARAM(_allow, quotas, CTLTYPE_INT | CTLFLAG_RW,
4551     "B", "Jail may set file quotas");
4552 SYSCTL_JAIL_PARAM(_allow, socket_af, CTLTYPE_INT | CTLFLAG_RW,
4553     "B", "Jail may create sockets other than just UNIX/IPv4/IPv6/route");
4554 SYSCTL_JAIL_PARAM(_allow, mlock, CTLTYPE_INT | CTLFLAG_RW,
4555     "B", "Jail may lock (unlock) physical pages in memory");
4556 SYSCTL_JAIL_PARAM(_allow, reserved_ports, CTLTYPE_INT | CTLFLAG_RW,
4557     "B", "Jail may bind sockets to reserved ports");
4558 SYSCTL_JAIL_PARAM(_allow, read_msgbuf, CTLTYPE_INT | CTLFLAG_RW,
4559     "B", "Jail may read the kernel message buffer");
4560 SYSCTL_JAIL_PARAM(_allow, unprivileged_proc_debug, CTLTYPE_INT | CTLFLAG_RW,
4561     "B", "Unprivileged processes may use process debugging facilities");
4562 SYSCTL_JAIL_PARAM(_allow, suser, CTLTYPE_INT | CTLFLAG_RW,
4563     "B", "Processes in jail with uid 0 have privilege");
4564 #if defined(VNET_NFSD) && defined(VIMAGE) && defined(NFSD)
4565 SYSCTL_JAIL_PARAM(_allow, nfsd, CTLTYPE_INT | CTLFLAG_RW,
4566     "B", "Mountd/nfsd may run in the jail");
4567 #endif
4568 
4569 SYSCTL_JAIL_PARAM_SUBNODE(allow, mount, "Jail mount/unmount permission flags");
4570 SYSCTL_JAIL_PARAM(_allow_mount, , CTLTYPE_INT | CTLFLAG_RW,
4571     "B", "Jail may mount/unmount jail-friendly file systems in general");
4572 
4573 /*
4574  * Add a dynamic parameter allow.<name>, or allow.<prefix>.<name>.  Return
4575  * its associated bit in the pr_allow bitmask, or zero if the parameter was
4576  * not created.
4577  */
4578 unsigned
4579 prison_add_allow(const char *prefix, const char *name, const char *prefix_descr,
4580     const char *descr)
4581 {
4582 	struct bool_flags *bf;
4583 	struct sysctl_oid *parent;
4584 	char *allow_name, *allow_noname, *allowed;
4585 #ifndef NO_SYSCTL_DESCR
4586 	char *descr_deprecated;
4587 #endif
4588 	u_int allow_flag;
4589 
4590 	if (prefix
4591 	    ? asprintf(&allow_name, M_PRISON, "allow.%s.%s", prefix, name)
4592 		< 0 ||
4593 	      asprintf(&allow_noname, M_PRISON, "allow.%s.no%s", prefix, name)
4594 		< 0
4595 	    : asprintf(&allow_name, M_PRISON, "allow.%s", name) < 0 ||
4596 	      asprintf(&allow_noname, M_PRISON, "allow.no%s", name) < 0) {
4597 		free(allow_name, M_PRISON);
4598 		return 0;
4599 	}
4600 
4601 	/*
4602 	 * See if this parameter has already beed added, i.e. a module was
4603 	 * previously loaded/unloaded.
4604 	 */
4605 	mtx_lock(&prison0.pr_mtx);
4606 	for (bf = pr_flag_allow;
4607 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4608 		atomic_load_int(&bf->flag) != 0;
4609 	     bf++) {
4610 		if (strcmp(bf->name, allow_name) == 0) {
4611 			allow_flag = bf->flag;
4612 			goto no_add;
4613 		}
4614 	}
4615 
4616 	/*
4617 	 * Find a free bit in pr_allow_all, failing if there are none
4618 	 * (which shouldn't happen as long as we keep track of how many
4619 	 * potential dynamic flags exist).
4620 	 */
4621 	for (allow_flag = 1;; allow_flag <<= 1) {
4622 		if (allow_flag == 0)
4623 			goto no_add;
4624 		if ((pr_allow_all & allow_flag) == 0)
4625 			break;
4626 	}
4627 
4628 	/* Note the parameter in the next open slot in pr_flag_allow. */
4629 	for (bf = pr_flag_allow; ; bf++) {
4630 		if (bf == pr_flag_allow + nitems(pr_flag_allow)) {
4631 			/* This should never happen, but is not fatal. */
4632 			allow_flag = 0;
4633 			goto no_add;
4634 		}
4635 		if (atomic_load_int(&bf->flag) == 0)
4636 			break;
4637 	}
4638 	bf->name = allow_name;
4639 	bf->noname = allow_noname;
4640 	pr_allow_all |= allow_flag;
4641 	/*
4642 	 * prison0 always has permission for the new parameter.
4643 	 * Other jails must have it granted to them.
4644 	 */
4645 	prison0.pr_allow |= allow_flag;
4646 	/* The flag indicates a valid entry, so make sure it is set last. */
4647 	atomic_store_rel_int(&bf->flag, allow_flag);
4648 	mtx_unlock(&prison0.pr_mtx);
4649 
4650 	/*
4651 	 * Create sysctls for the parameter, and the back-compat global
4652 	 * permission.
4653 	 */
4654 	parent = prefix
4655 	    ? SYSCTL_ADD_NODE(NULL,
4656 		  SYSCTL_CHILDREN(&sysctl___security_jail_param_allow),
4657 		  OID_AUTO, prefix, CTLFLAG_MPSAFE, 0, prefix_descr)
4658 	    : &sysctl___security_jail_param_allow;
4659 	(void)SYSCTL_ADD_PROC(NULL, SYSCTL_CHILDREN(parent), OID_AUTO,
4660 	    name, CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE,
4661 	    NULL, 0, sysctl_jail_param, "B", descr);
4662 	if ((prefix
4663 	     ? asprintf(&allowed, M_TEMP, "%s_%s_allowed", prefix, name)
4664 	     : asprintf(&allowed, M_TEMP, "%s_allowed", name)) >= 0) {
4665 #ifndef NO_SYSCTL_DESCR
4666 		(void)asprintf(&descr_deprecated, M_TEMP, "%s (deprecated)",
4667 		    descr);
4668 #endif
4669 		(void)SYSCTL_ADD_PROC(NULL,
4670 		    SYSCTL_CHILDREN(&sysctl___security_jail), OID_AUTO, allowed,
4671 		    CTLTYPE_INT | CTLFLAG_RW | CTLFLAG_MPSAFE, NULL, allow_flag,
4672 		    sysctl_jail_default_allow, "I", descr_deprecated);
4673 #ifndef NO_SYSCTL_DESCR
4674 		free(descr_deprecated, M_TEMP);
4675 #endif
4676 		free(allowed, M_TEMP);
4677 	}
4678 	return allow_flag;
4679 
4680  no_add:
4681 	mtx_unlock(&prison0.pr_mtx);
4682 	free(allow_name, M_PRISON);
4683 	free(allow_noname, M_PRISON);
4684 	return allow_flag;
4685 }
4686 
4687 /*
4688  * The VFS system will register jail-aware filesystems here.  They each get
4689  * a parameter allow.mount.xxxfs and a flag to check when a jailed user
4690  * attempts to mount.
4691  */
4692 void
4693 prison_add_vfs(struct vfsconf *vfsp)
4694 {
4695 #ifdef NO_SYSCTL_DESCR
4696 
4697 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4698 	    NULL, NULL);
4699 #else
4700 	char *descr;
4701 
4702 	(void)asprintf(&descr, M_TEMP, "Jail may mount the %s file system",
4703 	    vfsp->vfc_name);
4704 	vfsp->vfc_prison_flag = prison_add_allow("mount", vfsp->vfc_name,
4705 	    NULL, descr);
4706 	free(descr, M_TEMP);
4707 #endif
4708 }
4709 
4710 #ifdef RACCT
4711 void
4712 prison_racct_foreach(void (*callback)(struct racct *racct,
4713     void *arg2, void *arg3), void (*pre)(void), void (*post)(void),
4714     void *arg2, void *arg3)
4715 {
4716 	struct prison_racct *prr;
4717 
4718 	ASSERT_RACCT_ENABLED();
4719 
4720 	sx_slock(&allprison_lock);
4721 	if (pre != NULL)
4722 		(pre)();
4723 	LIST_FOREACH(prr, &allprison_racct, prr_next)
4724 		(callback)(prr->prr_racct, arg2, arg3);
4725 	if (post != NULL)
4726 		(post)();
4727 	sx_sunlock(&allprison_lock);
4728 }
4729 
4730 static struct prison_racct *
4731 prison_racct_find_locked(const char *name)
4732 {
4733 	struct prison_racct *prr;
4734 
4735 	ASSERT_RACCT_ENABLED();
4736 	sx_assert(&allprison_lock, SA_XLOCKED);
4737 
4738 	if (name[0] == '\0' || strlen(name) >= MAXHOSTNAMELEN)
4739 		return (NULL);
4740 
4741 	LIST_FOREACH(prr, &allprison_racct, prr_next) {
4742 		if (strcmp(name, prr->prr_name) != 0)
4743 			continue;
4744 
4745 		/* Found prison_racct with a matching name? */
4746 		prison_racct_hold(prr);
4747 		return (prr);
4748 	}
4749 
4750 	/* Add new prison_racct. */
4751 	prr = malloc(sizeof(*prr), M_PRISON_RACCT, M_ZERO | M_WAITOK);
4752 	racct_create(&prr->prr_racct);
4753 
4754 	strcpy(prr->prr_name, name);
4755 	refcount_init(&prr->prr_refcount, 1);
4756 	LIST_INSERT_HEAD(&allprison_racct, prr, prr_next);
4757 
4758 	return (prr);
4759 }
4760 
4761 struct prison_racct *
4762 prison_racct_find(const char *name)
4763 {
4764 	struct prison_racct *prr;
4765 
4766 	ASSERT_RACCT_ENABLED();
4767 
4768 	sx_xlock(&allprison_lock);
4769 	prr = prison_racct_find_locked(name);
4770 	sx_xunlock(&allprison_lock);
4771 	return (prr);
4772 }
4773 
4774 void
4775 prison_racct_hold(struct prison_racct *prr)
4776 {
4777 
4778 	ASSERT_RACCT_ENABLED();
4779 
4780 	refcount_acquire(&prr->prr_refcount);
4781 }
4782 
4783 static void
4784 prison_racct_free_locked(struct prison_racct *prr)
4785 {
4786 
4787 	ASSERT_RACCT_ENABLED();
4788 	sx_assert(&allprison_lock, SA_XLOCKED);
4789 
4790 	if (refcount_release(&prr->prr_refcount)) {
4791 		racct_destroy(&prr->prr_racct);
4792 		LIST_REMOVE(prr, prr_next);
4793 		free(prr, M_PRISON_RACCT);
4794 	}
4795 }
4796 
4797 void
4798 prison_racct_free(struct prison_racct *prr)
4799 {
4800 
4801 	ASSERT_RACCT_ENABLED();
4802 	sx_assert(&allprison_lock, SA_UNLOCKED);
4803 
4804 	if (refcount_release_if_not_last(&prr->prr_refcount))
4805 		return;
4806 
4807 	sx_xlock(&allprison_lock);
4808 	prison_racct_free_locked(prr);
4809 	sx_xunlock(&allprison_lock);
4810 }
4811 
4812 static void
4813 prison_racct_attach(struct prison *pr)
4814 {
4815 	struct prison_racct *prr;
4816 
4817 	ASSERT_RACCT_ENABLED();
4818 	sx_assert(&allprison_lock, SA_XLOCKED);
4819 
4820 	prr = prison_racct_find_locked(pr->pr_name);
4821 	KASSERT(prr != NULL, ("cannot find prison_racct"));
4822 
4823 	pr->pr_prison_racct = prr;
4824 }
4825 
4826 /*
4827  * Handle jail renaming.  From the racct point of view, renaming means
4828  * moving from one prison_racct to another.
4829  */
4830 static void
4831 prison_racct_modify(struct prison *pr)
4832 {
4833 #ifdef RCTL
4834 	struct proc *p;
4835 	struct ucred *cred;
4836 #endif
4837 	struct prison_racct *oldprr;
4838 
4839 	ASSERT_RACCT_ENABLED();
4840 
4841 	sx_slock(&allproc_lock);
4842 	sx_xlock(&allprison_lock);
4843 
4844 	if (strcmp(pr->pr_name, pr->pr_prison_racct->prr_name) == 0) {
4845 		sx_xunlock(&allprison_lock);
4846 		sx_sunlock(&allproc_lock);
4847 		return;
4848 	}
4849 
4850 	oldprr = pr->pr_prison_racct;
4851 	pr->pr_prison_racct = NULL;
4852 
4853 	prison_racct_attach(pr);
4854 
4855 	/*
4856 	 * Move resource utilisation records.
4857 	 */
4858 	racct_move(pr->pr_prison_racct->prr_racct, oldprr->prr_racct);
4859 
4860 #ifdef RCTL
4861 	/*
4862 	 * Force rctl to reattach rules to processes.
4863 	 */
4864 	FOREACH_PROC_IN_SYSTEM(p) {
4865 		PROC_LOCK(p);
4866 		cred = crhold(p->p_ucred);
4867 		PROC_UNLOCK(p);
4868 		rctl_proc_ucred_changed(p, cred);
4869 		crfree(cred);
4870 	}
4871 #endif
4872 
4873 	sx_sunlock(&allproc_lock);
4874 	prison_racct_free_locked(oldprr);
4875 	sx_xunlock(&allprison_lock);
4876 }
4877 
4878 static void
4879 prison_racct_detach(struct prison *pr)
4880 {
4881 
4882 	ASSERT_RACCT_ENABLED();
4883 	sx_assert(&allprison_lock, SA_UNLOCKED);
4884 
4885 	if (pr->pr_prison_racct == NULL)
4886 		return;
4887 	prison_racct_free(pr->pr_prison_racct);
4888 	pr->pr_prison_racct = NULL;
4889 }
4890 #endif /* RACCT */
4891 
4892 #ifdef DDB
4893 
4894 static void
4895 db_show_prison(struct prison *pr)
4896 {
4897 	struct bool_flags *bf;
4898 	struct jailsys_flags *jsf;
4899 #if defined(INET) || defined(INET6)
4900 	int ii;
4901 #endif
4902 	unsigned f;
4903 #ifdef INET
4904 	char ip4buf[INET_ADDRSTRLEN];
4905 #endif
4906 #ifdef INET6
4907 	char ip6buf[INET6_ADDRSTRLEN];
4908 #endif
4909 
4910 	db_printf("prison %p:\n", pr);
4911 	db_printf(" jid             = %d\n", pr->pr_id);
4912 	db_printf(" name            = %s\n", pr->pr_name);
4913 	db_printf(" parent          = %p\n", pr->pr_parent);
4914 	db_printf(" ref             = %d\n", pr->pr_ref);
4915 	db_printf(" uref            = %d\n", pr->pr_uref);
4916 	db_printf(" state           = %s\n",
4917 	    pr->pr_state == PRISON_STATE_ALIVE ? "alive" :
4918 	    pr->pr_state == PRISON_STATE_DYING ? "dying" :
4919 	    "invalid");
4920 	db_printf(" path            = %s\n", pr->pr_path);
4921 	db_printf(" cpuset          = %d\n", pr->pr_cpuset
4922 	    ? pr->pr_cpuset->cs_id : -1);
4923 #ifdef VIMAGE
4924 	db_printf(" vnet            = %p\n", pr->pr_vnet);
4925 #endif
4926 	db_printf(" root            = %p\n", pr->pr_root);
4927 	db_printf(" securelevel     = %d\n", pr->pr_securelevel);
4928 	db_printf(" devfs_rsnum     = %d\n", pr->pr_devfs_rsnum);
4929 	db_printf(" children.max    = %d\n", pr->pr_childmax);
4930 	db_printf(" children.cur    = %d\n", pr->pr_childcount);
4931 	db_printf(" child           = %p\n", LIST_FIRST(&pr->pr_children));
4932 	db_printf(" sibling         = %p\n", LIST_NEXT(pr, pr_sibling));
4933 	db_printf(" flags           = 0x%x", pr->pr_flags);
4934 	for (bf = pr_flag_bool; bf < pr_flag_bool + nitems(pr_flag_bool); bf++)
4935 		if (pr->pr_flags & bf->flag)
4936 			db_printf(" %s", bf->name);
4937 	for (jsf = pr_flag_jailsys;
4938 	     jsf < pr_flag_jailsys + nitems(pr_flag_jailsys);
4939 	     jsf++) {
4940 		f = pr->pr_flags & (jsf->disable | jsf->new);
4941 		db_printf(" %-16s= %s\n", jsf->name,
4942 		    (f != 0 && f == jsf->disable) ? "disable"
4943 		    : (f == jsf->new) ? "new"
4944 		    : "inherit");
4945 	}
4946 	db_printf(" allow           = 0x%x", pr->pr_allow);
4947 	for (bf = pr_flag_allow;
4948 	     bf < pr_flag_allow + nitems(pr_flag_allow) &&
4949 		atomic_load_int(&bf->flag) != 0;
4950 	     bf++)
4951 		if (pr->pr_allow & bf->flag)
4952 			db_printf(" %s", bf->name);
4953 	db_printf("\n");
4954 	db_printf(" enforce_statfs  = %d\n", pr->pr_enforce_statfs);
4955 	db_printf(" host.hostname   = %s\n", pr->pr_hostname);
4956 	db_printf(" host.domainname = %s\n", pr->pr_domainname);
4957 	db_printf(" host.hostuuid   = %s\n", pr->pr_hostuuid);
4958 	db_printf(" host.hostid     = %lu\n", pr->pr_hostid);
4959 #ifdef INET
4960 	if (pr->pr_addrs[PR_INET] != NULL) {
4961 		pr_family_t af = PR_INET;
4962 
4963 		db_printf(" ip4s            = %d\n", pr->pr_addrs[af]->ips);
4964 		for (ii = 0; ii < pr->pr_addrs[af]->ips; ii++)
4965 			db_printf(" %s %s\n",
4966 			    ii == 0 ? "ip4.addr        =" : "                 ",
4967 			    inet_ntoa_r(
4968 			    *(const struct in_addr *)PR_IP(pr->pr_addrs[af], ii),
4969 			    ip4buf));
4970 	}
4971 #endif
4972 #ifdef INET6
4973 	if (pr->pr_addrs[PR_INET6] != NULL) {
4974 		pr_family_t af = PR_INET6;
4975 
4976 		db_printf(" ip6s            = %d\n", pr->pr_addrs[af]->ips);
4977 		for (ii = 0; ii < pr->pr_addrs[af]->ips; ii++)
4978 			db_printf(" %s %s\n",
4979 			    ii == 0 ? "ip6.addr        =" : "                 ",
4980 			    ip6_sprintf(ip6buf,
4981 			    (const struct in6_addr *)PR_IP(pr->pr_addrs[af], ii)));
4982 	}
4983 #endif
4984 }
4985 
4986 DB_SHOW_COMMAND(prison, db_show_prison_command)
4987 {
4988 	struct prison *pr;
4989 
4990 	if (!have_addr) {
4991 		/*
4992 		 * Show all prisons in the list, and prison0 which is not
4993 		 * listed.
4994 		 */
4995 		db_show_prison(&prison0);
4996 		if (!db_pager_quit) {
4997 			TAILQ_FOREACH(pr, &allprison, pr_list) {
4998 				db_show_prison(pr);
4999 				if (db_pager_quit)
5000 					break;
5001 			}
5002 		}
5003 		return;
5004 	}
5005 
5006 	if (addr == 0)
5007 		pr = &prison0;
5008 	else {
5009 		/* Look for a prison with the ID and with references. */
5010 		TAILQ_FOREACH(pr, &allprison, pr_list)
5011 			if (pr->pr_id == addr && pr->pr_ref > 0)
5012 				break;
5013 		if (pr == NULL)
5014 			/* Look again, without requiring a reference. */
5015 			TAILQ_FOREACH(pr, &allprison, pr_list)
5016 				if (pr->pr_id == addr)
5017 					break;
5018 		if (pr == NULL)
5019 			/* Assume address points to a valid prison. */
5020 			pr = (struct prison *)addr;
5021 	}
5022 	db_show_prison(pr);
5023 }
5024 
5025 #endif /* DDB */
5026