xref: /dragonfly/sys/kern/kern_jail.c (revision 7d84b73d)
1 /*
2  * ----------------------------------------------------------------------------
3  * "THE BEER-WARE LICENSE" (Revision 42):
4  * <phk@FreeBSD.ORG> wrote this file.  As long as you retain this notice you
5  * can do whatever you want with this stuff. If we meet some day, and you think
6  * this stuff is worth it, you can buy me a beer in return.   Poul-Henning Kamp
7  * ----------------------------------------------------------------------------
8  *
9  */
10 /*-
11  * Copyright (c) 2006 Victor Balada Diaz <victor@bsdes.net>
12  * All rights reserved.
13  *
14  * Redistribution and use in source and binary forms, with or without
15  * modification, are permitted provided that the following conditions
16  * are met:
17  * 1. Redistributions of source code must retain the above copyright
18  *    notice, this list of conditions and the following disclaimer.
19  * 2. Redistributions in binary form must reproduce the above copyright
20  *    notice, this list of conditions and the following disclaimer in the
21  *    documentation and/or other materials provided with the distribution.
22  *
23  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33  * SUCH DAMAGE.
34  */
35 
36 
37 /*
38  * $FreeBSD: src/sys/kern/kern_jail.c,v 1.6.2.3 2001/08/17 01:00:26 rwatson Exp $
39  */
40 
41 #include "opt_inet6.h"
42 
43 #include <sys/param.h>
44 #include <sys/types.h>
45 #include <sys/kernel.h>
46 #include <sys/systm.h>
47 #include <sys/errno.h>
48 #include <sys/sysmsg.h>
49 #include <sys/malloc.h>
50 #include <sys/nlookup.h>
51 #include <sys/namecache.h>
52 #include <sys/proc.h>
53 #include <sys/caps.h>
54 #include <sys/jail.h>
55 #include <sys/socket.h>
56 #include <sys/sysctl.h>
57 #include <sys/kern_syscall.h>
58 #include <net/if.h>
59 #include <netinet/in.h>
60 #include <netinet6/in6_var.h>
61 
62 static struct prison	*prison_find(int);
63 static void		prison_ipcache_init(struct prison *);
64 
65 __read_mostly static prison_cap_t	prison_default_caps;
66 
67 MALLOC_DEFINE(M_PRISON, "prison", "Prison structures");
68 
69 SYSCTL_NODE(, OID_AUTO, jail, CTLFLAG_RW, 0,
70     "All jails settings");
71 
72 SYSCTL_NODE(_jail, OID_AUTO, defaults, CTLFLAG_RW, 0,
73     "Default options for jails");
74 
75 /*#define PRISON_DEBUG*/
76 #ifdef PRISON_DEBUG
77 __read_mostly static int prison_debug;
78 SYSCTL_INT(_jail, OID_AUTO, debug, CTLFLAG_RW, &prison_debug, 0,
79     "Debug prison refs");
80 #endif
81 
82 SYSCTL_BIT64(_jail_defaults, OID_AUTO, set_hostname_allowed, CTLFLAG_RW,
83     &prison_default_caps, 1, PRISON_CAP_SYS_SET_HOSTNAME,
84     "Processes in jail can set their hostnames");
85 
86 SYSCTL_BIT64(_jail_defaults, OID_AUTO, socket_unixiproute_only, CTLFLAG_RW,
87     &prison_default_caps, 0, PRISON_CAP_NET_UNIXIPROUTE,
88     "Processes in jail are limited to creating UNIX/IPv[46]/route sockets only");
89 
90 SYSCTL_BIT64(_jail_defaults, OID_AUTO, sysvipc_allowed, CTLFLAG_RW,
91     &prison_default_caps, 0, PRISON_CAP_SYS_SYSVIPC,
92     "Processes in jail can use System V IPC primitives");
93 
94 SYSCTL_BIT64(_jail_defaults, OID_AUTO, chflags_allowed, CTLFLAG_RW,
95     &prison_default_caps, 0, PRISON_CAP_VFS_CHFLAGS,
96     "Processes in jail can alter system file flags");
97 
98 SYSCTL_BIT64(_jail_defaults, OID_AUTO, allow_raw_sockets, CTLFLAG_RW,
99     &prison_default_caps, 0, PRISON_CAP_NET_RAW_SOCKETS,
100     "Process in jail can create raw sockets");
101 
102 SYSCTL_BIT64(_jail_defaults, OID_AUTO, allow_listen_override, CTLFLAG_RW,
103     &prison_default_caps, 0, PRISON_CAP_NET_LISTEN_OVERRIDE,
104     "Process in jail can override host wildcard listen");
105 
106 SYSCTL_BIT64(_jail_defaults, OID_AUTO, vfs_mount_nullfs, CTLFLAG_RW,
107     &prison_default_caps, 0, PRISON_CAP_VFS_MOUNT_NULLFS,
108     "Process in jail can mount nullfs(5) filesystems");
109 
110 SYSCTL_BIT64(_jail_defaults, OID_AUTO, vfs_mount_tmpfs, CTLFLAG_RW,
111     &prison_default_caps, 0, PRISON_CAP_VFS_MOUNT_TMPFS,
112     "Process in jail can mount tmpfs(5) filesystems");
113 
114 SYSCTL_BIT64(_jail_defaults, OID_AUTO, vfs_mount_devfs, CTLFLAG_RW,
115     &prison_default_caps, 0, PRISON_CAP_VFS_MOUNT_DEVFS,
116     "Process in jail can mount devfs(5) filesystems");
117 
118 SYSCTL_BIT64(_jail_defaults, OID_AUTO, vfs_mount_procfs, CTLFLAG_RW,
119     &prison_default_caps, 0, PRISON_CAP_VFS_MOUNT_PROCFS,
120     "Process in jail can mount procfs(5) filesystems");
121 
122 SYSCTL_BIT64(_jail_defaults, OID_AUTO, vfs_mount_fusefs, CTLFLAG_RW,
123     &prison_default_caps, 0, PRISON_CAP_VFS_MOUNT_FUSEFS,
124     "Process in jail can mount fuse filesystems");
125 
126 static int	lastprid = 0;
127 static int	prisoncount = 0;
128 
129 static struct lock jail_lock =
130        LOCK_INITIALIZER("jail", 0, LK_CANRECURSE);
131 
132 LIST_HEAD(prisonlist, prison);
133 static struct prisonlist allprison = LIST_HEAD_INITIALIZER(&allprison);
134 
135 static int
136 kern_jail_attach(int jid)
137 {
138 	struct proc *p = curthread->td_proc;
139 	struct prison *pr;
140 	struct ucred *cr;
141 	int error;
142 
143 	pr = prison_find(jid);
144 	if (pr == NULL)
145 		return(EINVAL);
146 
147 	error = kern_chroot(&pr->pr_root);
148 	if (error)
149 		return(error);
150 
151 	prison_hold(pr);
152 	lwkt_gettoken(&p->p_token);
153 	cr = cratom_proc(p);
154 	cr->cr_prison = pr;
155 	p->p_flags |= P_JAILED;
156 	caps_set_locked(p, SYSCAP_RESTRICTEDROOT, __SYSCAP_ALL);
157 	lwkt_reltoken(&p->p_token);
158 
159 	return(0);
160 }
161 
162 static int
163 assign_prison_id(struct prison *pr)
164 {
165 	int tryprid;
166 	struct prison *tpr;
167 
168 	tryprid = lastprid + 1;
169 	if (tryprid == JAIL_MAX)
170 		tryprid = 1;
171 
172 	lockmgr(&jail_lock, LK_EXCLUSIVE);
173 next:
174 	LIST_FOREACH(tpr, &allprison, pr_list) {
175 		if (tpr->pr_id != tryprid)
176 			continue;
177 		tryprid++;
178 		if (tryprid == JAIL_MAX) {
179 			lockmgr(&jail_lock, LK_RELEASE);
180 			return (ERANGE);
181 		}
182 		goto next;
183 	}
184 	pr->pr_id = lastprid = tryprid;
185 	lockmgr(&jail_lock, LK_RELEASE);
186 
187 	return (0);
188 }
189 
190 static int
191 kern_jail(struct prison *pr, struct jail *j)
192 {
193 	int error;
194 	struct nlookupdata nd;
195 
196 	error = nlookup_init(&nd, j->path, UIO_USERSPACE, NLC_FOLLOW);
197 	if (error) {
198 		nlookup_done(&nd);
199 		return (error);
200 	}
201 	error = nlookup(&nd);
202 	if (error) {
203 		nlookup_done(&nd);
204 		return (error);
205 	}
206 	cache_copy(&nd.nl_nch, &pr->pr_root);
207 
208 	varsymset_init(&pr->pr_varsymset, NULL);
209 	prison_ipcache_init(pr);
210 
211 	error = assign_prison_id(pr);
212 	if (error) {
213 		varsymset_clean(&pr->pr_varsymset);
214 		nlookup_done(&nd);
215 		return (error);
216 	}
217 
218 	lockmgr(&jail_lock, LK_EXCLUSIVE);
219 	LIST_INSERT_HEAD(&allprison, pr, pr_list);
220 	++prisoncount;
221 	lockmgr(&jail_lock, LK_RELEASE);
222 
223 	error = prison_sysctl_create(pr);
224 	if (error)
225 		goto out;
226 
227 	error = kern_jail_attach(pr->pr_id);
228 	if (error)
229 		goto out2;
230 
231 	nlookup_done(&nd);
232 	return 0;
233 
234 out2:
235 	prison_sysctl_done(pr);
236 
237 out:
238 	lockmgr(&jail_lock, LK_EXCLUSIVE);
239 	LIST_REMOVE(pr, pr_list);
240 	--prisoncount;
241 	lockmgr(&jail_lock, LK_RELEASE);
242 	varsymset_clean(&pr->pr_varsymset);
243 	nlookup_done(&nd);
244 	return (error);
245 }
246 
247 /*
248  * jail()
249  *
250  * jail_args(syscallarg(struct jail *) jail)
251  *
252  * MPALMOSTSAFE
253  */
254 int
255 sys_jail(struct sysmsg *sysmsg, const struct jail_args *uap)
256 {
257 	struct prison *pr;
258 	struct jail_ip_storage *jip;
259 	struct jail j;
260 	int error;
261 	uint32_t jversion;
262 
263 	sysmsg->sysmsg_result = -1;
264 
265 	error = caps_priv_check_self(SYSCAP_NOJAIL_CREATE);
266 	if (error)
267 		return (error);
268 
269 	error = copyin(uap->jail, &jversion, sizeof(jversion));
270 	if (error)
271 		return (error);
272 
273 	pr = kmalloc(sizeof(*pr), M_PRISON, M_WAITOK | M_ZERO);
274 	SLIST_INIT(&pr->pr_ips);
275 	lockmgr(&jail_lock, LK_EXCLUSIVE);
276 
277 	switch (jversion) {
278 	case 0:
279 		/* Single IPv4 jails. */
280 		{
281 		struct jail_v0 jv0;
282 		struct sockaddr_in ip4addr;
283 
284 		error = copyin(uap->jail, &jv0, sizeof(jv0));
285 		if (error)
286 			goto out;
287 
288 		j.path = jv0.path;
289 		j.hostname = jv0.hostname;
290 
291 		jip = kmalloc(sizeof(*jip),  M_PRISON, M_WAITOK | M_ZERO);
292 		ip4addr.sin_family = AF_INET;
293 		ip4addr.sin_addr.s_addr = htonl(jv0.ip_number);
294 		memcpy(&jip->ip, &ip4addr, sizeof(ip4addr));
295 		SLIST_INSERT_HEAD(&pr->pr_ips, jip, entries);
296 		break;
297 		}
298 
299 	case 1:
300 		/*
301 		 * DragonFly multi noIP/IPv4/IPv6 jails
302 		 *
303 		 * NOTE: This version is unsupported by FreeBSD
304 		 * (which uses version 2 instead).
305 		 */
306 
307 		error = copyin(uap->jail, &j, sizeof(j));
308 		if (error)
309 			goto out;
310 
311 		for (int i = 0; i < j.n_ips; i++) {
312 			jip = kmalloc(sizeof(*jip), M_PRISON,
313 				      M_WAITOK | M_ZERO);
314 			SLIST_INSERT_HEAD(&pr->pr_ips, jip, entries);
315 			error = copyin(&j.ips[i], &jip->ip,
316 					sizeof(struct sockaddr_storage));
317 			if (error)
318 				goto out;
319 		}
320 		break;
321 	default:
322 		error = EINVAL;
323 		goto out;
324 	}
325 
326 	error = copyinstr(j.hostname, &pr->pr_host, sizeof(pr->pr_host), 0);
327 	if (error)
328 		goto out;
329 
330 	/* Use default capabilities as a template */
331 	pr->pr_caps = prison_default_caps;
332 
333 	error = kern_jail(pr, &j);
334 	if (error)
335 		goto out;
336 
337 	sysmsg->sysmsg_result = pr->pr_id;
338 	lockmgr(&jail_lock, LK_RELEASE);
339 
340 	return (0);
341 
342 out:
343 	/* Delete all ips */
344 	while (!SLIST_EMPTY(&pr->pr_ips)) {
345 		jip = SLIST_FIRST(&pr->pr_ips);
346 		SLIST_REMOVE_HEAD(&pr->pr_ips, entries);
347 		kfree(jip, M_PRISON);
348 	}
349 	lockmgr(&jail_lock, LK_RELEASE);
350 	kfree(pr, M_PRISON);
351 
352 	return (error);
353 }
354 
355 /*
356  * int jail_attach(int jid);
357  *
358  * MPALMOSTSAFE
359  */
360 int
361 sys_jail_attach(struct sysmsg *sysmsg, const struct jail_attach_args *uap)
362 {
363 	int error;
364 
365 	error = caps_priv_check_self(SYSCAP_NOJAIL_ATTACH);
366 	if (error)
367 		return(error);
368 	lockmgr(&jail_lock, LK_EXCLUSIVE);
369 	error = kern_jail_attach(uap->jid);
370 	lockmgr(&jail_lock, LK_RELEASE);
371 	return (error);
372 }
373 
374 static void
375 prison_ipcache_init(struct prison *pr)
376 {
377 	struct jail_ip_storage *jis;
378 	struct sockaddr_in *ip4;
379 	struct sockaddr_in6 *ip6;
380 
381 	lockmgr(&jail_lock, LK_EXCLUSIVE);
382 	SLIST_FOREACH(jis, &pr->pr_ips, entries) {
383 		switch (jis->ip.ss_family) {
384 		case AF_INET:
385 			ip4 = (struct sockaddr_in *)&jis->ip;
386 			if ((ntohl(ip4->sin_addr.s_addr) >> IN_CLASSA_NSHIFT) ==
387 			    IN_LOOPBACKNET) {
388 				/* loopback address */
389 				if (pr->local_ip4 == NULL)
390 					pr->local_ip4 = ip4;
391 			} else {
392 				/* public address */
393 				if (pr->nonlocal_ip4 == NULL)
394 					pr->nonlocal_ip4 = ip4;
395 			}
396 			break;
397 
398 		case AF_INET6:
399 			ip6 = (struct sockaddr_in6 *)&jis->ip;
400 			if (IN6_IS_ADDR_LOOPBACK(&ip6->sin6_addr)) {
401 				/* loopback address */
402 				if (pr->local_ip6 == NULL)
403 					pr->local_ip6 = ip6;
404 			} else {
405 				/* public address */
406 				if (pr->nonlocal_ip6 == NULL)
407 					pr->nonlocal_ip6 = ip6;
408 			}
409 			break;
410 		}
411 	}
412 	lockmgr(&jail_lock, LK_RELEASE);
413 }
414 
415 /*
416  * Changes INADDR_LOOPBACK for a valid jail address.
417  * ip is in network byte order.
418  * Returns 1 if the ip is among jail valid ips.
419  * Returns 0 if is not among jail valid ips or
420  * if couldn't replace INADDR_LOOPBACK for a valid
421  * IP.
422  */
423 int
424 prison_replace_wildcards(struct thread *td, struct sockaddr *ip)
425 {
426 	struct sockaddr_in *ip4 = (struct sockaddr_in *)ip;
427 	struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)ip;
428 	struct prison *pr;
429 
430 	if (td->td_proc == NULL || td->td_ucred == NULL)
431 		return (1);
432 	if ((pr = td->td_ucred->cr_prison) == NULL)
433 		return (1);
434 
435 	if ((ip->sa_family == AF_INET &&
436 	    ip4->sin_addr.s_addr == htonl(INADDR_ANY)) ||
437 	    (ip->sa_family == AF_INET6 &&
438 	    IN6_IS_ADDR_UNSPECIFIED(&ip6->sin6_addr)))
439 		return (1);
440 	if ((ip->sa_family == AF_INET &&
441 	    ip4->sin_addr.s_addr == htonl(INADDR_LOOPBACK)) ||
442 	    (ip->sa_family == AF_INET6 &&
443 	    IN6_IS_ADDR_LOOPBACK(&ip6->sin6_addr))) {
444 		if (!prison_get_local(pr, ip->sa_family, ip) &&
445 		    !prison_get_nonlocal(pr, ip->sa_family, ip))
446 			return(0);
447 		else
448 			return(1);
449 	}
450 	if (jailed_ip(pr, ip))
451 		return(1);
452 	return(0);
453 }
454 
455 /*
456  * Convert the localhost IP to the actual jail IP
457  */
458 int
459 prison_remote_ip(struct thread *td, struct sockaddr *ip)
460 {
461 	struct sockaddr_in *ip4 = (struct sockaddr_in *)ip;
462 	struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)ip;
463 	struct prison *pr;
464 
465 	if (td == NULL || td->td_proc == NULL || td->td_ucred == NULL)
466 		return(1);
467 	if ((pr = td->td_ucred->cr_prison) == NULL)
468 		return(1);
469 	if ((ip->sa_family == AF_INET &&
470 	    ip4->sin_addr.s_addr == htonl(INADDR_LOOPBACK)) ||
471 	    (ip->sa_family == AF_INET6 &&
472 	    IN6_IS_ADDR_LOOPBACK(&ip6->sin6_addr))) {
473 		if (!prison_get_local(pr, ip->sa_family, ip) &&
474 		    !prison_get_nonlocal(pr, ip->sa_family, ip))
475 			return(0);
476 		else
477 			return(1);
478 	}
479 	return(1);
480 }
481 
482 /*
483  * Convert the jail IP back to localhost
484  *
485  * Used by getsockname() and getpeername() to convert the in-jail loopback
486  * address back to LOCALHOST.  For example, 127.0.0.2 -> 127.0.0.1.  The
487  * idea is that programs running inside the jail should be unaware that they
488  * are using a different loopback IP than the host.
489  */
490 __read_mostly static struct in6_addr sin6_localhost = IN6ADDR_LOOPBACK_INIT;
491 
492 int
493 prison_local_ip(struct thread *td, struct sockaddr *ip)
494 {
495 	struct sockaddr_in *ip4 = (struct sockaddr_in *)ip;
496 	struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)ip;
497 	struct prison *pr;
498 
499 	if (td == NULL || td->td_proc == NULL || td->td_ucred == NULL)
500 		return(1);
501 	if ((pr = td->td_ucred->cr_prison) == NULL)
502 		return(1);
503 	if (ip->sa_family == AF_INET && pr->local_ip4 &&
504 	    pr->local_ip4->sin_addr.s_addr == ip4->sin_addr.s_addr &&
505 	    pr->local_ip4->sin_addr.s_addr != htonl(INADDR_LOOPBACK)) {
506 		ip4->sin_addr.s_addr = htonl(INADDR_LOOPBACK);
507 		return(0);
508 	}
509 	if (ip->sa_family == AF_INET6 && pr->local_ip6 &&
510 	    bcmp(&pr->local_ip6->sin6_addr, &ip6->sin6_addr,
511 		 sizeof(ip6->sin6_addr)) == 0) {
512 		bcopy(&sin6_localhost, &ip6->sin6_addr, sizeof(ip6->sin6_addr));
513 		return(0);
514 	}
515 	return(1);
516 }
517 
518 /*
519  * Prison get non loopback ip:
520  * - af is the address family of the ip we want (AF_INET|AF_INET6).
521  * - If ip != NULL, put the first IP address that is not a loopback address
522  *   into *ip.
523  *
524  * ip is in network by order and we don't touch it unless we find a valid ip.
525  * No matter if ip == NULL or not, we return either a valid struct sockaddr *,
526  * or NULL.  This struct may not be modified.
527  */
528 struct sockaddr *
529 prison_get_nonlocal(struct prison *pr, sa_family_t af, struct sockaddr *ip)
530 {
531 	struct sockaddr_in *ip4 = (struct sockaddr_in *)ip;
532 	struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)ip;
533 
534 	/* Check if it is cached */
535 	switch(af) {
536 	case AF_INET:
537 		if (ip4 != NULL && pr->nonlocal_ip4 != NULL)
538 			ip4->sin_addr.s_addr = pr->nonlocal_ip4->sin_addr.s_addr;
539 		return (struct sockaddr *)pr->nonlocal_ip4;
540 
541 	case AF_INET6:
542 		if (ip6 != NULL && pr->nonlocal_ip6 != NULL)
543 			ip6->sin6_addr = pr->nonlocal_ip6->sin6_addr;
544 		return (struct sockaddr *)pr->nonlocal_ip6;
545 	}
546 
547 	/* NOTREACHED */
548 	return NULL;
549 }
550 
551 /*
552  * Prison get loopback ip.
553  * - af is the address family of the ip we want (AF_INET|AF_INET6).
554  * - If ip != NULL, put the first IP address that is not a loopback address
555  *   into *ip.
556  *
557  * ip is in network by order and we don't touch it unless we find a valid ip.
558  * No matter if ip == NULL or not, we return either a valid struct sockaddr *,
559  * or NULL.  This struct may not be modified.
560  */
561 struct sockaddr *
562 prison_get_local(struct prison *pr, sa_family_t af, struct sockaddr *ip)
563 {
564 	struct sockaddr_in *ip4 = (struct sockaddr_in *)ip;
565 	struct sockaddr_in6 *ip6 = (struct sockaddr_in6 *)ip;
566 
567 	/* Check if it is cached */
568 	switch(af) {
569 	case AF_INET:
570 		if (ip4 != NULL && pr->local_ip4 != NULL)
571 			ip4->sin_addr.s_addr = pr->local_ip4->sin_addr.s_addr;
572 		return (struct sockaddr *)pr->local_ip4;
573 
574 	case AF_INET6:
575 		if (ip6 != NULL && pr->local_ip6 != NULL)
576 			ip6->sin6_addr = pr->local_ip6->sin6_addr;
577 		return (struct sockaddr *)pr->local_ip6;
578 	}
579 
580 	/* NOTREACHED */
581 	return NULL;
582 }
583 
584 /* Check if the IP is among ours, if it is return 1, else 0 */
585 int
586 jailed_ip(struct prison *pr, const struct sockaddr *ip)
587 {
588 	const struct jail_ip_storage *jis;
589 	const struct sockaddr_in *jip4, *ip4;
590 	const struct sockaddr_in6 *jip6, *ip6;
591 
592 	if (pr == NULL)
593 		return(0);
594 	ip4 = (const struct sockaddr_in *)ip;
595 	ip6 = (const struct sockaddr_in6 *)ip;
596 
597 	lockmgr(&jail_lock, LK_EXCLUSIVE);
598 	SLIST_FOREACH(jis, &pr->pr_ips, entries) {
599 		switch (ip->sa_family) {
600 		case AF_INET:
601 			jip4 = (const struct sockaddr_in *) &jis->ip;
602 			if (jip4->sin_family == AF_INET &&
603 			    ip4->sin_addr.s_addr == jip4->sin_addr.s_addr) {
604 				lockmgr(&jail_lock, LK_RELEASE);
605 				return(1);
606 			}
607 			break;
608 		case AF_INET6:
609 			jip6 = (const struct sockaddr_in6 *) &jis->ip;
610 			if (jip6->sin6_family == AF_INET6 &&
611 			    IN6_ARE_ADDR_EQUAL(&ip6->sin6_addr,
612 				&jip6->sin6_addr)) {
613 				lockmgr(&jail_lock, LK_RELEASE);
614 				return(1);
615 			}
616 			break;
617 		}
618 	}
619 	lockmgr(&jail_lock, LK_RELEASE);
620 	/* Ip not in list */
621 	return(0);
622 }
623 
624 int
625 prison_if(struct ucred *cred, struct sockaddr *sa)
626 {
627 	struct prison *pr;
628 	struct sockaddr_in *sai = (struct sockaddr_in*) sa;
629 
630 	pr = cred->cr_prison;
631 
632 	if (((sai->sin_family != AF_INET) && (sai->sin_family != AF_INET6))
633 	    && PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_NET_UNIXIPROUTE))
634 		return(1);
635 	else if ((sai->sin_family != AF_INET) && (sai->sin_family != AF_INET6))
636 		return(0);
637 	else if (jailed_ip(pr, sa))
638 		return(0);
639 	return(1);
640 }
641 
642 /*
643  * Returns a prison instance, or NULL on failure.
644  */
645 static struct prison *
646 prison_find(int prid)
647 {
648 	struct prison *pr;
649 
650 	lockmgr(&jail_lock, LK_EXCLUSIVE);
651 	LIST_FOREACH(pr, &allprison, pr_list) {
652 		if (pr->pr_id == prid)
653 			break;
654 	}
655 	lockmgr(&jail_lock, LK_RELEASE);
656 
657 	return(pr);
658 }
659 
660 static int
661 sysctl_jail_list(SYSCTL_HANDLER_ARGS)
662 {
663 	struct thread *td = curthread;
664 	struct jail_ip_storage *jip;
665 #ifdef INET6
666 	struct sockaddr_in6 *jsin6;
667 #endif
668 	struct sockaddr_in *jsin;
669 	struct lwp *lp;
670 	struct prison *pr;
671 	unsigned int jlssize, jlsused;
672 	int count, error;
673 	char *jls; /* Jail list */
674 	char *oip; /* Output ip */
675 	char *fullpath, *freepath;
676 
677 	jlsused = 0;
678 
679 	if (jailed(td->td_ucred))
680 		return (0);
681 	lp = td->td_lwp;
682 retry:
683 	count = prisoncount;
684 
685 	if (count == 0)
686 		return(0);
687 
688 	jlssize = (count * 1024);
689 	jls = kmalloc(jlssize + 1, M_TEMP, M_WAITOK | M_ZERO);
690 	if (count < prisoncount) {
691 		kfree(jls, M_TEMP);
692 		goto retry;
693 	}
694 	count = prisoncount;
695 
696 	lockmgr(&jail_lock, LK_EXCLUSIVE);
697 	LIST_FOREACH(pr, &allprison, pr_list) {
698 		error = cache_fullpath(lp->lwp_proc, &pr->pr_root, NULL,
699 					&fullpath, &freepath, 0);
700 		if (error)
701 			continue;
702 		if (jlsused && jlsused < jlssize)
703 			jls[jlsused++] = '\n';
704 		count = ksnprintf(jls + jlsused, (jlssize - jlsused),
705 				 "%d %s %s",
706 				 pr->pr_id, pr->pr_host, fullpath);
707 		kfree(freepath, M_TEMP);
708 		if (count < 0)
709 			goto end;
710 		jlsused += count;
711 
712 		/* Copy the IPS */
713 		SLIST_FOREACH(jip, &pr->pr_ips, entries) {
714 			char buf[INET_ADDRSTRLEN];
715 
716 			jsin = (struct sockaddr_in *)&jip->ip;
717 
718 			switch(jsin->sin_family) {
719 			case AF_INET:
720 				oip = kinet_ntoa(jsin->sin_addr, buf);
721 				break;
722 #ifdef INET6
723 			case AF_INET6:
724 				jsin6 = (struct sockaddr_in6 *)&jip->ip;
725 				oip = ip6_sprintf(&jsin6->sin6_addr);
726 				break;
727 #endif
728 			default:
729 				oip = "?family?";
730 				break;
731 			}
732 
733 			if ((jlssize - jlsused) < (strlen(oip) + 1)) {
734 				error = ERANGE;
735 				goto end;
736 			}
737 			count = ksnprintf(jls + jlsused, (jlssize - jlsused),
738 					  " %s", oip);
739 			if (count < 0)
740 				goto end;
741 			jlsused += count;
742 		}
743 	}
744 
745 	/*
746 	 * The format is:
747 	 * pr_id <SPC> hostname1 <SPC> PATH1 <SPC> IP1 <SPC> IP2\npr_id...
748 	 */
749 	error = SYSCTL_OUT(req, jls, jlsused);
750 end:
751 	lockmgr(&jail_lock, LK_RELEASE);
752 	kfree(jls, M_TEMP);
753 
754 	return(error);
755 }
756 
757 SYSCTL_OID(_jail, OID_AUTO, list, CTLTYPE_STRING | CTLFLAG_RD, NULL, 0,
758 	   sysctl_jail_list, "A", "List of active jails");
759 
760 static int
761 sysctl_jail_jailed(SYSCTL_HANDLER_ARGS)
762 {
763 	int error, injail;
764 
765 	injail = jailed(req->td->td_ucred);
766 	error = SYSCTL_OUT(req, &injail, sizeof(injail));
767 
768 	return (error);
769 }
770 
771 SYSCTL_PROC(_jail, OID_AUTO, jailed,
772 	    CTLTYPE_INT | CTLFLAG_RD | CTLFLAG_NOLOCK, NULL, 0,
773 	    sysctl_jail_jailed, "I", "Process in jail?");
774 
775 /*
776  * MPSAFE
777  */
778 void
779 prison_hold(struct prison *pr)
780 {
781 	atomic_add_int(&pr->pr_ref, 1);
782 #ifdef PRISON_DEBUG
783 	if (prison_debug > 0) {
784 		--prison_debug;
785 		print_backtrace(-1);
786 	}
787 #endif
788 }
789 
790 /*
791  * MPALMOSTSAFE
792  */
793 void
794 prison_free(struct prison *pr)
795 {
796 	struct jail_ip_storage *jls;
797 
798 #ifdef PRISON_DEBUG
799 	if (prison_debug > 0) {
800 		--prison_debug;
801 		print_backtrace(-1);
802 	}
803 #endif
804 	KKASSERT(pr->pr_ref > 0);
805 	if (atomic_fetchadd_int(&pr->pr_ref, -1) != 1)
806 		return;
807 
808 	/*
809 	 * The global jail lock is needed on the last ref to adjust
810 	 * the list.
811 	 */
812 	lockmgr(&jail_lock, LK_EXCLUSIVE);
813 	if (pr->pr_ref) {
814 		lockmgr(&jail_lock, LK_RELEASE);
815 		return;
816 	}
817 	LIST_REMOVE(pr, pr_list);
818 	--prisoncount;
819 
820 	/*
821 	 * Clean up
822 	 */
823 	while (!SLIST_EMPTY(&pr->pr_ips)) {
824 		jls = SLIST_FIRST(&pr->pr_ips);
825 		SLIST_REMOVE_HEAD(&pr->pr_ips, entries);
826 		kfree(jls, M_PRISON);
827 	}
828 	lockmgr(&jail_lock, LK_RELEASE);
829 
830 	if (pr->pr_linux != NULL)
831 		kfree(pr->pr_linux, M_PRISON);
832 	varsymset_clean(&pr->pr_varsymset);
833 
834 	/* Release the sysctl tree */
835 	prison_sysctl_done(pr);
836 
837 	cache_drop(&pr->pr_root);
838 	kfree(pr, M_PRISON);
839 }
840 
841 /*
842  * Check if permisson for a specific privilege is granted within jail.
843  *
844  * MPSAFE
845  */
846 int
847 prison_priv_check(struct ucred *cred, int cap)
848 {
849 	struct prison *pr = cred->cr_prison;
850 
851 	if (!jailed(cred))
852 		return (0);
853 
854 	switch (cap & ~__SYSCAP_XFLAGS) {
855 	case SYSCAP_RESTRICTEDROOT:		/* meta group 1 */
856 		/* RESTRICTEDROOT fallbacks disallowed in jails */
857 		return EPERM;
858 	case SYSCAP_SENSITIVEROOT:		/* meta group 2 */
859 	case SYSCAP_NOEXEC:			/* meta group 3 */
860 	case SYSCAP_NOCRED:			/* meta group 4 */
861 		return 0;
862 	case SYSCAP_NOJAIL:			/* meta group 5 */
863 		/* all jail ops disallowed in jails */
864 		return EPERM;
865 	case SYSCAP_NONET:			/* meta group 6 */
866 		return 0;
867 	case SYSCAP_NONET_SENSITIVE:		/* meta group 7 */
868 		/* all sensitive network ops disallowed in jails */
869 		return EPERM;
870 	case SYSCAP_NOVFS:			/* meta group 8 */
871 	case SYSCAP_NOVFS_SENSITIVE:		/* meta group 9 */
872 	case SYSCAP_NOMOUNT:			/* meta group 10 */
873 	case SYSCAP_NO11:			/* meta group 11 */
874 	case SYSCAP_NO12:			/* meta group 12 */
875 	case SYSCAP_NO13:			/* meta group 13 */
876 	case SYSCAP_NO14:			/* meta group 14 */
877 	case SYSCAP_NO15:			/* meta group 15 */
878 		return (0);
879 
880 	/* ----- */				/* group 1 - disallowed */
881 
882 	case SYSCAP_NOPROC_TRESPASS:		/* group 2 allowed */
883 	case SYSCAP_NOPROC_SETLOGIN:
884 	case SYSCAP_NOPROC_SETRLIMIT:
885 	case SYSCAP_NOSYSCTL_WR:
886 	case SYSCAP_NOVARSYM_SYS:
887 	case SYSCAP_NOSETHOSTNAME:
888 	case SYSCAP_NOQUOTA_WR:
889 	case SYSCAP_NODEBUG_UNPRIV:
890 	case SYSCAP_NOSCHED:
891 	case SYSCAP_NOSCHED_CPUSET:
892 	case SYSCAP_NOSETTIME:
893 		return (0);
894 
895 	case SYSCAP_NOEXEC_SUID:		/* group 3 allowed */
896 	case SYSCAP_NOEXEC_SGID:
897 		return (0);
898 
899 	case SYSCAP_NOCRED_SETUID:		/* group 4 allowed */
900 	case SYSCAP_NOCRED_SETGID:
901 	case SYSCAP_NOCRED_SETEUID:
902 	case SYSCAP_NOCRED_SETEGID:
903 	case SYSCAP_NOCRED_SETREUID:
904 	case SYSCAP_NOCRED_SETREGID:
905 	case SYSCAP_NOCRED_SETRESUID:
906 	case SYSCAP_NOCRED_SETRESGID:
907 	case SYSCAP_NOCRED_SETGROUPS:
908 		return (0);
909 
910 	case SYSCAP_NOJAIL_CREATE:		/* group 5 disallowed */
911 	case SYSCAP_NOJAIL_ATTACH:
912 		return EPERM;
913 
914 	case SYSCAP_NONET_RESPORT:		/* group 6 mostly allowed */
915 		/*
916 		 * Allow reserved ports
917 		 */
918 		return 0;
919 	case SYSCAP_NONET_RAW:
920 		/*
921 		 * Conditionally allow creating raw sockets in jail.
922 		 */
923 		if (PRISON_CAP_ISSET(pr->pr_caps,
924 			PRISON_CAP_NET_RAW_SOCKETS))
925 			return (0);
926 		else
927 			return (EPERM);
928 
929 	/* ----- */				/* group 7 - disallowed */
930 
931 	case SYSCAP_NOVFS_SYSFLAGS:		/* group 8 - allowed */
932 	case SYSCAP_NOVFS_CHOWN:
933 	case SYSCAP_NOVFS_CHMOD:
934 	case SYSCAP_NOVFS_LINK:
935 	case SYSCAP_NOVFS_CHFLAGS_DEV:
936 	case SYSCAP_NOVFS_SETATTR:
937 	case SYSCAP_NOVFS_SETGID:
938 	case SYSCAP_NOVFS_GENERATION:
939 	case SYSCAP_NOVFS_RETAINSUGID:
940 		return (0);
941 
942 	case SYSCAP_NOVFS_MKNOD_BAD:		/* group 9 - allowed */
943 	case SYSCAP_NOVFS_MKNOD_WHT:
944 	case SYSCAP_NOVFS_MKNOD_DIR:
945 	case SYSCAP_NOVFS_MKNOD_DEV:
946 	case SYSCAP_NOVFS_IOCTL:
947 	case SYSCAP_NOVFS_CHROOT:
948 	case SYSCAP_NOVFS_REVOKE:
949 		return (0);
950 
951 	case SYSCAP_NOMOUNT_NULLFS:		/* group 10 - conditional */
952 		if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_VFS_MOUNT_NULLFS))
953 			return (0);
954 		else
955 			return (EPERM);
956 	case SYSCAP_NOMOUNT_DEVFS:
957 		if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_VFS_MOUNT_DEVFS))
958 			return (0);
959 		else
960 			return (EPERM);
961 	case SYSCAP_NOMOUNT_TMPFS:
962 		if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_VFS_MOUNT_TMPFS))
963 			return (0);
964 		else
965 			return (EPERM);
966 	case SYSCAP_NOMOUNT_PROCFS:
967 		if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_VFS_MOUNT_PROCFS))
968 			return (0);
969 		else
970 			return (EPERM);
971 	case SYSCAP_NOMOUNT_FUSE:
972 		if (PRISON_CAP_ISSET(pr->pr_caps, PRISON_CAP_VFS_MOUNT_FUSEFS))
973 			return (0);
974 		else
975 			return (EPERM);
976 	case SYSCAP_NOMOUNT_UMOUNT:
977 		return (0);
978 
979 	default:
980 		/* otherwise disallow */
981 		return (EPERM);
982 	}
983 }
984 
985 
986 /*
987  * Create a per-jail sysctl tree to control the prison
988  */
989 int
990 prison_sysctl_create(struct prison *pr)
991 {
992 	char id_str[7];
993 
994 	ksnprintf(id_str, 6, "%d", pr->pr_id);
995 
996 	pr->pr_sysctl_ctx = (struct sysctl_ctx_list *) kmalloc(
997 		sizeof(struct sysctl_ctx_list), M_PRISON, M_WAITOK | M_ZERO);
998 
999 	sysctl_ctx_init(pr->pr_sysctl_ctx);
1000 
1001 	/* Main jail node */
1002 	pr->pr_sysctl_tree = SYSCTL_ADD_NODE(pr->pr_sysctl_ctx,
1003 	    SYSCTL_STATIC_CHILDREN(_jail),
1004 	    OID_AUTO, id_str, CTLFLAG_RD, 0,
1005 	    "Jail specific settings");
1006 
1007 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1008 	    OID_AUTO, "sys_set_hostname", CTLFLAG_RW,
1009 	    &pr->pr_caps, 0, PRISON_CAP_SYS_SET_HOSTNAME,
1010 	    "Processes in jail can set their hostnames");
1011 
1012 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1013 	    OID_AUTO, "sys_sysvipc", CTLFLAG_RW,
1014 	    &pr->pr_caps, 0, PRISON_CAP_SYS_SYSVIPC,
1015 	    "Processes in jail can use System V IPC primitives");
1016 
1017 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1018 	    OID_AUTO, "net_unixiproute", CTLFLAG_RW,
1019 	    &pr->pr_caps, 0, PRISON_CAP_NET_UNIXIPROUTE,
1020 	    "Processes in jail are limited to creating UNIX/IPv[46]/route sockets only");
1021 
1022 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1023 	    OID_AUTO, "net_raw_sockets", CTLFLAG_RW,
1024 	    &pr->pr_caps, 0, PRISON_CAP_NET_RAW_SOCKETS,
1025 	    "Process in jail can create raw sockets");
1026 
1027 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1028 	    OID_AUTO, "allow_listen_override", CTLFLAG_RW,
1029 	    &pr->pr_caps, 0, PRISON_CAP_NET_LISTEN_OVERRIDE,
1030 	    "Process in jail can create raw sockets");
1031 
1032 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1033 	    OID_AUTO, "vfs_chflags", CTLFLAG_RW,
1034 	    &pr->pr_caps, 0, PRISON_CAP_VFS_CHFLAGS,
1035 	    "Process in jail can override host wildcard listen");
1036 
1037 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1038 	    OID_AUTO, "vfs_mount_nullfs", CTLFLAG_RW,
1039 	    &pr->pr_caps, 0, PRISON_CAP_VFS_MOUNT_NULLFS,
1040 	    "Processes in jail can mount nullfs(5) filesystems");
1041 
1042 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1043 	    OID_AUTO, "vfs_mount_tmpfs", CTLFLAG_RW,
1044 	    &pr->pr_caps, 0, PRISON_CAP_VFS_MOUNT_TMPFS,
1045 	    "Processes in jail can mount tmpfs(5) filesystems");
1046 
1047 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1048 	    OID_AUTO, "vfs_mount_devfs", CTLFLAG_RW,
1049 	    &pr->pr_caps, 0, PRISON_CAP_VFS_MOUNT_DEVFS,
1050 	    "Processes in jail can mount devfs(5) filesystems");
1051 
1052 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1053 	    OID_AUTO, "vfs_mount_procfs", CTLFLAG_RW,
1054 	    &pr->pr_caps, 0, PRISON_CAP_VFS_MOUNT_PROCFS,
1055 	    "Processes in jail can mount procfs(5) filesystems");
1056 
1057 	SYSCTL_ADD_BIT64(pr->pr_sysctl_ctx, SYSCTL_CHILDREN(pr->pr_sysctl_tree),
1058 	    OID_AUTO, "vfs_mount_fusefs", CTLFLAG_RW,
1059 	    &pr->pr_caps, 0, PRISON_CAP_VFS_MOUNT_FUSEFS,
1060 	    "Processes in jail can mount fuse filesystems");
1061 
1062 	return 0;
1063 }
1064 
1065 int
1066 prison_sysctl_done(struct prison *pr)
1067 {
1068 	if (pr->pr_sysctl_tree) {
1069 		sysctl_ctx_free(pr->pr_sysctl_ctx);
1070 		kfree(pr->pr_sysctl_ctx, M_PRISON);
1071 		pr->pr_sysctl_tree = NULL;
1072 	}
1073 
1074 	return 0;
1075 }
1076