xref: /freebsd/sys/kern/kern_rctl.c (revision 0957b409)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed by Edward Tomasz Napierala under sponsorship
8  * from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * $FreeBSD$
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/bus.h>
39 #include <sys/malloc.h>
40 #include <sys/queue.h>
41 #include <sys/refcount.h>
42 #include <sys/jail.h>
43 #include <sys/kernel.h>
44 #include <sys/limits.h>
45 #include <sys/loginclass.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/racct.h>
49 #include <sys/rctl.h>
50 #include <sys/resourcevar.h>
51 #include <sys/sx.h>
52 #include <sys/sysent.h>
53 #include <sys/sysproto.h>
54 #include <sys/systm.h>
55 #include <sys/types.h>
56 #include <sys/eventhandler.h>
57 #include <sys/lock.h>
58 #include <sys/mutex.h>
59 #include <sys/rwlock.h>
60 #include <sys/sbuf.h>
61 #include <sys/taskqueue.h>
62 #include <sys/tree.h>
63 #include <vm/uma.h>
64 
65 #ifdef RCTL
66 #ifndef RACCT
67 #error "The RCTL option requires the RACCT option"
68 #endif
69 
70 FEATURE(rctl, "Resource Limits");
71 
72 #define	HRF_DEFAULT		0
73 #define	HRF_DONT_INHERIT	1
74 #define	HRF_DONT_ACCUMULATE	2
75 
76 #define	RCTL_MAX_INBUFSIZE	4 * 1024
77 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
78 #define	RCTL_LOG_BUFSIZE	128
79 
80 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
81 
82 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
83 static int rctl_log_rate_limit = 10;
84 static int rctl_devctl_rate_limit = 10;
85 
86 /*
87  * Values below are initialized in rctl_init().
88  */
89 static int rctl_throttle_min = -1;
90 static int rctl_throttle_max = -1;
91 static int rctl_throttle_pct = -1;
92 static int rctl_throttle_pct2 = -1;
93 
94 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
96 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
97 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
98 
99 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW, 0, "Resource Limits");
100 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
101     &rctl_maxbufsize, 0, "Maximum output buffer size");
102 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
103     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
104 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
105     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
106 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
107     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_min_sysctl, "IU",
108     "Shortest throttling duration, in hz");
109 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
110 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
111     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_max_sysctl, "IU",
112     "Longest throttling duration, in hz");
113 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
114 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
115     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct_sysctl, "IU",
116     "Throttling penalty for process consumption, in percent");
117 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
118 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
119     CTLTYPE_UINT | CTLFLAG_RWTUN, 0, 0, &rctl_throttle_pct2_sysctl, "IU",
120     "Throttling penalty for container consumption, in percent");
121 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
122 
123 /*
124  * 'rctl_rule_link' connects a rule with every racct it's related to.
125  * For example, rule 'user:X:openfiles:deny=N/process' is linked
126  * with uidinfo for user X, and to each process of that user.
127  */
128 struct rctl_rule_link {
129 	LIST_ENTRY(rctl_rule_link)	rrl_next;
130 	struct rctl_rule		*rrl_rule;
131 	int				rrl_exceeded;
132 };
133 
134 struct dict {
135 	const char	*d_name;
136 	int		d_value;
137 };
138 
139 static struct dict subjectnames[] = {
140 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
141 	{ "user", RCTL_SUBJECT_TYPE_USER },
142 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
143 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
144 	{ NULL, -1 }};
145 
146 static struct dict resourcenames[] = {
147 	{ "cputime", RACCT_CPU },
148 	{ "datasize", RACCT_DATA },
149 	{ "stacksize", RACCT_STACK },
150 	{ "coredumpsize", RACCT_CORE },
151 	{ "memoryuse", RACCT_RSS },
152 	{ "memorylocked", RACCT_MEMLOCK },
153 	{ "maxproc", RACCT_NPROC },
154 	{ "openfiles", RACCT_NOFILE },
155 	{ "vmemoryuse", RACCT_VMEM },
156 	{ "pseudoterminals", RACCT_NPTS },
157 	{ "swapuse", RACCT_SWAP },
158 	{ "nthr", RACCT_NTHR },
159 	{ "msgqqueued", RACCT_MSGQQUEUED },
160 	{ "msgqsize", RACCT_MSGQSIZE },
161 	{ "nmsgq", RACCT_NMSGQ },
162 	{ "nsem", RACCT_NSEM },
163 	{ "nsemop", RACCT_NSEMOP },
164 	{ "nshm", RACCT_NSHM },
165 	{ "shmsize", RACCT_SHMSIZE },
166 	{ "wallclock", RACCT_WALLCLOCK },
167 	{ "pcpu", RACCT_PCTCPU },
168 	{ "readbps", RACCT_READBPS },
169 	{ "writebps", RACCT_WRITEBPS },
170 	{ "readiops", RACCT_READIOPS },
171 	{ "writeiops", RACCT_WRITEIOPS },
172 	{ NULL, -1 }};
173 
174 static struct dict actionnames[] = {
175 	{ "sighup", RCTL_ACTION_SIGHUP },
176 	{ "sigint", RCTL_ACTION_SIGINT },
177 	{ "sigquit", RCTL_ACTION_SIGQUIT },
178 	{ "sigill", RCTL_ACTION_SIGILL },
179 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
180 	{ "sigabrt", RCTL_ACTION_SIGABRT },
181 	{ "sigemt", RCTL_ACTION_SIGEMT },
182 	{ "sigfpe", RCTL_ACTION_SIGFPE },
183 	{ "sigkill", RCTL_ACTION_SIGKILL },
184 	{ "sigbus", RCTL_ACTION_SIGBUS },
185 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
186 	{ "sigsys", RCTL_ACTION_SIGSYS },
187 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
188 	{ "sigalrm", RCTL_ACTION_SIGALRM },
189 	{ "sigterm", RCTL_ACTION_SIGTERM },
190 	{ "sigurg", RCTL_ACTION_SIGURG },
191 	{ "sigstop", RCTL_ACTION_SIGSTOP },
192 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
193 	{ "sigchld", RCTL_ACTION_SIGCHLD },
194 	{ "sigttin", RCTL_ACTION_SIGTTIN },
195 	{ "sigttou", RCTL_ACTION_SIGTTOU },
196 	{ "sigio", RCTL_ACTION_SIGIO },
197 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
198 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
199 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
200 	{ "sigprof", RCTL_ACTION_SIGPROF },
201 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
202 	{ "siginfo", RCTL_ACTION_SIGINFO },
203 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
204 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
205 	{ "sigthr", RCTL_ACTION_SIGTHR },
206 	{ "deny", RCTL_ACTION_DENY },
207 	{ "log", RCTL_ACTION_LOG },
208 	{ "devctl", RCTL_ACTION_DEVCTL },
209 	{ "throttle", RCTL_ACTION_THROTTLE },
210 	{ NULL, -1 }};
211 
212 static void rctl_init(void);
213 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
214 
215 static uma_zone_t rctl_rule_zone;
216 static uma_zone_t rctl_rule_link_zone;
217 
218 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
219 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
220 
221 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
222 
223 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
224 {
225 	int error, val = rctl_throttle_min;
226 
227 	error = sysctl_handle_int(oidp, &val, 0, req);
228 	if (error || !req->newptr)
229 		return (error);
230 	if (val < 1 || val > rctl_throttle_max)
231 		return (EINVAL);
232 
233 	RACCT_LOCK();
234 	rctl_throttle_min = val;
235 	RACCT_UNLOCK();
236 
237 	return (0);
238 }
239 
240 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
241 {
242 	int error, val = rctl_throttle_max;
243 
244 	error = sysctl_handle_int(oidp, &val, 0, req);
245 	if (error || !req->newptr)
246 		return (error);
247 	if (val < rctl_throttle_min)
248 		return (EINVAL);
249 
250 	RACCT_LOCK();
251 	rctl_throttle_max = val;
252 	RACCT_UNLOCK();
253 
254 	return (0);
255 }
256 
257 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
258 {
259 	int error, val = rctl_throttle_pct;
260 
261 	error = sysctl_handle_int(oidp, &val, 0, req);
262 	if (error || !req->newptr)
263 		return (error);
264 	if (val < 0)
265 		return (EINVAL);
266 
267 	RACCT_LOCK();
268 	rctl_throttle_pct = val;
269 	RACCT_UNLOCK();
270 
271 	return (0);
272 }
273 
274 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
275 {
276 	int error, val = rctl_throttle_pct2;
277 
278 	error = sysctl_handle_int(oidp, &val, 0, req);
279 	if (error || !req->newptr)
280 		return (error);
281 	if (val < 0)
282 		return (EINVAL);
283 
284 	RACCT_LOCK();
285 	rctl_throttle_pct2 = val;
286 	RACCT_UNLOCK();
287 
288 	return (0);
289 }
290 
291 static const char *
292 rctl_subject_type_name(int subject)
293 {
294 	int i;
295 
296 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
297 		if (subjectnames[i].d_value == subject)
298 			return (subjectnames[i].d_name);
299 	}
300 
301 	panic("rctl_subject_type_name: unknown subject type %d", subject);
302 }
303 
304 static const char *
305 rctl_action_name(int action)
306 {
307 	int i;
308 
309 	for (i = 0; actionnames[i].d_name != NULL; i++) {
310 		if (actionnames[i].d_value == action)
311 			return (actionnames[i].d_name);
312 	}
313 
314 	panic("rctl_action_name: unknown action %d", action);
315 }
316 
317 const char *
318 rctl_resource_name(int resource)
319 {
320 	int i;
321 
322 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
323 		if (resourcenames[i].d_value == resource)
324 			return (resourcenames[i].d_name);
325 	}
326 
327 	panic("rctl_resource_name: unknown resource %d", resource);
328 }
329 
330 static struct racct *
331 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
332 {
333 	struct ucred *cred = p->p_ucred;
334 
335 	ASSERT_RACCT_ENABLED();
336 	RACCT_LOCK_ASSERT();
337 
338 	switch (rule->rr_per) {
339 	case RCTL_SUBJECT_TYPE_PROCESS:
340 		return (p->p_racct);
341 	case RCTL_SUBJECT_TYPE_USER:
342 		return (cred->cr_ruidinfo->ui_racct);
343 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
344 		return (cred->cr_loginclass->lc_racct);
345 	case RCTL_SUBJECT_TYPE_JAIL:
346 		return (cred->cr_prison->pr_prison_racct->prr_racct);
347 	default:
348 		panic("%s: unknown per %d", __func__, rule->rr_per);
349 	}
350 }
351 
352 /*
353  * Return the amount of resource that can be allocated by 'p' before
354  * hitting 'rule'.
355  */
356 static int64_t
357 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
358 {
359 	const struct racct *racct;
360 	int64_t available;
361 
362 	ASSERT_RACCT_ENABLED();
363 	RACCT_LOCK_ASSERT();
364 
365 	racct = rctl_proc_rule_to_racct(p, rule);
366 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
367 
368 	return (available);
369 }
370 
371 /*
372  * Called every second for proc, uidinfo, loginclass, and jail containers.
373  * If the limit isn't exceeded, it decreases the usage amount to zero.
374  * Otherwise, it decreases it by the value of the limit.  This way
375  * resource consumption exceeding the limit "carries over" to the next
376  * period.
377  */
378 void
379 rctl_throttle_decay(struct racct *racct, int resource)
380 {
381 	struct rctl_rule *rule;
382 	struct rctl_rule_link *link;
383 	int64_t minavailable;
384 
385 	ASSERT_RACCT_ENABLED();
386 	RACCT_LOCK_ASSERT();
387 
388 	minavailable = INT64_MAX;
389 
390 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
391 		rule = link->rrl_rule;
392 
393 		if (rule->rr_resource != resource)
394 			continue;
395 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
396 			continue;
397 
398 		if (rule->rr_amount < minavailable)
399 			minavailable = rule->rr_amount;
400 	}
401 
402 	if (racct->r_resources[resource] < minavailable) {
403 		racct->r_resources[resource] = 0;
404 	} else {
405 		/*
406 		 * Cap utilization counter at ten times the limit.  Otherwise,
407 		 * if we changed the rule lowering the allowed amount, it could
408 		 * take unreasonably long time for the accumulated resource
409 		 * usage to drop.
410 		 */
411 		if (racct->r_resources[resource] > minavailable * 10)
412 			racct->r_resources[resource] = minavailable * 10;
413 
414 		racct->r_resources[resource] -= minavailable;
415 	}
416 }
417 
418 /*
419  * Special version of rctl_get_available() for the %CPU resource.
420  * We slightly cheat here and return less than we normally would.
421  */
422 int64_t
423 rctl_pcpu_available(const struct proc *p) {
424 	struct rctl_rule *rule;
425 	struct rctl_rule_link *link;
426 	int64_t available, minavailable, limit;
427 
428 	ASSERT_RACCT_ENABLED();
429 	RACCT_LOCK_ASSERT();
430 
431 	minavailable = INT64_MAX;
432 	limit = 0;
433 
434 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
435 		rule = link->rrl_rule;
436 		if (rule->rr_resource != RACCT_PCTCPU)
437 			continue;
438 		if (rule->rr_action != RCTL_ACTION_DENY)
439 			continue;
440 		available = rctl_available_resource(p, rule);
441 		if (available < minavailable) {
442 			minavailable = available;
443 			limit = rule->rr_amount;
444 		}
445 	}
446 
447 	/*
448 	 * Return slightly less than actual value of the available
449 	 * %cpu resource.  This makes %cpu throttling more aggressive
450 	 * and lets us act sooner than the limits are already exceeded.
451 	 */
452 	if (limit != 0) {
453 		if (limit > 2 * RCTL_PCPU_SHIFT)
454 			minavailable -= RCTL_PCPU_SHIFT;
455 		else
456 			minavailable -= (limit / 2);
457 	}
458 
459 	return (minavailable);
460 }
461 
462 static uint64_t
463 xadd(uint64_t a, uint64_t b)
464 {
465 	uint64_t c;
466 
467 	c = a + b;
468 
469 	/*
470 	 * Detect overflow.
471 	 */
472 	if (c < a || c < b)
473 		return (UINT64_MAX);
474 
475 	return (c);
476 }
477 
478 static uint64_t
479 xmul(uint64_t a, uint64_t b)
480 {
481 
482 	if (b != 0 && a > UINT64_MAX / b)
483 		return (UINT64_MAX);
484 
485 	return (a * b);
486 }
487 
488 /*
489  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
490  * to what it keeps allocated now.  Returns non-zero if the allocation should
491  * be denied, 0 otherwise.
492  */
493 int
494 rctl_enforce(struct proc *p, int resource, uint64_t amount)
495 {
496 	static struct timeval log_lasttime, devctl_lasttime;
497 	static int log_curtime = 0, devctl_curtime = 0;
498 	struct rctl_rule *rule;
499 	struct rctl_rule_link *link;
500 	struct sbuf sb;
501 	char *buf;
502 	int64_t available;
503 	uint64_t sleep_ms, sleep_ratio;
504 	int should_deny = 0;
505 
506 	ASSERT_RACCT_ENABLED();
507 	RACCT_LOCK_ASSERT();
508 
509 	/*
510 	 * There may be more than one matching rule; go through all of them.
511 	 * Denial should be done last, after logging and sending signals.
512 	 */
513 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
514 		rule = link->rrl_rule;
515 		if (rule->rr_resource != resource)
516 			continue;
517 
518 		available = rctl_available_resource(p, rule);
519 		if (available >= (int64_t)amount) {
520 			link->rrl_exceeded = 0;
521 			continue;
522 		}
523 
524 		switch (rule->rr_action) {
525 		case RCTL_ACTION_DENY:
526 			should_deny = 1;
527 			continue;
528 		case RCTL_ACTION_LOG:
529 			/*
530 			 * If rrl_exceeded != 0, it means we've already
531 			 * logged a warning for this process.
532 			 */
533 			if (link->rrl_exceeded != 0)
534 				continue;
535 
536 			/*
537 			 * If the process state is not fully initialized yet,
538 			 * we can't access most of the required fields, e.g.
539 			 * p->p_comm.  This happens when called from fork1().
540 			 * Ignore this rule for now; it will be processed just
541 			 * after fork, when called from racct_proc_fork_done().
542 			 */
543 			if (p->p_state != PRS_NORMAL)
544 				continue;
545 
546 			if (!ppsratecheck(&log_lasttime, &log_curtime,
547 			    rctl_log_rate_limit))
548 				continue;
549 
550 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
551 			if (buf == NULL) {
552 				printf("rctl_enforce: out of memory\n");
553 				continue;
554 			}
555 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
556 			rctl_rule_to_sbuf(&sb, rule);
557 			sbuf_finish(&sb);
558 			printf("rctl: rule \"%s\" matched by pid %d "
559 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
560 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
561 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
562 			sbuf_delete(&sb);
563 			free(buf, M_RCTL);
564 			link->rrl_exceeded = 1;
565 			continue;
566 		case RCTL_ACTION_DEVCTL:
567 			if (link->rrl_exceeded != 0)
568 				continue;
569 
570 			if (p->p_state != PRS_NORMAL)
571 				continue;
572 
573 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
574 			    rctl_devctl_rate_limit))
575 				continue;
576 
577 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
578 			if (buf == NULL) {
579 				printf("rctl_enforce: out of memory\n");
580 				continue;
581 			}
582 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
583 			sbuf_printf(&sb, "rule=");
584 			rctl_rule_to_sbuf(&sb, rule);
585 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
586 			    p->p_pid, p->p_ucred->cr_ruid,
587 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
588 			sbuf_finish(&sb);
589 			devctl_notify_f("RCTL", "rule", "matched",
590 			    sbuf_data(&sb), M_NOWAIT);
591 			sbuf_delete(&sb);
592 			free(buf, M_RCTL);
593 			link->rrl_exceeded = 1;
594 			continue;
595 		case RCTL_ACTION_THROTTLE:
596 			if (p->p_state != PRS_NORMAL)
597 				continue;
598 
599 			/*
600 			 * Make the process sleep for a fraction of second
601 			 * proportional to the ratio of process' resource
602 			 * utilization compared to the limit.  The point is
603 			 * to penalize resource hogs: processes that consume
604 			 * more of the available resources sleep for longer.
605 			 *
606 			 * We're trying to defer division until the very end,
607 			 * to minimize the rounding effects.  The following
608 			 * calculation could have been written in a clearer
609 			 * way like this:
610 			 *
611 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
612 			 *     rule->rr_amount;
613 			 * sleep_ms *= rctl_throttle_pct / 100;
614 			 * if (sleep_ms < rctl_throttle_min)
615 			 *         sleep_ms = rctl_throttle_min;
616 			 *
617 			 */
618 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
619 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
620 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
621 				sleep_ms = rctl_throttle_min * rule->rr_amount;
622 
623 			/*
624 			 * Multiply that by the ratio of the resource
625 			 * consumption for the container compared to the limit,
626 			 * squared.  In other words, a process in a container
627 			 * that is two times over the limit will be throttled
628 			 * four times as much for hitting the same rule.  The
629 			 * point is to penalize processes more if the container
630 			 * itself (eg certain UID or jail) is above the limit.
631 			 */
632 			if (available < 0)
633 				sleep_ratio = -available / rule->rr_amount;
634 			else
635 				sleep_ratio = 0;
636 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
637 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
638 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
639 
640 			/*
641 			 * Finally the division.
642 			 */
643 			sleep_ms /= rule->rr_amount;
644 
645 			if (sleep_ms > rctl_throttle_max)
646 				sleep_ms = rctl_throttle_max;
647 #if 0
648 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
649 			   __func__, p->p_pid, p->p_comm,
650 			   p->p_racct->r_resources[resource],
651 			   rule->rr_amount, (uintmax_t)sleep_ms,
652 			   (uintmax_t)sleep_ratio, (intmax_t)available);
653 #endif
654 
655 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
656 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
657 			racct_proc_throttle(p, sleep_ms);
658 			continue;
659 		default:
660 			if (link->rrl_exceeded != 0)
661 				continue;
662 
663 			if (p->p_state != PRS_NORMAL)
664 				continue;
665 
666 			KASSERT(rule->rr_action > 0 &&
667 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
668 			    ("rctl_enforce: unknown action %d",
669 			     rule->rr_action));
670 
671 			/*
672 			 * We're using the fact that RCTL_ACTION_SIG* values
673 			 * are equal to their counterparts from sys/signal.h.
674 			 */
675 			kern_psignal(p, rule->rr_action);
676 			link->rrl_exceeded = 1;
677 			continue;
678 		}
679 	}
680 
681 	if (should_deny) {
682 		/*
683 		 * Return fake error code; the caller should change it
684 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
685 		 */
686 		return (EDOOFUS);
687 	}
688 
689 	return (0);
690 }
691 
692 uint64_t
693 rctl_get_limit(struct proc *p, int resource)
694 {
695 	struct rctl_rule *rule;
696 	struct rctl_rule_link *link;
697 	uint64_t amount = UINT64_MAX;
698 
699 	ASSERT_RACCT_ENABLED();
700 	RACCT_LOCK_ASSERT();
701 
702 	/*
703 	 * There may be more than one matching rule; go through all of them.
704 	 * Denial should be done last, after logging and sending signals.
705 	 */
706 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
707 		rule = link->rrl_rule;
708 		if (rule->rr_resource != resource)
709 			continue;
710 		if (rule->rr_action != RCTL_ACTION_DENY)
711 			continue;
712 		if (rule->rr_amount < amount)
713 			amount = rule->rr_amount;
714 	}
715 
716 	return (amount);
717 }
718 
719 uint64_t
720 rctl_get_available(struct proc *p, int resource)
721 {
722 	struct rctl_rule *rule;
723 	struct rctl_rule_link *link;
724 	int64_t available, minavailable, allocated;
725 
726 	minavailable = INT64_MAX;
727 
728 	ASSERT_RACCT_ENABLED();
729 	RACCT_LOCK_ASSERT();
730 
731 	/*
732 	 * There may be more than one matching rule; go through all of them.
733 	 * Denial should be done last, after logging and sending signals.
734 	 */
735 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
736 		rule = link->rrl_rule;
737 		if (rule->rr_resource != resource)
738 			continue;
739 		if (rule->rr_action != RCTL_ACTION_DENY)
740 			continue;
741 		available = rctl_available_resource(p, rule);
742 		if (available < minavailable)
743 			minavailable = available;
744 	}
745 
746 	/*
747 	 * XXX: Think about this _hard_.
748 	 */
749 	allocated = p->p_racct->r_resources[resource];
750 	if (minavailable < INT64_MAX - allocated)
751 		minavailable += allocated;
752 	if (minavailable < 0)
753 		minavailable = 0;
754 
755 	return (minavailable);
756 }
757 
758 static int
759 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
760 {
761 
762 	ASSERT_RACCT_ENABLED();
763 
764 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
765 		if (rule->rr_subject_type != filter->rr_subject_type)
766 			return (0);
767 
768 		switch (filter->rr_subject_type) {
769 		case RCTL_SUBJECT_TYPE_PROCESS:
770 			if (filter->rr_subject.rs_proc != NULL &&
771 			    rule->rr_subject.rs_proc !=
772 			    filter->rr_subject.rs_proc)
773 				return (0);
774 			break;
775 		case RCTL_SUBJECT_TYPE_USER:
776 			if (filter->rr_subject.rs_uip != NULL &&
777 			    rule->rr_subject.rs_uip !=
778 			    filter->rr_subject.rs_uip)
779 				return (0);
780 			break;
781 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
782 			if (filter->rr_subject.rs_loginclass != NULL &&
783 			    rule->rr_subject.rs_loginclass !=
784 			    filter->rr_subject.rs_loginclass)
785 				return (0);
786 			break;
787 		case RCTL_SUBJECT_TYPE_JAIL:
788 			if (filter->rr_subject.rs_prison_racct != NULL &&
789 			    rule->rr_subject.rs_prison_racct !=
790 			    filter->rr_subject.rs_prison_racct)
791 				return (0);
792 			break;
793 		default:
794 			panic("rctl_rule_matches: unknown subject type %d",
795 			    filter->rr_subject_type);
796 		}
797 	}
798 
799 	if (filter->rr_resource != RACCT_UNDEFINED) {
800 		if (rule->rr_resource != filter->rr_resource)
801 			return (0);
802 	}
803 
804 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
805 		if (rule->rr_action != filter->rr_action)
806 			return (0);
807 	}
808 
809 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
810 		if (rule->rr_amount != filter->rr_amount)
811 			return (0);
812 	}
813 
814 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
815 		if (rule->rr_per != filter->rr_per)
816 			return (0);
817 	}
818 
819 	return (1);
820 }
821 
822 static int
823 str2value(const char *str, int *value, struct dict *table)
824 {
825 	int i;
826 
827 	if (value == NULL)
828 		return (EINVAL);
829 
830 	for (i = 0; table[i].d_name != NULL; i++) {
831 		if (strcasecmp(table[i].d_name, str) == 0) {
832 			*value =  table[i].d_value;
833 			return (0);
834 		}
835 	}
836 
837 	return (EINVAL);
838 }
839 
840 static int
841 str2id(const char *str, id_t *value)
842 {
843 	char *end;
844 
845 	if (str == NULL)
846 		return (EINVAL);
847 
848 	*value = strtoul(str, &end, 10);
849 	if ((size_t)(end - str) != strlen(str))
850 		return (EINVAL);
851 
852 	return (0);
853 }
854 
855 static int
856 str2int64(const char *str, int64_t *value)
857 {
858 	char *end;
859 
860 	if (str == NULL)
861 		return (EINVAL);
862 
863 	*value = strtoul(str, &end, 10);
864 	if ((size_t)(end - str) != strlen(str))
865 		return (EINVAL);
866 
867 	if (*value < 0)
868 		return (ERANGE);
869 
870 	return (0);
871 }
872 
873 /*
874  * Connect the rule to the racct, increasing refcount for the rule.
875  */
876 static void
877 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
878 {
879 	struct rctl_rule_link *link;
880 
881 	ASSERT_RACCT_ENABLED();
882 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
883 
884 	rctl_rule_acquire(rule);
885 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
886 	link->rrl_rule = rule;
887 	link->rrl_exceeded = 0;
888 
889 	RACCT_LOCK();
890 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
891 	RACCT_UNLOCK();
892 }
893 
894 static int
895 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
896 {
897 	struct rctl_rule_link *link;
898 
899 	ASSERT_RACCT_ENABLED();
900 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
901 	RACCT_LOCK_ASSERT();
902 
903 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
904 	if (link == NULL)
905 		return (ENOMEM);
906 	rctl_rule_acquire(rule);
907 	link->rrl_rule = rule;
908 	link->rrl_exceeded = 0;
909 
910 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
911 
912 	return (0);
913 }
914 
915 /*
916  * Remove limits for a rules matching the filter and release
917  * the refcounts for the rules, possibly freeing them.  Returns
918  * the number of limit structures removed.
919  */
920 static int
921 rctl_racct_remove_rules(struct racct *racct,
922     const struct rctl_rule *filter)
923 {
924 	struct rctl_rule_link *link, *linktmp;
925 	int removed = 0;
926 
927 	ASSERT_RACCT_ENABLED();
928 	RACCT_LOCK_ASSERT();
929 
930 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
931 		if (!rctl_rule_matches(link->rrl_rule, filter))
932 			continue;
933 
934 		LIST_REMOVE(link, rrl_next);
935 		rctl_rule_release(link->rrl_rule);
936 		uma_zfree(rctl_rule_link_zone, link);
937 		removed++;
938 	}
939 	return (removed);
940 }
941 
942 static void
943 rctl_rule_acquire_subject(struct rctl_rule *rule)
944 {
945 
946 	ASSERT_RACCT_ENABLED();
947 
948 	switch (rule->rr_subject_type) {
949 	case RCTL_SUBJECT_TYPE_UNDEFINED:
950 	case RCTL_SUBJECT_TYPE_PROCESS:
951 		break;
952 	case RCTL_SUBJECT_TYPE_JAIL:
953 		if (rule->rr_subject.rs_prison_racct != NULL)
954 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
955 		break;
956 	case RCTL_SUBJECT_TYPE_USER:
957 		if (rule->rr_subject.rs_uip != NULL)
958 			uihold(rule->rr_subject.rs_uip);
959 		break;
960 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
961 		if (rule->rr_subject.rs_loginclass != NULL)
962 			loginclass_hold(rule->rr_subject.rs_loginclass);
963 		break;
964 	default:
965 		panic("rctl_rule_acquire_subject: unknown subject type %d",
966 		    rule->rr_subject_type);
967 	}
968 }
969 
970 static void
971 rctl_rule_release_subject(struct rctl_rule *rule)
972 {
973 
974 	ASSERT_RACCT_ENABLED();
975 
976 	switch (rule->rr_subject_type) {
977 	case RCTL_SUBJECT_TYPE_UNDEFINED:
978 	case RCTL_SUBJECT_TYPE_PROCESS:
979 		break;
980 	case RCTL_SUBJECT_TYPE_JAIL:
981 		if (rule->rr_subject.rs_prison_racct != NULL)
982 			prison_racct_free(rule->rr_subject.rs_prison_racct);
983 		break;
984 	case RCTL_SUBJECT_TYPE_USER:
985 		if (rule->rr_subject.rs_uip != NULL)
986 			uifree(rule->rr_subject.rs_uip);
987 		break;
988 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
989 		if (rule->rr_subject.rs_loginclass != NULL)
990 			loginclass_free(rule->rr_subject.rs_loginclass);
991 		break;
992 	default:
993 		panic("rctl_rule_release_subject: unknown subject type %d",
994 		    rule->rr_subject_type);
995 	}
996 }
997 
998 struct rctl_rule *
999 rctl_rule_alloc(int flags)
1000 {
1001 	struct rctl_rule *rule;
1002 
1003 	ASSERT_RACCT_ENABLED();
1004 
1005 	rule = uma_zalloc(rctl_rule_zone, flags);
1006 	if (rule == NULL)
1007 		return (NULL);
1008 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1009 	rule->rr_subject.rs_proc = NULL;
1010 	rule->rr_subject.rs_uip = NULL;
1011 	rule->rr_subject.rs_loginclass = NULL;
1012 	rule->rr_subject.rs_prison_racct = NULL;
1013 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1014 	rule->rr_resource = RACCT_UNDEFINED;
1015 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1016 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1017 	refcount_init(&rule->rr_refcount, 1);
1018 
1019 	return (rule);
1020 }
1021 
1022 struct rctl_rule *
1023 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1024 {
1025 	struct rctl_rule *copy;
1026 
1027 	ASSERT_RACCT_ENABLED();
1028 
1029 	copy = uma_zalloc(rctl_rule_zone, flags);
1030 	if (copy == NULL)
1031 		return (NULL);
1032 	copy->rr_subject_type = rule->rr_subject_type;
1033 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1034 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1035 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1036 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1037 	copy->rr_per = rule->rr_per;
1038 	copy->rr_resource = rule->rr_resource;
1039 	copy->rr_action = rule->rr_action;
1040 	copy->rr_amount = rule->rr_amount;
1041 	refcount_init(&copy->rr_refcount, 1);
1042 	rctl_rule_acquire_subject(copy);
1043 
1044 	return (copy);
1045 }
1046 
1047 void
1048 rctl_rule_acquire(struct rctl_rule *rule)
1049 {
1050 
1051 	ASSERT_RACCT_ENABLED();
1052 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1053 
1054 	refcount_acquire(&rule->rr_refcount);
1055 }
1056 
1057 static void
1058 rctl_rule_free(void *context, int pending)
1059 {
1060 	struct rctl_rule *rule;
1061 
1062 	rule = (struct rctl_rule *)context;
1063 
1064 	ASSERT_RACCT_ENABLED();
1065 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1066 
1067 	/*
1068 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1069 	 */
1070 
1071 	rctl_rule_release_subject(rule);
1072 	uma_zfree(rctl_rule_zone, rule);
1073 }
1074 
1075 void
1076 rctl_rule_release(struct rctl_rule *rule)
1077 {
1078 
1079 	ASSERT_RACCT_ENABLED();
1080 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1081 
1082 	if (refcount_release(&rule->rr_refcount)) {
1083 		/*
1084 		 * rctl_rule_release() is often called when iterating
1085 		 * over all the uidinfo structures in the system,
1086 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1087 		 * might end up calling uifree(), this would lead
1088 		 * to lock recursion.  Use taskqueue to avoid this.
1089 		 */
1090 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1091 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1092 	}
1093 }
1094 
1095 static int
1096 rctl_rule_fully_specified(const struct rctl_rule *rule)
1097 {
1098 
1099 	ASSERT_RACCT_ENABLED();
1100 
1101 	switch (rule->rr_subject_type) {
1102 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1103 		return (0);
1104 	case RCTL_SUBJECT_TYPE_PROCESS:
1105 		if (rule->rr_subject.rs_proc == NULL)
1106 			return (0);
1107 		break;
1108 	case RCTL_SUBJECT_TYPE_USER:
1109 		if (rule->rr_subject.rs_uip == NULL)
1110 			return (0);
1111 		break;
1112 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1113 		if (rule->rr_subject.rs_loginclass == NULL)
1114 			return (0);
1115 		break;
1116 	case RCTL_SUBJECT_TYPE_JAIL:
1117 		if (rule->rr_subject.rs_prison_racct == NULL)
1118 			return (0);
1119 		break;
1120 	default:
1121 		panic("rctl_rule_fully_specified: unknown subject type %d",
1122 		    rule->rr_subject_type);
1123 	}
1124 	if (rule->rr_resource == RACCT_UNDEFINED)
1125 		return (0);
1126 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1127 		return (0);
1128 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1129 		return (0);
1130 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1131 		return (0);
1132 
1133 	return (1);
1134 }
1135 
1136 static int
1137 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1138 {
1139 	struct rctl_rule *rule;
1140 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1141 	     *amountstr, *perstr;
1142 	id_t id;
1143 	int error = 0;
1144 
1145 	ASSERT_RACCT_ENABLED();
1146 
1147 	rule = rctl_rule_alloc(M_WAITOK);
1148 
1149 	subjectstr = strsep(&rulestr, ":");
1150 	subject_idstr = strsep(&rulestr, ":");
1151 	resourcestr = strsep(&rulestr, ":");
1152 	actionstr = strsep(&rulestr, "=/");
1153 	amountstr = strsep(&rulestr, "/");
1154 	perstr = rulestr;
1155 
1156 	if (subjectstr == NULL || subjectstr[0] == '\0')
1157 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1158 	else {
1159 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1160 		if (error != 0)
1161 			goto out;
1162 	}
1163 
1164 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1165 		rule->rr_subject.rs_proc = NULL;
1166 		rule->rr_subject.rs_uip = NULL;
1167 		rule->rr_subject.rs_loginclass = NULL;
1168 		rule->rr_subject.rs_prison_racct = NULL;
1169 	} else {
1170 		switch (rule->rr_subject_type) {
1171 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1172 			error = EINVAL;
1173 			goto out;
1174 		case RCTL_SUBJECT_TYPE_PROCESS:
1175 			error = str2id(subject_idstr, &id);
1176 			if (error != 0)
1177 				goto out;
1178 			sx_assert(&allproc_lock, SA_LOCKED);
1179 			rule->rr_subject.rs_proc = pfind(id);
1180 			if (rule->rr_subject.rs_proc == NULL) {
1181 				error = ESRCH;
1182 				goto out;
1183 			}
1184 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1185 			break;
1186 		case RCTL_SUBJECT_TYPE_USER:
1187 			error = str2id(subject_idstr, &id);
1188 			if (error != 0)
1189 				goto out;
1190 			rule->rr_subject.rs_uip = uifind(id);
1191 			break;
1192 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1193 			rule->rr_subject.rs_loginclass =
1194 			    loginclass_find(subject_idstr);
1195 			if (rule->rr_subject.rs_loginclass == NULL) {
1196 				error = ENAMETOOLONG;
1197 				goto out;
1198 			}
1199 			break;
1200 		case RCTL_SUBJECT_TYPE_JAIL:
1201 			rule->rr_subject.rs_prison_racct =
1202 			    prison_racct_find(subject_idstr);
1203 			if (rule->rr_subject.rs_prison_racct == NULL) {
1204 				error = ENAMETOOLONG;
1205 				goto out;
1206 			}
1207 			break;
1208                default:
1209                        panic("rctl_string_to_rule: unknown subject type %d",
1210                            rule->rr_subject_type);
1211                }
1212 	}
1213 
1214 	if (resourcestr == NULL || resourcestr[0] == '\0')
1215 		rule->rr_resource = RACCT_UNDEFINED;
1216 	else {
1217 		error = str2value(resourcestr, &rule->rr_resource,
1218 		    resourcenames);
1219 		if (error != 0)
1220 			goto out;
1221 	}
1222 
1223 	if (actionstr == NULL || actionstr[0] == '\0')
1224 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1225 	else {
1226 		error = str2value(actionstr, &rule->rr_action, actionnames);
1227 		if (error != 0)
1228 			goto out;
1229 	}
1230 
1231 	if (amountstr == NULL || amountstr[0] == '\0')
1232 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1233 	else {
1234 		error = str2int64(amountstr, &rule->rr_amount);
1235 		if (error != 0)
1236 			goto out;
1237 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1238 			if (rule->rr_amount > INT64_MAX / 1000000) {
1239 				error = ERANGE;
1240 				goto out;
1241 			}
1242 			rule->rr_amount *= 1000000;
1243 		}
1244 	}
1245 
1246 	if (perstr == NULL || perstr[0] == '\0')
1247 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1248 	else {
1249 		error = str2value(perstr, &rule->rr_per, subjectnames);
1250 		if (error != 0)
1251 			goto out;
1252 	}
1253 
1254 out:
1255 	if (error == 0)
1256 		*rulep = rule;
1257 	else
1258 		rctl_rule_release(rule);
1259 
1260 	return (error);
1261 }
1262 
1263 /*
1264  * Link a rule with all the subjects it applies to.
1265  */
1266 int
1267 rctl_rule_add(struct rctl_rule *rule)
1268 {
1269 	struct proc *p;
1270 	struct ucred *cred;
1271 	struct uidinfo *uip;
1272 	struct prison *pr;
1273 	struct prison_racct *prr;
1274 	struct loginclass *lc;
1275 	struct rctl_rule *rule2;
1276 	int match;
1277 
1278 	ASSERT_RACCT_ENABLED();
1279 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1280 
1281 	/*
1282 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1283 	 * resource.  The exception are the RSS and %CPU resources - they are
1284 	 * not deniable in the racct sense, but the limit is enforced in
1285 	 * a different way.
1286 	 */
1287 	if (rule->rr_action == RCTL_ACTION_DENY &&
1288 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1289 	    rule->rr_resource != RACCT_RSS &&
1290 	    rule->rr_resource != RACCT_PCTCPU) {
1291 		return (EOPNOTSUPP);
1292 	}
1293 
1294 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1295 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1296 		return (EOPNOTSUPP);
1297 	}
1298 
1299 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1300 	    rule->rr_resource == RACCT_PCTCPU) {
1301 		return (EOPNOTSUPP);
1302 	}
1303 
1304 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1305 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1306 		return (EOPNOTSUPP);
1307 	}
1308 
1309 	/*
1310 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1311 	 * rules, remove ones differing only by "amount".
1312 	 */
1313 	if (rule->rr_action == RCTL_ACTION_DENY) {
1314 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1315 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1316 		rctl_rule_remove(rule2);
1317 		rctl_rule_release(rule2);
1318 	} else
1319 		rctl_rule_remove(rule);
1320 
1321 	switch (rule->rr_subject_type) {
1322 	case RCTL_SUBJECT_TYPE_PROCESS:
1323 		p = rule->rr_subject.rs_proc;
1324 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1325 
1326 		rctl_racct_add_rule(p->p_racct, rule);
1327 		/*
1328 		 * In case of per-process rule, we don't have anything more
1329 		 * to do.
1330 		 */
1331 		return (0);
1332 
1333 	case RCTL_SUBJECT_TYPE_USER:
1334 		uip = rule->rr_subject.rs_uip;
1335 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1336 		rctl_racct_add_rule(uip->ui_racct, rule);
1337 		break;
1338 
1339 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1340 		lc = rule->rr_subject.rs_loginclass;
1341 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1342 		rctl_racct_add_rule(lc->lc_racct, rule);
1343 		break;
1344 
1345 	case RCTL_SUBJECT_TYPE_JAIL:
1346 		prr = rule->rr_subject.rs_prison_racct;
1347 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1348 		rctl_racct_add_rule(prr->prr_racct, rule);
1349 		break;
1350 
1351 	default:
1352 		panic("rctl_rule_add: unknown subject type %d",
1353 		    rule->rr_subject_type);
1354 	}
1355 
1356 	/*
1357 	 * Now go through all the processes and add the new rule to the ones
1358 	 * it applies to.
1359 	 */
1360 	sx_assert(&allproc_lock, SA_LOCKED);
1361 	FOREACH_PROC_IN_SYSTEM(p) {
1362 		cred = p->p_ucred;
1363 		switch (rule->rr_subject_type) {
1364 		case RCTL_SUBJECT_TYPE_USER:
1365 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1366 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1367 				break;
1368 			continue;
1369 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1370 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1371 				break;
1372 			continue;
1373 		case RCTL_SUBJECT_TYPE_JAIL:
1374 			match = 0;
1375 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1376 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1377 					match = 1;
1378 					break;
1379 				}
1380 			}
1381 			if (match)
1382 				break;
1383 			continue;
1384 		default:
1385 			panic("rctl_rule_add: unknown subject type %d",
1386 			    rule->rr_subject_type);
1387 		}
1388 
1389 		rctl_racct_add_rule(p->p_racct, rule);
1390 	}
1391 
1392 	return (0);
1393 }
1394 
1395 static void
1396 rctl_rule_pre_callback(void)
1397 {
1398 
1399 	RACCT_LOCK();
1400 }
1401 
1402 static void
1403 rctl_rule_post_callback(void)
1404 {
1405 
1406 	RACCT_UNLOCK();
1407 }
1408 
1409 static void
1410 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1411 {
1412 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1413 	int found = 0;
1414 
1415 	ASSERT_RACCT_ENABLED();
1416 	RACCT_LOCK_ASSERT();
1417 
1418 	found += rctl_racct_remove_rules(racct, filter);
1419 
1420 	*((int *)arg3) += found;
1421 }
1422 
1423 /*
1424  * Remove all rules that match the filter.
1425  */
1426 int
1427 rctl_rule_remove(struct rctl_rule *filter)
1428 {
1429 	struct proc *p;
1430 	int found = 0;
1431 
1432 	ASSERT_RACCT_ENABLED();
1433 
1434 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1435 	    filter->rr_subject.rs_proc != NULL) {
1436 		p = filter->rr_subject.rs_proc;
1437 		RACCT_LOCK();
1438 		found = rctl_racct_remove_rules(p->p_racct, filter);
1439 		RACCT_UNLOCK();
1440 		if (found)
1441 			return (0);
1442 		return (ESRCH);
1443 	}
1444 
1445 	loginclass_racct_foreach(rctl_rule_remove_callback,
1446 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1447 	    filter, (void *)&found);
1448 	ui_racct_foreach(rctl_rule_remove_callback,
1449 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1450 	    filter, (void *)&found);
1451 	prison_racct_foreach(rctl_rule_remove_callback,
1452 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1453 	    filter, (void *)&found);
1454 
1455 	sx_assert(&allproc_lock, SA_LOCKED);
1456 	RACCT_LOCK();
1457 	FOREACH_PROC_IN_SYSTEM(p) {
1458 		found += rctl_racct_remove_rules(p->p_racct, filter);
1459 	}
1460 	RACCT_UNLOCK();
1461 
1462 	if (found)
1463 		return (0);
1464 	return (ESRCH);
1465 }
1466 
1467 /*
1468  * Appends a rule to the sbuf.
1469  */
1470 static void
1471 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1472 {
1473 	int64_t amount;
1474 
1475 	ASSERT_RACCT_ENABLED();
1476 
1477 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1478 
1479 	switch (rule->rr_subject_type) {
1480 	case RCTL_SUBJECT_TYPE_PROCESS:
1481 		if (rule->rr_subject.rs_proc == NULL)
1482 			sbuf_printf(sb, ":");
1483 		else
1484 			sbuf_printf(sb, "%d:",
1485 			    rule->rr_subject.rs_proc->p_pid);
1486 		break;
1487 	case RCTL_SUBJECT_TYPE_USER:
1488 		if (rule->rr_subject.rs_uip == NULL)
1489 			sbuf_printf(sb, ":");
1490 		else
1491 			sbuf_printf(sb, "%d:",
1492 			    rule->rr_subject.rs_uip->ui_uid);
1493 		break;
1494 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1495 		if (rule->rr_subject.rs_loginclass == NULL)
1496 			sbuf_printf(sb, ":");
1497 		else
1498 			sbuf_printf(sb, "%s:",
1499 			    rule->rr_subject.rs_loginclass->lc_name);
1500 		break;
1501 	case RCTL_SUBJECT_TYPE_JAIL:
1502 		if (rule->rr_subject.rs_prison_racct == NULL)
1503 			sbuf_printf(sb, ":");
1504 		else
1505 			sbuf_printf(sb, "%s:",
1506 			    rule->rr_subject.rs_prison_racct->prr_name);
1507 		break;
1508 	default:
1509 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1510 		    rule->rr_subject_type);
1511 	}
1512 
1513 	amount = rule->rr_amount;
1514 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1515 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1516 		amount /= 1000000;
1517 
1518 	sbuf_printf(sb, "%s:%s=%jd",
1519 	    rctl_resource_name(rule->rr_resource),
1520 	    rctl_action_name(rule->rr_action),
1521 	    amount);
1522 
1523 	if (rule->rr_per != rule->rr_subject_type)
1524 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1525 }
1526 
1527 /*
1528  * Routine used by RCTL syscalls to read in input string.
1529  */
1530 static int
1531 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1532 {
1533 	char *str;
1534 	int error;
1535 
1536 	ASSERT_RACCT_ENABLED();
1537 
1538 	if (inbuflen <= 0)
1539 		return (EINVAL);
1540 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1541 		return (E2BIG);
1542 
1543 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1544 	error = copyinstr(inbufp, str, inbuflen, NULL);
1545 	if (error != 0) {
1546 		free(str, M_RCTL);
1547 		return (error);
1548 	}
1549 
1550 	*inputstr = str;
1551 
1552 	return (0);
1553 }
1554 
1555 /*
1556  * Routine used by RCTL syscalls to write out output string.
1557  */
1558 static int
1559 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1560 {
1561 	int error;
1562 
1563 	ASSERT_RACCT_ENABLED();
1564 
1565 	if (outputsbuf == NULL)
1566 		return (0);
1567 
1568 	sbuf_finish(outputsbuf);
1569 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1570 		sbuf_delete(outputsbuf);
1571 		return (ERANGE);
1572 	}
1573 	error = copyout(sbuf_data(outputsbuf), outbufp,
1574 	    sbuf_len(outputsbuf) + 1);
1575 	sbuf_delete(outputsbuf);
1576 	return (error);
1577 }
1578 
1579 static struct sbuf *
1580 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1581 {
1582 	struct sbuf *sb;
1583 	int64_t amount;
1584 	int i;
1585 
1586 	ASSERT_RACCT_ENABLED();
1587 
1588 	sb = sbuf_new_auto();
1589 	for (i = 0; i <= RACCT_MAX; i++) {
1590 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1591 			continue;
1592 		RACCT_LOCK();
1593 		amount = racct->r_resources[i];
1594 		RACCT_UNLOCK();
1595 		if (RACCT_IS_IN_MILLIONS(i))
1596 			amount /= 1000000;
1597 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1598 	}
1599 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1600 	return (sb);
1601 }
1602 
1603 int
1604 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1605 {
1606 	struct rctl_rule *filter;
1607 	struct sbuf *outputsbuf = NULL;
1608 	struct proc *p;
1609 	struct uidinfo *uip;
1610 	struct loginclass *lc;
1611 	struct prison_racct *prr;
1612 	char *inputstr;
1613 	int error;
1614 
1615 	if (!racct_enable)
1616 		return (ENOSYS);
1617 
1618 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1619 	if (error != 0)
1620 		return (error);
1621 
1622 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1623 	if (error != 0)
1624 		return (error);
1625 
1626 	sx_slock(&allproc_lock);
1627 	error = rctl_string_to_rule(inputstr, &filter);
1628 	free(inputstr, M_RCTL);
1629 	if (error != 0) {
1630 		sx_sunlock(&allproc_lock);
1631 		return (error);
1632 	}
1633 
1634 	switch (filter->rr_subject_type) {
1635 	case RCTL_SUBJECT_TYPE_PROCESS:
1636 		p = filter->rr_subject.rs_proc;
1637 		if (p == NULL) {
1638 			error = EINVAL;
1639 			goto out;
1640 		}
1641 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1642 		break;
1643 	case RCTL_SUBJECT_TYPE_USER:
1644 		uip = filter->rr_subject.rs_uip;
1645 		if (uip == NULL) {
1646 			error = EINVAL;
1647 			goto out;
1648 		}
1649 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1650 		break;
1651 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1652 		lc = filter->rr_subject.rs_loginclass;
1653 		if (lc == NULL) {
1654 			error = EINVAL;
1655 			goto out;
1656 		}
1657 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1658 		break;
1659 	case RCTL_SUBJECT_TYPE_JAIL:
1660 		prr = filter->rr_subject.rs_prison_racct;
1661 		if (prr == NULL) {
1662 			error = EINVAL;
1663 			goto out;
1664 		}
1665 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1666 		break;
1667 	default:
1668 		error = EINVAL;
1669 	}
1670 out:
1671 	rctl_rule_release(filter);
1672 	sx_sunlock(&allproc_lock);
1673 	if (error != 0)
1674 		return (error);
1675 
1676 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1677 
1678 	return (error);
1679 }
1680 
1681 static void
1682 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1683 {
1684 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1685 	struct rctl_rule_link *link;
1686 	struct sbuf *sb = (struct sbuf *)arg3;
1687 
1688 	ASSERT_RACCT_ENABLED();
1689 	RACCT_LOCK_ASSERT();
1690 
1691 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1692 		if (!rctl_rule_matches(link->rrl_rule, filter))
1693 			continue;
1694 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1695 		sbuf_printf(sb, ",");
1696 	}
1697 }
1698 
1699 int
1700 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1701 {
1702 	struct sbuf *sb;
1703 	struct rctl_rule *filter;
1704 	struct rctl_rule_link *link;
1705 	struct proc *p;
1706 	char *inputstr, *buf;
1707 	size_t bufsize;
1708 	int error;
1709 
1710 	if (!racct_enable)
1711 		return (ENOSYS);
1712 
1713 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1714 	if (error != 0)
1715 		return (error);
1716 
1717 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1718 	if (error != 0)
1719 		return (error);
1720 
1721 	sx_slock(&allproc_lock);
1722 	error = rctl_string_to_rule(inputstr, &filter);
1723 	free(inputstr, M_RCTL);
1724 	if (error != 0) {
1725 		sx_sunlock(&allproc_lock);
1726 		return (error);
1727 	}
1728 
1729 	bufsize = uap->outbuflen;
1730 	if (bufsize > rctl_maxbufsize) {
1731 		sx_sunlock(&allproc_lock);
1732 		return (E2BIG);
1733 	}
1734 
1735 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1736 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1737 	KASSERT(sb != NULL, ("sbuf_new failed"));
1738 
1739 	FOREACH_PROC_IN_SYSTEM(p) {
1740 		RACCT_LOCK();
1741 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1742 			/*
1743 			 * Non-process rules will be added to the buffer later.
1744 			 * Adding them here would result in duplicated output.
1745 			 */
1746 			if (link->rrl_rule->rr_subject_type !=
1747 			    RCTL_SUBJECT_TYPE_PROCESS)
1748 				continue;
1749 			if (!rctl_rule_matches(link->rrl_rule, filter))
1750 				continue;
1751 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1752 			sbuf_printf(sb, ",");
1753 		}
1754 		RACCT_UNLOCK();
1755 	}
1756 
1757 	loginclass_racct_foreach(rctl_get_rules_callback,
1758 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1759 	    filter, sb);
1760 	ui_racct_foreach(rctl_get_rules_callback,
1761 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1762 	    filter, sb);
1763 	prison_racct_foreach(rctl_get_rules_callback,
1764 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1765 	    filter, sb);
1766 	if (sbuf_error(sb) == ENOMEM) {
1767 		error = ERANGE;
1768 		goto out;
1769 	}
1770 
1771 	/*
1772 	 * Remove trailing ",".
1773 	 */
1774 	if (sbuf_len(sb) > 0)
1775 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1776 
1777 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1778 out:
1779 	rctl_rule_release(filter);
1780 	sx_sunlock(&allproc_lock);
1781 	free(buf, M_RCTL);
1782 	return (error);
1783 }
1784 
1785 int
1786 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1787 {
1788 	struct sbuf *sb;
1789 	struct rctl_rule *filter;
1790 	struct rctl_rule_link *link;
1791 	char *inputstr, *buf;
1792 	size_t bufsize;
1793 	int error;
1794 
1795 	if (!racct_enable)
1796 		return (ENOSYS);
1797 
1798 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1799 	if (error != 0)
1800 		return (error);
1801 
1802 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1803 	if (error != 0)
1804 		return (error);
1805 
1806 	sx_slock(&allproc_lock);
1807 	error = rctl_string_to_rule(inputstr, &filter);
1808 	free(inputstr, M_RCTL);
1809 	if (error != 0) {
1810 		sx_sunlock(&allproc_lock);
1811 		return (error);
1812 	}
1813 
1814 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1815 		rctl_rule_release(filter);
1816 		sx_sunlock(&allproc_lock);
1817 		return (EINVAL);
1818 	}
1819 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1820 		rctl_rule_release(filter);
1821 		sx_sunlock(&allproc_lock);
1822 		return (EOPNOTSUPP);
1823 	}
1824 	if (filter->rr_subject.rs_proc == NULL) {
1825 		rctl_rule_release(filter);
1826 		sx_sunlock(&allproc_lock);
1827 		return (EINVAL);
1828 	}
1829 
1830 	bufsize = uap->outbuflen;
1831 	if (bufsize > rctl_maxbufsize) {
1832 		rctl_rule_release(filter);
1833 		sx_sunlock(&allproc_lock);
1834 		return (E2BIG);
1835 	}
1836 
1837 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1838 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1839 	KASSERT(sb != NULL, ("sbuf_new failed"));
1840 
1841 	RACCT_LOCK();
1842 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1843 	    rrl_next) {
1844 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1845 		sbuf_printf(sb, ",");
1846 	}
1847 	RACCT_UNLOCK();
1848 	if (sbuf_error(sb) == ENOMEM) {
1849 		error = ERANGE;
1850 		sbuf_delete(sb);
1851 		goto out;
1852 	}
1853 
1854 	/*
1855 	 * Remove trailing ",".
1856 	 */
1857 	if (sbuf_len(sb) > 0)
1858 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1859 
1860 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1861 out:
1862 	rctl_rule_release(filter);
1863 	sx_sunlock(&allproc_lock);
1864 	free(buf, M_RCTL);
1865 	return (error);
1866 }
1867 
1868 int
1869 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1870 {
1871 	struct rctl_rule *rule;
1872 	char *inputstr;
1873 	int error;
1874 
1875 	if (!racct_enable)
1876 		return (ENOSYS);
1877 
1878 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1879 	if (error != 0)
1880 		return (error);
1881 
1882 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1883 	if (error != 0)
1884 		return (error);
1885 
1886 	sx_slock(&allproc_lock);
1887 	error = rctl_string_to_rule(inputstr, &rule);
1888 	free(inputstr, M_RCTL);
1889 	if (error != 0) {
1890 		sx_sunlock(&allproc_lock);
1891 		return (error);
1892 	}
1893 	/*
1894 	 * The 'per' part of a rule is optional.
1895 	 */
1896 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1897 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1898 		rule->rr_per = rule->rr_subject_type;
1899 
1900 	if (!rctl_rule_fully_specified(rule)) {
1901 		error = EINVAL;
1902 		goto out;
1903 	}
1904 
1905 	error = rctl_rule_add(rule);
1906 
1907 out:
1908 	rctl_rule_release(rule);
1909 	sx_sunlock(&allproc_lock);
1910 	return (error);
1911 }
1912 
1913 int
1914 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1915 {
1916 	struct rctl_rule *filter;
1917 	char *inputstr;
1918 	int error;
1919 
1920 	if (!racct_enable)
1921 		return (ENOSYS);
1922 
1923 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1924 	if (error != 0)
1925 		return (error);
1926 
1927 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1928 	if (error != 0)
1929 		return (error);
1930 
1931 	sx_slock(&allproc_lock);
1932 	error = rctl_string_to_rule(inputstr, &filter);
1933 	free(inputstr, M_RCTL);
1934 	if (error != 0) {
1935 		sx_sunlock(&allproc_lock);
1936 		return (error);
1937 	}
1938 
1939 	error = rctl_rule_remove(filter);
1940 	rctl_rule_release(filter);
1941 	sx_sunlock(&allproc_lock);
1942 
1943 	return (error);
1944 }
1945 
1946 /*
1947  * Update RCTL rule list after credential change.
1948  */
1949 void
1950 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1951 {
1952 	LIST_HEAD(, rctl_rule_link) newrules;
1953 	struct rctl_rule_link *link, *newlink;
1954 	struct uidinfo *newuip;
1955 	struct loginclass *newlc;
1956 	struct prison_racct *newprr;
1957 	int rulecnt, i;
1958 
1959 	if (!racct_enable)
1960 		return;
1961 
1962 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1963 
1964 	newuip = newcred->cr_ruidinfo;
1965 	newlc = newcred->cr_loginclass;
1966 	newprr = newcred->cr_prison->pr_prison_racct;
1967 
1968 	LIST_INIT(&newrules);
1969 
1970 again:
1971 	/*
1972 	 * First, count the rules that apply to the process with new
1973 	 * credentials.
1974 	 */
1975 	rulecnt = 0;
1976 	RACCT_LOCK();
1977 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1978 		if (link->rrl_rule->rr_subject_type ==
1979 		    RCTL_SUBJECT_TYPE_PROCESS)
1980 			rulecnt++;
1981 	}
1982 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1983 		rulecnt++;
1984 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1985 		rulecnt++;
1986 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1987 		rulecnt++;
1988 	RACCT_UNLOCK();
1989 
1990 	/*
1991 	 * Create temporary list.  We've dropped the rctl_lock in order
1992 	 * to use M_WAITOK.
1993 	 */
1994 	for (i = 0; i < rulecnt; i++) {
1995 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
1996 		newlink->rrl_rule = NULL;
1997 		newlink->rrl_exceeded = 0;
1998 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
1999 	}
2000 
2001 	newlink = LIST_FIRST(&newrules);
2002 
2003 	/*
2004 	 * Assign rules to the newly allocated list entries.
2005 	 */
2006 	RACCT_LOCK();
2007 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2008 		if (link->rrl_rule->rr_subject_type ==
2009 		    RCTL_SUBJECT_TYPE_PROCESS) {
2010 			if (newlink == NULL)
2011 				goto goaround;
2012 			rctl_rule_acquire(link->rrl_rule);
2013 			newlink->rrl_rule = link->rrl_rule;
2014 			newlink->rrl_exceeded = link->rrl_exceeded;
2015 			newlink = LIST_NEXT(newlink, rrl_next);
2016 			rulecnt--;
2017 		}
2018 	}
2019 
2020 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2021 		if (newlink == NULL)
2022 			goto goaround;
2023 		rctl_rule_acquire(link->rrl_rule);
2024 		newlink->rrl_rule = link->rrl_rule;
2025 		newlink->rrl_exceeded = link->rrl_exceeded;
2026 		newlink = LIST_NEXT(newlink, rrl_next);
2027 		rulecnt--;
2028 	}
2029 
2030 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2031 		if (newlink == NULL)
2032 			goto goaround;
2033 		rctl_rule_acquire(link->rrl_rule);
2034 		newlink->rrl_rule = link->rrl_rule;
2035 		newlink->rrl_exceeded = link->rrl_exceeded;
2036 		newlink = LIST_NEXT(newlink, rrl_next);
2037 		rulecnt--;
2038 	}
2039 
2040 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2041 		if (newlink == NULL)
2042 			goto goaround;
2043 		rctl_rule_acquire(link->rrl_rule);
2044 		newlink->rrl_rule = link->rrl_rule;
2045 		newlink->rrl_exceeded = link->rrl_exceeded;
2046 		newlink = LIST_NEXT(newlink, rrl_next);
2047 		rulecnt--;
2048 	}
2049 
2050 	if (rulecnt == 0) {
2051 		/*
2052 		 * Free the old rule list.
2053 		 */
2054 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2055 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2056 			LIST_REMOVE(link, rrl_next);
2057 			rctl_rule_release(link->rrl_rule);
2058 			uma_zfree(rctl_rule_link_zone, link);
2059 		}
2060 
2061 		/*
2062 		 * Replace lists and we're done.
2063 		 *
2064 		 * XXX: Is there any way to switch list heads instead
2065 		 *      of iterating here?
2066 		 */
2067 		while (!LIST_EMPTY(&newrules)) {
2068 			newlink = LIST_FIRST(&newrules);
2069 			LIST_REMOVE(newlink, rrl_next);
2070 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2071 			    newlink, rrl_next);
2072 		}
2073 
2074 		RACCT_UNLOCK();
2075 
2076 		return;
2077 	}
2078 
2079 goaround:
2080 	RACCT_UNLOCK();
2081 
2082 	/*
2083 	 * Rule list changed while we were not holding the rctl_lock.
2084 	 * Free the new list and try again.
2085 	 */
2086 	while (!LIST_EMPTY(&newrules)) {
2087 		newlink = LIST_FIRST(&newrules);
2088 		LIST_REMOVE(newlink, rrl_next);
2089 		if (newlink->rrl_rule != NULL)
2090 			rctl_rule_release(newlink->rrl_rule);
2091 		uma_zfree(rctl_rule_link_zone, newlink);
2092 	}
2093 
2094 	goto again;
2095 }
2096 
2097 /*
2098  * Assign RCTL rules to the newly created process.
2099  */
2100 int
2101 rctl_proc_fork(struct proc *parent, struct proc *child)
2102 {
2103 	struct rctl_rule *rule;
2104 	struct rctl_rule_link *link;
2105 	int error;
2106 
2107 	ASSERT_RACCT_ENABLED();
2108 	RACCT_LOCK_ASSERT();
2109 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2110 
2111 	LIST_INIT(&child->p_racct->r_rule_links);
2112 
2113 	/*
2114 	 * Go through limits applicable to the parent and assign them
2115 	 * to the child.  Rules with 'process' subject have to be duplicated
2116 	 * in order to make their rr_subject point to the new process.
2117 	 */
2118 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2119 		if (link->rrl_rule->rr_subject_type ==
2120 		    RCTL_SUBJECT_TYPE_PROCESS) {
2121 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2122 			if (rule == NULL)
2123 				goto fail;
2124 			KASSERT(rule->rr_subject.rs_proc == parent,
2125 			    ("rule->rr_subject.rs_proc != parent"));
2126 			rule->rr_subject.rs_proc = child;
2127 			error = rctl_racct_add_rule_locked(child->p_racct,
2128 			    rule);
2129 			rctl_rule_release(rule);
2130 			if (error != 0)
2131 				goto fail;
2132 		} else {
2133 			error = rctl_racct_add_rule_locked(child->p_racct,
2134 			    link->rrl_rule);
2135 			if (error != 0)
2136 				goto fail;
2137 		}
2138 	}
2139 
2140 	return (0);
2141 
2142 fail:
2143 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2144 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2145 		LIST_REMOVE(link, rrl_next);
2146 		rctl_rule_release(link->rrl_rule);
2147 		uma_zfree(rctl_rule_link_zone, link);
2148 	}
2149 
2150 	return (EAGAIN);
2151 }
2152 
2153 /*
2154  * Release rules attached to the racct.
2155  */
2156 void
2157 rctl_racct_release(struct racct *racct)
2158 {
2159 	struct rctl_rule_link *link;
2160 
2161 	ASSERT_RACCT_ENABLED();
2162 	RACCT_LOCK_ASSERT();
2163 
2164 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2165 		link = LIST_FIRST(&racct->r_rule_links);
2166 		LIST_REMOVE(link, rrl_next);
2167 		rctl_rule_release(link->rrl_rule);
2168 		uma_zfree(rctl_rule_link_zone, link);
2169 	}
2170 }
2171 
2172 static void
2173 rctl_init(void)
2174 {
2175 
2176 	if (!racct_enable)
2177 		return;
2178 
2179 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2180 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2181 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2182 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2183 	    UMA_ALIGN_PTR, 0);
2184 
2185 	/*
2186 	 * Set default values, making sure not to overwrite the ones
2187 	 * fetched from tunables.  Most of those could be set at the
2188 	 * declaration, except for the rctl_throttle_max - we cannot
2189 	 * set it there due to hz not being compile time constant.
2190 	 */
2191 	if (rctl_throttle_min < 1)
2192 		rctl_throttle_min = 1;
2193 	if (rctl_throttle_max < rctl_throttle_min)
2194 		rctl_throttle_max = 2 * hz;
2195 	if (rctl_throttle_pct < 0)
2196 		rctl_throttle_pct = 100;
2197 	if (rctl_throttle_pct2 < 0)
2198 		rctl_throttle_pct2 = 100;
2199 }
2200 
2201 #else /* !RCTL */
2202 
2203 int
2204 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2205 {
2206 
2207 	return (ENOSYS);
2208 }
2209 
2210 int
2211 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2212 {
2213 
2214 	return (ENOSYS);
2215 }
2216 
2217 int
2218 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2219 {
2220 
2221 	return (ENOSYS);
2222 }
2223 
2224 int
2225 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2226 {
2227 
2228 	return (ENOSYS);
2229 }
2230 
2231 int
2232 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2233 {
2234 
2235 	return (ENOSYS);
2236 }
2237 
2238 #endif /* !RCTL */
2239