xref: /freebsd/sys/kern/kern_rctl.c (revision c697fb7f)
1 /*-
2  * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3  *
4  * Copyright (c) 2010 The FreeBSD Foundation
5  * All rights reserved.
6  *
7  * This software was developed by Edward Tomasz Napierala under sponsorship
8  * from the FreeBSD Foundation.
9  *
10  * Redistribution and use in source and binary forms, with or without
11  * modification, are permitted provided that the following conditions
12  * are met:
13  * 1. Redistributions of source code must retain the above copyright
14  *    notice, this list of conditions and the following disclaimer.
15  * 2. Redistributions in binary form must reproduce the above copyright
16  *    notice, this list of conditions and the following disclaimer in the
17  *    documentation and/or other materials provided with the distribution.
18  *
19  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
20  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
22  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
23  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
25  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
26  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
27  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
28  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
29  * SUCH DAMAGE.
30  *
31  * $FreeBSD$
32  */
33 
34 #include <sys/cdefs.h>
35 __FBSDID("$FreeBSD$");
36 
37 #include <sys/param.h>
38 #include <sys/bus.h>
39 #include <sys/malloc.h>
40 #include <sys/queue.h>
41 #include <sys/refcount.h>
42 #include <sys/jail.h>
43 #include <sys/kernel.h>
44 #include <sys/limits.h>
45 #include <sys/loginclass.h>
46 #include <sys/priv.h>
47 #include <sys/proc.h>
48 #include <sys/racct.h>
49 #include <sys/rctl.h>
50 #include <sys/resourcevar.h>
51 #include <sys/sx.h>
52 #include <sys/sysent.h>
53 #include <sys/sysproto.h>
54 #include <sys/systm.h>
55 #include <sys/types.h>
56 #include <sys/eventhandler.h>
57 #include <sys/lock.h>
58 #include <sys/mutex.h>
59 #include <sys/rwlock.h>
60 #include <sys/sbuf.h>
61 #include <sys/taskqueue.h>
62 #include <sys/tree.h>
63 #include <vm/uma.h>
64 
65 #ifdef RCTL
66 #ifndef RACCT
67 #error "The RCTL option requires the RACCT option"
68 #endif
69 
70 FEATURE(rctl, "Resource Limits");
71 
72 #define	HRF_DEFAULT		0
73 #define	HRF_DONT_INHERIT	1
74 #define	HRF_DONT_ACCUMULATE	2
75 
76 #define	RCTL_MAX_INBUFSIZE	4 * 1024
77 #define	RCTL_MAX_OUTBUFSIZE	16 * 1024 * 1024
78 #define	RCTL_LOG_BUFSIZE	128
79 
80 #define	RCTL_PCPU_SHIFT		(10 * 1000000)
81 
82 static unsigned int rctl_maxbufsize = RCTL_MAX_OUTBUFSIZE;
83 static int rctl_log_rate_limit = 10;
84 static int rctl_devctl_rate_limit = 10;
85 
86 /*
87  * Values below are initialized in rctl_init().
88  */
89 static int rctl_throttle_min = -1;
90 static int rctl_throttle_max = -1;
91 static int rctl_throttle_pct = -1;
92 static int rctl_throttle_pct2 = -1;
93 
94 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS);
95 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS);
96 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS);
97 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS);
98 
99 SYSCTL_NODE(_kern_racct, OID_AUTO, rctl, CTLFLAG_RW | CTLFLAG_MPSAFE, 0,
100     "Resource Limits");
101 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, maxbufsize, CTLFLAG_RWTUN,
102     &rctl_maxbufsize, 0, "Maximum output buffer size");
103 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, log_rate_limit, CTLFLAG_RW,
104     &rctl_log_rate_limit, 0, "Maximum number of log messages per second");
105 SYSCTL_UINT(_kern_racct_rctl, OID_AUTO, devctl_rate_limit, CTLFLAG_RWTUN,
106     &rctl_devctl_rate_limit, 0, "Maximum number of devctl messages per second");
107 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_min,
108     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
109     &rctl_throttle_min_sysctl, "IU",
110     "Shortest throttling duration, in hz");
111 TUNABLE_INT("kern.racct.rctl.throttle_min", &rctl_throttle_min);
112 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_max,
113     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
114     &rctl_throttle_max_sysctl, "IU",
115     "Longest throttling duration, in hz");
116 TUNABLE_INT("kern.racct.rctl.throttle_max", &rctl_throttle_max);
117 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct,
118     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
119     &rctl_throttle_pct_sysctl, "IU",
120     "Throttling penalty for process consumption, in percent");
121 TUNABLE_INT("kern.racct.rctl.throttle_pct", &rctl_throttle_pct);
122 SYSCTL_PROC(_kern_racct_rctl, OID_AUTO, throttle_pct2,
123     CTLTYPE_UINT | CTLFLAG_RWTUN | CTLFLAG_MPSAFE, 0, 0,
124     &rctl_throttle_pct2_sysctl, "IU",
125     "Throttling penalty for container consumption, in percent");
126 TUNABLE_INT("kern.racct.rctl.throttle_pct2", &rctl_throttle_pct2);
127 
128 /*
129  * 'rctl_rule_link' connects a rule with every racct it's related to.
130  * For example, rule 'user:X:openfiles:deny=N/process' is linked
131  * with uidinfo for user X, and to each process of that user.
132  */
133 struct rctl_rule_link {
134 	LIST_ENTRY(rctl_rule_link)	rrl_next;
135 	struct rctl_rule		*rrl_rule;
136 	int				rrl_exceeded;
137 };
138 
139 struct dict {
140 	const char	*d_name;
141 	int		d_value;
142 };
143 
144 static struct dict subjectnames[] = {
145 	{ "process", RCTL_SUBJECT_TYPE_PROCESS },
146 	{ "user", RCTL_SUBJECT_TYPE_USER },
147 	{ "loginclass", RCTL_SUBJECT_TYPE_LOGINCLASS },
148 	{ "jail", RCTL_SUBJECT_TYPE_JAIL },
149 	{ NULL, -1 }};
150 
151 static struct dict resourcenames[] = {
152 	{ "cputime", RACCT_CPU },
153 	{ "datasize", RACCT_DATA },
154 	{ "stacksize", RACCT_STACK },
155 	{ "coredumpsize", RACCT_CORE },
156 	{ "memoryuse", RACCT_RSS },
157 	{ "memorylocked", RACCT_MEMLOCK },
158 	{ "maxproc", RACCT_NPROC },
159 	{ "openfiles", RACCT_NOFILE },
160 	{ "vmemoryuse", RACCT_VMEM },
161 	{ "pseudoterminals", RACCT_NPTS },
162 	{ "swapuse", RACCT_SWAP },
163 	{ "nthr", RACCT_NTHR },
164 	{ "msgqqueued", RACCT_MSGQQUEUED },
165 	{ "msgqsize", RACCT_MSGQSIZE },
166 	{ "nmsgq", RACCT_NMSGQ },
167 	{ "nsem", RACCT_NSEM },
168 	{ "nsemop", RACCT_NSEMOP },
169 	{ "nshm", RACCT_NSHM },
170 	{ "shmsize", RACCT_SHMSIZE },
171 	{ "wallclock", RACCT_WALLCLOCK },
172 	{ "pcpu", RACCT_PCTCPU },
173 	{ "readbps", RACCT_READBPS },
174 	{ "writebps", RACCT_WRITEBPS },
175 	{ "readiops", RACCT_READIOPS },
176 	{ "writeiops", RACCT_WRITEIOPS },
177 	{ NULL, -1 }};
178 
179 static struct dict actionnames[] = {
180 	{ "sighup", RCTL_ACTION_SIGHUP },
181 	{ "sigint", RCTL_ACTION_SIGINT },
182 	{ "sigquit", RCTL_ACTION_SIGQUIT },
183 	{ "sigill", RCTL_ACTION_SIGILL },
184 	{ "sigtrap", RCTL_ACTION_SIGTRAP },
185 	{ "sigabrt", RCTL_ACTION_SIGABRT },
186 	{ "sigemt", RCTL_ACTION_SIGEMT },
187 	{ "sigfpe", RCTL_ACTION_SIGFPE },
188 	{ "sigkill", RCTL_ACTION_SIGKILL },
189 	{ "sigbus", RCTL_ACTION_SIGBUS },
190 	{ "sigsegv", RCTL_ACTION_SIGSEGV },
191 	{ "sigsys", RCTL_ACTION_SIGSYS },
192 	{ "sigpipe", RCTL_ACTION_SIGPIPE },
193 	{ "sigalrm", RCTL_ACTION_SIGALRM },
194 	{ "sigterm", RCTL_ACTION_SIGTERM },
195 	{ "sigurg", RCTL_ACTION_SIGURG },
196 	{ "sigstop", RCTL_ACTION_SIGSTOP },
197 	{ "sigtstp", RCTL_ACTION_SIGTSTP },
198 	{ "sigchld", RCTL_ACTION_SIGCHLD },
199 	{ "sigttin", RCTL_ACTION_SIGTTIN },
200 	{ "sigttou", RCTL_ACTION_SIGTTOU },
201 	{ "sigio", RCTL_ACTION_SIGIO },
202 	{ "sigxcpu", RCTL_ACTION_SIGXCPU },
203 	{ "sigxfsz", RCTL_ACTION_SIGXFSZ },
204 	{ "sigvtalrm", RCTL_ACTION_SIGVTALRM },
205 	{ "sigprof", RCTL_ACTION_SIGPROF },
206 	{ "sigwinch", RCTL_ACTION_SIGWINCH },
207 	{ "siginfo", RCTL_ACTION_SIGINFO },
208 	{ "sigusr1", RCTL_ACTION_SIGUSR1 },
209 	{ "sigusr2", RCTL_ACTION_SIGUSR2 },
210 	{ "sigthr", RCTL_ACTION_SIGTHR },
211 	{ "deny", RCTL_ACTION_DENY },
212 	{ "log", RCTL_ACTION_LOG },
213 	{ "devctl", RCTL_ACTION_DEVCTL },
214 	{ "throttle", RCTL_ACTION_THROTTLE },
215 	{ NULL, -1 }};
216 
217 static void rctl_init(void);
218 SYSINIT(rctl, SI_SUB_RACCT, SI_ORDER_FIRST, rctl_init, NULL);
219 
220 static uma_zone_t rctl_rule_zone;
221 static uma_zone_t rctl_rule_link_zone;
222 
223 static int rctl_rule_fully_specified(const struct rctl_rule *rule);
224 static void rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule);
225 
226 static MALLOC_DEFINE(M_RCTL, "rctl", "Resource Limits");
227 
228 static int rctl_throttle_min_sysctl(SYSCTL_HANDLER_ARGS)
229 {
230 	int error, val = rctl_throttle_min;
231 
232 	error = sysctl_handle_int(oidp, &val, 0, req);
233 	if (error || !req->newptr)
234 		return (error);
235 	if (val < 1 || val > rctl_throttle_max)
236 		return (EINVAL);
237 
238 	RACCT_LOCK();
239 	rctl_throttle_min = val;
240 	RACCT_UNLOCK();
241 
242 	return (0);
243 }
244 
245 static int rctl_throttle_max_sysctl(SYSCTL_HANDLER_ARGS)
246 {
247 	int error, val = rctl_throttle_max;
248 
249 	error = sysctl_handle_int(oidp, &val, 0, req);
250 	if (error || !req->newptr)
251 		return (error);
252 	if (val < rctl_throttle_min)
253 		return (EINVAL);
254 
255 	RACCT_LOCK();
256 	rctl_throttle_max = val;
257 	RACCT_UNLOCK();
258 
259 	return (0);
260 }
261 
262 static int rctl_throttle_pct_sysctl(SYSCTL_HANDLER_ARGS)
263 {
264 	int error, val = rctl_throttle_pct;
265 
266 	error = sysctl_handle_int(oidp, &val, 0, req);
267 	if (error || !req->newptr)
268 		return (error);
269 	if (val < 0)
270 		return (EINVAL);
271 
272 	RACCT_LOCK();
273 	rctl_throttle_pct = val;
274 	RACCT_UNLOCK();
275 
276 	return (0);
277 }
278 
279 static int rctl_throttle_pct2_sysctl(SYSCTL_HANDLER_ARGS)
280 {
281 	int error, val = rctl_throttle_pct2;
282 
283 	error = sysctl_handle_int(oidp, &val, 0, req);
284 	if (error || !req->newptr)
285 		return (error);
286 	if (val < 0)
287 		return (EINVAL);
288 
289 	RACCT_LOCK();
290 	rctl_throttle_pct2 = val;
291 	RACCT_UNLOCK();
292 
293 	return (0);
294 }
295 
296 static const char *
297 rctl_subject_type_name(int subject)
298 {
299 	int i;
300 
301 	for (i = 0; subjectnames[i].d_name != NULL; i++) {
302 		if (subjectnames[i].d_value == subject)
303 			return (subjectnames[i].d_name);
304 	}
305 
306 	panic("rctl_subject_type_name: unknown subject type %d", subject);
307 }
308 
309 static const char *
310 rctl_action_name(int action)
311 {
312 	int i;
313 
314 	for (i = 0; actionnames[i].d_name != NULL; i++) {
315 		if (actionnames[i].d_value == action)
316 			return (actionnames[i].d_name);
317 	}
318 
319 	panic("rctl_action_name: unknown action %d", action);
320 }
321 
322 const char *
323 rctl_resource_name(int resource)
324 {
325 	int i;
326 
327 	for (i = 0; resourcenames[i].d_name != NULL; i++) {
328 		if (resourcenames[i].d_value == resource)
329 			return (resourcenames[i].d_name);
330 	}
331 
332 	panic("rctl_resource_name: unknown resource %d", resource);
333 }
334 
335 static struct racct *
336 rctl_proc_rule_to_racct(const struct proc *p, const struct rctl_rule *rule)
337 {
338 	struct ucred *cred = p->p_ucred;
339 
340 	ASSERT_RACCT_ENABLED();
341 	RACCT_LOCK_ASSERT();
342 
343 	switch (rule->rr_per) {
344 	case RCTL_SUBJECT_TYPE_PROCESS:
345 		return (p->p_racct);
346 	case RCTL_SUBJECT_TYPE_USER:
347 		return (cred->cr_ruidinfo->ui_racct);
348 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
349 		return (cred->cr_loginclass->lc_racct);
350 	case RCTL_SUBJECT_TYPE_JAIL:
351 		return (cred->cr_prison->pr_prison_racct->prr_racct);
352 	default:
353 		panic("%s: unknown per %d", __func__, rule->rr_per);
354 	}
355 }
356 
357 /*
358  * Return the amount of resource that can be allocated by 'p' before
359  * hitting 'rule'.
360  */
361 static int64_t
362 rctl_available_resource(const struct proc *p, const struct rctl_rule *rule)
363 {
364 	const struct racct *racct;
365 	int64_t available;
366 
367 	ASSERT_RACCT_ENABLED();
368 	RACCT_LOCK_ASSERT();
369 
370 	racct = rctl_proc_rule_to_racct(p, rule);
371 	available = rule->rr_amount - racct->r_resources[rule->rr_resource];
372 
373 	return (available);
374 }
375 
376 /*
377  * Called every second for proc, uidinfo, loginclass, and jail containers.
378  * If the limit isn't exceeded, it decreases the usage amount to zero.
379  * Otherwise, it decreases it by the value of the limit.  This way
380  * resource consumption exceeding the limit "carries over" to the next
381  * period.
382  */
383 void
384 rctl_throttle_decay(struct racct *racct, int resource)
385 {
386 	struct rctl_rule *rule;
387 	struct rctl_rule_link *link;
388 	int64_t minavailable;
389 
390 	ASSERT_RACCT_ENABLED();
391 	RACCT_LOCK_ASSERT();
392 
393 	minavailable = INT64_MAX;
394 
395 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
396 		rule = link->rrl_rule;
397 
398 		if (rule->rr_resource != resource)
399 			continue;
400 		if (rule->rr_action != RCTL_ACTION_THROTTLE)
401 			continue;
402 
403 		if (rule->rr_amount < minavailable)
404 			minavailable = rule->rr_amount;
405 	}
406 
407 	if (racct->r_resources[resource] < minavailable) {
408 		racct->r_resources[resource] = 0;
409 	} else {
410 		/*
411 		 * Cap utilization counter at ten times the limit.  Otherwise,
412 		 * if we changed the rule lowering the allowed amount, it could
413 		 * take unreasonably long time for the accumulated resource
414 		 * usage to drop.
415 		 */
416 		if (racct->r_resources[resource] > minavailable * 10)
417 			racct->r_resources[resource] = minavailable * 10;
418 
419 		racct->r_resources[resource] -= minavailable;
420 	}
421 }
422 
423 /*
424  * Special version of rctl_get_available() for the %CPU resource.
425  * We slightly cheat here and return less than we normally would.
426  */
427 int64_t
428 rctl_pcpu_available(const struct proc *p) {
429 	struct rctl_rule *rule;
430 	struct rctl_rule_link *link;
431 	int64_t available, minavailable, limit;
432 
433 	ASSERT_RACCT_ENABLED();
434 	RACCT_LOCK_ASSERT();
435 
436 	minavailable = INT64_MAX;
437 	limit = 0;
438 
439 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
440 		rule = link->rrl_rule;
441 		if (rule->rr_resource != RACCT_PCTCPU)
442 			continue;
443 		if (rule->rr_action != RCTL_ACTION_DENY)
444 			continue;
445 		available = rctl_available_resource(p, rule);
446 		if (available < minavailable) {
447 			minavailable = available;
448 			limit = rule->rr_amount;
449 		}
450 	}
451 
452 	/*
453 	 * Return slightly less than actual value of the available
454 	 * %cpu resource.  This makes %cpu throttling more aggressive
455 	 * and lets us act sooner than the limits are already exceeded.
456 	 */
457 	if (limit != 0) {
458 		if (limit > 2 * RCTL_PCPU_SHIFT)
459 			minavailable -= RCTL_PCPU_SHIFT;
460 		else
461 			minavailable -= (limit / 2);
462 	}
463 
464 	return (minavailable);
465 }
466 
467 static uint64_t
468 xadd(uint64_t a, uint64_t b)
469 {
470 	uint64_t c;
471 
472 	c = a + b;
473 
474 	/*
475 	 * Detect overflow.
476 	 */
477 	if (c < a || c < b)
478 		return (UINT64_MAX);
479 
480 	return (c);
481 }
482 
483 static uint64_t
484 xmul(uint64_t a, uint64_t b)
485 {
486 
487 	if (b != 0 && a > UINT64_MAX / b)
488 		return (UINT64_MAX);
489 
490 	return (a * b);
491 }
492 
493 /*
494  * Check whether the proc 'p' can allocate 'amount' of 'resource' in addition
495  * to what it keeps allocated now.  Returns non-zero if the allocation should
496  * be denied, 0 otherwise.
497  */
498 int
499 rctl_enforce(struct proc *p, int resource, uint64_t amount)
500 {
501 	static struct timeval log_lasttime, devctl_lasttime;
502 	static int log_curtime = 0, devctl_curtime = 0;
503 	struct rctl_rule *rule;
504 	struct rctl_rule_link *link;
505 	struct sbuf sb;
506 	char *buf;
507 	int64_t available;
508 	uint64_t sleep_ms, sleep_ratio;
509 	int should_deny = 0;
510 
511 	ASSERT_RACCT_ENABLED();
512 	RACCT_LOCK_ASSERT();
513 
514 	/*
515 	 * There may be more than one matching rule; go through all of them.
516 	 * Denial should be done last, after logging and sending signals.
517 	 */
518 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
519 		rule = link->rrl_rule;
520 		if (rule->rr_resource != resource)
521 			continue;
522 
523 		available = rctl_available_resource(p, rule);
524 		if (available >= (int64_t)amount) {
525 			link->rrl_exceeded = 0;
526 			continue;
527 		}
528 
529 		switch (rule->rr_action) {
530 		case RCTL_ACTION_DENY:
531 			should_deny = 1;
532 			continue;
533 		case RCTL_ACTION_LOG:
534 			/*
535 			 * If rrl_exceeded != 0, it means we've already
536 			 * logged a warning for this process.
537 			 */
538 			if (link->rrl_exceeded != 0)
539 				continue;
540 
541 			/*
542 			 * If the process state is not fully initialized yet,
543 			 * we can't access most of the required fields, e.g.
544 			 * p->p_comm.  This happens when called from fork1().
545 			 * Ignore this rule for now; it will be processed just
546 			 * after fork, when called from racct_proc_fork_done().
547 			 */
548 			if (p->p_state != PRS_NORMAL)
549 				continue;
550 
551 			if (!ppsratecheck(&log_lasttime, &log_curtime,
552 			    rctl_log_rate_limit))
553 				continue;
554 
555 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
556 			if (buf == NULL) {
557 				printf("rctl_enforce: out of memory\n");
558 				continue;
559 			}
560 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
561 			rctl_rule_to_sbuf(&sb, rule);
562 			sbuf_finish(&sb);
563 			printf("rctl: rule \"%s\" matched by pid %d "
564 			    "(%s), uid %d, jail %s\n", sbuf_data(&sb),
565 			    p->p_pid, p->p_comm, p->p_ucred->cr_uid,
566 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
567 			sbuf_delete(&sb);
568 			free(buf, M_RCTL);
569 			link->rrl_exceeded = 1;
570 			continue;
571 		case RCTL_ACTION_DEVCTL:
572 			if (link->rrl_exceeded != 0)
573 				continue;
574 
575 			if (p->p_state != PRS_NORMAL)
576 				continue;
577 
578 			if (!ppsratecheck(&devctl_lasttime, &devctl_curtime,
579 			    rctl_devctl_rate_limit))
580 				continue;
581 
582 			buf = malloc(RCTL_LOG_BUFSIZE, M_RCTL, M_NOWAIT);
583 			if (buf == NULL) {
584 				printf("rctl_enforce: out of memory\n");
585 				continue;
586 			}
587 			sbuf_new(&sb, buf, RCTL_LOG_BUFSIZE, SBUF_FIXEDLEN);
588 			sbuf_printf(&sb, "rule=");
589 			rctl_rule_to_sbuf(&sb, rule);
590 			sbuf_printf(&sb, " pid=%d ruid=%d jail=%s",
591 			    p->p_pid, p->p_ucred->cr_ruid,
592 			    p->p_ucred->cr_prison->pr_prison_racct->prr_name);
593 			sbuf_finish(&sb);
594 			devctl_notify_f("RCTL", "rule", "matched",
595 			    sbuf_data(&sb), M_NOWAIT);
596 			sbuf_delete(&sb);
597 			free(buf, M_RCTL);
598 			link->rrl_exceeded = 1;
599 			continue;
600 		case RCTL_ACTION_THROTTLE:
601 			if (p->p_state != PRS_NORMAL)
602 				continue;
603 
604 			/*
605 			 * Make the process sleep for a fraction of second
606 			 * proportional to the ratio of process' resource
607 			 * utilization compared to the limit.  The point is
608 			 * to penalize resource hogs: processes that consume
609 			 * more of the available resources sleep for longer.
610 			 *
611 			 * We're trying to defer division until the very end,
612 			 * to minimize the rounding effects.  The following
613 			 * calculation could have been written in a clearer
614 			 * way like this:
615 			 *
616 			 * sleep_ms = hz * p->p_racct->r_resources[resource] /
617 			 *     rule->rr_amount;
618 			 * sleep_ms *= rctl_throttle_pct / 100;
619 			 * if (sleep_ms < rctl_throttle_min)
620 			 *         sleep_ms = rctl_throttle_min;
621 			 *
622 			 */
623 			sleep_ms = xmul(hz, p->p_racct->r_resources[resource]);
624 			sleep_ms = xmul(sleep_ms,  rctl_throttle_pct) / 100;
625 			if (sleep_ms < rctl_throttle_min * rule->rr_amount)
626 				sleep_ms = rctl_throttle_min * rule->rr_amount;
627 
628 			/*
629 			 * Multiply that by the ratio of the resource
630 			 * consumption for the container compared to the limit,
631 			 * squared.  In other words, a process in a container
632 			 * that is two times over the limit will be throttled
633 			 * four times as much for hitting the same rule.  The
634 			 * point is to penalize processes more if the container
635 			 * itself (eg certain UID or jail) is above the limit.
636 			 */
637 			if (available < 0)
638 				sleep_ratio = -available / rule->rr_amount;
639 			else
640 				sleep_ratio = 0;
641 			sleep_ratio = xmul(sleep_ratio, sleep_ratio);
642 			sleep_ratio = xmul(sleep_ratio, rctl_throttle_pct2) / 100;
643 			sleep_ms = xadd(sleep_ms, xmul(sleep_ms, sleep_ratio));
644 
645 			/*
646 			 * Finally the division.
647 			 */
648 			sleep_ms /= rule->rr_amount;
649 
650 			if (sleep_ms > rctl_throttle_max)
651 				sleep_ms = rctl_throttle_max;
652 #if 0
653 			printf("%s: pid %d (%s), %jd of %jd, will sleep for %ju ms (ratio %ju, available %jd)\n",
654 			   __func__, p->p_pid, p->p_comm,
655 			   p->p_racct->r_resources[resource],
656 			   rule->rr_amount, (uintmax_t)sleep_ms,
657 			   (uintmax_t)sleep_ratio, (intmax_t)available);
658 #endif
659 
660 			KASSERT(sleep_ms >= rctl_throttle_min, ("%s: %ju < %d\n",
661 			    __func__, (uintmax_t)sleep_ms, rctl_throttle_min));
662 			racct_proc_throttle(p, sleep_ms);
663 			continue;
664 		default:
665 			if (link->rrl_exceeded != 0)
666 				continue;
667 
668 			if (p->p_state != PRS_NORMAL)
669 				continue;
670 
671 			KASSERT(rule->rr_action > 0 &&
672 			    rule->rr_action <= RCTL_ACTION_SIGNAL_MAX,
673 			    ("rctl_enforce: unknown action %d",
674 			     rule->rr_action));
675 
676 			/*
677 			 * We're using the fact that RCTL_ACTION_SIG* values
678 			 * are equal to their counterparts from sys/signal.h.
679 			 */
680 			kern_psignal(p, rule->rr_action);
681 			link->rrl_exceeded = 1;
682 			continue;
683 		}
684 	}
685 
686 	if (should_deny) {
687 		/*
688 		 * Return fake error code; the caller should change it
689 		 * into one proper for the situation - EFSIZ, ENOMEM etc.
690 		 */
691 		return (EDOOFUS);
692 	}
693 
694 	return (0);
695 }
696 
697 uint64_t
698 rctl_get_limit(struct proc *p, int resource)
699 {
700 	struct rctl_rule *rule;
701 	struct rctl_rule_link *link;
702 	uint64_t amount = UINT64_MAX;
703 
704 	ASSERT_RACCT_ENABLED();
705 	RACCT_LOCK_ASSERT();
706 
707 	/*
708 	 * There may be more than one matching rule; go through all of them.
709 	 * Denial should be done last, after logging and sending signals.
710 	 */
711 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
712 		rule = link->rrl_rule;
713 		if (rule->rr_resource != resource)
714 			continue;
715 		if (rule->rr_action != RCTL_ACTION_DENY)
716 			continue;
717 		if (rule->rr_amount < amount)
718 			amount = rule->rr_amount;
719 	}
720 
721 	return (amount);
722 }
723 
724 uint64_t
725 rctl_get_available(struct proc *p, int resource)
726 {
727 	struct rctl_rule *rule;
728 	struct rctl_rule_link *link;
729 	int64_t available, minavailable, allocated;
730 
731 	minavailable = INT64_MAX;
732 
733 	ASSERT_RACCT_ENABLED();
734 	RACCT_LOCK_ASSERT();
735 
736 	/*
737 	 * There may be more than one matching rule; go through all of them.
738 	 * Denial should be done last, after logging and sending signals.
739 	 */
740 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
741 		rule = link->rrl_rule;
742 		if (rule->rr_resource != resource)
743 			continue;
744 		if (rule->rr_action != RCTL_ACTION_DENY)
745 			continue;
746 		available = rctl_available_resource(p, rule);
747 		if (available < minavailable)
748 			minavailable = available;
749 	}
750 
751 	/*
752 	 * XXX: Think about this _hard_.
753 	 */
754 	allocated = p->p_racct->r_resources[resource];
755 	if (minavailable < INT64_MAX - allocated)
756 		minavailable += allocated;
757 	if (minavailable < 0)
758 		minavailable = 0;
759 
760 	return (minavailable);
761 }
762 
763 static int
764 rctl_rule_matches(const struct rctl_rule *rule, const struct rctl_rule *filter)
765 {
766 
767 	ASSERT_RACCT_ENABLED();
768 
769 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED) {
770 		if (rule->rr_subject_type != filter->rr_subject_type)
771 			return (0);
772 
773 		switch (filter->rr_subject_type) {
774 		case RCTL_SUBJECT_TYPE_PROCESS:
775 			if (filter->rr_subject.rs_proc != NULL &&
776 			    rule->rr_subject.rs_proc !=
777 			    filter->rr_subject.rs_proc)
778 				return (0);
779 			break;
780 		case RCTL_SUBJECT_TYPE_USER:
781 			if (filter->rr_subject.rs_uip != NULL &&
782 			    rule->rr_subject.rs_uip !=
783 			    filter->rr_subject.rs_uip)
784 				return (0);
785 			break;
786 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
787 			if (filter->rr_subject.rs_loginclass != NULL &&
788 			    rule->rr_subject.rs_loginclass !=
789 			    filter->rr_subject.rs_loginclass)
790 				return (0);
791 			break;
792 		case RCTL_SUBJECT_TYPE_JAIL:
793 			if (filter->rr_subject.rs_prison_racct != NULL &&
794 			    rule->rr_subject.rs_prison_racct !=
795 			    filter->rr_subject.rs_prison_racct)
796 				return (0);
797 			break;
798 		default:
799 			panic("rctl_rule_matches: unknown subject type %d",
800 			    filter->rr_subject_type);
801 		}
802 	}
803 
804 	if (filter->rr_resource != RACCT_UNDEFINED) {
805 		if (rule->rr_resource != filter->rr_resource)
806 			return (0);
807 	}
808 
809 	if (filter->rr_action != RCTL_ACTION_UNDEFINED) {
810 		if (rule->rr_action != filter->rr_action)
811 			return (0);
812 	}
813 
814 	if (filter->rr_amount != RCTL_AMOUNT_UNDEFINED) {
815 		if (rule->rr_amount != filter->rr_amount)
816 			return (0);
817 	}
818 
819 	if (filter->rr_per != RCTL_SUBJECT_TYPE_UNDEFINED) {
820 		if (rule->rr_per != filter->rr_per)
821 			return (0);
822 	}
823 
824 	return (1);
825 }
826 
827 static int
828 str2value(const char *str, int *value, struct dict *table)
829 {
830 	int i;
831 
832 	if (value == NULL)
833 		return (EINVAL);
834 
835 	for (i = 0; table[i].d_name != NULL; i++) {
836 		if (strcasecmp(table[i].d_name, str) == 0) {
837 			*value =  table[i].d_value;
838 			return (0);
839 		}
840 	}
841 
842 	return (EINVAL);
843 }
844 
845 static int
846 str2id(const char *str, id_t *value)
847 {
848 	char *end;
849 
850 	if (str == NULL)
851 		return (EINVAL);
852 
853 	*value = strtoul(str, &end, 10);
854 	if ((size_t)(end - str) != strlen(str))
855 		return (EINVAL);
856 
857 	return (0);
858 }
859 
860 static int
861 str2int64(const char *str, int64_t *value)
862 {
863 	char *end;
864 
865 	if (str == NULL)
866 		return (EINVAL);
867 
868 	*value = strtoul(str, &end, 10);
869 	if ((size_t)(end - str) != strlen(str))
870 		return (EINVAL);
871 
872 	if (*value < 0)
873 		return (ERANGE);
874 
875 	return (0);
876 }
877 
878 /*
879  * Connect the rule to the racct, increasing refcount for the rule.
880  */
881 static void
882 rctl_racct_add_rule(struct racct *racct, struct rctl_rule *rule)
883 {
884 	struct rctl_rule_link *link;
885 
886 	ASSERT_RACCT_ENABLED();
887 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
888 
889 	rctl_rule_acquire(rule);
890 	link = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
891 	link->rrl_rule = rule;
892 	link->rrl_exceeded = 0;
893 
894 	RACCT_LOCK();
895 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
896 	RACCT_UNLOCK();
897 }
898 
899 static int
900 rctl_racct_add_rule_locked(struct racct *racct, struct rctl_rule *rule)
901 {
902 	struct rctl_rule_link *link;
903 
904 	ASSERT_RACCT_ENABLED();
905 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
906 	RACCT_LOCK_ASSERT();
907 
908 	link = uma_zalloc(rctl_rule_link_zone, M_NOWAIT);
909 	if (link == NULL)
910 		return (ENOMEM);
911 	rctl_rule_acquire(rule);
912 	link->rrl_rule = rule;
913 	link->rrl_exceeded = 0;
914 
915 	LIST_INSERT_HEAD(&racct->r_rule_links, link, rrl_next);
916 
917 	return (0);
918 }
919 
920 /*
921  * Remove limits for a rules matching the filter and release
922  * the refcounts for the rules, possibly freeing them.  Returns
923  * the number of limit structures removed.
924  */
925 static int
926 rctl_racct_remove_rules(struct racct *racct,
927     const struct rctl_rule *filter)
928 {
929 	struct rctl_rule_link *link, *linktmp;
930 	int removed = 0;
931 
932 	ASSERT_RACCT_ENABLED();
933 	RACCT_LOCK_ASSERT();
934 
935 	LIST_FOREACH_SAFE(link, &racct->r_rule_links, rrl_next, linktmp) {
936 		if (!rctl_rule_matches(link->rrl_rule, filter))
937 			continue;
938 
939 		LIST_REMOVE(link, rrl_next);
940 		rctl_rule_release(link->rrl_rule);
941 		uma_zfree(rctl_rule_link_zone, link);
942 		removed++;
943 	}
944 	return (removed);
945 }
946 
947 static void
948 rctl_rule_acquire_subject(struct rctl_rule *rule)
949 {
950 
951 	ASSERT_RACCT_ENABLED();
952 
953 	switch (rule->rr_subject_type) {
954 	case RCTL_SUBJECT_TYPE_UNDEFINED:
955 	case RCTL_SUBJECT_TYPE_PROCESS:
956 		break;
957 	case RCTL_SUBJECT_TYPE_JAIL:
958 		if (rule->rr_subject.rs_prison_racct != NULL)
959 			prison_racct_hold(rule->rr_subject.rs_prison_racct);
960 		break;
961 	case RCTL_SUBJECT_TYPE_USER:
962 		if (rule->rr_subject.rs_uip != NULL)
963 			uihold(rule->rr_subject.rs_uip);
964 		break;
965 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
966 		if (rule->rr_subject.rs_loginclass != NULL)
967 			loginclass_hold(rule->rr_subject.rs_loginclass);
968 		break;
969 	default:
970 		panic("rctl_rule_acquire_subject: unknown subject type %d",
971 		    rule->rr_subject_type);
972 	}
973 }
974 
975 static void
976 rctl_rule_release_subject(struct rctl_rule *rule)
977 {
978 
979 	ASSERT_RACCT_ENABLED();
980 
981 	switch (rule->rr_subject_type) {
982 	case RCTL_SUBJECT_TYPE_UNDEFINED:
983 	case RCTL_SUBJECT_TYPE_PROCESS:
984 		break;
985 	case RCTL_SUBJECT_TYPE_JAIL:
986 		if (rule->rr_subject.rs_prison_racct != NULL)
987 			prison_racct_free(rule->rr_subject.rs_prison_racct);
988 		break;
989 	case RCTL_SUBJECT_TYPE_USER:
990 		if (rule->rr_subject.rs_uip != NULL)
991 			uifree(rule->rr_subject.rs_uip);
992 		break;
993 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
994 		if (rule->rr_subject.rs_loginclass != NULL)
995 			loginclass_free(rule->rr_subject.rs_loginclass);
996 		break;
997 	default:
998 		panic("rctl_rule_release_subject: unknown subject type %d",
999 		    rule->rr_subject_type);
1000 	}
1001 }
1002 
1003 struct rctl_rule *
1004 rctl_rule_alloc(int flags)
1005 {
1006 	struct rctl_rule *rule;
1007 
1008 	ASSERT_RACCT_ENABLED();
1009 
1010 	rule = uma_zalloc(rctl_rule_zone, flags);
1011 	if (rule == NULL)
1012 		return (NULL);
1013 	rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1014 	rule->rr_subject.rs_proc = NULL;
1015 	rule->rr_subject.rs_uip = NULL;
1016 	rule->rr_subject.rs_loginclass = NULL;
1017 	rule->rr_subject.rs_prison_racct = NULL;
1018 	rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1019 	rule->rr_resource = RACCT_UNDEFINED;
1020 	rule->rr_action = RCTL_ACTION_UNDEFINED;
1021 	rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1022 	refcount_init(&rule->rr_refcount, 1);
1023 
1024 	return (rule);
1025 }
1026 
1027 struct rctl_rule *
1028 rctl_rule_duplicate(const struct rctl_rule *rule, int flags)
1029 {
1030 	struct rctl_rule *copy;
1031 
1032 	ASSERT_RACCT_ENABLED();
1033 
1034 	copy = uma_zalloc(rctl_rule_zone, flags);
1035 	if (copy == NULL)
1036 		return (NULL);
1037 	copy->rr_subject_type = rule->rr_subject_type;
1038 	copy->rr_subject.rs_proc = rule->rr_subject.rs_proc;
1039 	copy->rr_subject.rs_uip = rule->rr_subject.rs_uip;
1040 	copy->rr_subject.rs_loginclass = rule->rr_subject.rs_loginclass;
1041 	copy->rr_subject.rs_prison_racct = rule->rr_subject.rs_prison_racct;
1042 	copy->rr_per = rule->rr_per;
1043 	copy->rr_resource = rule->rr_resource;
1044 	copy->rr_action = rule->rr_action;
1045 	copy->rr_amount = rule->rr_amount;
1046 	refcount_init(&copy->rr_refcount, 1);
1047 	rctl_rule_acquire_subject(copy);
1048 
1049 	return (copy);
1050 }
1051 
1052 void
1053 rctl_rule_acquire(struct rctl_rule *rule)
1054 {
1055 
1056 	ASSERT_RACCT_ENABLED();
1057 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1058 
1059 	refcount_acquire(&rule->rr_refcount);
1060 }
1061 
1062 static void
1063 rctl_rule_free(void *context, int pending)
1064 {
1065 	struct rctl_rule *rule;
1066 
1067 	rule = (struct rctl_rule *)context;
1068 
1069 	ASSERT_RACCT_ENABLED();
1070 	KASSERT(rule->rr_refcount == 0, ("rule->rr_refcount != 0"));
1071 
1072 	/*
1073 	 * We don't need locking here; rule is guaranteed to be inaccessible.
1074 	 */
1075 
1076 	rctl_rule_release_subject(rule);
1077 	uma_zfree(rctl_rule_zone, rule);
1078 }
1079 
1080 void
1081 rctl_rule_release(struct rctl_rule *rule)
1082 {
1083 
1084 	ASSERT_RACCT_ENABLED();
1085 	KASSERT(rule->rr_refcount > 0, ("rule->rr_refcount <= 0"));
1086 
1087 	if (refcount_release(&rule->rr_refcount)) {
1088 		/*
1089 		 * rctl_rule_release() is often called when iterating
1090 		 * over all the uidinfo structures in the system,
1091 		 * holding uihashtbl_lock.  Since rctl_rule_free()
1092 		 * might end up calling uifree(), this would lead
1093 		 * to lock recursion.  Use taskqueue to avoid this.
1094 		 */
1095 		TASK_INIT(&rule->rr_task, 0, rctl_rule_free, rule);
1096 		taskqueue_enqueue(taskqueue_thread, &rule->rr_task);
1097 	}
1098 }
1099 
1100 static int
1101 rctl_rule_fully_specified(const struct rctl_rule *rule)
1102 {
1103 
1104 	ASSERT_RACCT_ENABLED();
1105 
1106 	switch (rule->rr_subject_type) {
1107 	case RCTL_SUBJECT_TYPE_UNDEFINED:
1108 		return (0);
1109 	case RCTL_SUBJECT_TYPE_PROCESS:
1110 		if (rule->rr_subject.rs_proc == NULL)
1111 			return (0);
1112 		break;
1113 	case RCTL_SUBJECT_TYPE_USER:
1114 		if (rule->rr_subject.rs_uip == NULL)
1115 			return (0);
1116 		break;
1117 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1118 		if (rule->rr_subject.rs_loginclass == NULL)
1119 			return (0);
1120 		break;
1121 	case RCTL_SUBJECT_TYPE_JAIL:
1122 		if (rule->rr_subject.rs_prison_racct == NULL)
1123 			return (0);
1124 		break;
1125 	default:
1126 		panic("rctl_rule_fully_specified: unknown subject type %d",
1127 		    rule->rr_subject_type);
1128 	}
1129 	if (rule->rr_resource == RACCT_UNDEFINED)
1130 		return (0);
1131 	if (rule->rr_action == RCTL_ACTION_UNDEFINED)
1132 		return (0);
1133 	if (rule->rr_amount == RCTL_AMOUNT_UNDEFINED)
1134 		return (0);
1135 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED)
1136 		return (0);
1137 
1138 	return (1);
1139 }
1140 
1141 static int
1142 rctl_string_to_rule(char *rulestr, struct rctl_rule **rulep)
1143 {
1144 	struct rctl_rule *rule;
1145 	char *subjectstr, *subject_idstr, *resourcestr, *actionstr,
1146 	     *amountstr, *perstr;
1147 	id_t id;
1148 	int error = 0;
1149 
1150 	ASSERT_RACCT_ENABLED();
1151 
1152 	rule = rctl_rule_alloc(M_WAITOK);
1153 
1154 	subjectstr = strsep(&rulestr, ":");
1155 	subject_idstr = strsep(&rulestr, ":");
1156 	resourcestr = strsep(&rulestr, ":");
1157 	actionstr = strsep(&rulestr, "=/");
1158 	amountstr = strsep(&rulestr, "/");
1159 	perstr = rulestr;
1160 
1161 	if (subjectstr == NULL || subjectstr[0] == '\0')
1162 		rule->rr_subject_type = RCTL_SUBJECT_TYPE_UNDEFINED;
1163 	else {
1164 		error = str2value(subjectstr, &rule->rr_subject_type, subjectnames);
1165 		if (error != 0)
1166 			goto out;
1167 	}
1168 
1169 	if (subject_idstr == NULL || subject_idstr[0] == '\0') {
1170 		rule->rr_subject.rs_proc = NULL;
1171 		rule->rr_subject.rs_uip = NULL;
1172 		rule->rr_subject.rs_loginclass = NULL;
1173 		rule->rr_subject.rs_prison_racct = NULL;
1174 	} else {
1175 		switch (rule->rr_subject_type) {
1176 		case RCTL_SUBJECT_TYPE_UNDEFINED:
1177 			error = EINVAL;
1178 			goto out;
1179 		case RCTL_SUBJECT_TYPE_PROCESS:
1180 			error = str2id(subject_idstr, &id);
1181 			if (error != 0)
1182 				goto out;
1183 			sx_assert(&allproc_lock, SA_LOCKED);
1184 			rule->rr_subject.rs_proc = pfind(id);
1185 			if (rule->rr_subject.rs_proc == NULL) {
1186 				error = ESRCH;
1187 				goto out;
1188 			}
1189 			PROC_UNLOCK(rule->rr_subject.rs_proc);
1190 			break;
1191 		case RCTL_SUBJECT_TYPE_USER:
1192 			error = str2id(subject_idstr, &id);
1193 			if (error != 0)
1194 				goto out;
1195 			rule->rr_subject.rs_uip = uifind(id);
1196 			break;
1197 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1198 			rule->rr_subject.rs_loginclass =
1199 			    loginclass_find(subject_idstr);
1200 			if (rule->rr_subject.rs_loginclass == NULL) {
1201 				error = ENAMETOOLONG;
1202 				goto out;
1203 			}
1204 			break;
1205 		case RCTL_SUBJECT_TYPE_JAIL:
1206 			rule->rr_subject.rs_prison_racct =
1207 			    prison_racct_find(subject_idstr);
1208 			if (rule->rr_subject.rs_prison_racct == NULL) {
1209 				error = ENAMETOOLONG;
1210 				goto out;
1211 			}
1212 			break;
1213                default:
1214                        panic("rctl_string_to_rule: unknown subject type %d",
1215                            rule->rr_subject_type);
1216                }
1217 	}
1218 
1219 	if (resourcestr == NULL || resourcestr[0] == '\0')
1220 		rule->rr_resource = RACCT_UNDEFINED;
1221 	else {
1222 		error = str2value(resourcestr, &rule->rr_resource,
1223 		    resourcenames);
1224 		if (error != 0)
1225 			goto out;
1226 	}
1227 
1228 	if (actionstr == NULL || actionstr[0] == '\0')
1229 		rule->rr_action = RCTL_ACTION_UNDEFINED;
1230 	else {
1231 		error = str2value(actionstr, &rule->rr_action, actionnames);
1232 		if (error != 0)
1233 			goto out;
1234 	}
1235 
1236 	if (amountstr == NULL || amountstr[0] == '\0')
1237 		rule->rr_amount = RCTL_AMOUNT_UNDEFINED;
1238 	else {
1239 		error = str2int64(amountstr, &rule->rr_amount);
1240 		if (error != 0)
1241 			goto out;
1242 		if (RACCT_IS_IN_MILLIONS(rule->rr_resource)) {
1243 			if (rule->rr_amount > INT64_MAX / 1000000) {
1244 				error = ERANGE;
1245 				goto out;
1246 			}
1247 			rule->rr_amount *= 1000000;
1248 		}
1249 	}
1250 
1251 	if (perstr == NULL || perstr[0] == '\0')
1252 		rule->rr_per = RCTL_SUBJECT_TYPE_UNDEFINED;
1253 	else {
1254 		error = str2value(perstr, &rule->rr_per, subjectnames);
1255 		if (error != 0)
1256 			goto out;
1257 	}
1258 
1259 out:
1260 	if (error == 0)
1261 		*rulep = rule;
1262 	else
1263 		rctl_rule_release(rule);
1264 
1265 	return (error);
1266 }
1267 
1268 /*
1269  * Link a rule with all the subjects it applies to.
1270  */
1271 int
1272 rctl_rule_add(struct rctl_rule *rule)
1273 {
1274 	struct proc *p;
1275 	struct ucred *cred;
1276 	struct uidinfo *uip;
1277 	struct prison *pr;
1278 	struct prison_racct *prr;
1279 	struct loginclass *lc;
1280 	struct rctl_rule *rule2;
1281 	int match;
1282 
1283 	ASSERT_RACCT_ENABLED();
1284 	KASSERT(rctl_rule_fully_specified(rule), ("rule not fully specified"));
1285 
1286 	/*
1287 	 * Some rules just don't make sense, like "deny" rule for an undeniable
1288 	 * resource.  The exception are the RSS and %CPU resources - they are
1289 	 * not deniable in the racct sense, but the limit is enforced in
1290 	 * a different way.
1291 	 */
1292 	if (rule->rr_action == RCTL_ACTION_DENY &&
1293 	    !RACCT_IS_DENIABLE(rule->rr_resource) &&
1294 	    rule->rr_resource != RACCT_RSS &&
1295 	    rule->rr_resource != RACCT_PCTCPU) {
1296 		return (EOPNOTSUPP);
1297 	}
1298 
1299 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1300 	    !RACCT_IS_DECAYING(rule->rr_resource)) {
1301 		return (EOPNOTSUPP);
1302 	}
1303 
1304 	if (rule->rr_action == RCTL_ACTION_THROTTLE &&
1305 	    rule->rr_resource == RACCT_PCTCPU) {
1306 		return (EOPNOTSUPP);
1307 	}
1308 
1309 	if (rule->rr_per == RCTL_SUBJECT_TYPE_PROCESS &&
1310 	    RACCT_IS_SLOPPY(rule->rr_resource)) {
1311 		return (EOPNOTSUPP);
1312 	}
1313 
1314 	/*
1315 	 * Make sure there are no duplicated rules.  Also, for the "deny"
1316 	 * rules, remove ones differing only by "amount".
1317 	 */
1318 	if (rule->rr_action == RCTL_ACTION_DENY) {
1319 		rule2 = rctl_rule_duplicate(rule, M_WAITOK);
1320 		rule2->rr_amount = RCTL_AMOUNT_UNDEFINED;
1321 		rctl_rule_remove(rule2);
1322 		rctl_rule_release(rule2);
1323 	} else
1324 		rctl_rule_remove(rule);
1325 
1326 	switch (rule->rr_subject_type) {
1327 	case RCTL_SUBJECT_TYPE_PROCESS:
1328 		p = rule->rr_subject.rs_proc;
1329 		KASSERT(p != NULL, ("rctl_rule_add: NULL proc"));
1330 
1331 		rctl_racct_add_rule(p->p_racct, rule);
1332 		/*
1333 		 * In case of per-process rule, we don't have anything more
1334 		 * to do.
1335 		 */
1336 		return (0);
1337 
1338 	case RCTL_SUBJECT_TYPE_USER:
1339 		uip = rule->rr_subject.rs_uip;
1340 		KASSERT(uip != NULL, ("rctl_rule_add: NULL uip"));
1341 		rctl_racct_add_rule(uip->ui_racct, rule);
1342 		break;
1343 
1344 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1345 		lc = rule->rr_subject.rs_loginclass;
1346 		KASSERT(lc != NULL, ("rctl_rule_add: NULL loginclass"));
1347 		rctl_racct_add_rule(lc->lc_racct, rule);
1348 		break;
1349 
1350 	case RCTL_SUBJECT_TYPE_JAIL:
1351 		prr = rule->rr_subject.rs_prison_racct;
1352 		KASSERT(prr != NULL, ("rctl_rule_add: NULL pr"));
1353 		rctl_racct_add_rule(prr->prr_racct, rule);
1354 		break;
1355 
1356 	default:
1357 		panic("rctl_rule_add: unknown subject type %d",
1358 		    rule->rr_subject_type);
1359 	}
1360 
1361 	/*
1362 	 * Now go through all the processes and add the new rule to the ones
1363 	 * it applies to.
1364 	 */
1365 	sx_assert(&allproc_lock, SA_LOCKED);
1366 	FOREACH_PROC_IN_SYSTEM(p) {
1367 		cred = p->p_ucred;
1368 		switch (rule->rr_subject_type) {
1369 		case RCTL_SUBJECT_TYPE_USER:
1370 			if (cred->cr_uidinfo == rule->rr_subject.rs_uip ||
1371 			    cred->cr_ruidinfo == rule->rr_subject.rs_uip)
1372 				break;
1373 			continue;
1374 		case RCTL_SUBJECT_TYPE_LOGINCLASS:
1375 			if (cred->cr_loginclass == rule->rr_subject.rs_loginclass)
1376 				break;
1377 			continue;
1378 		case RCTL_SUBJECT_TYPE_JAIL:
1379 			match = 0;
1380 			for (pr = cred->cr_prison; pr != NULL; pr = pr->pr_parent) {
1381 				if (pr->pr_prison_racct == rule->rr_subject.rs_prison_racct) {
1382 					match = 1;
1383 					break;
1384 				}
1385 			}
1386 			if (match)
1387 				break;
1388 			continue;
1389 		default:
1390 			panic("rctl_rule_add: unknown subject type %d",
1391 			    rule->rr_subject_type);
1392 		}
1393 
1394 		rctl_racct_add_rule(p->p_racct, rule);
1395 	}
1396 
1397 	return (0);
1398 }
1399 
1400 static void
1401 rctl_rule_pre_callback(void)
1402 {
1403 
1404 	RACCT_LOCK();
1405 }
1406 
1407 static void
1408 rctl_rule_post_callback(void)
1409 {
1410 
1411 	RACCT_UNLOCK();
1412 }
1413 
1414 static void
1415 rctl_rule_remove_callback(struct racct *racct, void *arg2, void *arg3)
1416 {
1417 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1418 	int found = 0;
1419 
1420 	ASSERT_RACCT_ENABLED();
1421 	RACCT_LOCK_ASSERT();
1422 
1423 	found += rctl_racct_remove_rules(racct, filter);
1424 
1425 	*((int *)arg3) += found;
1426 }
1427 
1428 /*
1429  * Remove all rules that match the filter.
1430  */
1431 int
1432 rctl_rule_remove(struct rctl_rule *filter)
1433 {
1434 	struct proc *p;
1435 	int found = 0;
1436 
1437 	ASSERT_RACCT_ENABLED();
1438 
1439 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_PROCESS &&
1440 	    filter->rr_subject.rs_proc != NULL) {
1441 		p = filter->rr_subject.rs_proc;
1442 		RACCT_LOCK();
1443 		found = rctl_racct_remove_rules(p->p_racct, filter);
1444 		RACCT_UNLOCK();
1445 		if (found)
1446 			return (0);
1447 		return (ESRCH);
1448 	}
1449 
1450 	loginclass_racct_foreach(rctl_rule_remove_callback,
1451 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1452 	    filter, (void *)&found);
1453 	ui_racct_foreach(rctl_rule_remove_callback,
1454 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1455 	    filter, (void *)&found);
1456 	prison_racct_foreach(rctl_rule_remove_callback,
1457 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1458 	    filter, (void *)&found);
1459 
1460 	sx_assert(&allproc_lock, SA_LOCKED);
1461 	RACCT_LOCK();
1462 	FOREACH_PROC_IN_SYSTEM(p) {
1463 		found += rctl_racct_remove_rules(p->p_racct, filter);
1464 	}
1465 	RACCT_UNLOCK();
1466 
1467 	if (found)
1468 		return (0);
1469 	return (ESRCH);
1470 }
1471 
1472 /*
1473  * Appends a rule to the sbuf.
1474  */
1475 static void
1476 rctl_rule_to_sbuf(struct sbuf *sb, const struct rctl_rule *rule)
1477 {
1478 	int64_t amount;
1479 
1480 	ASSERT_RACCT_ENABLED();
1481 
1482 	sbuf_printf(sb, "%s:", rctl_subject_type_name(rule->rr_subject_type));
1483 
1484 	switch (rule->rr_subject_type) {
1485 	case RCTL_SUBJECT_TYPE_PROCESS:
1486 		if (rule->rr_subject.rs_proc == NULL)
1487 			sbuf_printf(sb, ":");
1488 		else
1489 			sbuf_printf(sb, "%d:",
1490 			    rule->rr_subject.rs_proc->p_pid);
1491 		break;
1492 	case RCTL_SUBJECT_TYPE_USER:
1493 		if (rule->rr_subject.rs_uip == NULL)
1494 			sbuf_printf(sb, ":");
1495 		else
1496 			sbuf_printf(sb, "%d:",
1497 			    rule->rr_subject.rs_uip->ui_uid);
1498 		break;
1499 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1500 		if (rule->rr_subject.rs_loginclass == NULL)
1501 			sbuf_printf(sb, ":");
1502 		else
1503 			sbuf_printf(sb, "%s:",
1504 			    rule->rr_subject.rs_loginclass->lc_name);
1505 		break;
1506 	case RCTL_SUBJECT_TYPE_JAIL:
1507 		if (rule->rr_subject.rs_prison_racct == NULL)
1508 			sbuf_printf(sb, ":");
1509 		else
1510 			sbuf_printf(sb, "%s:",
1511 			    rule->rr_subject.rs_prison_racct->prr_name);
1512 		break;
1513 	default:
1514 		panic("rctl_rule_to_sbuf: unknown subject type %d",
1515 		    rule->rr_subject_type);
1516 	}
1517 
1518 	amount = rule->rr_amount;
1519 	if (amount != RCTL_AMOUNT_UNDEFINED &&
1520 	    RACCT_IS_IN_MILLIONS(rule->rr_resource))
1521 		amount /= 1000000;
1522 
1523 	sbuf_printf(sb, "%s:%s=%jd",
1524 	    rctl_resource_name(rule->rr_resource),
1525 	    rctl_action_name(rule->rr_action),
1526 	    amount);
1527 
1528 	if (rule->rr_per != rule->rr_subject_type)
1529 		sbuf_printf(sb, "/%s", rctl_subject_type_name(rule->rr_per));
1530 }
1531 
1532 /*
1533  * Routine used by RCTL syscalls to read in input string.
1534  */
1535 static int
1536 rctl_read_inbuf(char **inputstr, const char *inbufp, size_t inbuflen)
1537 {
1538 	char *str;
1539 	int error;
1540 
1541 	ASSERT_RACCT_ENABLED();
1542 
1543 	if (inbuflen <= 0)
1544 		return (EINVAL);
1545 	if (inbuflen > RCTL_MAX_INBUFSIZE)
1546 		return (E2BIG);
1547 
1548 	str = malloc(inbuflen + 1, M_RCTL, M_WAITOK);
1549 	error = copyinstr(inbufp, str, inbuflen, NULL);
1550 	if (error != 0) {
1551 		free(str, M_RCTL);
1552 		return (error);
1553 	}
1554 
1555 	*inputstr = str;
1556 
1557 	return (0);
1558 }
1559 
1560 /*
1561  * Routine used by RCTL syscalls to write out output string.
1562  */
1563 static int
1564 rctl_write_outbuf(struct sbuf *outputsbuf, char *outbufp, size_t outbuflen)
1565 {
1566 	int error;
1567 
1568 	ASSERT_RACCT_ENABLED();
1569 
1570 	if (outputsbuf == NULL)
1571 		return (0);
1572 
1573 	sbuf_finish(outputsbuf);
1574 	if (outbuflen < sbuf_len(outputsbuf) + 1) {
1575 		sbuf_delete(outputsbuf);
1576 		return (ERANGE);
1577 	}
1578 	error = copyout(sbuf_data(outputsbuf), outbufp,
1579 	    sbuf_len(outputsbuf) + 1);
1580 	sbuf_delete(outputsbuf);
1581 	return (error);
1582 }
1583 
1584 static struct sbuf *
1585 rctl_racct_to_sbuf(struct racct *racct, int sloppy)
1586 {
1587 	struct sbuf *sb;
1588 	int64_t amount;
1589 	int i;
1590 
1591 	ASSERT_RACCT_ENABLED();
1592 
1593 	sb = sbuf_new_auto();
1594 	for (i = 0; i <= RACCT_MAX; i++) {
1595 		if (sloppy == 0 && RACCT_IS_SLOPPY(i))
1596 			continue;
1597 		RACCT_LOCK();
1598 		amount = racct->r_resources[i];
1599 		RACCT_UNLOCK();
1600 		if (RACCT_IS_IN_MILLIONS(i))
1601 			amount /= 1000000;
1602 		sbuf_printf(sb, "%s=%jd,", rctl_resource_name(i), amount);
1603 	}
1604 	sbuf_setpos(sb, sbuf_len(sb) - 1);
1605 	return (sb);
1606 }
1607 
1608 int
1609 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
1610 {
1611 	struct rctl_rule *filter;
1612 	struct sbuf *outputsbuf = NULL;
1613 	struct proc *p;
1614 	struct uidinfo *uip;
1615 	struct loginclass *lc;
1616 	struct prison_racct *prr;
1617 	char *inputstr;
1618 	int error;
1619 
1620 	if (!racct_enable)
1621 		return (ENOSYS);
1622 
1623 	error = priv_check(td, PRIV_RCTL_GET_RACCT);
1624 	if (error != 0)
1625 		return (error);
1626 
1627 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1628 	if (error != 0)
1629 		return (error);
1630 
1631 	sx_slock(&allproc_lock);
1632 	error = rctl_string_to_rule(inputstr, &filter);
1633 	free(inputstr, M_RCTL);
1634 	if (error != 0) {
1635 		sx_sunlock(&allproc_lock);
1636 		return (error);
1637 	}
1638 
1639 	switch (filter->rr_subject_type) {
1640 	case RCTL_SUBJECT_TYPE_PROCESS:
1641 		p = filter->rr_subject.rs_proc;
1642 		if (p == NULL) {
1643 			error = EINVAL;
1644 			goto out;
1645 		}
1646 		outputsbuf = rctl_racct_to_sbuf(p->p_racct, 0);
1647 		break;
1648 	case RCTL_SUBJECT_TYPE_USER:
1649 		uip = filter->rr_subject.rs_uip;
1650 		if (uip == NULL) {
1651 			error = EINVAL;
1652 			goto out;
1653 		}
1654 		outputsbuf = rctl_racct_to_sbuf(uip->ui_racct, 1);
1655 		break;
1656 	case RCTL_SUBJECT_TYPE_LOGINCLASS:
1657 		lc = filter->rr_subject.rs_loginclass;
1658 		if (lc == NULL) {
1659 			error = EINVAL;
1660 			goto out;
1661 		}
1662 		outputsbuf = rctl_racct_to_sbuf(lc->lc_racct, 1);
1663 		break;
1664 	case RCTL_SUBJECT_TYPE_JAIL:
1665 		prr = filter->rr_subject.rs_prison_racct;
1666 		if (prr == NULL) {
1667 			error = EINVAL;
1668 			goto out;
1669 		}
1670 		outputsbuf = rctl_racct_to_sbuf(prr->prr_racct, 1);
1671 		break;
1672 	default:
1673 		error = EINVAL;
1674 	}
1675 out:
1676 	rctl_rule_release(filter);
1677 	sx_sunlock(&allproc_lock);
1678 	if (error != 0)
1679 		return (error);
1680 
1681 	error = rctl_write_outbuf(outputsbuf, uap->outbufp, uap->outbuflen);
1682 
1683 	return (error);
1684 }
1685 
1686 static void
1687 rctl_get_rules_callback(struct racct *racct, void *arg2, void *arg3)
1688 {
1689 	struct rctl_rule *filter = (struct rctl_rule *)arg2;
1690 	struct rctl_rule_link *link;
1691 	struct sbuf *sb = (struct sbuf *)arg3;
1692 
1693 	ASSERT_RACCT_ENABLED();
1694 	RACCT_LOCK_ASSERT();
1695 
1696 	LIST_FOREACH(link, &racct->r_rule_links, rrl_next) {
1697 		if (!rctl_rule_matches(link->rrl_rule, filter))
1698 			continue;
1699 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1700 		sbuf_printf(sb, ",");
1701 	}
1702 }
1703 
1704 int
1705 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
1706 {
1707 	struct sbuf *sb;
1708 	struct rctl_rule *filter;
1709 	struct rctl_rule_link *link;
1710 	struct proc *p;
1711 	char *inputstr, *buf;
1712 	size_t bufsize;
1713 	int error;
1714 
1715 	if (!racct_enable)
1716 		return (ENOSYS);
1717 
1718 	error = priv_check(td, PRIV_RCTL_GET_RULES);
1719 	if (error != 0)
1720 		return (error);
1721 
1722 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1723 	if (error != 0)
1724 		return (error);
1725 
1726 	sx_slock(&allproc_lock);
1727 	error = rctl_string_to_rule(inputstr, &filter);
1728 	free(inputstr, M_RCTL);
1729 	if (error != 0) {
1730 		sx_sunlock(&allproc_lock);
1731 		return (error);
1732 	}
1733 
1734 	bufsize = uap->outbuflen;
1735 	if (bufsize > rctl_maxbufsize) {
1736 		sx_sunlock(&allproc_lock);
1737 		return (E2BIG);
1738 	}
1739 
1740 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1741 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1742 	KASSERT(sb != NULL, ("sbuf_new failed"));
1743 
1744 	FOREACH_PROC_IN_SYSTEM(p) {
1745 		RACCT_LOCK();
1746 		LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1747 			/*
1748 			 * Non-process rules will be added to the buffer later.
1749 			 * Adding them here would result in duplicated output.
1750 			 */
1751 			if (link->rrl_rule->rr_subject_type !=
1752 			    RCTL_SUBJECT_TYPE_PROCESS)
1753 				continue;
1754 			if (!rctl_rule_matches(link->rrl_rule, filter))
1755 				continue;
1756 			rctl_rule_to_sbuf(sb, link->rrl_rule);
1757 			sbuf_printf(sb, ",");
1758 		}
1759 		RACCT_UNLOCK();
1760 	}
1761 
1762 	loginclass_racct_foreach(rctl_get_rules_callback,
1763 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1764 	    filter, sb);
1765 	ui_racct_foreach(rctl_get_rules_callback,
1766 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1767 	    filter, sb);
1768 	prison_racct_foreach(rctl_get_rules_callback,
1769 	    rctl_rule_pre_callback, rctl_rule_post_callback,
1770 	    filter, sb);
1771 	if (sbuf_error(sb) == ENOMEM) {
1772 		error = ERANGE;
1773 		goto out;
1774 	}
1775 
1776 	/*
1777 	 * Remove trailing ",".
1778 	 */
1779 	if (sbuf_len(sb) > 0)
1780 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1781 
1782 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1783 out:
1784 	rctl_rule_release(filter);
1785 	sx_sunlock(&allproc_lock);
1786 	free(buf, M_RCTL);
1787 	return (error);
1788 }
1789 
1790 int
1791 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
1792 {
1793 	struct sbuf *sb;
1794 	struct rctl_rule *filter;
1795 	struct rctl_rule_link *link;
1796 	char *inputstr, *buf;
1797 	size_t bufsize;
1798 	int error;
1799 
1800 	if (!racct_enable)
1801 		return (ENOSYS);
1802 
1803 	error = priv_check(td, PRIV_RCTL_GET_LIMITS);
1804 	if (error != 0)
1805 		return (error);
1806 
1807 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1808 	if (error != 0)
1809 		return (error);
1810 
1811 	sx_slock(&allproc_lock);
1812 	error = rctl_string_to_rule(inputstr, &filter);
1813 	free(inputstr, M_RCTL);
1814 	if (error != 0) {
1815 		sx_sunlock(&allproc_lock);
1816 		return (error);
1817 	}
1818 
1819 	if (filter->rr_subject_type == RCTL_SUBJECT_TYPE_UNDEFINED) {
1820 		rctl_rule_release(filter);
1821 		sx_sunlock(&allproc_lock);
1822 		return (EINVAL);
1823 	}
1824 	if (filter->rr_subject_type != RCTL_SUBJECT_TYPE_PROCESS) {
1825 		rctl_rule_release(filter);
1826 		sx_sunlock(&allproc_lock);
1827 		return (EOPNOTSUPP);
1828 	}
1829 	if (filter->rr_subject.rs_proc == NULL) {
1830 		rctl_rule_release(filter);
1831 		sx_sunlock(&allproc_lock);
1832 		return (EINVAL);
1833 	}
1834 
1835 	bufsize = uap->outbuflen;
1836 	if (bufsize > rctl_maxbufsize) {
1837 		rctl_rule_release(filter);
1838 		sx_sunlock(&allproc_lock);
1839 		return (E2BIG);
1840 	}
1841 
1842 	buf = malloc(bufsize, M_RCTL, M_WAITOK);
1843 	sb = sbuf_new(NULL, buf, bufsize, SBUF_FIXEDLEN);
1844 	KASSERT(sb != NULL, ("sbuf_new failed"));
1845 
1846 	RACCT_LOCK();
1847 	LIST_FOREACH(link, &filter->rr_subject.rs_proc->p_racct->r_rule_links,
1848 	    rrl_next) {
1849 		rctl_rule_to_sbuf(sb, link->rrl_rule);
1850 		sbuf_printf(sb, ",");
1851 	}
1852 	RACCT_UNLOCK();
1853 	if (sbuf_error(sb) == ENOMEM) {
1854 		error = ERANGE;
1855 		sbuf_delete(sb);
1856 		goto out;
1857 	}
1858 
1859 	/*
1860 	 * Remove trailing ",".
1861 	 */
1862 	if (sbuf_len(sb) > 0)
1863 		sbuf_setpos(sb, sbuf_len(sb) - 1);
1864 
1865 	error = rctl_write_outbuf(sb, uap->outbufp, uap->outbuflen);
1866 out:
1867 	rctl_rule_release(filter);
1868 	sx_sunlock(&allproc_lock);
1869 	free(buf, M_RCTL);
1870 	return (error);
1871 }
1872 
1873 int
1874 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
1875 {
1876 	struct rctl_rule *rule;
1877 	char *inputstr;
1878 	int error;
1879 
1880 	if (!racct_enable)
1881 		return (ENOSYS);
1882 
1883 	error = priv_check(td, PRIV_RCTL_ADD_RULE);
1884 	if (error != 0)
1885 		return (error);
1886 
1887 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1888 	if (error != 0)
1889 		return (error);
1890 
1891 	sx_slock(&allproc_lock);
1892 	error = rctl_string_to_rule(inputstr, &rule);
1893 	free(inputstr, M_RCTL);
1894 	if (error != 0) {
1895 		sx_sunlock(&allproc_lock);
1896 		return (error);
1897 	}
1898 	/*
1899 	 * The 'per' part of a rule is optional.
1900 	 */
1901 	if (rule->rr_per == RCTL_SUBJECT_TYPE_UNDEFINED &&
1902 	    rule->rr_subject_type != RCTL_SUBJECT_TYPE_UNDEFINED)
1903 		rule->rr_per = rule->rr_subject_type;
1904 
1905 	if (!rctl_rule_fully_specified(rule)) {
1906 		error = EINVAL;
1907 		goto out;
1908 	}
1909 
1910 	error = rctl_rule_add(rule);
1911 
1912 out:
1913 	rctl_rule_release(rule);
1914 	sx_sunlock(&allproc_lock);
1915 	return (error);
1916 }
1917 
1918 int
1919 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
1920 {
1921 	struct rctl_rule *filter;
1922 	char *inputstr;
1923 	int error;
1924 
1925 	if (!racct_enable)
1926 		return (ENOSYS);
1927 
1928 	error = priv_check(td, PRIV_RCTL_REMOVE_RULE);
1929 	if (error != 0)
1930 		return (error);
1931 
1932 	error = rctl_read_inbuf(&inputstr, uap->inbufp, uap->inbuflen);
1933 	if (error != 0)
1934 		return (error);
1935 
1936 	sx_slock(&allproc_lock);
1937 	error = rctl_string_to_rule(inputstr, &filter);
1938 	free(inputstr, M_RCTL);
1939 	if (error != 0) {
1940 		sx_sunlock(&allproc_lock);
1941 		return (error);
1942 	}
1943 
1944 	error = rctl_rule_remove(filter);
1945 	rctl_rule_release(filter);
1946 	sx_sunlock(&allproc_lock);
1947 
1948 	return (error);
1949 }
1950 
1951 /*
1952  * Update RCTL rule list after credential change.
1953  */
1954 void
1955 rctl_proc_ucred_changed(struct proc *p, struct ucred *newcred)
1956 {
1957 	LIST_HEAD(, rctl_rule_link) newrules;
1958 	struct rctl_rule_link *link, *newlink;
1959 	struct uidinfo *newuip;
1960 	struct loginclass *newlc;
1961 	struct prison_racct *newprr;
1962 	int rulecnt, i;
1963 
1964 	if (!racct_enable)
1965 		return;
1966 
1967 	PROC_LOCK_ASSERT(p, MA_NOTOWNED);
1968 
1969 	newuip = newcred->cr_ruidinfo;
1970 	newlc = newcred->cr_loginclass;
1971 	newprr = newcred->cr_prison->pr_prison_racct;
1972 
1973 	LIST_INIT(&newrules);
1974 
1975 again:
1976 	/*
1977 	 * First, count the rules that apply to the process with new
1978 	 * credentials.
1979 	 */
1980 	rulecnt = 0;
1981 	RACCT_LOCK();
1982 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
1983 		if (link->rrl_rule->rr_subject_type ==
1984 		    RCTL_SUBJECT_TYPE_PROCESS)
1985 			rulecnt++;
1986 	}
1987 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next)
1988 		rulecnt++;
1989 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next)
1990 		rulecnt++;
1991 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next)
1992 		rulecnt++;
1993 	RACCT_UNLOCK();
1994 
1995 	/*
1996 	 * Create temporary list.  We've dropped the rctl_lock in order
1997 	 * to use M_WAITOK.
1998 	 */
1999 	for (i = 0; i < rulecnt; i++) {
2000 		newlink = uma_zalloc(rctl_rule_link_zone, M_WAITOK);
2001 		newlink->rrl_rule = NULL;
2002 		newlink->rrl_exceeded = 0;
2003 		LIST_INSERT_HEAD(&newrules, newlink, rrl_next);
2004 	}
2005 
2006 	newlink = LIST_FIRST(&newrules);
2007 
2008 	/*
2009 	 * Assign rules to the newly allocated list entries.
2010 	 */
2011 	RACCT_LOCK();
2012 	LIST_FOREACH(link, &p->p_racct->r_rule_links, rrl_next) {
2013 		if (link->rrl_rule->rr_subject_type ==
2014 		    RCTL_SUBJECT_TYPE_PROCESS) {
2015 			if (newlink == NULL)
2016 				goto goaround;
2017 			rctl_rule_acquire(link->rrl_rule);
2018 			newlink->rrl_rule = link->rrl_rule;
2019 			newlink->rrl_exceeded = link->rrl_exceeded;
2020 			newlink = LIST_NEXT(newlink, rrl_next);
2021 			rulecnt--;
2022 		}
2023 	}
2024 
2025 	LIST_FOREACH(link, &newuip->ui_racct->r_rule_links, rrl_next) {
2026 		if (newlink == NULL)
2027 			goto goaround;
2028 		rctl_rule_acquire(link->rrl_rule);
2029 		newlink->rrl_rule = link->rrl_rule;
2030 		newlink->rrl_exceeded = link->rrl_exceeded;
2031 		newlink = LIST_NEXT(newlink, rrl_next);
2032 		rulecnt--;
2033 	}
2034 
2035 	LIST_FOREACH(link, &newlc->lc_racct->r_rule_links, rrl_next) {
2036 		if (newlink == NULL)
2037 			goto goaround;
2038 		rctl_rule_acquire(link->rrl_rule);
2039 		newlink->rrl_rule = link->rrl_rule;
2040 		newlink->rrl_exceeded = link->rrl_exceeded;
2041 		newlink = LIST_NEXT(newlink, rrl_next);
2042 		rulecnt--;
2043 	}
2044 
2045 	LIST_FOREACH(link, &newprr->prr_racct->r_rule_links, rrl_next) {
2046 		if (newlink == NULL)
2047 			goto goaround;
2048 		rctl_rule_acquire(link->rrl_rule);
2049 		newlink->rrl_rule = link->rrl_rule;
2050 		newlink->rrl_exceeded = link->rrl_exceeded;
2051 		newlink = LIST_NEXT(newlink, rrl_next);
2052 		rulecnt--;
2053 	}
2054 
2055 	if (rulecnt == 0) {
2056 		/*
2057 		 * Free the old rule list.
2058 		 */
2059 		while (!LIST_EMPTY(&p->p_racct->r_rule_links)) {
2060 			link = LIST_FIRST(&p->p_racct->r_rule_links);
2061 			LIST_REMOVE(link, rrl_next);
2062 			rctl_rule_release(link->rrl_rule);
2063 			uma_zfree(rctl_rule_link_zone, link);
2064 		}
2065 
2066 		/*
2067 		 * Replace lists and we're done.
2068 		 *
2069 		 * XXX: Is there any way to switch list heads instead
2070 		 *      of iterating here?
2071 		 */
2072 		while (!LIST_EMPTY(&newrules)) {
2073 			newlink = LIST_FIRST(&newrules);
2074 			LIST_REMOVE(newlink, rrl_next);
2075 			LIST_INSERT_HEAD(&p->p_racct->r_rule_links,
2076 			    newlink, rrl_next);
2077 		}
2078 
2079 		RACCT_UNLOCK();
2080 
2081 		return;
2082 	}
2083 
2084 goaround:
2085 	RACCT_UNLOCK();
2086 
2087 	/*
2088 	 * Rule list changed while we were not holding the rctl_lock.
2089 	 * Free the new list and try again.
2090 	 */
2091 	while (!LIST_EMPTY(&newrules)) {
2092 		newlink = LIST_FIRST(&newrules);
2093 		LIST_REMOVE(newlink, rrl_next);
2094 		if (newlink->rrl_rule != NULL)
2095 			rctl_rule_release(newlink->rrl_rule);
2096 		uma_zfree(rctl_rule_link_zone, newlink);
2097 	}
2098 
2099 	goto again;
2100 }
2101 
2102 /*
2103  * Assign RCTL rules to the newly created process.
2104  */
2105 int
2106 rctl_proc_fork(struct proc *parent, struct proc *child)
2107 {
2108 	struct rctl_rule *rule;
2109 	struct rctl_rule_link *link;
2110 	int error;
2111 
2112 	ASSERT_RACCT_ENABLED();
2113 	RACCT_LOCK_ASSERT();
2114 	KASSERT(parent->p_racct != NULL, ("process without racct; p = %p", parent));
2115 
2116 	LIST_INIT(&child->p_racct->r_rule_links);
2117 
2118 	/*
2119 	 * Go through limits applicable to the parent and assign them
2120 	 * to the child.  Rules with 'process' subject have to be duplicated
2121 	 * in order to make their rr_subject point to the new process.
2122 	 */
2123 	LIST_FOREACH(link, &parent->p_racct->r_rule_links, rrl_next) {
2124 		if (link->rrl_rule->rr_subject_type ==
2125 		    RCTL_SUBJECT_TYPE_PROCESS) {
2126 			rule = rctl_rule_duplicate(link->rrl_rule, M_NOWAIT);
2127 			if (rule == NULL)
2128 				goto fail;
2129 			KASSERT(rule->rr_subject.rs_proc == parent,
2130 			    ("rule->rr_subject.rs_proc != parent"));
2131 			rule->rr_subject.rs_proc = child;
2132 			error = rctl_racct_add_rule_locked(child->p_racct,
2133 			    rule);
2134 			rctl_rule_release(rule);
2135 			if (error != 0)
2136 				goto fail;
2137 		} else {
2138 			error = rctl_racct_add_rule_locked(child->p_racct,
2139 			    link->rrl_rule);
2140 			if (error != 0)
2141 				goto fail;
2142 		}
2143 	}
2144 
2145 	return (0);
2146 
2147 fail:
2148 	while (!LIST_EMPTY(&child->p_racct->r_rule_links)) {
2149 		link = LIST_FIRST(&child->p_racct->r_rule_links);
2150 		LIST_REMOVE(link, rrl_next);
2151 		rctl_rule_release(link->rrl_rule);
2152 		uma_zfree(rctl_rule_link_zone, link);
2153 	}
2154 
2155 	return (EAGAIN);
2156 }
2157 
2158 /*
2159  * Release rules attached to the racct.
2160  */
2161 void
2162 rctl_racct_release(struct racct *racct)
2163 {
2164 	struct rctl_rule_link *link;
2165 
2166 	ASSERT_RACCT_ENABLED();
2167 	RACCT_LOCK_ASSERT();
2168 
2169 	while (!LIST_EMPTY(&racct->r_rule_links)) {
2170 		link = LIST_FIRST(&racct->r_rule_links);
2171 		LIST_REMOVE(link, rrl_next);
2172 		rctl_rule_release(link->rrl_rule);
2173 		uma_zfree(rctl_rule_link_zone, link);
2174 	}
2175 }
2176 
2177 static void
2178 rctl_init(void)
2179 {
2180 
2181 	if (!racct_enable)
2182 		return;
2183 
2184 	rctl_rule_zone = uma_zcreate("rctl_rule", sizeof(struct rctl_rule),
2185 	    NULL, NULL, NULL, NULL, UMA_ALIGN_PTR, 0);
2186 	rctl_rule_link_zone = uma_zcreate("rctl_rule_link",
2187 	    sizeof(struct rctl_rule_link), NULL, NULL, NULL, NULL,
2188 	    UMA_ALIGN_PTR, 0);
2189 
2190 	/*
2191 	 * Set default values, making sure not to overwrite the ones
2192 	 * fetched from tunables.  Most of those could be set at the
2193 	 * declaration, except for the rctl_throttle_max - we cannot
2194 	 * set it there due to hz not being compile time constant.
2195 	 */
2196 	if (rctl_throttle_min < 1)
2197 		rctl_throttle_min = 1;
2198 	if (rctl_throttle_max < rctl_throttle_min)
2199 		rctl_throttle_max = 2 * hz;
2200 	if (rctl_throttle_pct < 0)
2201 		rctl_throttle_pct = 100;
2202 	if (rctl_throttle_pct2 < 0)
2203 		rctl_throttle_pct2 = 100;
2204 }
2205 
2206 #else /* !RCTL */
2207 
2208 int
2209 sys_rctl_get_racct(struct thread *td, struct rctl_get_racct_args *uap)
2210 {
2211 
2212 	return (ENOSYS);
2213 }
2214 
2215 int
2216 sys_rctl_get_rules(struct thread *td, struct rctl_get_rules_args *uap)
2217 {
2218 
2219 	return (ENOSYS);
2220 }
2221 
2222 int
2223 sys_rctl_get_limits(struct thread *td, struct rctl_get_limits_args *uap)
2224 {
2225 
2226 	return (ENOSYS);
2227 }
2228 
2229 int
2230 sys_rctl_add_rule(struct thread *td, struct rctl_add_rule_args *uap)
2231 {
2232 
2233 	return (ENOSYS);
2234 }
2235 
2236 int
2237 sys_rctl_remove_rule(struct thread *td, struct rctl_remove_rule_args *uap)
2238 {
2239 
2240 	return (ENOSYS);
2241 }
2242 
2243 #endif /* !RCTL */
2244