xref: /illumos-gate/usr/src/cmd/rcap/rcapd/rcapd_main.c (revision 3db86aab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * rcapd is a long-running daemon enforcing project-based resource caps (see
30  * rcapd(1M)).  Each instance of a process aggregate (project or, generically,
31  * "collection") may have a memory cap.  A single thread monitors the resource
32  * utilization of capped collections, enforces caps when they are exceeded (and
33  * other conditions are met), and incorporates changes in configuration or
34  * caps.  Each of these actions occurs not more frequently than the rate
35  * specified with rcapadm(1M).
36  */
37 
38 #include <sys/priocntl.h>
39 #include <sys/proc.h>
40 #include <sys/resource.h>
41 #include <sys/sysinfo.h>
42 #include <sys/stat.h>
43 #include <sys/sysmacros.h>
44 #include <sys/time.h>
45 #include <sys/types.h>
46 #include <dirent.h>
47 #include <errno.h>
48 #include <fcntl.h>
49 #include <kstat.h>
50 #include <libintl.h>
51 #include <limits.h>
52 #include <locale.h>
53 #include <priv.h>
54 #include <signal.h>
55 #include <stdarg.h>
56 #include <stdio.h>
57 #include <stdio_ext.h>
58 #include <stdlib.h>
59 #include <strings.h>
60 #include <time.h>
61 #include <unistd.h>
62 #include <zone.h>
63 #include <assert.h>
64 #include "rcapd.h"
65 #include "rcapd_mapping.h"
66 #include "rcapd_rfd.h"
67 #include "rcapd_stat.h"
68 #include "utils.h"
69 
70 #define	POSITIVE_MIN(x, y) \
71 	(((x) <= 0) ? (y) : ((y) <= 0) ? (x) : MIN(x, y))
72 #define	NEXT_EVENT_TIME(base, seconds) \
73 	(((int)seconds > 0) ? (base + (hrtime_t)seconds * (hrtime_t)NANOSEC) \
74 	: (hrtime_t)0)
75 #define	NEXT_REPORT_EVENT_TIME(base, seconds) \
76 	((rcfg.rcfg_stat_file[0] != 0) ?  \
77 	    NEXT_EVENT_TIME(gethrtime(), seconds) : (hrtime_t)0)
78 #define	EVENT_TIME(time, eventtime) \
79 	(((time) > (eventtime)) && (eventtime) != 0)
80 #define	STAT_TEMPLATE_SUFFIX	".XXXXXX"	/* suffix of mkstemp() arg */
81 #define	DAEMON_UID		1		/* uid to use */
82 
83 typedef struct soft_scan_arg {
84 	uint64_t ssa_sum_excess;
85 	int64_t ssa_scan_goal;
86 } soft_scan_arg_t;
87 
88 static int debug_mode = 0;		/* debug mode flag */
89 static pid_t rcapd_pid;			/* rcapd's pid to ensure it's not */
90 					/* scanned */
91 static kstat_ctl_t *kctl;		/* kstat chain */
92 static uint64_t new_sp = 0, old_sp = 0;	/* measure delta in page scan count */
93 static int enforce_caps = 0;		/* cap enforcement flag, dependent on */
94 					/* enforce_soft_caps and */
95 					/* global_scanner_running */
96 static int enforce_soft_caps = 0;	/* soft cap enforcement flag, */
97 					/* depending on memory pressure */
98 static int memory_pressure = 0;		/* physical memory utilization (%) */
99 static int memory_pressure_sample = 0;	/* count of samples */
100 static int global_scanner_running = 0;	/* global scanning flag, to avoid */
101 					/* interference with kernel's page */
102 					/* scanner */
103 static hrtime_t next_report;		/* time of next report */
104 static int termination_signal = 0;	/* terminating signal */
105 
106 rcfg_t rcfg;
107 
108 /*
109  * Flags.
110  */
111 static int ever_ran;
112 int should_run;
113 static int should_reconfigure;
114 
115 static int verify_statistics(void);
116 static int update_statistics(void);
117 
118 /*
119  * Checks if a process is marked 'system'.  Returns zero only when it is not.
120  */
121 static int
122 proc_issystem(pid_t pid)
123 {
124 	char pc_clname[PC_CLNMSZ];
125 
126 	if (priocntl(P_PID, pid, PC_GETXPARMS, NULL, PC_KY_CLNAME, pc_clname,
127 	    PC_KY_NULL) != -1) {
128 		return (strcmp(pc_clname, "SYS") == 0);
129 	} else {
130 		debug("cannot get class-specific scheduling parameters; "
131 		    "assuming system process");
132 		return (-1);
133 	}
134 }
135 
136 /*
137  * fname is the process name, for debugging messages, and unscannable is a flag
138  * indicating whether the process should be scanned.
139  */
140 static void
141 lprocess_insert_mark(pid_t pid, id_t colid, char *fname, int unscannable)
142 {
143 	lcollection_t *lcol;
144 	lprocess_t *lproc;
145 
146 	if ((lcol = lcollection_find(colid)) == NULL)
147 		return;
148 
149 	/*
150 	 * If the process is already being tracked, update the unscannable flag,
151 	 * as determined by the caller, from the process's psinfo.
152 	 */
153 	lproc = lcol->lcol_lprocess;
154 	while (lproc != NULL) {
155 		if (lproc->lpc_pid == pid) {
156 			lproc->lpc_mark = 1;
157 			if (unscannable != 0 && lproc->lpc_unscannable == 0) {
158 				debug("process %d: became unscannable\n",
159 				    (int)lproc->lpc_pid);
160 				lproc->lpc_unscannable = 1;
161 			}
162 			return;
163 		}
164 		lproc = lproc->lpc_next;
165 	}
166 
167 	/*
168 	 * We've fallen off the list without finding our current process;
169 	 * insert it at the list head.
170 	 */
171 	if ((lproc = malloc(sizeof (*lproc))) == NULL)
172 		debug("insufficient memory to track new process %d", (int)pid);
173 	else {
174 		(void) bzero(lproc, sizeof (*lproc));
175 		lproc->lpc_pid = pid;
176 		lproc->lpc_mark = 1;
177 		lproc->lpc_collection = lcol;
178 		lproc->lpc_psinfo_fd = -1;
179 		lproc->lpc_pgdata_fd = -1;
180 		lproc->lpc_xmap_fd = -1;
181 
182 		/*
183 		 * If the caller didn't flag this process as unscannable
184 		 * already, do some more checking.
185 		 */
186 		lproc->lpc_unscannable = unscannable || proc_issystem(pid);
187 
188 #ifdef DEBUG
189 		/*
190 		 * Verify the sanity of lprocess.  It should not contain the
191 		 * process we are about to prepend.
192 		 */
193 		if (lcollection_member(lcol, lproc)) {
194 			lprocess_t *cur = lcol->lcol_lprocess;
195 			debug("The collection %lld already has these members, "
196 			    "including me, %d!\n", (long long)lcol->lcol_id,
197 			    (int)lproc->lpc_pid);
198 			while (cur != NULL) {
199 				debug("\t%d\n", (int)cur->lpc_pid);
200 				cur = cur->lpc_next;
201 			}
202 			info(gettext("process already on lprocess\n"));
203 			abort();
204 		}
205 #endif /* DEBUG */
206 		lproc->lpc_next = lcol->lcol_lprocess;
207 		if (lproc->lpc_next != NULL)
208 			lproc->lpc_next->lpc_prev = lproc;
209 		lproc->lpc_prev = NULL;
210 		lcol->lcol_lprocess = lproc;
211 
212 		debug("tracking %d %d %s%s\n", (int)colid, (int)pid, fname,
213 		    (lproc->lpc_unscannable != 0) ? " (not scannable)" : "");
214 		lcol->lcol_stat.lcols_proc_in++;
215 	}
216 }
217 
218 static int
219 list_walk_process_cb(lcollection_t *lcol, void *arg)
220 {
221 	int (*cb)(lcollection_t *, lprocess_t *) =
222 	    (int(*)(lcollection_t *, lprocess_t *))arg;
223 	lprocess_t *member;
224 	lprocess_t *next;
225 
226 	member = lcol->lcol_lprocess;
227 	while (member != NULL) {
228 		pid_t pid = member->lpc_pid;
229 		next = member->lpc_next;
230 
231 		debug_high("list_walk_all lpc %d\n", (int)pid);
232 		if (cb(lcol, member) != 0) {
233 			debug_high("list_walk_all aborted at lpc %d\n",
234 			    (int)pid);
235 			return (1);
236 		}
237 		member = next;
238 	}
239 
240 	return (0);
241 }
242 
243 /*
244  * Invoke the given callback for each process in each collection.  Callbacks
245  * are allowed to change the linkage of the process on which they act.
246  */
247 static void
248 list_walk_all(int (*cb)(lcollection_t *, lprocess_t *))
249 {
250 	list_walk_collection(list_walk_process_cb, (void *)cb);
251 }
252 
253 static void
254 revoke_psinfo(rfd_t *rfd)
255 {
256 	lprocess_t *lpc = (lprocess_t *)rfd->rfd_data;
257 
258 	if (lpc != NULL) {
259 		debug("revoking psinfo fd for process %d\n", (int)lpc->lpc_pid);
260 		ASSERT(lpc->lpc_psinfo_fd != -1);
261 		lpc->lpc_psinfo_fd = -1;
262 	} else
263 		debug("revoking psinfo fd for unknown process\n");
264 }
265 
266 /*
267  * Retrieve a process's psinfo via an already-opened or new file descriptor.
268  * The supplied descriptor will be closed on failure.  An optional callback
269  * will be invoked with the last descriptor tried, and a supplied callback
270  * argument, as its arguments, such that the new descriptor may be cached, or
271  * an old one may be invalidated.  If the result of the callback is zero, the
272  * the caller is to assume responsibility for the file descriptor, to close it
273  * with rfd_close().
274  *
275  * On failure, a nonzero value is returned.
276  */
277 int
278 get_psinfo(pid_t pid, psinfo_t *psinfo, int cached_fd,
279     int(*fd_update_cb)(void *, int), void *arg, lprocess_t *lpc)
280 {
281 	int fd;
282 	int can_try_uncached;
283 
284 	ASSERT(!(cached_fd > 0 && fd_update_cb == NULL));
285 
286 	do {
287 		if (cached_fd >= 0) {
288 			fd = cached_fd;
289 			can_try_uncached = 1;
290 			debug_high("%d/psinfo, trying cached fd %d\n",
291 			    (int)pid, fd);
292 		} else {
293 			char pathbuf[PROC_PATH_MAX];
294 
295 			can_try_uncached = 0;
296 			(void) snprintf(pathbuf, sizeof (pathbuf),
297 			    "/proc/%d/psinfo", (int)pid);
298 			if ((fd = rfd_open(pathbuf, 1, RFD_PSINFO,
299 			    revoke_psinfo, lpc, O_RDONLY, 0000)) < 0) {
300 				debug("cannot open %s", pathbuf);
301 				break;
302 			} else
303 				debug_high("opened %s, fd %d\n", pathbuf, fd);
304 		}
305 
306 		if (pread(fd, psinfo, sizeof (*psinfo), 0) ==
307 		    sizeof (*psinfo) && psinfo->pr_pid == pid)
308 			break;
309 		else {
310 			debug_high("closed fd %d\n", fd);
311 			if (rfd_close(fd) != 0)
312 				debug("could not close fd %d", fd);
313 			fd = cached_fd = -1;
314 		}
315 	} while (can_try_uncached == 1);
316 
317 	if (fd_update_cb == NULL || fd_update_cb(arg, fd) != 0)
318 		if (fd >= 0) {
319 			debug_high("closed %s fd %d\n", fd_update_cb == NULL ?
320 			    "uncached" : "cached", fd);
321 			if (rfd_close(fd) != 0)
322 				debug("could not close fd %d", fd);
323 		}
324 
325 	debug_high("get_psinfo ret %d, fd %d, %s\n", ((fd >= 0) ? 0 : -1), fd,
326 	    fd_update_cb != NULL ? "cached" : "uncached");
327 	return ((fd >= 0) ? 0 : -1);
328 }
329 
330 /*
331  * Retrieve the collection membership of all processes in our zone, and update
332  * the psinfo of those non-system, non-zombie ones in collections.
333  */
334 static void
335 proc_cb(const pid_t pid)
336 {
337 	static zoneid_t ours = (zoneid_t)-1;
338 	psinfo_t psinfo;
339 
340 	if (ours == (zoneid_t)-1)
341 		ours = getzoneid();
342 
343 	if (get_psinfo(pid, &psinfo, -1, NULL, NULL, NULL) == 0 &&
344 	    psinfo.pr_zoneid == ours)
345 		lprocess_insert_mark(psinfo.pr_pid, rc_getidbypsinfo(&psinfo),
346 		    psinfo.pr_psargs, psinfo.pr_nlwp == 0);
347 }
348 
349 /*
350  * Cache the process' psinfo fd, taking responsibility for freeing it.
351  */
352 int
353 lprocess_update_psinfo_fd_cb(void *arg, int fd)
354 {
355 	lprocess_t *lpc = arg;
356 
357 	lpc->lpc_psinfo_fd = fd;
358 	return (0);
359 }
360 
361 /*
362  * Update the RSS of processes in monitored collections.
363  */
364 /*ARGSUSED*/
365 static int
366 mem_sample_cb(lcollection_t *lcol, lprocess_t *lpc)
367 {
368 	psinfo_t psinfo;
369 
370 	if (get_psinfo(lpc->lpc_pid, &psinfo, lpc->lpc_psinfo_fd,
371 	    lprocess_update_psinfo_fd_cb, lpc, lpc) == 0) {
372 		lpc->lpc_rss = psinfo.pr_rssize;
373 		lpc->lpc_size = psinfo.pr_size;
374 	} else {
375 		if (errno == ENOENT)
376 			debug("process %d finished\n", (int)lpc->lpc_pid);
377 		else
378 			debug("process %d: cannot read psinfo",
379 			    (int)lpc->lpc_pid);
380 		lprocess_free(lpc);
381 	}
382 
383 	return (0);
384 }
385 
386 /*
387  * Sample the collection RSS, updating the collection's statistics with the
388  * results.
389  */
390 /*ARGSUSED*/
391 static int
392 rss_sample_col_cb(lcollection_t *lcol, void *arg)
393 {
394 	int64_t excess;
395 	uint64_t rss;
396 
397 	/*
398 	 * If updating statistics for a new interval, reset the affected
399 	 * counters.
400 	 */
401 	if (lcol->lcol_stat_invalidate != 0) {
402 		lcol->lcol_stat_old = lcol->lcol_stat;
403 		lcol->lcol_stat.lcols_min_rss = (int64_t)-1;
404 		lcol->lcol_stat.lcols_max_rss = 0;
405 		lcol->lcol_stat_invalidate = 0;
406 	}
407 
408 	lcol->lcol_stat.lcols_rss_sample++;
409 	excess = lcol->lcol_rss - lcol->lcol_rss_cap;
410 	rss = lcol->lcol_rss;
411 	if (excess > 0)
412 		lcol->lcol_stat.lcols_rss_act_sum += rss;
413 	lcol->lcol_stat.lcols_rss_sum += rss;
414 
415 	if (lcol->lcol_stat.lcols_min_rss > rss)
416 		lcol->lcol_stat.lcols_min_rss = rss;
417 	if (lcol->lcol_stat.lcols_max_rss < rss)
418 		lcol->lcol_stat.lcols_max_rss = rss;
419 
420 	return (0);
421 }
422 
423 /*
424  * Open /proc and walk entries.
425  */
426 static void
427 proc_walk_all(void (*cb)(const pid_t))
428 {
429 	DIR *pdir;
430 	struct dirent *dirent;
431 	pid_t pid;
432 
433 	(void) rfd_reserve(1);
434 	if ((pdir = opendir("/proc")) == NULL)
435 		die(gettext("couldn't open /proc!"));
436 
437 	while ((dirent = readdir(pdir)) != NULL) {
438 		if (strcmp(".", dirent->d_name) == 0 ||
439 		    strcmp("..", dirent->d_name) == 0)
440 			continue;
441 		pid = atoi(dirent->d_name);
442 		ASSERT(pid != 0 || strcmp(dirent->d_name, "0") == 0);
443 		if (pid == rcapd_pid)
444 			continue;
445 		else
446 			cb(pid);
447 	}
448 	(void) closedir(pdir);
449 }
450 
451 /*
452  * Memory update callback.
453  */
454 static int
455 memory_all_cb(lcollection_t *lcol, lprocess_t *lpc)
456 {
457 	debug_high("%s %s, pid %d: rss += %llu/%llu\n", rcfg.rcfg_mode_name,
458 	    lcol->lcol_name, (int)lpc->lpc_pid,
459 	    (unsigned long long)lpc->lpc_rss,
460 	    (unsigned long long)lpc->lpc_size);
461 	ASSERT(lpc->lpc_rss <= lpc->lpc_size);
462 	lcol->lcol_rss += lpc->lpc_rss;
463 	lcol->lcol_image_size += lpc->lpc_size;
464 
465 	return (0);
466 }
467 
468 /*
469  * Clear unmarked callback.
470  */
471 /*ARGSUSED*/
472 static int
473 sweep_process_cb(lcollection_t *lcol, lprocess_t *lpc)
474 {
475 	if (lpc->lpc_mark) {
476 		lpc->lpc_mark = 0;
477 	} else {
478 		debug("process %d finished\n", (int)lpc->lpc_pid);
479 		lprocess_free(lpc);
480 	}
481 
482 	return (0);
483 }
484 
485 /*
486  * Memory clear callback.
487  */
488 /*ARGSUSED*/
489 static int
490 collection_zero_mem_cb(lcollection_t *lcol, void *arg)
491 {
492 	lcol->lcol_rss = 0;
493 	lcol->lcol_image_size = 0;
494 
495 	return (0);
496 }
497 
498 /*
499  * Print, for debugging purposes, a collection's recently-sampled RSS and
500  * excess.
501  */
502 /*ARGSUSED*/
503 static int
504 excess_print_cb(lcollection_t *lcol, void *arg)
505 {
506 	int64_t excess = lcol->lcol_rss - lcol->lcol_rss_cap;
507 
508 	debug("%s %s rss/cap: %llu/%llu, excess = %lld kB\n",
509 	    rcfg.rcfg_mode_name, lcol->lcol_name,
510 	    (unsigned long long)lcol->lcol_rss,
511 	    (unsigned long long)lcol->lcol_rss_cap,
512 	    (long long)excess);
513 
514 	return (0);
515 }
516 
517 /*
518  * Scan those collections which have exceeded their caps.
519  */
520 /*ARGSUSED*/
521 static int
522 scan_cb(lcollection_t *lcol, void *arg)
523 {
524 	int64_t excess;
525 
526 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
527 		scan(lcol, excess);
528 		lcol->lcol_stat.lcols_scan++;
529 	}
530 
531 	return (0);
532 }
533 
534 /*
535  * Do a soft scan of those collections which have excesses.  A soft scan is one
536  * in which the cap enforcement pressure is taken into account.  The difference
537  * between the utilized physical memory and the cap enforcement pressure will
538  * be scanned-for, and each collection will be scanned proportionally by their
539  * present excesses.
540  */
541 static int
542 soft_scan_cb(lcollection_t *lcol, void *a)
543 {
544 	int64_t excess;
545 	soft_scan_arg_t *arg = a;
546 
547 	if ((excess = lcol->lcol_rss - lcol->lcol_rss_cap) > 0) {
548 		debug("col %lld excess %lld scan_goal %lld sum_excess %llu, "
549 		    "scanning %lld\n", (long long)lcol->lcol_id,
550 		    (long long)excess, (long long)arg->ssa_scan_goal,
551 		    (unsigned long long)arg->ssa_sum_excess,
552 		    (long long)(excess * arg->ssa_scan_goal /
553 		    arg->ssa_sum_excess));
554 
555 		scan(lcol, (int64_t)(excess * arg->ssa_scan_goal /
556 		    arg->ssa_sum_excess));
557 		lcol->lcol_stat.lcols_scan++;
558 	}
559 
560 	return (0);
561 }
562 
563 /*
564  * When a scan could happen, but caps aren't enforced tick the
565  * lcols_unenforced_cap counter.
566  */
567 /*ARGSUSED*/
568 static int
569 unenforced_cap_cb(lcollection_t *lcol, void *arg)
570 {
571 	lcol->lcol_stat.lcols_unenforced_cap++;
572 
573 	return (0);
574 }
575 
576 /*
577  * Update the count of physically installed memory.
578  */
579 static void
580 update_phys_total(void)
581 {
582 	uint64_t old_phys_total;
583 
584 	old_phys_total = phys_total;
585 	phys_total = (uint64_t)sysconf(_SC_PHYS_PAGES) * sysconf(_SC_PAGESIZE)
586 	    / 1024;
587 	if (phys_total != old_phys_total)
588 		debug("physical memory%s: %lluM\n", (old_phys_total == 0 ?
589 		    "" : " adjusted"), (unsigned long long)(phys_total / 1024));
590 }
591 
592 /*
593  * Unlink a process from its collection, updating relevant statistics, and
594  * freeing its associated memory.
595  */
596 void
597 lprocess_free(lprocess_t *lpc)
598 {
599 	pid_t pid;
600 
601 	lpc->lpc_collection->lcol_stat.lcols_proc_out++;
602 
603 	if (lpc->lpc_prev != NULL)
604 		lpc->lpc_prev->lpc_next = lpc->lpc_next;
605 	if (lpc->lpc_next != NULL)
606 		lpc->lpc_next->lpc_prev = lpc->lpc_prev;
607 	if (lpc->lpc_collection->lcol_lprocess == lpc)
608 		lpc->lpc_collection->lcol_lprocess = (lpc->lpc_next !=
609 		    lpc ? lpc->lpc_next : NULL);
610 	lpc->lpc_next = lpc->lpc_prev = NULL;
611 
612 	if (lpc->lpc_prpageheader != NULL)
613 		free(lpc->lpc_prpageheader);
614 	if (lpc->lpc_xmap != NULL)
615 		free(lpc->lpc_xmap);
616 	if (lpc->lpc_psinfo_fd >= 0) {
617 		if (rfd_close(lpc->lpc_psinfo_fd) != 0)
618 			debug("could not close %d lpc_psinfo_fd %d",
619 			    (int)lpc->lpc_pid, lpc->lpc_psinfo_fd);
620 		lpc->lpc_psinfo_fd = -1;
621 	}
622 	if (lpc->lpc_pgdata_fd >= 0) {
623 		if (rfd_close(lpc->lpc_pgdata_fd) != 0)
624 			debug("could not close %d lpc_pgdata_fd %d",
625 			    (int)lpc->lpc_pid, lpc->lpc_pgdata_fd);
626 		lpc->lpc_pgdata_fd = -1;
627 	}
628 	if (lpc->lpc_xmap_fd >= 0) {
629 		if (rfd_close(lpc->lpc_xmap_fd) != 0)
630 			debug("could not close %d lpc_xmap_fd %d",
631 			    (int)lpc->lpc_pid, lpc->lpc_xmap_fd);
632 		lpc->lpc_xmap_fd = -1;
633 	}
634 	if (lpc->lpc_ignore != NULL)
635 		lmapping_free(&lpc->lpc_ignore);
636 	pid = lpc->lpc_pid;
637 	free(lpc);
638 	debug_high("process %d freed\n", (int)pid);
639 }
640 
641 /*
642  * Collection clear callback.
643  */
644 /*ARGSUSED*/
645 static int
646 collection_clear_cb(lcollection_t *lcol, void *arg)
647 {
648 	lcol->lcol_mark = 0;
649 
650 	return (0);
651 }
652 
653 /*
654  * Respond to a terminating signal by setting a termination flag.
655  */
656 /*ARGSUSED*/
657 static void
658 terminate_signal(int signal)
659 {
660 	if (termination_signal == 0)
661 		termination_signal = signal;
662 	should_run = 0;
663 }
664 
665 /*
666  * Handle any synchronous or asynchronous signals that would ordinarily cause a
667  * process to abort.
668  */
669 /*ARGSUSED*/
670 static void
671 abort_signal(int signal)
672 {
673 	/*
674 	 * Allow the scanner to make a last-ditch effort to resume any stopped
675 	 * processes.
676 	 */
677 	scan_abort();
678 	abort();
679 }
680 
681 /*
682  * Clean up collections which have been removed due to configuration.  Unlink
683  * the collection from lcollection and free it.
684  */
685 /*ARGSUSED*/
686 static int
687 collection_sweep_cb(lcollection_t *lcol, void *arg)
688 {
689 	if (lcol->lcol_mark == 0) {
690 		debug("freeing %s %s\n", rcfg.rcfg_mode_name, lcol->lcol_name);
691 		lcollection_free(lcol);
692 	}
693 
694 	return (0);
695 }
696 
697 /*
698  * Set those variables which depend on the global configuration.
699  */
700 static void
701 finish_configuration(void)
702 {
703 	/*
704 	 * Warn that any lnode (or non-project) mode specification (by an SRM
705 	 * 1.3 configuration file, for example) is ignored.
706 	 */
707 	if (strcmp(rcfg.rcfg_mode_name, "project") != 0) {
708 		warn(gettext("%s mode specification ignored -- using project"
709 		    " mode\n"), rcfg.rcfg_mode_name);
710 		rcfg.rcfg_mode_name = "project";
711 		rcfg.rcfg_mode = rctype_project;
712 	}
713 
714 	lcollection_set_type(rcfg.rcfg_mode);
715 }
716 
717 /*
718  * Cause the configuration file to be reread and applied.
719  */
720 static void
721 reread_configuration_file(void)
722 {
723 	rcfg_t rcfg_new;
724 	struct stat st;
725 
726 	if (stat(rcfg.rcfg_filename, &st) == 0 && st.st_mtime ==
727 	    rcfg.rcfg_last_modification)
728 		return;
729 
730 	if (rcfg_read(rcfg.rcfg_filename, rcfg.rcfg_fd, &rcfg_new,
731 	    update_statistics) != 0)
732 		warn(gettext("can't reread configuration"));
733 	else {
734 		/*
735 		 * The configuration file has been read.  Remove existing
736 		 * collections in case there is a change in collection type.
737 		 */
738 		if (rcfg.rcfg_mode != rcfg_new.rcfg_mode) {
739 			list_walk_collection(collection_clear_cb, NULL);
740 			list_walk_collection(collection_sweep_cb, NULL);
741 		}
742 
743 		/*
744 		 * Make the newly-read configuration the global one, and update
745 		 * any variables that depend on it.
746 		 */
747 		rcfg = rcfg_new;
748 		finish_configuration();
749 	}
750 }
751 
752 /*
753  * Reread the configuration filex, then examine changes, additions, and
754  * deletions to cap definitions.
755  */
756 static void
757 reconfigure(void)
758 {
759 	debug("reconfigure...\n");
760 
761 	/*
762 	 * Reread the configuration data.
763 	 */
764 	reread_configuration_file();
765 
766 	/*
767 	 * Walk the lcollection, marking active collections so inactive ones
768 	 * can be freed.
769 	 */
770 	list_walk_collection(collection_clear_cb, NULL);
771 	lcollection_update(LCU_ACTIVE_ONLY); /* mark */
772 	list_walk_collection(collection_sweep_cb, NULL);
773 }
774 
775 /*
776  * Respond to SIGHUP by triggering the rereading the configuration file and cap
777  * definitions.
778  */
779 /*ARGSUSED*/
780 static void
781 sighup(int signal)
782 {
783 	should_reconfigure = 1;
784 }
785 
786 /*
787  * Print, for debugging purposes, each collection's interval statistics.
788  */
789 /*ARGSUSED*/
790 static int
791 simple_report_collection_cb(lcollection_t *lcol, void *arg)
792 {
793 #define	DELTA(field) \
794 	(unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \
795 	    (lcol->lcol_stat.field - lcol->lcol_stat_old.field))
796 #define	VALID(field) \
797 	(unsigned long long)(lcol->lcol_stat_invalidate ? 0 : \
798 	    lcol->lcol_stat.field)
799 
800 	debug("%s %s status: succeeded/attempted (k): %llu/%llu, "
801 	    "ineffective/scans/unenforced/samplings:  %llu/%llu/%llu/%llu, RSS "
802 	    "min/max (k): %llu/%llu, cap %llu kB, processes/thpt: %llu/%llu, "
803 	    "%llu scans over %llu ms\n", rcfg.rcfg_mode_name, lcol->lcol_name,
804 	    DELTA(lcols_pg_eff), DELTA(lcols_pg_att),
805 	    DELTA(lcols_scan_ineffective), DELTA(lcols_scan),
806 	    DELTA(lcols_unenforced_cap), DELTA(lcols_rss_sample),
807 	    VALID(lcols_min_rss), VALID(lcols_max_rss),
808 	    (unsigned long long)lcol->lcol_rss_cap,
809 	    (unsigned long long)(lcol->lcol_stat.lcols_proc_in -
810 	    lcol->lcol_stat.lcols_proc_out), DELTA(lcols_proc_out),
811 	    DELTA(lcols_scan_count), DELTA(lcols_scan_time_complete) / (NANOSEC
812 	    / MILLISEC));
813 
814 #undef DELTA
815 #undef VALID
816 
817 	return (0);
818 }
819 
820 /*
821  * Record each collection's interval statistics in the statistics file.
822  */
823 static int
824 report_collection_cb(lcollection_t *lcol, void *arg)
825 {
826 	lcollection_report_t dc;
827 	int fd = (intptr_t)arg;
828 
829 	/*
830 	 * Copy the relevant fields to the collection's record.
831 	 */
832 	bzero(&dc, sizeof (dc));
833 	dc.lcol_id = lcol->lcol_id;
834 	(void) strcpy(dc.lcol_name, lcol->lcol_name);
835 	dc.lcol_rss = lcol->lcol_rss;
836 	dc.lcol_image_size = lcol->lcol_image_size;
837 	dc.lcol_rss_cap = lcol->lcol_rss_cap;
838 	dc.lcol_stat = lcol->lcol_stat;
839 
840 	if (write(fd, &dc, sizeof (dc)) == sizeof (dc)) {
841 		/*
842 		 * Set a flag to indicate that the exported interval snapshot
843 		 * values should be reset at the next sample.
844 		 */
845 		lcol->lcol_stat_invalidate = 1;
846 	} else {
847 		debug("can't write %s %s statistics", rcfg.rcfg_mode_name,
848 		    lcol->lcol_name);
849 	}
850 
851 	return (0);
852 }
853 
854 /*
855  * Determine the count of pages scanned by the global page scanner, obtained
856  * from the cpu_stat:*::scan kstats.  Return zero on success.
857  */
858 static int
859 get_globally_scanned_pages(uint64_t *scannedp)
860 {
861 	kstat_t *ksp;
862 	uint64_t scanned = 0;
863 
864 	if (kstat_chain_update(kctl) == -1) {
865 		warn(gettext("can't update kstat chain"));
866 		return (0);
867 	}
868 
869 	for (ksp = kctl->kc_chain; ksp != NULL; ksp = ksp->ks_next) {
870 		if (strcmp(ksp->ks_module, "cpu_stat") == 0) {
871 			if (kstat_read(kctl, ksp, NULL) != -1) {
872 				scanned += ((cpu_stat_t *)
873 				    ksp->ks_data)->cpu_vminfo.scan;
874 			} else
875 				return (-1);
876 		}
877 	}
878 
879 	*scannedp = scanned;
880 	return (0);
881 }
882 
883 /*
884  * Update the shared statistics file with each collection's current statistics.
885  * Return zero on success.
886  */
887 static int
888 update_statistics(void)
889 {
890 	int fd, res;
891 	static char template[LINELEN];
892 
893 	/*
894 	 * Try to create a directory irrespective of whether it is existing
895 	 * or not. If it is not there then it will create. Otherwise any way
896 	 * it will fail at mkstemp call below.
897 	 */
898 	(void) mkdir(STAT_FILE_DIR, 0755);
899 
900 	/*
901 	 * Create a temporary file.
902 	 */
903 	if (sizeof (template) < (strlen(rcfg.rcfg_stat_file) +
904 	    strlen(STAT_TEMPLATE_SUFFIX) + 1)) {
905 		debug("temporary file template size too small\n");
906 		return (-1);
907 	}
908 	(void) strcpy(template, rcfg.rcfg_stat_file);
909 	(void) strcat(template, STAT_TEMPLATE_SUFFIX);
910 	(void) rfd_reserve(1);
911 	fd = mkstemp(template);
912 
913 	/*
914 	 * Write the header and per-collection statistics.
915 	 */
916 	if (fd >= 0) {
917 		rcapd_stat_hdr_t rs;
918 
919 		rs.rs_pid = rcapd_pid;
920 		rs.rs_time = gethrtime();
921 		ASSERT(sizeof (rs.rs_mode) > strlen(rcfg.rcfg_mode_name));
922 		(void) strcpy(rs.rs_mode, rcfg.rcfg_mode_name);
923 		rs.rs_pressure_cur = memory_pressure;
924 		rs.rs_pressure_cap = rcfg.rcfg_memory_cap_enforcement_pressure;
925 		rs.rs_pressure_sample = memory_pressure_sample;
926 
927 		if (fchmod(fd, 0644) == 0 && write(fd, &rs, sizeof (rs)) ==
928 		    sizeof (rs)) {
929 			list_walk_collection(report_collection_cb,
930 				(void *)(intptr_t)fd);
931 			/*
932 			 * Replace the existing statistics file with this new
933 			 * one.
934 			 */
935 			res = rename(template, rcfg.rcfg_stat_file);
936 		} else
937 			res = -1;
938 		(void) close(fd);
939 	} else
940 		res = -1;
941 
942 	return (res);
943 }
944 
945 /*
946  * Verify the statistics file can be created and written to, and die if an
947  * existing file may be in use by another rcapd.
948  */
949 static int
950 verify_statistics(void)
951 {
952 	pid_t pid;
953 
954 	/*
955 	 * Warn if another instance of rcapd might be active.
956 	 */
957 	(void) rfd_reserve(1);
958 	pid = stat_get_rcapd_pid(rcfg.rcfg_stat_file);
959 	if (pid != rcapd_pid && pid != -1)
960 		die(gettext("%s exists; rcapd may already be active\n"),
961 		    rcfg.rcfg_stat_file);
962 
963 	return (update_statistics());
964 }
965 
966 static int
967 sum_excess_cb(lcollection_t *lcol, void *arg)
968 {
969 	uint64_t *sum_excess = arg;
970 
971 	*sum_excess += MAX((int64_t)0, (int64_t)(lcol->lcol_rss -
972 	    lcol->lcol_rss_cap));
973 	return (0);
974 }
975 
976 static void
977 rcapd_usage(void)
978 {
979 	info(gettext("usage: rcapd [-d]\n"));
980 }
981 
982 void
983 check_update_statistics(void)
984 {
985 	hrtime_t now = gethrtime();
986 
987 	if (EVENT_TIME(now, next_report)) {
988 		debug("updating statistics...\n");
989 		list_walk_collection(simple_report_collection_cb, NULL);
990 		if (update_statistics() != 0)
991 			debug("couldn't update statistics");
992 		next_report = NEXT_REPORT_EVENT_TIME(now,
993 		    rcfg.rcfg_report_interval);
994 	}
995 }
996 
997 static void
998 verify_and_set_privileges(void)
999 {
1000 	priv_set_t *required =
1001 	    priv_str_to_set("zone,sys_resource,proc_owner", ",", NULL);
1002 
1003 	/*
1004 	 * Ensure the required privileges, suitable for controlling processes,
1005 	 * are possessed.
1006 	 */
1007 	if (setppriv(PRIV_SET, PRIV_PERMITTED, required) != 0 || setppriv(
1008 	    PRIV_SET, PRIV_EFFECTIVE, required) != 0)
1009 		die(gettext("can't set requisite privileges"));
1010 
1011 	/*
1012 	 * Ensure access to /var/run/daemon.
1013 	 */
1014 	if (setreuid(DAEMON_UID, DAEMON_UID) != 0)
1015 		die(gettext("cannot become user daemon"));
1016 
1017 	priv_freeset(required);
1018 }
1019 
1020 int
1021 main(int argc, char *argv[])
1022 {
1023 	int res;
1024 	int should_fork = 1;	/* fork flag */
1025 	hrtime_t now;		/* current time */
1026 	hrtime_t next;		/* time of next event */
1027 	int sig;		/* signal iteration */
1028 	struct rlimit rl;
1029 	hrtime_t next_proc_walk;	/* time of next /proc scan */
1030 	hrtime_t next_configuration;	/* time of next configuration */
1031 	hrtime_t next_rss_sample;	/* (latest) time of next RSS sample */
1032 	int old_enforce_caps;		/* track changes in enforcement */
1033 					/* conditions */
1034 	soft_scan_arg_t arg;
1035 
1036 	(void) set_message_priority(RCM_INFO);
1037 	(void) setprogname("rcapd");
1038 	rcapd_pid = getpid();
1039 	(void) chdir("/");
1040 	should_run = 1;
1041 	ever_ran = 0;
1042 
1043 	(void) setlocale(LC_ALL, "");
1044 	(void) textdomain(TEXT_DOMAIN);
1045 
1046 	/*
1047 	 * Parse command-line options.
1048 	 */
1049 	while ((res = getopt(argc, argv, "dF")) > 0)
1050 		switch (res) {
1051 		case 'd':
1052 			should_fork = 0;
1053 			if (debug_mode == 0) {
1054 				debug_mode = 1;
1055 				(void) set_message_priority(RCM_DEBUG);
1056 			} else
1057 				(void) set_message_priority(RCM_DEBUG_HIGH);
1058 			break;
1059 		case 'F':
1060 			should_fork = 0;
1061 			break;
1062 		default:
1063 			rcapd_usage();
1064 			return (E_USAGE);
1065 			/*NOTREACHED*/
1066 		}
1067 
1068 	/*
1069 	 * If not debugging, fork and continue operating, changing the
1070 	 * destination of messages to syslog().
1071 	 */
1072 	if (should_fork == 1) {
1073 		pid_t child;
1074 		debug("forking\n");
1075 		child = fork();
1076 		if (child == -1)
1077 			die(gettext("cannot fork"));
1078 		if (child > 0)
1079 			return (0);
1080 		else {
1081 			rcapd_pid = getpid();
1082 			(void) set_message_destination(RCD_SYSLOG);
1083 			(void) fclose(stdin);
1084 			(void) fclose(stdout);
1085 			(void) fclose(stderr);
1086 		}
1087 		/*
1088 		 * Start a new session and detatch from the controlling tty.
1089 		 */
1090 		if (setsid() == (pid_t)-1)
1091 			debug(gettext("setsid() failed; cannot detach from "
1092 			    "terminal"));
1093 	}
1094 
1095 	/*
1096 	 * Read the configuration file.
1097 	 */
1098 	if (rcfg_read(RCAPD_DEFAULT_CONF_FILE, -1, &rcfg, verify_statistics)
1099 	    != 0)
1100 		die(gettext("invalid configuration: %s"),
1101 		    RCAPD_DEFAULT_CONF_FILE);
1102 	finish_configuration();
1103 	should_reconfigure = 0;
1104 
1105 	/*
1106 	 * Check that required privileges are possessed.
1107 	 */
1108 	verify_and_set_privileges();
1109 
1110 	now = next_report = next_proc_walk = next_rss_sample = gethrtime();
1111 	next_configuration = NEXT_EVENT_TIME(gethrtime(),
1112 	    rcfg.rcfg_reconfiguration_interval);
1113 
1114 	if (rcfg.rcfg_memory_cap_enforcement_pressure == 0) {
1115 		/*
1116 		 * Always enforce caps when strict caps are used.
1117 		 */
1118 		enforce_caps = 1;
1119 	}
1120 
1121 	/*
1122 	 * Open the kstat chain.
1123 	 */
1124 	kctl = kstat_open();
1125 	if (kctl == NULL)
1126 		die(gettext("can't open kstats"));
1127 
1128 	/*
1129 	 * Set RLIMIT_NOFILE as high as practical, so roughly 10K processes can
1130 	 * be effectively managed without revoking descriptors (at 3 per
1131 	 * process).
1132 	 */
1133 	rl.rlim_cur = 32 * 1024;
1134 	rl.rlim_max = 32 * 1024;
1135 	if (setrlimit(RLIMIT_NOFILE, &rl) != 0 &&
1136 	    getrlimit(RLIMIT_NOFILE, &rl) == 0) {
1137 		rl.rlim_cur = rl.rlim_max;
1138 		(void) setrlimit(RLIMIT_NOFILE, &rl);
1139 	}
1140 	(void) enable_extended_FILE_stdio(-1, -1);
1141 
1142 	if (getrlimit(RLIMIT_NOFILE, &rl) == 0)
1143 		debug("fd limit: %lu\n", rl.rlim_cur);
1144 	else
1145 		debug("fd limit: unknown\n");
1146 
1147 	/*
1148 	 * Handle those signals whose (default) exit disposition
1149 	 * prevents rcapd from finishing scanning before terminating.
1150 	 */
1151 	(void) sigset(SIGINT, terminate_signal);
1152 	(void) sigset(SIGQUIT, abort_signal);
1153 	(void) sigset(SIGILL, abort_signal);
1154 	(void) sigset(SIGEMT, abort_signal);
1155 	(void) sigset(SIGFPE, abort_signal);
1156 	(void) sigset(SIGBUS, abort_signal);
1157 	(void) sigset(SIGSEGV, abort_signal);
1158 	(void) sigset(SIGSYS, abort_signal);
1159 	(void) sigset(SIGPIPE, terminate_signal);
1160 	(void) sigset(SIGALRM, terminate_signal);
1161 	(void) sigset(SIGTERM, terminate_signal);
1162 	(void) sigset(SIGUSR1, terminate_signal);
1163 	(void) sigset(SIGUSR2, terminate_signal);
1164 	(void) sigset(SIGPOLL, terminate_signal);
1165 	(void) sigset(SIGVTALRM, terminate_signal);
1166 	(void) sigset(SIGXCPU, abort_signal);
1167 	(void) sigset(SIGXFSZ, abort_signal);
1168 	for (sig = SIGRTMIN; sig <= SIGRTMAX; sig++)
1169 		(void) sigset(sig, terminate_signal);
1170 
1171 	/*
1172 	 * Install a signal handler for reconfiguration processing.
1173 	 */
1174 	(void) sigset(SIGHUP, sighup);
1175 
1176 	/*
1177 	 * Determine which process collections to cap.
1178 	 */
1179 	lcollection_update(LCU_COMPLETE);
1180 
1181 	/*
1182 	 * Loop forever, monitoring collections' resident set sizes and
1183 	 * enforcing their caps.  Look for changes in caps and process
1184 	 * membership, as well as responding to requests to reread the
1185 	 * configuration.  Update per-collection statistics periodically.
1186 	 */
1187 	while (should_run != 0) {
1188 		struct timespec ts;
1189 
1190 		/*
1191 		 * Announce that rcapd is starting.
1192 		 */
1193 		if (ever_ran == 0) {
1194 			info(gettext("starting\n"));
1195 			ever_ran = 1;
1196 		}
1197 
1198 		/*
1199 		 * Update the process list once every proc_walk_interval.  The
1200 		 * condition of global memory pressure is also checked at the
1201 		 * same frequency, if strict caps are in use.
1202 		 */
1203 		now = gethrtime();
1204 
1205 		/*
1206 		 * Detect configuration and cap changes at every
1207 		 * reconfiguration_interval, or when SIGHUP has been received.
1208 		 */
1209 		if (EVENT_TIME(now, next_configuration) ||
1210 		    should_reconfigure == 1) {
1211 			reconfigure();
1212 			next_configuration = NEXT_EVENT_TIME(now,
1213 			    rcfg.rcfg_reconfiguration_interval);
1214 
1215 			/*
1216 			 * Reset each event time to the shorter of the
1217 			 * previous and new intervals.
1218 			 */
1219 			if (next_report == 0 &&
1220 			    rcfg.rcfg_report_interval > 0)
1221 				next_report = now;
1222 			else
1223 				next_report = POSITIVE_MIN(next_report,
1224 				    NEXT_REPORT_EVENT_TIME(now,
1225 				    rcfg.rcfg_report_interval));
1226 			if (next_proc_walk == 0 &&
1227 			    rcfg.rcfg_proc_walk_interval > 0)
1228 				next_proc_walk = now;
1229 			else
1230 				next_proc_walk = POSITIVE_MIN(next_proc_walk,
1231 				    NEXT_EVENT_TIME(now,
1232 				    rcfg.rcfg_proc_walk_interval));
1233 			if (next_rss_sample == 0 &&
1234 			    rcfg.rcfg_rss_sample_interval > 0)
1235 				next_rss_sample = now;
1236 			else
1237 				next_rss_sample = POSITIVE_MIN(next_rss_sample,
1238 				    NEXT_EVENT_TIME(now,
1239 				    rcfg.rcfg_rss_sample_interval));
1240 
1241 			should_reconfigure = 0;
1242 			continue;
1243 		}
1244 
1245 		if (EVENT_TIME(now, next_proc_walk)) {
1246 			debug("scanning process list...\n");
1247 			proc_walk_all(proc_cb); /* mark */
1248 			list_walk_all(sweep_process_cb);
1249 			next_proc_walk = NEXT_EVENT_TIME(now,
1250 			    rcfg.rcfg_proc_walk_interval);
1251 		}
1252 
1253 		if (EVENT_TIME(now, next_rss_sample)) {
1254 			/*
1255 			 * Check for changes to the amount of installed
1256 			 * physical memory, to compute the current memory
1257 			 * pressure.
1258 			 */
1259 			update_phys_total();
1260 
1261 			/*
1262 			 * If soft caps are in use, determine if global memory
1263 			 * pressure exceeds the configured maximum above which
1264 			 * soft caps are enforced.
1265 			 */
1266 			memory_pressure = 100 -
1267 			    (int)((sysconf(_SC_AVPHYS_PAGES) *
1268 			    (sysconf(_SC_PAGESIZE) / 1024)) * 100.0 /
1269 			    phys_total);
1270 			memory_pressure_sample++;
1271 			if (rcfg.rcfg_memory_cap_enforcement_pressure > 0) {
1272 				if (memory_pressure >
1273 				    rcfg.rcfg_memory_cap_enforcement_pressure) {
1274 					if (enforce_soft_caps == 0) {
1275 						debug("memory pressure %d%%\n",
1276 						    memory_pressure);
1277 						enforce_soft_caps = 1;
1278 					}
1279 				} else {
1280 					if (enforce_soft_caps == 1)
1281 						enforce_soft_caps = 0;
1282 				}
1283 			}
1284 
1285 			/*
1286 			 * Determine if the global page scanner is running,
1287 			 * while which no memory caps should be enforced, to
1288 			 * prevent interference with the global page scanner.
1289 			 */
1290 			if (get_globally_scanned_pages(&new_sp) == 0) {
1291 				if (old_sp == 0)
1292 					/*EMPTY*/
1293 					;
1294 				else if ((new_sp - old_sp) > 0) {
1295 					if (global_scanner_running == 0) {
1296 						debug("global memory pressure "
1297 						    "detected (%llu pages "
1298 						    "scanned since last "
1299 						    "interval)\n",
1300 						    (unsigned long long)
1301 						    (new_sp - old_sp));
1302 						global_scanner_running = 1;
1303 					}
1304 				} else if (global_scanner_running == 1) {
1305 					debug("global memory pressure "
1306 					    "relieved\n");
1307 					global_scanner_running = 0;
1308 				}
1309 				old_sp = new_sp;
1310 			} else {
1311 				warn(gettext("kstat_read() failed"));
1312 				new_sp = old_sp;
1313 			}
1314 
1315 			/*
1316 			 * Cap enforcement is determined by the previous two
1317 			 * conditions.
1318 			 */
1319 			old_enforce_caps = enforce_caps;
1320 			enforce_caps =
1321 			    (rcfg.rcfg_memory_cap_enforcement_pressure ==
1322 			    0 || enforce_soft_caps == 1) &&
1323 			    !global_scanner_running;
1324 			if (old_enforce_caps != enforce_caps)
1325 				debug("%senforcing caps\n", enforce_caps == 0 ?
1326 				    "not " : "");
1327 
1328 			/*
1329 			 * Sample collections' member processes' RSSes and
1330 			 * recompute collections' excess.
1331 			 */
1332 			list_walk_all(mem_sample_cb);
1333 			list_walk_collection(collection_zero_mem_cb, NULL);
1334 			list_walk_all(memory_all_cb);
1335 			list_walk_collection(rss_sample_col_cb, NULL);
1336 			if (rcfg.rcfg_memory_cap_enforcement_pressure > 0)
1337 				debug("memory pressure %d%%\n",
1338 				    memory_pressure);
1339 			list_walk_collection(excess_print_cb, NULL);
1340 
1341 			/*
1342 			 * If soft caps are in use, determine the size of the
1343 			 * portion from each collection to scan for.
1344 			 */
1345 			if (enforce_soft_caps == 1) {
1346 				/*
1347 				 * Compute the sum of the collections'
1348 				 * excesses, which will be the denominator.
1349 				 */
1350 				arg.ssa_sum_excess = 0;
1351 				list_walk_collection(sum_excess_cb,
1352 				    &arg.ssa_sum_excess);
1353 
1354 				/*
1355 				 * Compute the quantity of memory (in
1356 				 * kilobytes) above the cap enforcement
1357 				 * pressure.  Set the scan goal to that
1358 				 * quantity (or at most the excess).
1359 				 */
1360 				arg.ssa_scan_goal = MIN((
1361 				    sysconf(_SC_PHYS_PAGES) * (100 -
1362 				    rcfg.rcfg_memory_cap_enforcement_pressure)
1363 				    / 100 - sysconf(_SC_AVPHYS_PAGES)) *
1364 				    (sysconf(_SC_PAGESIZE) / 1024),
1365 				    arg.ssa_sum_excess);
1366 			}
1367 
1368 			/*
1369 			 * Victimize offending collections.
1370 			 */
1371 			if (enforce_caps == 1 && ((enforce_soft_caps == 1 &&
1372 			    arg.ssa_scan_goal > 0 && arg.ssa_sum_excess > 0) ||
1373 			    (enforce_soft_caps == 0)))
1374 				if (enforce_soft_caps == 1) {
1375 					debug("scan goal is %lldKB\n",
1376 					    (long long)arg.ssa_scan_goal);
1377 					list_walk_collection(soft_scan_cb,
1378 					    &arg);
1379 				} else
1380 					list_walk_collection(scan_cb, NULL);
1381 			else
1382 				list_walk_collection(unenforced_cap_cb, NULL);
1383 
1384 			next_rss_sample = NEXT_EVENT_TIME(now,
1385 			    rcfg.rcfg_rss_sample_interval);
1386 		}
1387 
1388 		/*
1389 		 * Update the statistics file, if it's time.
1390 		 */
1391 		check_update_statistics();
1392 
1393 		/*
1394 		 * Sleep for some time before repeating.
1395 		 */
1396 		now = gethrtime();
1397 		next = next_configuration;
1398 		next = POSITIVE_MIN(next, next_proc_walk);
1399 		next = POSITIVE_MIN(next, next_report);
1400 		next = POSITIVE_MIN(next, next_rss_sample);
1401 		if (next > now && should_run != 0) {
1402 			debug("sleeping %-4.2f seconds\n", (float)(next -
1403 			    now) / (float)NANOSEC);
1404 			hrt2ts(next - now, &ts);
1405 			(void) nanosleep(&ts, NULL);
1406 		}
1407 	}
1408 	if (termination_signal != 0)
1409 		debug("exiting due to signal %d\n", termination_signal);
1410 	if (ever_ran != 0)
1411 		info(gettext("exiting\n"));
1412 
1413 	/*
1414 	 * Unlink the statistics file before exiting.
1415 	 */
1416 	if (rcfg.rcfg_stat_file[0] != 0)
1417 		(void) unlink(rcfg.rcfg_stat_file);
1418 
1419 	return (E_SUCCESS);
1420 }
1421