1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 #include <sys/disp.h>
28 #include <sys/param.h>
29 #include <sys/systm.h>
30 #include <sys/sysmacros.h>
31 #include <sys/atomic.h>
32 #include <sys/cpucaps_impl.h>
33 #include <sys/dtrace.h>
34 #include <sys/sdt.h>
35 #include <sys/debug.h>
36 #include <sys/rctl.h>
37 #include <sys/errno.h>
38
39 /*
40 * CPU Caps implementation
41 * =======================
42 *
43 * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU
44 * usage for all projects running inside the zone. If the zone CPU cap is set
45 * below the project CPU cap, the latter will have no effect.
46 *
47 * When CPU usage of projects and/or zones reaches specified caps, threads in
48 * them do not get scheduled and instead are placed on wait queues associated
49 * with a cap. Such threads will start running again only when CPU usage drops
50 * below the cap level. Each zone and each project has its own wait queue.
51 *
52 * When CPU cap is set, the kernel continously keeps track of CPU time used by
53 * capped zones and/or projects over a short time interval and calculates their
54 * current CPU usage as a percentage. When the accumulated usage reaches the CPU
55 * cap, LWPs running in the user-land (when they are not holding any critical
56 * kernel locks) are placed on special wait queues until their project's or
57 * zone's CPU usage drops below the cap.
58 *
59 * The system maintains a list of all capped projects and all capped zones. On
60 * every clock tick every active thread belonging to a capped project adds its
61 * CPU usage to its project. Usage from all projects belonging to a capped zone
62 * is aggregated to get the zone usage.
63 *
64 * When the current CPU usage is above the cap, a project or zone is considered
65 * over-capped. Every user thread caught running in an over-capped project or
66 * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and
67 * is requested to surrender its CPU. This causes scheduling class specific
68 * CL_PREEMPT() callback to be invoked. The callback function places threads
69 * marked as TS_PROJWAIT on a wait queue and calls switch().
70 *
71 * Threads are only placed on wait queues after trapping from user-land
72 * (they could be holding some user locks, but no kernel locks) and while
73 * returning from the trap back to the user-land when no kernel locks are held.
74 * Putting threads on wait queues in random places while running in the
75 * kernel might lead to all kinds of locking problems.
76 *
77 * Accounting
78 * ==========
79 *
80 * Accounting of CPU usage is based on per-thread micro-state accounting data.
81 * On every clock tick clock() adds new on-CPU time for every thread found on
82 * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU.
83 * New times means time since it was last accounted for. On-CPU times greater
84 * than 1 tick are truncated to 1 tick.
85 *
86 * Project CPU usage is aggregated from all threads within the project.
87 * Zone CPU usage is the sum of usages for all projects within the zone. Zone
88 * CPU usage is calculated on every clock tick by walking list of projects and
89 * adding their usage together.
90 *
91 * Decay
92 * =====
93 *
94 * CPU usage is decayed by the caps_update() routine which is called once per
95 * every clock tick. It walks lists of project caps and decays their usages by
96 * one per cent. If CPU usage drops below cap levels, threads on the wait queue
97 * are made runnable again, one thread per clock tick.
98 *
99 * Interfaces
100 * ==========
101 *
102 * The CPU Caps facility provides the following interfaces to the rest of the
103 * system:
104 *
105 * cpucaps_project_add(kproject_t *)
106 *
107 * Notifies the framework of a new project. It should be put on the
108 * capped_projects list if its zone has a cap.
109 *
110 * cpucaps_project_remove(kproject_t *)
111 *
112 * Remove the association between the specified project and its cap.
113 * Called right before the project is destroyed.
114 *
115 * cpucaps_project_set(kproject_t *, rctl_qty_t)
116 *
117 * Set project cap of the specified project to the specified value. Setting the
118 * value to NOCAP is equivalent to removing the cap.
119 *
120 * cpucaps_zone_set(zone_t *, rctl_qty_t)
121 *
122 * Set zone cap of the specified zone to the specified value. Setting the value
123 * to NOCAP is equivalent to removing the cap.
124 *
125 * cpucaps_zone_remove(zone_t *)
126 *
127 * Remove the association between the zone and its cap.
128 *
129 * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t)
130 *
131 * Charges specified thread's project the amount of on-CPU time that it used.
132 * If the third argument is CPUCAPS_CHARGE_ONLY returns False.
133 * Otherwise returns True if project or zone should be penalized because its
134 * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ
135 * bits in t_schedflag in this case.
136 *
137 * CPUCAPS_ENFORCE(kthread_id_t *)
138 *
139 * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER
140 * state on project or zone wait queues, as requested by TS_PROJWAITQ or
141 * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a
142 * wait queue or False otherwise.
143 *
144 * cpucaps_sc_init(caps_sc_t *)
145 *
146 * Initializes the scheduling-class specific CPU Caps data for a thread.
147 *
148 * LOCKS
149 * =====
150 *
151 * all the individual caps structures and their lists are protected by a global
152 * caps_lock mutex. The lock is grabbed either by clock() or by events modifying
153 * caps, so it is usually uncontended. We avoid all blocking memory allocations
154 * while holding caps_lock to prevent clock() from blocking.
155 *
156 * Thread state is protected by the thread lock. It protects the association
157 * between a thread and its project and, as a consequence, to its zone. The
158 * association can not break while thread lock is held, so the project or zone
159 * cap are not going to disappear while thread lock is held.
160 *
161 * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is
162 * grabbed by scheduling classes already holding thread lock at high PIL and by
163 * clock thread performing usage decay. We should do as little work as possible
164 * while holding the lock since it may be very hot. All threads in the project
165 * contend for the same cache line doing cap usage updates.
166 */
167
168 /*
169 * caps_lock protects list of capped projects and zones, changes in the cap
170 * state and changes of the global cpucaps_enabled flag.
171 *
172 * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is
173 * modified in parallel. This can be per-zone cap flag, but we don't keep any
174 * cap state for now.
175 */
176 static kmutex_t caps_lock; /* lock to protect: */
177 static list_t capped_zones; /* - list of zones with caps */
178 static list_t capped_projects; /* - list of projects with caps */
179 boolean_t cpucaps_enabled; /* - are there any caps defined? */
180 boolean_t cpucaps_busy; /* - is framework busy? */
181
182 /*
183 * The accounting is based on the number of nanoseconds threads spend running
184 * during a tick which is kept in the cap_tick_cost variable.
185 */
186 static hrtime_t cap_tick_cost;
187
188 /*
189 * How much of the usage value is decayed every clock tick
190 * Decay one per cent of value per tick
191 */
192 #define CAP_DECAY_FACTOR 100
193
194 /*
195 * Scale the value and round it to the closest integer value
196 */
197 #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y))
198
199 static void caps_update();
200
201 /*
202 * CAP kstats.
203 */
204 struct cap_kstat {
205 kstat_named_t cap_value;
206 kstat_named_t cap_usage;
207 kstat_named_t cap_nwait;
208 kstat_named_t cap_below;
209 kstat_named_t cap_above;
210 kstat_named_t cap_maxusage;
211 kstat_named_t cap_zonename;
212 } cap_kstat = {
213 { "value", KSTAT_DATA_UINT64 },
214 { "usage", KSTAT_DATA_UINT64 },
215 { "nwait", KSTAT_DATA_UINT64 },
216 { "below_sec", KSTAT_DATA_UINT64 },
217 { "above_sec", KSTAT_DATA_UINT64 },
218 { "maxusage", KSTAT_DATA_UINT64 },
219 { "zonename", KSTAT_DATA_STRING },
220 };
221
222
223 static kmutex_t cap_kstat_lock;
224 static int cap_kstat_update(kstat_t *, int);
225
226 /*
227 * Initialize CPU caps infrastructure.
228 * - Initialize lists of capped zones and capped projects
229 * - Set cpucaps_clock_callout to NULL
230 */
231 void
cpucaps_init()232 cpucaps_init()
233 {
234 /*
235 * Initialize global variables
236 */
237 cap_tick_cost = TICK_TO_NSEC((hrtime_t)1);
238
239 list_create(&capped_zones, sizeof (cpucap_t),
240 offsetof(cpucap_t, cap_link));
241 list_create(&capped_projects, sizeof (cpucap_t),
242 offsetof(cpucap_t, cap_link));
243
244 cpucaps_enabled = B_FALSE;
245 cpucaps_busy = B_FALSE;
246 cpucaps_clock_callout = NULL;
247 }
248
249 /*
250 * Initialize scheduling-class specific CPU Caps data.
251 */
252 void
cpucaps_sc_init(caps_sc_t * csc)253 cpucaps_sc_init(caps_sc_t *csc)
254 {
255 csc->csc_cputime = 0;
256 }
257
258 /*
259 * Allocate and initialize cpucap structure
260 */
261 static cpucap_t *
cap_alloc(void)262 cap_alloc(void)
263 {
264 cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP);
265
266 DISP_LOCK_INIT(&cap->cap_usagelock);
267 waitq_init(&cap->cap_waitq);
268
269 return (cap);
270 }
271
272 /*
273 * Free cpucap structure
274 */
275 static void
cap_free(cpucap_t * cap)276 cap_free(cpucap_t *cap)
277 {
278 if (cap == NULL)
279 return;
280
281 /*
282 * This cap should not be active
283 */
284 ASSERT(!list_link_active(&cap->cap_link));
285 ASSERT(cap->cap_value == 0);
286 ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock));
287
288 waitq_fini(&cap->cap_waitq);
289 DISP_LOCK_DESTROY(&cap->cap_usagelock);
290
291 kmem_free(cap, sizeof (cpucap_t));
292 }
293
294 /*
295 * Activate cap - insert into active list and unblock its
296 * wait queue. Should be called with caps_lock held.
297 * The cap_value field is set to the value supplied.
298 */
299 static void
cap_enable(list_t * l,cpucap_t * cap,hrtime_t value)300 cap_enable(list_t *l, cpucap_t *cap, hrtime_t value)
301 {
302 ASSERT(MUTEX_HELD(&caps_lock));
303
304 /*
305 * Cap can not be already enabled
306 */
307 ASSERT(!CAP_ENABLED(cap));
308 ASSERT(!list_link_active(&cap->cap_link));
309
310 list_insert_tail(l, cap);
311 cap->cap_below = cap->cap_above = 0;
312 cap->cap_maxusage = 0;
313 cap->cap_usage = 0;
314 cap->cap_value = value;
315 waitq_unblock(&cap->cap_waitq);
316 if (CPUCAPS_OFF()) {
317 cpucaps_enabled = B_TRUE;
318 cpucaps_clock_callout = caps_update;
319 }
320 }
321
322 /*
323 * Deactivate cap
324 * - Block its wait queue. This prevents any new threads from being
325 * enqueued there and moves all enqueued threads to the run queue.
326 * - Remove cap from list l.
327 * - Disable CPU caps globally if there are no capped projects or zones
328 *
329 * Should be called with caps_lock held.
330 */
331 static void
cap_disable(list_t * l,cpucap_t * cap)332 cap_disable(list_t *l, cpucap_t *cap)
333 {
334 ASSERT(MUTEX_HELD(&caps_lock));
335 /*
336 * Cap should be currently active
337 */
338 ASSERT(CPUCAPS_ON());
339 ASSERT(list_link_active(&cap->cap_link));
340 ASSERT(CAP_ENABLED(cap));
341
342 waitq_block(&cap->cap_waitq);
343 list_remove(l, cap);
344 if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) {
345 cpucaps_enabled = B_FALSE;
346 cpucaps_clock_callout = NULL;
347 }
348 cap->cap_value = 0;
349 cap->cap_project = NULL;
350 cap->cap_zone = NULL;
351 if (cap->cap_kstat != NULL) {
352 kstat_delete(cap->cap_kstat);
353 cap->cap_kstat = NULL;
354 }
355
356 }
357
358 /*
359 * Enable cap for a project kpj
360 * It is safe to enable already enabled project cap.
361 * Should be called with caps_lock held.
362 */
363 static void
cap_project_enable(kproject_t * kpj,hrtime_t value)364 cap_project_enable(kproject_t *kpj, hrtime_t value)
365 {
366 cpucap_t *cap = kpj->kpj_cpucap;
367
368 ASSERT(MUTEX_HELD(&caps_lock));
369 ASSERT(cap != NULL);
370
371 if (CAP_DISABLED(cap)) {
372 ASSERT(cap->cap_kstat == NULL);
373 cap_enable(&capped_projects, cap, value);
374 cap->cap_project = kpj;
375 cap->cap_zone = kpj->kpj_zone;
376
377 /*
378 * Create cap kstats
379 */
380 if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps",
381 KSTAT_TYPE_NAMED,
382 sizeof (cap_kstat) / sizeof (kstat_named_t),
383 KSTAT_FLAG_VIRTUAL)) != NULL) {
384 cap->cap_kstat->ks_data_size +=
385 strlen(cap->cap_zone->zone_name) + 1;
386 cap->cap_kstat->ks_lock = &cap_kstat_lock;
387 cap->cap_kstat->ks_data = &cap_kstat;
388 cap->cap_kstat->ks_update = cap_kstat_update;
389 cap->cap_kstat->ks_private = cap;
390 kstat_install(cap->cap_kstat);
391 }
392 }
393 }
394
395 /*
396 * Disable project cap.
397 * It is safe to disable already disabled project cap.
398 * Should be called with caps_lock held.
399 */
400 static void
cap_project_disable(kproject_t * kpj)401 cap_project_disable(kproject_t *kpj)
402 {
403 cpucap_t *cap = kpj->kpj_cpucap;
404
405 ASSERT(MUTEX_HELD(&caps_lock));
406 ASSERT(cap != NULL);
407 ASSERT(cap->cap_project == kpj);
408
409 if (CAP_ENABLED(cap))
410 cap_disable(&capped_projects, cap);
411 }
412
413 /*
414 * Enable cap for a zone
415 * It is safe to enable already enabled zone cap.
416 * Should be called with caps_lock held.
417 */
418 static void
cap_zone_enable(zone_t * zone,hrtime_t value)419 cap_zone_enable(zone_t *zone, hrtime_t value)
420 {
421 cpucap_t *cap = zone->zone_cpucap;
422
423 ASSERT(MUTEX_HELD(&caps_lock));
424 ASSERT(cap != NULL);
425
426 if (CAP_DISABLED(cap)) {
427 ASSERT(cap->cap_kstat == NULL);
428 cap_enable(&capped_zones, cap, value);
429 cap->cap_zone = zone;
430
431 /*
432 * Create cap kstats
433 */
434 if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps",
435 KSTAT_TYPE_NAMED,
436 sizeof (cap_kstat) / sizeof (kstat_named_t),
437 KSTAT_FLAG_VIRTUAL)) != NULL) {
438 cap->cap_kstat->ks_data_size +=
439 strlen(cap->cap_zone->zone_name) + 1;
440 cap->cap_kstat->ks_lock = &cap_kstat_lock;
441 cap->cap_kstat->ks_data = &cap_kstat;
442 cap->cap_kstat->ks_update = cap_kstat_update;
443 cap->cap_kstat->ks_private = cap;
444 kstat_install(cap->cap_kstat);
445 }
446 }
447 }
448
449 /*
450 * Disable zone cap.
451 * It is safe to disable already disabled zone cap.
452 * Should be called with caps_lock held.
453 */
454 static void
cap_zone_disable(zone_t * zone)455 cap_zone_disable(zone_t *zone)
456 {
457 cpucap_t *cap = zone->zone_cpucap;
458
459 ASSERT(MUTEX_HELD(&caps_lock));
460 ASSERT(cap != NULL);
461 ASSERT(cap->cap_zone == zone);
462
463 if (CAP_ENABLED(cap))
464 cap_disable(&capped_zones, cap);
465 }
466
467 /*
468 * Apply specified callback to all caps contained in the list `l'.
469 */
470 static void
cap_walk(list_t * l,void (* cb)(cpucap_t *,int64_t))471 cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t))
472 {
473 static uint64_t cpucap_walk_gen;
474 cpucap_t *cap;
475
476 ASSERT(MUTEX_HELD(&caps_lock));
477
478 for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) {
479 (*cb)(cap, cpucap_walk_gen);
480 }
481
482 atomic_inc_64(&cpucap_walk_gen);
483 }
484
485 /*
486 * If cap limit is not reached, make one thread from wait queue runnable.
487 * The waitq_isempty check is performed without the waitq lock. If a new thread
488 * is placed on the waitq right after the check, it will be picked up during the
489 * next invocation of cap_poke_waitq().
490 */
491 /* ARGSUSED */
492 static void
cap_poke_waitq(cpucap_t * cap,int64_t gen)493 cap_poke_waitq(cpucap_t *cap, int64_t gen)
494 {
495 ASSERT(MUTEX_HELD(&caps_lock));
496
497 if (cap->cap_usage >= cap->cap_value) {
498 cap->cap_above++;
499 } else {
500 waitq_t *wq = &cap->cap_waitq;
501
502 cap->cap_below++;
503
504 if (!waitq_isempty(wq))
505 waitq_runone(wq);
506 }
507 }
508
509 /*
510 * The callback function called for every cap on capped_projects list.
511 * Decay cap usage by CAP_DECAY_FACTOR
512 * Add this cap project usage to its zone usage.
513 * Kick off a thread from the cap waitq if cap is not reached.
514 */
515 static void
cap_project_usage_walker(cpucap_t * cap,int64_t gen)516 cap_project_usage_walker(cpucap_t *cap, int64_t gen)
517 {
518 zone_t *zone = cap->cap_zone;
519 hrtime_t cap_usage = cap->cap_usage;
520
521 ASSERT(MUTEX_HELD(&caps_lock));
522 ASSERT(cap->cap_project->kpj_cpucap == cap);
523 ASSERT(zone == cap->cap_project->kpj_zone);
524 ASSERT(CAP_ENABLED(cap));
525
526 /*
527 * Set or clear the CAP_REACHED flag based on the current usage.
528 * Only projects having their own caps are ever marked as CAP_REACHED.
529 */
530 cap_poke_waitq(cap, 0);
531
532 /*
533 * Add project's CPU usage to our zone's CPU usage.
534 */
535 if (ZONE_IS_CAPPED(zone)) {
536 cpucap_t *zcap = zone->zone_cpucap;
537
538 ASSERT(zcap->cap_zone == zone);
539
540 /*
541 * If we haven't reset this zone's usage during this clock tick
542 * yet, then do it now. The cap_gen field is used to check
543 * whether this is the first zone's project we see during this
544 * tick or a subsequent one.
545 */
546 if (zcap->cap_gen != gen) {
547 if (zcap->cap_usage > zcap->cap_maxusage)
548 zcap->cap_maxusage = zcap->cap_usage;
549 zcap->cap_usage = 0;
550 zcap->cap_gen = gen;
551 }
552 DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap,
553 hrtime_t, cap_usage);
554 zcap->cap_usage += cap_usage;
555 /* Check for overflows */
556 if (zcap->cap_usage < 0)
557 zcap->cap_usage = MAX_USAGE - 1;
558 }
559
560 /*
561 * Decay project usage.
562 */
563 disp_lock_enter(&cap->cap_usagelock);
564 cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR);
565 disp_lock_exit(&cap->cap_usagelock);
566 }
567
568 /*
569 * On every clock tick walk the list of project caps and update the CPU usage.
570 * Also walk the list of zone caps checking whether any threads should
571 * transition from wait queue to run queue.
572 *
573 * This function gets called by the clock thread directly when there are any
574 * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs
575 * caps_lock for long periods of time, so there should be almost no contention
576 * for it.
577 */
578 static void
caps_update()579 caps_update()
580 {
581 mutex_enter(&caps_lock);
582 cap_walk(&capped_projects, cap_project_usage_walker);
583 cap_walk(&capped_zones, cap_poke_waitq);
584 mutex_exit(&caps_lock);
585 }
586
587 /*
588 * The function is called for each project in a zone when the zone cap is
589 * modified. It enables project caps if zone cap is enabled and disables if the
590 * zone cap is disabled and project doesn't have its own cap.
591 *
592 * For each project that does not have cpucap structure allocated it allocates a
593 * new structure and assigns to kpj->cpu_cap. The allocation is performed
594 * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock
595 * held.
596 */
597 static int
cap_project_zone_modify_walker(kproject_t * kpj,void * arg)598 cap_project_zone_modify_walker(kproject_t *kpj, void *arg)
599 {
600 cpucap_t *project_cap = NULL;
601 cpucap_t *zone_cap = (cpucap_t *)arg;
602
603 ASSERT(zone_cap != NULL);
604
605 if (kpj->kpj_cpucap == NULL) {
606 /*
607 * This is the first time any cap was established for this
608 * project. Allocate a new cpucap structure for it.
609 */
610 project_cap = cap_alloc();
611 }
612
613 mutex_enter(&caps_lock);
614
615 /*
616 * Double-check that kpj_cpucap is still NULL - now with caps_lock held
617 * and assign the newly allocated cpucap structure to it.
618 */
619 if (kpj->kpj_cpucap == NULL) {
620 kpj->kpj_cpucap = project_cap;
621 } else if (project_cap != NULL) {
622 cap_free(project_cap);
623 }
624
625 project_cap = kpj->kpj_cpucap;
626
627 if (CAP_DISABLED(zone_cap)) {
628 /*
629 * Remove all projects in this zone without caps
630 * from the capped_projects list.
631 */
632 if (project_cap->cap_value == MAX_USAGE) {
633 cap_project_disable(kpj);
634 }
635 } else if (CAP_DISABLED(project_cap)) {
636 /*
637 * Add the project to capped_projects list.
638 */
639 ASSERT(project_cap->cap_value == 0);
640 cap_project_enable(kpj, MAX_USAGE);
641 }
642 mutex_exit(&caps_lock);
643
644 return (0);
645 }
646
647 /*
648 * Set zone cap to cap_val
649 * If cap_val is equal to NOCAP, disable zone cap.
650 *
651 * If this is the first time a cap is set on a zone, allocate cpucap structure
652 * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held.
653 */
654 int
cpucaps_zone_set(zone_t * zone,rctl_qty_t cap_val)655 cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val)
656 {
657 cpucap_t *cap = NULL;
658 hrtime_t value;
659
660 if (cap_val == 0)
661 return (EINVAL);
662
663 ASSERT(cap_val <= MAXCAP);
664 if (cap_val > MAXCAP)
665 cap_val = MAXCAP;
666
667 /*
668 * Nothing to do if trying to disable a cap on a zone when caps are off
669 * or a zone which does not have a cap yet.
670 */
671 if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP))
672 return (0);
673
674 if (zone->zone_cpucap == NULL)
675 cap = cap_alloc();
676
677 mutex_enter(&caps_lock);
678
679 if (cpucaps_busy) {
680 mutex_exit(&caps_lock);
681 return (EBUSY);
682 }
683
684 /*
685 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock
686 * held. If it is still NULL, assign a newly allocated cpucap to it.
687 */
688 if (zone->zone_cpucap == NULL) {
689 zone->zone_cpucap = cap;
690 } else if (cap != NULL) {
691 cap_free(cap);
692 }
693
694 cap = zone->zone_cpucap;
695 value = cap_val * cap_tick_cost;
696 if (value < 0)
697 value = MAX_USAGE;
698
699 /* Nothing to do if the value is staying the same */
700 if (value == cap->cap_value) {
701 mutex_exit(&caps_lock);
702 return (0);
703 }
704
705 /*
706 * Clear cap statistics since the cap value itself changes.
707 */
708 cap->cap_above = cap->cap_below = 0;
709
710
711 if (cap_val == NOCAP) {
712 if (CAP_ENABLED(cap)) {
713 /*
714 * Remove cap for the zone
715 */
716 cap_zone_disable(zone);
717 cpucaps_busy = B_TRUE;
718 mutex_exit(&caps_lock);
719 /*
720 * Disable caps for all project belonging to this zone
721 * unless they have their own cap.
722 */
723 (void) project_walk_all(zone->zone_id,
724 cap_project_zone_modify_walker, cap);
725
726 mutex_enter(&caps_lock);
727 cpucaps_busy = B_FALSE;
728 }
729 } else if (CAP_DISABLED(cap)) {
730 /*
731 * Set a cap on a zone which previously was not capped.
732 */
733 cap_zone_enable(zone, value);
734 cpucaps_busy = B_TRUE;
735 mutex_exit(&caps_lock);
736
737 /*
738 * Enable cap for all projects belonging to this zone.
739 */
740 (void) project_walk_all(zone->zone_id,
741 cap_project_zone_modify_walker, cap);
742
743 mutex_enter(&caps_lock);
744 cpucaps_busy = B_FALSE;
745 } else {
746 /*
747 * No state transitions, just change the value
748 */
749 cap->cap_value = value;
750 }
751
752 ASSERT(MUTEX_HELD(&caps_lock));
753 ASSERT(!cpucaps_busy);
754 mutex_exit(&caps_lock);
755
756 return (0);
757 }
758
759 /*
760 * The project is going away so disable its cap.
761 */
762 void
cpucaps_project_remove(kproject_t * kpj)763 cpucaps_project_remove(kproject_t *kpj)
764 {
765 mutex_enter(&caps_lock);
766 if (PROJECT_IS_CAPPED(kpj))
767 cap_project_disable(kpj);
768 if (kpj->kpj_cpucap != NULL) {
769 cap_free(kpj->kpj_cpucap);
770 kpj->kpj_cpucap = NULL;
771 }
772 mutex_exit(&caps_lock);
773 }
774
775 /*
776 * The zone is going away, so disable its cap.
777 */
778 void
cpucaps_zone_remove(zone_t * zone)779 cpucaps_zone_remove(zone_t *zone)
780 {
781 mutex_enter(&caps_lock);
782 while (ZONE_IS_CAPPED(zone)) {
783 mutex_exit(&caps_lock);
784 (void) cpucaps_zone_set(zone, NOCAP);
785 mutex_enter(&caps_lock);
786 }
787 if (zone->zone_cpucap != NULL) {
788 cap_free(zone->zone_cpucap);
789 zone->zone_cpucap = NULL;
790 }
791 mutex_exit(&caps_lock);
792 }
793
794 /*
795 * New project was created. It should be put on the capped_projects list if
796 * its zone has a cap.
797 */
798 void
cpucaps_project_add(kproject_t * kpj)799 cpucaps_project_add(kproject_t *kpj)
800 {
801 cpucap_t *cap = NULL;
802
803 if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone))
804 return;
805
806 /*
807 * This project was never capped before, so allocate its cap structure.
808 */
809 if (kpj->kpj_cpucap == NULL)
810 cap = cap_alloc();
811
812 mutex_enter(&caps_lock);
813 /*
814 * Double-check with caps_lock held
815 */
816 if (kpj->kpj_cpucap == NULL) {
817 kpj->kpj_cpucap = cap;
818 } else if (cap != NULL) {
819 cap_free(cap);
820 }
821
822 if (ZONE_IS_CAPPED(kpj->kpj_zone))
823 cap_project_enable(kpj, MAX_USAGE);
824
825 mutex_exit(&caps_lock);
826 }
827
828 /*
829 * Set project cap to cap_val
830 * If cap_val is equal to NOCAP, disable project cap.
831 *
832 * If this is the first time a cap is set on a project, allocate cpucap
833 * structure without holding caps_lock to avoid KM_SLEEP allocation with
834 * caps_lock held.
835 */
836 int
cpucaps_project_set(kproject_t * kpj,rctl_qty_t cap_val)837 cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val)
838 {
839 cpucap_t *cap = NULL;
840 hrtime_t value;
841
842 if (cap_val == 0)
843 return (EINVAL);
844
845 ASSERT(cap_val <= MAXCAP);
846 if (cap_val > MAXCAP)
847 cap_val = MAXCAP;
848
849 /*
850 * Nothing to do if trying to disable project cap and caps are not
851 * enabled or if trying to disable cap on a project that does not have
852 * cap enabled.
853 */
854 if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj)))
855 return (0);
856
857 if (kpj->kpj_cpucap == NULL) {
858 /*
859 * This project was never capped before, so allocate its cap
860 * structure.
861 */
862 cap = cap_alloc();
863 }
864
865 mutex_enter(&caps_lock);
866
867 /*
868 * Double-check with caps_lock held.
869 */
870 if (kpj->kpj_cpucap == NULL) {
871 kpj->kpj_cpucap = cap;
872 } else if (cap != NULL) {
873 cap_free(cap);
874 }
875
876 /*
877 * Get the actual pointer to the project cap.
878 */
879 cap = kpj->kpj_cpucap;
880 value = cap_val * cap_tick_cost;
881 if (value < 0)
882 value = MAX_USAGE;
883
884 /*
885 * Nothing to do if the value is not changing
886 */
887 if (value == cap->cap_value) {
888 mutex_exit(&caps_lock);
889 return (0);
890 }
891
892 /*
893 * Clear cap statistics since the cap value itself changes.
894 */
895 cap->cap_above = cap->cap_below = 0;
896 cap->cap_maxusage = 0;
897
898 if (cap_val != NOCAP) {
899 /*
900 * Enable this cap if it is not already enabled.
901 */
902 if (CAP_DISABLED(cap))
903 cap_project_enable(kpj, value);
904 else
905 cap->cap_value = value;
906 } else if (CAP_ENABLED(cap)) {
907 /*
908 * User requested to drop a cap on the project. If it is part of
909 * capped zone, keep the cap and set the value to MAX_USAGE,
910 * otherwise disable the cap.
911 */
912 if (ZONE_IS_CAPPED(kpj->kpj_zone)) {
913 cap->cap_value = MAX_USAGE;
914 } else {
915 cap_project_disable(kpj);
916 }
917 }
918 mutex_exit(&caps_lock);
919
920 return (0);
921 }
922
923 /*
924 * Get cap usage.
925 */
926 static rctl_qty_t
cap_get(cpucap_t * cap)927 cap_get(cpucap_t *cap)
928 {
929 return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0);
930 }
931
932 /*
933 * Get current project usage.
934 */
935 rctl_qty_t
cpucaps_project_get(kproject_t * kpj)936 cpucaps_project_get(kproject_t *kpj)
937 {
938 return (cap_get(kpj->kpj_cpucap));
939 }
940
941 /*
942 * Get current zone usage.
943 */
944 rctl_qty_t
cpucaps_zone_get(zone_t * zone)945 cpucaps_zone_get(zone_t *zone)
946 {
947 return (cap_get(zone->zone_cpucap));
948 }
949
950 /*
951 * Charge project of thread t the time thread t spent on CPU since previously
952 * adjusted.
953 *
954 * Record the current on-CPU time in the csc structure.
955 *
956 * Do not adjust for more than one tick worth of time.
957 *
958 * It is possible that the project cap is being disabled while this routine is
959 * executed. This should not cause any issues since the association between the
960 * thread and its project is protected by thread lock.
961 */
962 static void
caps_charge_adjust(kthread_id_t t,caps_sc_t * csc)963 caps_charge_adjust(kthread_id_t t, caps_sc_t *csc)
964 {
965 kproject_t *kpj = ttoproj(t);
966 hrtime_t new_usage;
967 hrtime_t usage_delta;
968
969 ASSERT(THREAD_LOCK_HELD(t));
970 ASSERT(kpj->kpj_cpucap != NULL);
971
972 /* Get on-CPU time since birth of a thread */
973 new_usage = mstate_thread_onproc_time(t);
974
975 /* Time spent on CPU since last checked */
976 usage_delta = new_usage - csc->csc_cputime;
977
978 /* Save the accumulated on-CPU time */
979 csc->csc_cputime = new_usage;
980
981 /* Charge at most one tick worth of on-CPU time */
982 if (usage_delta > cap_tick_cost)
983 usage_delta = cap_tick_cost;
984
985 /* Add usage_delta to the project usage value. */
986 if (usage_delta > 0) {
987 cpucap_t *cap = kpj->kpj_cpucap;
988
989 DTRACE_PROBE2(cpucaps__project__charge,
990 kthread_id_t, t, hrtime_t, usage_delta);
991
992 disp_lock_enter_high(&cap->cap_usagelock);
993 cap->cap_usage += usage_delta;
994
995 /* Check for overflows */
996 if (cap->cap_usage < 0)
997 cap->cap_usage = MAX_USAGE - 1;
998
999 disp_lock_exit_high(&cap->cap_usagelock);
1000
1001 /*
1002 * cap_maxusage is only kept for observability. Move it outside
1003 * the lock to reduce the time spent while holding the lock.
1004 */
1005 if (cap->cap_usage > cap->cap_maxusage)
1006 cap->cap_maxusage = cap->cap_usage;
1007 }
1008 }
1009
1010 /*
1011 * Charge thread's project and return True if project or zone should be
1012 * penalized because its project or zone is exceeding its cap. Also sets
1013 * TS_PROJWAITQ or TS_ZONEWAITQ in this case.
1014 *
1015 * It is possible that the project cap is being disabled while this routine is
1016 * executed. This should not cause any issues since the association between the
1017 * thread and its project is protected by thread lock. It will still set
1018 * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place
1019 * anything on the blocked wait queue.
1020 *
1021 */
1022 boolean_t
cpucaps_charge(kthread_id_t t,caps_sc_t * csc,cpucaps_charge_t charge_type)1023 cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type)
1024 {
1025 kproject_t *kpj = ttoproj(t);
1026 klwp_t *lwp = t->t_lwp;
1027 zone_t *zone;
1028 cpucap_t *project_cap;
1029 boolean_t rc = B_FALSE;
1030
1031 ASSERT(THREAD_LOCK_HELD(t));
1032
1033 /* Nothing to do for projects that are not capped. */
1034 if (lwp == NULL || !PROJECT_IS_CAPPED(kpj))
1035 return (B_FALSE);
1036
1037 caps_charge_adjust(t, csc);
1038
1039 /*
1040 * The caller only requested to charge the project usage, no enforcement
1041 * part.
1042 */
1043 if (charge_type == CPUCAPS_CHARGE_ONLY)
1044 return (B_FALSE);
1045
1046 project_cap = kpj->kpj_cpucap;
1047
1048 if (project_cap->cap_usage >= project_cap->cap_value) {
1049 t->t_schedflag |= TS_PROJWAITQ;
1050 rc = B_TRUE;
1051 } else if (t->t_schedflag & TS_PROJWAITQ) {
1052 t->t_schedflag &= ~TS_PROJWAITQ;
1053 }
1054
1055 zone = ttozone(t);
1056 if (!ZONE_IS_CAPPED(zone)) {
1057 if (t->t_schedflag & TS_ZONEWAITQ)
1058 t->t_schedflag &= ~TS_ZONEWAITQ;
1059 } else {
1060 cpucap_t *zone_cap = zone->zone_cpucap;
1061
1062 if (zone_cap->cap_usage >= zone_cap->cap_value) {
1063 t->t_schedflag |= TS_ZONEWAITQ;
1064 rc = B_TRUE;
1065 } else if (t->t_schedflag & TS_ZONEWAITQ) {
1066 t->t_schedflag &= ~TS_ZONEWAITQ;
1067 }
1068 }
1069
1070
1071 return (rc);
1072 }
1073
1074 /*
1075 * Enforce CPU caps. If got preempted in the user-land, we know that thread does
1076 * not hold any kernel locks, so enqueue ourselves on the waitq, if needed.
1077 *
1078 * CPU Caps are only enforced for user threads.
1079 *
1080 * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and
1081 * threads marked with TS_ZONEWAITQ are placed on their zone wait queue.
1082 *
1083 * It is possible that by the time we enter cpucaps_enforce() the cap is already
1084 * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We
1085 * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer
1086 * apply.
1087 */
1088 boolean_t
cpucaps_enforce(kthread_t * t)1089 cpucaps_enforce(kthread_t *t)
1090 {
1091 klwp_t *lwp = t->t_lwp;
1092
1093 ASSERT(THREAD_LOCK_HELD(t));
1094
1095 if (lwp != NULL && lwp->lwp_state == LWP_USER) {
1096 if (t->t_schedflag & TS_PROJWAITQ) {
1097 ASSERT(ttoproj(t)->kpj_cpucap != NULL);
1098 t->t_schedflag &= ~TS_ANYWAITQ;
1099 if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq),
1100 t)) {
1101 return (B_TRUE);
1102 }
1103 }
1104 if (t->t_schedflag & TS_ZONEWAITQ) {
1105 ASSERT(ttozone(t)->zone_cpucap != NULL);
1106 t->t_schedflag &= ~TS_ZONEWAITQ;
1107 if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq),
1108 t)) {
1109 return (B_TRUE);
1110 }
1111 }
1112 }
1113
1114 /*
1115 * The thread is not enqueued on the wait queue.
1116 */
1117 return (B_FALSE);
1118 }
1119
1120 /*
1121 * Convert internal cap statistics into values exported by cap kstat.
1122 */
1123 static int
cap_kstat_update(kstat_t * ksp,int rw)1124 cap_kstat_update(kstat_t *ksp, int rw)
1125 {
1126 struct cap_kstat *capsp = &cap_kstat;
1127 cpucap_t *cap = ksp->ks_private;
1128 clock_t tick_sec = SEC_TO_TICK(1);
1129 char *zonename = cap->cap_zone->zone_name;
1130
1131 if (rw == KSTAT_WRITE)
1132 return (EACCES);
1133
1134 capsp->cap_value.value.ui64 =
1135 ROUND_SCALE(cap->cap_value, cap_tick_cost);
1136 capsp->cap_usage.value.ui64 =
1137 ROUND_SCALE(cap->cap_usage, cap_tick_cost);
1138 capsp->cap_maxusage.value.ui64 =
1139 ROUND_SCALE(cap->cap_maxusage, cap_tick_cost);
1140 capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count;
1141 capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec);
1142 capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec);
1143 kstat_named_setstr(&capsp->cap_zonename, zonename);
1144
1145 return (0);
1146 }
1147