1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 #include <sys/disp.h> 28 #include <sys/param.h> 29 #include <sys/systm.h> 30 #include <sys/sysmacros.h> 31 #include <sys/atomic.h> 32 #include <sys/cpucaps_impl.h> 33 #include <sys/dtrace.h> 34 #include <sys/sdt.h> 35 #include <sys/debug.h> 36 #include <sys/rctl.h> 37 #include <sys/errno.h> 38 39 /* 40 * CPU Caps implementation 41 * ======================= 42 * 43 * A CPU cap can be set on any project or any zone. Zone CPU cap limits the CPU 44 * usage for all projects running inside the zone. If the zone CPU cap is set 45 * below the project CPU cap, the latter will have no effect. 46 * 47 * When CPU usage of projects and/or zones reaches specified caps, threads in 48 * them do not get scheduled and instead are placed on wait queues associated 49 * with a cap. Such threads will start running again only when CPU usage drops 50 * below the cap level. Each zone and each project has its own wait queue. 51 * 52 * When CPU cap is set, the kernel continously keeps track of CPU time used by 53 * capped zones and/or projects over a short time interval and calculates their 54 * current CPU usage as a percentage. When the accumulated usage reaches the CPU 55 * cap, LWPs running in the user-land (when they are not holding any critical 56 * kernel locks) are placed on special wait queues until their project's or 57 * zone's CPU usage drops below the cap. 58 * 59 * The system maintains a list of all capped projects and all capped zones. On 60 * every clock tick every active thread belonging to a capped project adds its 61 * CPU usage to its project. Usage from all projects belonging to a capped zone 62 * is aggregated to get the zone usage. 63 * 64 * When the current CPU usage is above the cap, a project or zone is considered 65 * over-capped. Every user thread caught running in an over-capped project or 66 * zone is marked by setting TS_PROJWAITQ flag in thread's t_schedflag field and 67 * is requested to surrender its CPU. This causes scheduling class specific 68 * CL_PREEMPT() callback to be invoked. The callback function places threads 69 * marked as TS_PROJWAIT on a wait queue and calls switch(). 70 * 71 * Threads are only placed on wait queues after trapping from user-land 72 * (they could be holding some user locks, but no kernel locks) and while 73 * returning from the trap back to the user-land when no kernel locks are held. 74 * Putting threads on wait queues in random places while running in the 75 * kernel might lead to all kinds of locking problems. 76 * 77 * Accounting 78 * ========== 79 * 80 * Accounting of CPU usage is based on per-thread micro-state accounting data. 81 * On every clock tick clock() adds new on-CPU time for every thread found on 82 * CPU. Scheduling classes also add new on-CPU time for any thread leaving CPU. 83 * New times means time since it was last accounted for. On-CPU times greater 84 * than 1 tick are truncated to 1 tick. 85 * 86 * Project CPU usage is aggregated from all threads within the project. 87 * Zone CPU usage is the sum of usages for all projects within the zone. Zone 88 * CPU usage is calculated on every clock tick by walking list of projects and 89 * adding their usage together. 90 * 91 * Decay 92 * ===== 93 * 94 * CPU usage is decayed by the caps_update() routine which is called once per 95 * every clock tick. It walks lists of project caps and decays their usages by 96 * one per cent. If CPU usage drops below cap levels, threads on the wait queue 97 * are made runnable again, one thread per clock tick. 98 * 99 * Interfaces 100 * ========== 101 * 102 * The CPU Caps facility provides the following interfaces to the rest of the 103 * system: 104 * 105 * cpucaps_project_add(kproject_t *) 106 * 107 * Notifies the framework of a new project. It should be put on the 108 * capped_projects list if its zone has a cap. 109 * 110 * cpucaps_project_remove(kproject_t *) 111 * 112 * Remove the association between the specified project and its cap. 113 * Called right before the project is destroyed. 114 * 115 * cpucaps_project_set(kproject_t *, rctl_qty_t) 116 * 117 * Set project cap of the specified project to the specified value. Setting the 118 * value to NOCAP is equivalent to removing the cap. 119 * 120 * cpucaps_zone_set(zone_t *, rctl_qty_t) 121 * 122 * Set zone cap of the specified zone to the specified value. Setting the value 123 * to NOCAP is equivalent to removing the cap. 124 * 125 * cpucaps_zone_remove(zone_t *) 126 * 127 * Remove the association between the zone and its cap. 128 * 129 * cpucaps_charge(kthread_id_t, caps_sc_t *, cpucaps_charge_t) 130 * 131 * Charges specified thread's project the amount of on-CPU time that it used. 132 * If the third argument is CPUCAPS_CHARGE_ONLY returns False. 133 * Otherwise returns True if project or zone should be penalized because its 134 * project or zone is exceeding its cap. Also sets TS_PROJWAITQ or TS_ZONEWAITQ 135 * bits in t_schedflag in this case. 136 * 137 * CPUCAPS_ENFORCE(kthread_id_t *) 138 * 139 * Enforces CPU caps for a specified thread. Places LWPs running in LWP_USER 140 * state on project or zone wait queues, as requested by TS_PROJWAITQ or 141 * TS_ZONEWAITQ bits in t_schedflag. Returns True if the thread was placed on a 142 * wait queue or False otherwise. 143 * 144 * cpucaps_sc_init(caps_sc_t *) 145 * 146 * Initializes the scheduling-class specific CPU Caps data for a thread. 147 * 148 * LOCKS 149 * ===== 150 * 151 * all the individual caps structures and their lists are protected by a global 152 * caps_lock mutex. The lock is grabbed either by clock() or by events modifying 153 * caps, so it is usually uncontended. We avoid all blocking memory allocations 154 * while holding caps_lock to prevent clock() from blocking. 155 * 156 * Thread state is protected by the thread lock. It protects the association 157 * between a thread and its project and, as a consequence, to its zone. The 158 * association can not break while thread lock is held, so the project or zone 159 * cap are not going to disappear while thread lock is held. 160 * 161 * Cap usage field is protected by high-pil spin-lock cap_usagelock. It is 162 * grabbed by scheduling classes already holding thread lock at high PIL and by 163 * clock thread performing usage decay. We should do as little work as possible 164 * while holding the lock since it may be very hot. All threads in the project 165 * contend for the same cache line doing cap usage updates. 166 */ 167 168 /* 169 * caps_lock protects list of capped projects and zones, changes in the cap 170 * state and changes of the global cpucaps_enabled flag. 171 * 172 * Changing zone caps also sets cpucaps_busy to avoid races when a zone cap is 173 * modified in parallel. This can be per-zone cap flag, but we don't keep any 174 * cap state for now. 175 */ 176 static kmutex_t caps_lock; /* lock to protect: */ 177 static list_t capped_zones; /* - list of zones with caps */ 178 static list_t capped_projects; /* - list of projects with caps */ 179 boolean_t cpucaps_enabled; /* - are there any caps defined? */ 180 boolean_t cpucaps_busy; /* - is framework busy? */ 181 182 /* 183 * The accounting is based on the number of nanoseconds threads spend running 184 * during a tick which is kept in the cap_tick_cost variable. 185 */ 186 static hrtime_t cap_tick_cost; 187 188 /* 189 * How much of the usage value is decayed every clock tick 190 * Decay one per cent of value per tick 191 */ 192 #define CAP_DECAY_FACTOR 100 193 194 /* 195 * Scale the value and round it to the closest integer value 196 */ 197 #define ROUND_SCALE(x, y) (((x) + (y) / 2) / (y)) 198 199 static void caps_update(); 200 201 /* 202 * CAP kstats. 203 */ 204 struct cap_kstat { 205 kstat_named_t cap_value; 206 kstat_named_t cap_usage; 207 kstat_named_t cap_nwait; 208 kstat_named_t cap_below; 209 kstat_named_t cap_above; 210 kstat_named_t cap_maxusage; 211 kstat_named_t cap_zonename; 212 } cap_kstat = { 213 { "value", KSTAT_DATA_UINT64 }, 214 { "usage", KSTAT_DATA_UINT64 }, 215 { "nwait", KSTAT_DATA_UINT64 }, 216 { "below_sec", KSTAT_DATA_UINT64 }, 217 { "above_sec", KSTAT_DATA_UINT64 }, 218 { "maxusage", KSTAT_DATA_UINT64 }, 219 { "zonename", KSTAT_DATA_STRING }, 220 }; 221 222 223 static kmutex_t cap_kstat_lock; 224 static int cap_kstat_update(kstat_t *, int); 225 226 /* 227 * Initialize CPU caps infrastructure. 228 * - Initialize lists of capped zones and capped projects 229 * - Set cpucaps_clock_callout to NULL 230 */ 231 void 232 cpucaps_init() 233 { 234 /* 235 * Initialize global variables 236 */ 237 cap_tick_cost = TICK_TO_NSEC((hrtime_t)1); 238 239 list_create(&capped_zones, sizeof (cpucap_t), 240 offsetof(cpucap_t, cap_link)); 241 list_create(&capped_projects, sizeof (cpucap_t), 242 offsetof(cpucap_t, cap_link)); 243 244 cpucaps_enabled = B_FALSE; 245 cpucaps_busy = B_FALSE; 246 cpucaps_clock_callout = NULL; 247 } 248 249 /* 250 * Initialize scheduling-class specific CPU Caps data. 251 */ 252 void 253 cpucaps_sc_init(caps_sc_t *csc) 254 { 255 csc->csc_cputime = 0; 256 } 257 258 /* 259 * Allocate and initialize cpucap structure 260 */ 261 static cpucap_t * 262 cap_alloc(void) 263 { 264 cpucap_t *cap = kmem_zalloc(sizeof (cpucap_t), KM_SLEEP); 265 266 DISP_LOCK_INIT(&cap->cap_usagelock); 267 waitq_init(&cap->cap_waitq); 268 269 return (cap); 270 } 271 272 /* 273 * Free cpucap structure 274 */ 275 static void 276 cap_free(cpucap_t *cap) 277 { 278 if (cap == NULL) 279 return; 280 281 /* 282 * This cap should not be active 283 */ 284 ASSERT(!list_link_active(&cap->cap_link)); 285 ASSERT(cap->cap_value == 0); 286 ASSERT(!DISP_LOCK_HELD(&cap->cap_usagelock)); 287 288 waitq_fini(&cap->cap_waitq); 289 DISP_LOCK_DESTROY(&cap->cap_usagelock); 290 291 kmem_free(cap, sizeof (cpucap_t)); 292 } 293 294 /* 295 * Activate cap - insert into active list and unblock its 296 * wait queue. Should be called with caps_lock held. 297 * The cap_value field is set to the value supplied. 298 */ 299 static void 300 cap_enable(list_t *l, cpucap_t *cap, hrtime_t value) 301 { 302 ASSERT(MUTEX_HELD(&caps_lock)); 303 304 /* 305 * Cap can not be already enabled 306 */ 307 ASSERT(!CAP_ENABLED(cap)); 308 ASSERT(!list_link_active(&cap->cap_link)); 309 310 list_insert_tail(l, cap); 311 cap->cap_below = cap->cap_above = 0; 312 cap->cap_maxusage = 0; 313 cap->cap_usage = 0; 314 cap->cap_value = value; 315 waitq_unblock(&cap->cap_waitq); 316 if (CPUCAPS_OFF()) { 317 cpucaps_enabled = B_TRUE; 318 cpucaps_clock_callout = caps_update; 319 } 320 } 321 322 /* 323 * Deactivate cap 324 * - Block its wait queue. This prevents any new threads from being 325 * enqueued there and moves all enqueued threads to the run queue. 326 * - Remove cap from list l. 327 * - Disable CPU caps globally if there are no capped projects or zones 328 * 329 * Should be called with caps_lock held. 330 */ 331 static void 332 cap_disable(list_t *l, cpucap_t *cap) 333 { 334 ASSERT(MUTEX_HELD(&caps_lock)); 335 /* 336 * Cap should be currently active 337 */ 338 ASSERT(CPUCAPS_ON()); 339 ASSERT(list_link_active(&cap->cap_link)); 340 ASSERT(CAP_ENABLED(cap)); 341 342 waitq_block(&cap->cap_waitq); 343 list_remove(l, cap); 344 if (list_is_empty(&capped_projects) && list_is_empty(&capped_zones)) { 345 cpucaps_enabled = B_FALSE; 346 cpucaps_clock_callout = NULL; 347 } 348 cap->cap_value = 0; 349 cap->cap_project = NULL; 350 cap->cap_zone = NULL; 351 if (cap->cap_kstat != NULL) { 352 kstat_delete(cap->cap_kstat); 353 cap->cap_kstat = NULL; 354 } 355 356 } 357 358 /* 359 * Enable cap for a project kpj 360 * It is safe to enable already enabled project cap. 361 * Should be called with caps_lock held. 362 */ 363 static void 364 cap_project_enable(kproject_t *kpj, hrtime_t value) 365 { 366 cpucap_t *cap = kpj->kpj_cpucap; 367 368 ASSERT(MUTEX_HELD(&caps_lock)); 369 ASSERT(cap != NULL); 370 371 if (CAP_DISABLED(cap)) { 372 ASSERT(cap->cap_kstat == NULL); 373 cap_enable(&capped_projects, cap, value); 374 cap->cap_project = kpj; 375 cap->cap_zone = kpj->kpj_zone; 376 377 /* 378 * Create cap kstats 379 */ 380 if ((cap->cap_kstat = rctl_kstat_create_project(kpj, "cpucaps", 381 KSTAT_TYPE_NAMED, 382 sizeof (cap_kstat) / sizeof (kstat_named_t), 383 KSTAT_FLAG_VIRTUAL)) != NULL) { 384 cap->cap_kstat->ks_data_size += 385 strlen(cap->cap_zone->zone_name) + 1; 386 cap->cap_kstat->ks_lock = &cap_kstat_lock; 387 cap->cap_kstat->ks_data = &cap_kstat; 388 cap->cap_kstat->ks_update = cap_kstat_update; 389 cap->cap_kstat->ks_private = cap; 390 kstat_install(cap->cap_kstat); 391 } 392 } 393 } 394 395 /* 396 * Disable project cap. 397 * It is safe to disable already disabled project cap. 398 * Should be called with caps_lock held. 399 */ 400 static void 401 cap_project_disable(kproject_t *kpj) 402 { 403 cpucap_t *cap = kpj->kpj_cpucap; 404 405 ASSERT(MUTEX_HELD(&caps_lock)); 406 ASSERT(cap != NULL); 407 ASSERT(cap->cap_project == kpj); 408 409 if (CAP_ENABLED(cap)) 410 cap_disable(&capped_projects, cap); 411 } 412 413 /* 414 * Enable cap for a zone 415 * It is safe to enable already enabled zone cap. 416 * Should be called with caps_lock held. 417 */ 418 static void 419 cap_zone_enable(zone_t *zone, hrtime_t value) 420 { 421 cpucap_t *cap = zone->zone_cpucap; 422 423 ASSERT(MUTEX_HELD(&caps_lock)); 424 ASSERT(cap != NULL); 425 426 if (CAP_DISABLED(cap)) { 427 ASSERT(cap->cap_kstat == NULL); 428 cap_enable(&capped_zones, cap, value); 429 cap->cap_zone = zone; 430 431 /* 432 * Create cap kstats 433 */ 434 if ((cap->cap_kstat = rctl_kstat_create_zone(zone, "cpucaps", 435 KSTAT_TYPE_NAMED, 436 sizeof (cap_kstat) / sizeof (kstat_named_t), 437 KSTAT_FLAG_VIRTUAL)) != NULL) { 438 cap->cap_kstat->ks_data_size += 439 strlen(cap->cap_zone->zone_name) + 1; 440 cap->cap_kstat->ks_lock = &cap_kstat_lock; 441 cap->cap_kstat->ks_data = &cap_kstat; 442 cap->cap_kstat->ks_update = cap_kstat_update; 443 cap->cap_kstat->ks_private = cap; 444 kstat_install(cap->cap_kstat); 445 } 446 } 447 } 448 449 /* 450 * Disable zone cap. 451 * It is safe to disable already disabled zone cap. 452 * Should be called with caps_lock held. 453 */ 454 static void 455 cap_zone_disable(zone_t *zone) 456 { 457 cpucap_t *cap = zone->zone_cpucap; 458 459 ASSERT(MUTEX_HELD(&caps_lock)); 460 ASSERT(cap != NULL); 461 ASSERT(cap->cap_zone == zone); 462 463 if (CAP_ENABLED(cap)) 464 cap_disable(&capped_zones, cap); 465 } 466 467 /* 468 * Apply specified callback to all caps contained in the list `l'. 469 */ 470 static void 471 cap_walk(list_t *l, void (*cb)(cpucap_t *, int64_t)) 472 { 473 static uint64_t cpucap_walk_gen; 474 cpucap_t *cap; 475 476 ASSERT(MUTEX_HELD(&caps_lock)); 477 478 for (cap = list_head(l); cap != NULL; cap = list_next(l, cap)) { 479 (*cb)(cap, cpucap_walk_gen); 480 } 481 482 atomic_inc_64(&cpucap_walk_gen); 483 } 484 485 /* 486 * If cap limit is not reached, make one thread from wait queue runnable. 487 * The waitq_isempty check is performed without the waitq lock. If a new thread 488 * is placed on the waitq right after the check, it will be picked up during the 489 * next invocation of cap_poke_waitq(). 490 */ 491 /* ARGSUSED */ 492 static void 493 cap_poke_waitq(cpucap_t *cap, int64_t gen) 494 { 495 ASSERT(MUTEX_HELD(&caps_lock)); 496 497 if (cap->cap_usage >= cap->cap_value) { 498 cap->cap_above++; 499 } else { 500 waitq_t *wq = &cap->cap_waitq; 501 502 cap->cap_below++; 503 504 if (!waitq_isempty(wq)) 505 waitq_runone(wq); 506 } 507 } 508 509 /* 510 * The callback function called for every cap on capped_projects list. 511 * Decay cap usage by CAP_DECAY_FACTOR 512 * Add this cap project usage to its zone usage. 513 * Kick off a thread from the cap waitq if cap is not reached. 514 */ 515 static void 516 cap_project_usage_walker(cpucap_t *cap, int64_t gen) 517 { 518 zone_t *zone = cap->cap_zone; 519 hrtime_t cap_usage = cap->cap_usage; 520 521 ASSERT(MUTEX_HELD(&caps_lock)); 522 ASSERT(cap->cap_project->kpj_cpucap == cap); 523 ASSERT(zone == cap->cap_project->kpj_zone); 524 ASSERT(CAP_ENABLED(cap)); 525 526 /* 527 * Set or clear the CAP_REACHED flag based on the current usage. 528 * Only projects having their own caps are ever marked as CAP_REACHED. 529 */ 530 cap_poke_waitq(cap, 0); 531 532 /* 533 * Add project's CPU usage to our zone's CPU usage. 534 */ 535 if (ZONE_IS_CAPPED(zone)) { 536 cpucap_t *zcap = zone->zone_cpucap; 537 538 ASSERT(zcap->cap_zone == zone); 539 540 /* 541 * If we haven't reset this zone's usage during this clock tick 542 * yet, then do it now. The cap_gen field is used to check 543 * whether this is the first zone's project we see during this 544 * tick or a subsequent one. 545 */ 546 if (zcap->cap_gen != gen) { 547 if (zcap->cap_usage > zcap->cap_maxusage) 548 zcap->cap_maxusage = zcap->cap_usage; 549 zcap->cap_usage = 0; 550 zcap->cap_gen = gen; 551 } 552 DTRACE_PROBE2(cpucaps__zusage, cpucap_t *, zcap, 553 hrtime_t, cap_usage); 554 zcap->cap_usage += cap_usage; 555 /* Check for overflows */ 556 if (zcap->cap_usage < 0) 557 zcap->cap_usage = MAX_USAGE - 1; 558 } 559 560 /* 561 * Decay project usage. 562 */ 563 disp_lock_enter(&cap->cap_usagelock); 564 cap->cap_usage -= ROUND_SCALE(cap_usage, CAP_DECAY_FACTOR); 565 disp_lock_exit(&cap->cap_usagelock); 566 } 567 568 /* 569 * On every clock tick walk the list of project caps and update the CPU usage. 570 * Also walk the list of zone caps checking whether any threads should 571 * transition from wait queue to run queue. 572 * 573 * This function gets called by the clock thread directly when there are any 574 * defined caps. The only lock that it grabs is caps_lock. Nothing else grabs 575 * caps_lock for long periods of time, so there should be almost no contention 576 * for it. 577 */ 578 static void 579 caps_update() 580 { 581 mutex_enter(&caps_lock); 582 cap_walk(&capped_projects, cap_project_usage_walker); 583 cap_walk(&capped_zones, cap_poke_waitq); 584 mutex_exit(&caps_lock); 585 } 586 587 /* 588 * The function is called for each project in a zone when the zone cap is 589 * modified. It enables project caps if zone cap is enabled and disables if the 590 * zone cap is disabled and project doesn't have its own cap. 591 * 592 * For each project that does not have cpucap structure allocated it allocates a 593 * new structure and assigns to kpj->cpu_cap. The allocation is performed 594 * without holding caps_lock to avoid using KM_SLEEP allocation with caps_lock 595 * held. 596 */ 597 static int 598 cap_project_zone_modify_walker(kproject_t *kpj, void *arg) 599 { 600 cpucap_t *project_cap = NULL; 601 cpucap_t *zone_cap = (cpucap_t *)arg; 602 603 ASSERT(zone_cap != NULL); 604 605 if (kpj->kpj_cpucap == NULL) { 606 /* 607 * This is the first time any cap was established for this 608 * project. Allocate a new cpucap structure for it. 609 */ 610 project_cap = cap_alloc(); 611 } 612 613 mutex_enter(&caps_lock); 614 615 /* 616 * Double-check that kpj_cpucap is still NULL - now with caps_lock held 617 * and assign the newly allocated cpucap structure to it. 618 */ 619 if (kpj->kpj_cpucap == NULL) { 620 kpj->kpj_cpucap = project_cap; 621 } else if (project_cap != NULL) { 622 cap_free(project_cap); 623 } 624 625 project_cap = kpj->kpj_cpucap; 626 627 if (CAP_DISABLED(zone_cap)) { 628 /* 629 * Remove all projects in this zone without caps 630 * from the capped_projects list. 631 */ 632 if (project_cap->cap_value == MAX_USAGE) { 633 cap_project_disable(kpj); 634 } 635 } else if (CAP_DISABLED(project_cap)) { 636 /* 637 * Add the project to capped_projects list. 638 */ 639 ASSERT(project_cap->cap_value == 0); 640 cap_project_enable(kpj, MAX_USAGE); 641 } 642 mutex_exit(&caps_lock); 643 644 return (0); 645 } 646 647 /* 648 * Set zone cap to cap_val 649 * If cap_val is equal to NOCAP, disable zone cap. 650 * 651 * If this is the first time a cap is set on a zone, allocate cpucap structure 652 * without holding caps_lock to avoid KM_SLEEP allocation with caps_lock held. 653 */ 654 int 655 cpucaps_zone_set(zone_t *zone, rctl_qty_t cap_val) 656 { 657 cpucap_t *cap = NULL; 658 hrtime_t value; 659 660 if (cap_val == 0) 661 return (EINVAL); 662 663 ASSERT(cap_val <= MAXCAP); 664 if (cap_val > MAXCAP) 665 cap_val = MAXCAP; 666 667 /* 668 * Nothing to do if trying to disable a cap on a zone when caps are off 669 * or a zone which does not have a cap yet. 670 */ 671 if ((CPUCAPS_OFF() || !ZONE_IS_CAPPED(zone)) && (cap_val == NOCAP)) 672 return (0); 673 674 if (zone->zone_cpucap == NULL) 675 cap = cap_alloc(); 676 677 mutex_enter(&caps_lock); 678 679 if (cpucaps_busy) { 680 mutex_exit(&caps_lock); 681 return (EBUSY); 682 } 683 684 /* 685 * Double-check whether zone->zone_cpucap is NULL, now with caps_lock 686 * held. If it is still NULL, assign a newly allocated cpucap to it. 687 */ 688 if (zone->zone_cpucap == NULL) { 689 zone->zone_cpucap = cap; 690 } else if (cap != NULL) { 691 cap_free(cap); 692 } 693 694 cap = zone->zone_cpucap; 695 value = cap_val * cap_tick_cost; 696 if (value < 0) 697 value = MAX_USAGE; 698 699 /* Nothing to do if the value is staying the same */ 700 if (value == cap->cap_value) { 701 mutex_exit(&caps_lock); 702 return (0); 703 } 704 705 /* 706 * Clear cap statistics since the cap value itself changes. 707 */ 708 cap->cap_above = cap->cap_below = 0; 709 710 711 if (cap_val == NOCAP) { 712 if (CAP_ENABLED(cap)) { 713 /* 714 * Remove cap for the zone 715 */ 716 cap_zone_disable(zone); 717 cpucaps_busy = B_TRUE; 718 mutex_exit(&caps_lock); 719 /* 720 * Disable caps for all project belonging to this zone 721 * unless they have their own cap. 722 */ 723 (void) project_walk_all(zone->zone_id, 724 cap_project_zone_modify_walker, cap); 725 726 mutex_enter(&caps_lock); 727 cpucaps_busy = B_FALSE; 728 } 729 } else if (CAP_DISABLED(cap)) { 730 /* 731 * Set a cap on a zone which previously was not capped. 732 */ 733 cap_zone_enable(zone, value); 734 cpucaps_busy = B_TRUE; 735 mutex_exit(&caps_lock); 736 737 /* 738 * Enable cap for all projects belonging to this zone. 739 */ 740 (void) project_walk_all(zone->zone_id, 741 cap_project_zone_modify_walker, cap); 742 743 mutex_enter(&caps_lock); 744 cpucaps_busy = B_FALSE; 745 } else { 746 /* 747 * No state transitions, just change the value 748 */ 749 cap->cap_value = value; 750 } 751 752 ASSERT(MUTEX_HELD(&caps_lock)); 753 ASSERT(!cpucaps_busy); 754 mutex_exit(&caps_lock); 755 756 return (0); 757 } 758 759 /* 760 * The project is going away so disable its cap. 761 */ 762 void 763 cpucaps_project_remove(kproject_t *kpj) 764 { 765 mutex_enter(&caps_lock); 766 if (PROJECT_IS_CAPPED(kpj)) 767 cap_project_disable(kpj); 768 if (kpj->kpj_cpucap != NULL) { 769 cap_free(kpj->kpj_cpucap); 770 kpj->kpj_cpucap = NULL; 771 } 772 mutex_exit(&caps_lock); 773 } 774 775 /* 776 * The zone is going away, so disable its cap. 777 */ 778 void 779 cpucaps_zone_remove(zone_t *zone) 780 { 781 mutex_enter(&caps_lock); 782 while (ZONE_IS_CAPPED(zone)) { 783 mutex_exit(&caps_lock); 784 (void) cpucaps_zone_set(zone, NOCAP); 785 mutex_enter(&caps_lock); 786 } 787 if (zone->zone_cpucap != NULL) { 788 cap_free(zone->zone_cpucap); 789 zone->zone_cpucap = NULL; 790 } 791 mutex_exit(&caps_lock); 792 } 793 794 /* 795 * New project was created. It should be put on the capped_projects list if 796 * its zone has a cap. 797 */ 798 void 799 cpucaps_project_add(kproject_t *kpj) 800 { 801 cpucap_t *cap = NULL; 802 803 if (CPUCAPS_OFF() || !ZONE_IS_CAPPED(kpj->kpj_zone)) 804 return; 805 806 /* 807 * This project was never capped before, so allocate its cap structure. 808 */ 809 if (kpj->kpj_cpucap == NULL) 810 cap = cap_alloc(); 811 812 mutex_enter(&caps_lock); 813 /* 814 * Double-check with caps_lock held 815 */ 816 if (kpj->kpj_cpucap == NULL) { 817 kpj->kpj_cpucap = cap; 818 } else if (cap != NULL) { 819 cap_free(cap); 820 } 821 822 if (ZONE_IS_CAPPED(kpj->kpj_zone)) 823 cap_project_enable(kpj, MAX_USAGE); 824 825 mutex_exit(&caps_lock); 826 } 827 828 /* 829 * Set project cap to cap_val 830 * If cap_val is equal to NOCAP, disable project cap. 831 * 832 * If this is the first time a cap is set on a project, allocate cpucap 833 * structure without holding caps_lock to avoid KM_SLEEP allocation with 834 * caps_lock held. 835 */ 836 int 837 cpucaps_project_set(kproject_t *kpj, rctl_qty_t cap_val) 838 { 839 cpucap_t *cap = NULL; 840 hrtime_t value; 841 842 if (cap_val == 0) 843 return (EINVAL); 844 845 ASSERT(cap_val <= MAXCAP); 846 if (cap_val > MAXCAP) 847 cap_val = MAXCAP; 848 849 /* 850 * Nothing to do if trying to disable project cap and caps are not 851 * enabled or if trying to disable cap on a project that does not have 852 * cap enabled. 853 */ 854 if ((cap_val == NOCAP) && (CPUCAPS_OFF() || !PROJECT_IS_CAPPED(kpj))) 855 return (0); 856 857 if (kpj->kpj_cpucap == NULL) { 858 /* 859 * This project was never capped before, so allocate its cap 860 * structure. 861 */ 862 cap = cap_alloc(); 863 } 864 865 mutex_enter(&caps_lock); 866 867 /* 868 * Double-check with caps_lock held. 869 */ 870 if (kpj->kpj_cpucap == NULL) { 871 kpj->kpj_cpucap = cap; 872 } else if (cap != NULL) { 873 cap_free(cap); 874 } 875 876 /* 877 * Get the actual pointer to the project cap. 878 */ 879 cap = kpj->kpj_cpucap; 880 value = cap_val * cap_tick_cost; 881 if (value < 0) 882 value = MAX_USAGE; 883 884 /* 885 * Nothing to do if the value is not changing 886 */ 887 if (value == cap->cap_value) { 888 mutex_exit(&caps_lock); 889 return (0); 890 } 891 892 /* 893 * Clear cap statistics since the cap value itself changes. 894 */ 895 cap->cap_above = cap->cap_below = 0; 896 cap->cap_maxusage = 0; 897 898 if (cap_val != NOCAP) { 899 /* 900 * Enable this cap if it is not already enabled. 901 */ 902 if (CAP_DISABLED(cap)) 903 cap_project_enable(kpj, value); 904 else 905 cap->cap_value = value; 906 } else if (CAP_ENABLED(cap)) { 907 /* 908 * User requested to drop a cap on the project. If it is part of 909 * capped zone, keep the cap and set the value to MAX_USAGE, 910 * otherwise disable the cap. 911 */ 912 if (ZONE_IS_CAPPED(kpj->kpj_zone)) { 913 cap->cap_value = MAX_USAGE; 914 } else { 915 cap_project_disable(kpj); 916 } 917 } 918 mutex_exit(&caps_lock); 919 920 return (0); 921 } 922 923 /* 924 * Get cap usage. 925 */ 926 static rctl_qty_t 927 cap_get(cpucap_t *cap) 928 { 929 return (cap != NULL ? (rctl_qty_t)(cap->cap_usage / cap_tick_cost) : 0); 930 } 931 932 /* 933 * Get current project usage. 934 */ 935 rctl_qty_t 936 cpucaps_project_get(kproject_t *kpj) 937 { 938 return (cap_get(kpj->kpj_cpucap)); 939 } 940 941 /* 942 * Get current zone usage. 943 */ 944 rctl_qty_t 945 cpucaps_zone_get(zone_t *zone) 946 { 947 return (cap_get(zone->zone_cpucap)); 948 } 949 950 /* 951 * Charge project of thread t the time thread t spent on CPU since previously 952 * adjusted. 953 * 954 * Record the current on-CPU time in the csc structure. 955 * 956 * Do not adjust for more than one tick worth of time. 957 * 958 * It is possible that the project cap is being disabled while this routine is 959 * executed. This should not cause any issues since the association between the 960 * thread and its project is protected by thread lock. 961 */ 962 static void 963 caps_charge_adjust(kthread_id_t t, caps_sc_t *csc) 964 { 965 kproject_t *kpj = ttoproj(t); 966 hrtime_t new_usage; 967 hrtime_t usage_delta; 968 969 ASSERT(THREAD_LOCK_HELD(t)); 970 ASSERT(kpj->kpj_cpucap != NULL); 971 972 /* Get on-CPU time since birth of a thread */ 973 new_usage = mstate_thread_onproc_time(t); 974 975 /* Time spent on CPU since last checked */ 976 usage_delta = new_usage - csc->csc_cputime; 977 978 /* Save the accumulated on-CPU time */ 979 csc->csc_cputime = new_usage; 980 981 /* Charge at most one tick worth of on-CPU time */ 982 if (usage_delta > cap_tick_cost) 983 usage_delta = cap_tick_cost; 984 985 /* Add usage_delta to the project usage value. */ 986 if (usage_delta > 0) { 987 cpucap_t *cap = kpj->kpj_cpucap; 988 989 DTRACE_PROBE2(cpucaps__project__charge, 990 kthread_id_t, t, hrtime_t, usage_delta); 991 992 disp_lock_enter_high(&cap->cap_usagelock); 993 cap->cap_usage += usage_delta; 994 995 /* Check for overflows */ 996 if (cap->cap_usage < 0) 997 cap->cap_usage = MAX_USAGE - 1; 998 999 disp_lock_exit_high(&cap->cap_usagelock); 1000 1001 /* 1002 * cap_maxusage is only kept for observability. Move it outside 1003 * the lock to reduce the time spent while holding the lock. 1004 */ 1005 if (cap->cap_usage > cap->cap_maxusage) 1006 cap->cap_maxusage = cap->cap_usage; 1007 } 1008 } 1009 1010 /* 1011 * Charge thread's project and return True if project or zone should be 1012 * penalized because its project or zone is exceeding its cap. Also sets 1013 * TS_PROJWAITQ or TS_ZONEWAITQ in this case. 1014 * 1015 * It is possible that the project cap is being disabled while this routine is 1016 * executed. This should not cause any issues since the association between the 1017 * thread and its project is protected by thread lock. It will still set 1018 * TS_PROJECTWAITQ/TS_ZONEWAITQ in this case but cpucaps_enforce will not place 1019 * anything on the blocked wait queue. 1020 * 1021 */ 1022 boolean_t 1023 cpucaps_charge(kthread_id_t t, caps_sc_t *csc, cpucaps_charge_t charge_type) 1024 { 1025 kproject_t *kpj = ttoproj(t); 1026 klwp_t *lwp = t->t_lwp; 1027 zone_t *zone; 1028 cpucap_t *project_cap; 1029 boolean_t rc = B_FALSE; 1030 1031 ASSERT(THREAD_LOCK_HELD(t)); 1032 1033 /* Nothing to do for projects that are not capped. */ 1034 if (lwp == NULL || !PROJECT_IS_CAPPED(kpj)) 1035 return (B_FALSE); 1036 1037 caps_charge_adjust(t, csc); 1038 1039 /* 1040 * The caller only requested to charge the project usage, no enforcement 1041 * part. 1042 */ 1043 if (charge_type == CPUCAPS_CHARGE_ONLY) 1044 return (B_FALSE); 1045 1046 project_cap = kpj->kpj_cpucap; 1047 1048 if (project_cap->cap_usage >= project_cap->cap_value) { 1049 t->t_schedflag |= TS_PROJWAITQ; 1050 rc = B_TRUE; 1051 } else if (t->t_schedflag & TS_PROJWAITQ) { 1052 t->t_schedflag &= ~TS_PROJWAITQ; 1053 } 1054 1055 zone = ttozone(t); 1056 if (!ZONE_IS_CAPPED(zone)) { 1057 if (t->t_schedflag & TS_ZONEWAITQ) 1058 t->t_schedflag &= ~TS_ZONEWAITQ; 1059 } else { 1060 cpucap_t *zone_cap = zone->zone_cpucap; 1061 1062 if (zone_cap->cap_usage >= zone_cap->cap_value) { 1063 t->t_schedflag |= TS_ZONEWAITQ; 1064 rc = B_TRUE; 1065 } else if (t->t_schedflag & TS_ZONEWAITQ) { 1066 t->t_schedflag &= ~TS_ZONEWAITQ; 1067 } 1068 } 1069 1070 1071 return (rc); 1072 } 1073 1074 /* 1075 * Enforce CPU caps. If got preempted in the user-land, we know that thread does 1076 * not hold any kernel locks, so enqueue ourselves on the waitq, if needed. 1077 * 1078 * CPU Caps are only enforced for user threads. 1079 * 1080 * Threads flagged with TS_PROJWAITQ are placed on their project wait queues and 1081 * threads marked with TS_ZONEWAITQ are placed on their zone wait queue. 1082 * 1083 * It is possible that by the time we enter cpucaps_enforce() the cap is already 1084 * disabled. In this case waitq_enqueue() fails and doesn't enqueue anything. We 1085 * still clear TS_PROJWAITQ/TS_ZONEWAITQ flags in this case since they no longer 1086 * apply. 1087 */ 1088 boolean_t 1089 cpucaps_enforce(kthread_t *t) 1090 { 1091 klwp_t *lwp = t->t_lwp; 1092 1093 ASSERT(THREAD_LOCK_HELD(t)); 1094 1095 if (lwp != NULL && lwp->lwp_state == LWP_USER) { 1096 if (t->t_schedflag & TS_PROJWAITQ) { 1097 ASSERT(ttoproj(t)->kpj_cpucap != NULL); 1098 t->t_schedflag &= ~TS_ANYWAITQ; 1099 if (waitq_enqueue(&(ttoproj(t)->kpj_cpucap->cap_waitq), 1100 t)) { 1101 return (B_TRUE); 1102 } 1103 } 1104 if (t->t_schedflag & TS_ZONEWAITQ) { 1105 ASSERT(ttozone(t)->zone_cpucap != NULL); 1106 t->t_schedflag &= ~TS_ZONEWAITQ; 1107 if (waitq_enqueue(&(ttozone(t)->zone_cpucap->cap_waitq), 1108 t)) { 1109 return (B_TRUE); 1110 } 1111 } 1112 } 1113 1114 /* 1115 * The thread is not enqueued on the wait queue. 1116 */ 1117 return (B_FALSE); 1118 } 1119 1120 /* 1121 * Convert internal cap statistics into values exported by cap kstat. 1122 */ 1123 static int 1124 cap_kstat_update(kstat_t *ksp, int rw) 1125 { 1126 struct cap_kstat *capsp = &cap_kstat; 1127 cpucap_t *cap = ksp->ks_private; 1128 clock_t tick_sec = SEC_TO_TICK(1); 1129 char *zonename = cap->cap_zone->zone_name; 1130 1131 if (rw == KSTAT_WRITE) 1132 return (EACCES); 1133 1134 capsp->cap_value.value.ui64 = 1135 ROUND_SCALE(cap->cap_value, cap_tick_cost); 1136 capsp->cap_usage.value.ui64 = 1137 ROUND_SCALE(cap->cap_usage, cap_tick_cost); 1138 capsp->cap_maxusage.value.ui64 = 1139 ROUND_SCALE(cap->cap_maxusage, cap_tick_cost); 1140 capsp->cap_nwait.value.ui64 = cap->cap_waitq.wq_count; 1141 capsp->cap_below.value.ui64 = ROUND_SCALE(cap->cap_below, tick_sec); 1142 capsp->cap_above.value.ui64 = ROUND_SCALE(cap->cap_above, tick_sec); 1143 kstat_named_setstr(&capsp->cap_zonename, zonename); 1144 1145 return (0); 1146 } 1147