1 /* 2 * CDDL HEADER START 3 * 4 * The contents of this file are subject to the terms of the 5 * Common Development and Distribution License (the "License"). 6 * You may not use this file except in compliance with the License. 7 * 8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE 9 * or http://www.opensolaris.org/os/licensing. 10 * See the License for the specific language governing permissions 11 * and limitations under the License. 12 * 13 * When distributing Covered Code, include this CDDL HEADER in each 14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE. 15 * If applicable, add the following below this CDDL HEADER, with the 16 * fields enclosed by brackets "[]" replaced with your own identifying 17 * information: Portions Copyright [yyyy] [name of copyright owner] 18 * 19 * CDDL HEADER END 20 */ 21 22 /* 23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved. 24 * Use is subject to license terms. 25 */ 26 27 /* 28 * vm_usage 29 * 30 * This file implements the getvmusage() private system call. 31 * getvmusage() counts the amount of resident memory pages and swap 32 * reserved by the specified process collective. A "process collective" is 33 * the set of processes owned by a particular, zone, project, task, or user. 34 * 35 * rss and swap are counted so that for a given process collective, a page is 36 * only counted once. For example, this means that if multiple processes in 37 * the same project map the same page, then the project will only be charged 38 * once for that page. On the other hand, if two processes in different 39 * projects map the same page, then both projects will be charged 40 * for the page. 41 * 42 * The vm_getusage() calculation is implemented so that the first thread 43 * performs the rss/swap counting. Other callers will wait for that thread to 44 * finish, copying the results. This enables multiple rcapds and prstats to 45 * consume data from the same calculation. The results are also cached so that 46 * a caller interested in recent results can just copy them instead of starting 47 * a new calculation. The caller passes the maximium age (in seconds) of the 48 * data. If the cached data is young enough, the cache is copied, otherwise, 49 * a new calculation is executed and the cache is replaced with the new 50 * data. 51 * 52 * The rss calculation for each process collective is as follows: 53 * 54 * - Inspect flags, determine if counting rss for zones, projects, tasks, 55 * and/or users. 56 * - For each proc: 57 * - Figure out proc's collectives (zone, project, task, and/or user). 58 * - For each seg in proc's address space: 59 * - If seg is private: 60 * - Lookup anons in the amp. 61 * - For incore pages not previously visited each of the 62 * proc's collectives, add incore pagesize to each. 63 * collective. 64 * Anon's with a refcnt of 1 can be assummed to be not 65 * previously visited. 66 * - For address ranges without anons in the amp: 67 * - Lookup pages in underlying vnode. 68 * - For incore pages not previously visiting for 69 * each of the proc's collectives, add incore 70 * pagesize to each collective. 71 * - If seg is shared: 72 * - Lookup pages in the shared amp or vnode. 73 * - For incore pages not previously visited for each of 74 * the proc's collectives, add incore pagesize to each 75 * collective. 76 * 77 * Swap is reserved by private segments, and shared anonymous segments. 78 * The only shared anon segments which do not reserve swap are ISM segments 79 * and schedctl segments, both of which can be identified by having 80 * amp->swresv == 0. 81 * 82 * The swap calculation for each collective is as follows: 83 * 84 * - Inspect flags, determine if counting rss for zones, projects, tasks, 85 * and/or users. 86 * - For each proc: 87 * - Figure out proc's collectives (zone, project, task, and/or user). 88 * - For each seg in proc's address space: 89 * - If seg is private: 90 * - Add svd->swresv pages to swap count for each of the 91 * proc's collectives. 92 * - If seg is anon, shared, and amp->swresv != 0 93 * - For address ranges in amp not previously visited for 94 * each of the proc's collectives, add size of address 95 * range to the swap count for each collective. 96 * 97 * These two calculations are done simultaneously, with most of the work 98 * being done in vmu_calculate_seg(). The results of the calculation are 99 * copied into "vmu_data.vmu_cache_results". 100 * 101 * To perform the calculation, various things are tracked and cached: 102 * 103 * - incore/not-incore page ranges for all vnodes. 104 * (vmu_data.vmu_all_vnodes_hash) 105 * This eliminates looking up the same page more than once. 106 * 107 * - incore/not-incore page ranges for all shared amps. 108 * (vmu_data.vmu_all_amps_hash) 109 * This eliminates looking up the same page more than once. 110 * 111 * - visited page ranges for each collective. 112 * - per vnode (entity->vme_vnode_hash) 113 * - per shared amp (entity->vme_amp_hash) 114 * For accurate counting of map-shared and COW-shared pages. 115 * 116 * - visited private anons (refcnt > 1) for each collective. 117 * (entity->vme_anon_hash) 118 * For accurate counting of COW-shared pages. 119 * 120 * The common accounting structure is the vmu_entity_t, which represents 121 * collectives: 122 * 123 * - A zone. 124 * - A project, task, or user within a zone. 125 * - The entire system (vmu_data.vmu_system). 126 * - Each collapsed (col) project and user. This means a given projid or 127 * uid, regardless of which zone the process is in. For instance, 128 * project 0 in the global zone and project 0 in a non global zone are 129 * the same collapsed project. 130 * 131 * Each entity structure tracks which pages have been already visited for 132 * that entity (via previously inspected processes) so that these pages are 133 * not double counted. 134 */ 135 136 #include <sys/errno.h> 137 #include <sys/types.h> 138 #include <sys/zone.h> 139 #include <sys/proc.h> 140 #include <sys/project.h> 141 #include <sys/task.h> 142 #include <sys/thread.h> 143 #include <sys/time.h> 144 #include <sys/mman.h> 145 #include <sys/modhash.h> 146 #include <sys/modhash_impl.h> 147 #include <sys/shm.h> 148 #include <sys/swap.h> 149 #include <sys/synch.h> 150 #include <sys/systm.h> 151 #include <sys/var.h> 152 #include <sys/vm_usage.h> 153 #include <sys/zone.h> 154 #include <sys/sunddi.h> 155 #include <sys/avl.h> 156 #include <vm/anon.h> 157 #include <vm/as.h> 158 #include <vm/seg_vn.h> 159 #include <vm/seg_spt.h> 160 161 #define VMUSAGE_HASH_SIZE 512 162 163 #define VMUSAGE_TYPE_VNODE 1 164 #define VMUSAGE_TYPE_AMP 2 165 #define VMUSAGE_TYPE_ANON 3 166 167 #define VMUSAGE_BOUND_UNKNOWN 0 168 #define VMUSAGE_BOUND_INCORE 1 169 #define VMUSAGE_BOUND_NOT_INCORE 2 170 171 #define ISWITHIN(node, addr) ((node)->vmb_start <= addr && \ 172 (node)->vmb_end >= addr ? 1 : 0) 173 174 /* 175 * bounds for vnodes and shared amps 176 * Each bound is either entirely incore, entirely not in core, or 177 * entirely unknown. bounds are stored in an avl tree sorted by start member 178 * when in use, otherwise (free or temporary lists) they're strung 179 * together off of vmb_next. 180 */ 181 typedef struct vmu_bound { 182 avl_node_t vmb_node; 183 struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */ 184 pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */ 185 pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */ 186 char vmb_type; /* One of VMUSAGE_BOUND_* */ 187 } vmu_bound_t; 188 189 /* 190 * hash of visited objects (vnodes or shared amps) 191 * key is address of vnode or amp. Bounds lists known incore/non-incore 192 * bounds for vnode/amp. 193 */ 194 typedef struct vmu_object { 195 struct vmu_object *vmo_next; /* free list */ 196 caddr_t vmo_key; 197 short vmo_type; 198 avl_tree_t vmo_bounds; 199 } vmu_object_t; 200 201 /* 202 * Entity by which to count results. 203 * 204 * The entity structure keeps the current rss/swap counts for each entity 205 * (zone, project, etc), and hashes of vm structures that have already 206 * been visited for the entity. 207 * 208 * vme_next: links the list of all entities currently being counted by 209 * vmu_calculate(). 210 * 211 * vme_next_calc: links the list of entities related to the current process 212 * being counted by vmu_calculate_proc(). 213 * 214 * vmu_calculate_proc() walks all processes. For each process, it makes a 215 * list of the entities related to that process using vme_next_calc. This 216 * list changes each time vmu_calculate_proc() is called. 217 * 218 */ 219 typedef struct vmu_entity { 220 struct vmu_entity *vme_next; 221 struct vmu_entity *vme_next_calc; 222 mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */ 223 mod_hash_t *vme_amp_hash; /* shared amps visited for entity */ 224 mod_hash_t *vme_anon_hash; /* COW anons visited for entity */ 225 vmusage_t vme_result; /* identifies entity and results */ 226 } vmu_entity_t; 227 228 /* 229 * Hash of entities visited within a zone, and an entity for the zone 230 * itself. 231 */ 232 typedef struct vmu_zone { 233 struct vmu_zone *vmz_next; /* free list */ 234 id_t vmz_id; 235 vmu_entity_t *vmz_zone; 236 mod_hash_t *vmz_projects_hash; 237 mod_hash_t *vmz_tasks_hash; 238 mod_hash_t *vmz_rusers_hash; 239 mod_hash_t *vmz_eusers_hash; 240 } vmu_zone_t; 241 242 /* 243 * Cache of results from last calculation 244 */ 245 typedef struct vmu_cache { 246 vmusage_t *vmc_results; /* Results from last call to */ 247 /* vm_getusage(). */ 248 uint64_t vmc_nresults; /* Count of cached results */ 249 uint64_t vmc_refcnt; /* refcnt for free */ 250 uint_t vmc_flags; /* Flags for vm_getusage() */ 251 hrtime_t vmc_timestamp; /* when cache was created */ 252 } vmu_cache_t; 253 254 /* 255 * top level rss info for the system 256 */ 257 typedef struct vmu_data { 258 kmutex_t vmu_lock; /* Protects vmu_data */ 259 kcondvar_t vmu_cv; /* Used to signal threads */ 260 /* Waiting for */ 261 /* Rss_calc_thread to finish */ 262 vmu_entity_t *vmu_system; /* Entity for tracking */ 263 /* rss/swap for all processes */ 264 /* in all zones */ 265 mod_hash_t *vmu_zones_hash; /* Zones visited */ 266 mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */ 267 mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */ 268 mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */ 269 /* to implement VMUSAGE_COL_* */ 270 /* flags, which aggregate by */ 271 /* project or user regardless */ 272 /* of zoneid. */ 273 mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */ 274 /* to track incore/not-incore */ 275 mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */ 276 /* amps to track incore/not- */ 277 /* incore */ 278 vmu_entity_t *vmu_entities; /* Linked list of entities */ 279 size_t vmu_nentities; /* Count of entities in list */ 280 vmu_cache_t *vmu_cache; /* Cached results */ 281 kthread_t *vmu_calc_thread; /* NULL, or thread running */ 282 /* vmu_calculate() */ 283 uint_t vmu_calc_flags; /* Flags being using by */ 284 /* currently running calc */ 285 /* thread */ 286 uint_t vmu_pending_flags; /* Flags of vm_getusage() */ 287 /* threads waiting for */ 288 /* calc thread to finish */ 289 uint_t vmu_pending_waiters; /* Number of threads waiting */ 290 /* for calc thread */ 291 vmu_bound_t *vmu_free_bounds; 292 vmu_object_t *vmu_free_objects; 293 vmu_entity_t *vmu_free_entities; 294 vmu_zone_t *vmu_free_zones; 295 } vmu_data_t; 296 297 extern struct as kas; 298 extern proc_t *practive; 299 extern zone_t *global_zone; 300 extern struct seg_ops segvn_ops; 301 extern struct seg_ops segspt_shmops; 302 303 static vmu_data_t vmu_data; 304 static kmem_cache_t *vmu_bound_cache; 305 static kmem_cache_t *vmu_object_cache; 306 307 /* 308 * Comparison routine for AVL tree. We base our comparison on vmb_start. 309 */ 310 static int 311 bounds_cmp(const void *bnd1, const void *bnd2) 312 { 313 const vmu_bound_t *bound1 = bnd1; 314 const vmu_bound_t *bound2 = bnd2; 315 316 if (bound1->vmb_start == bound2->vmb_start) { 317 return (0); 318 } 319 if (bound1->vmb_start < bound2->vmb_start) { 320 return (-1); 321 } 322 323 return (1); 324 } 325 326 /* 327 * Save a bound on the free list. 328 */ 329 static void 330 vmu_free_bound(vmu_bound_t *bound) 331 { 332 bound->vmb_next = vmu_data.vmu_free_bounds; 333 bound->vmb_start = 0; 334 bound->vmb_end = 0; 335 bound->vmb_type = 0; 336 vmu_data.vmu_free_bounds = bound; 337 } 338 339 /* 340 * Free an object, and all visited bound info. 341 */ 342 static void 343 vmu_free_object(mod_hash_val_t val) 344 { 345 vmu_object_t *obj = (vmu_object_t *)val; 346 avl_tree_t *tree = &(obj->vmo_bounds); 347 vmu_bound_t *bound; 348 void *cookie = NULL; 349 350 while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL) 351 vmu_free_bound(bound); 352 avl_destroy(tree); 353 354 obj->vmo_type = 0; 355 obj->vmo_next = vmu_data.vmu_free_objects; 356 vmu_data.vmu_free_objects = obj; 357 } 358 359 /* 360 * Free an entity, and hashes of visited objects for that entity. 361 */ 362 static void 363 vmu_free_entity(mod_hash_val_t val) 364 { 365 vmu_entity_t *entity = (vmu_entity_t *)val; 366 367 if (entity->vme_vnode_hash != NULL) 368 i_mod_hash_clear_nosync(entity->vme_vnode_hash); 369 if (entity->vme_amp_hash != NULL) 370 i_mod_hash_clear_nosync(entity->vme_amp_hash); 371 if (entity->vme_anon_hash != NULL) 372 i_mod_hash_clear_nosync(entity->vme_anon_hash); 373 374 entity->vme_next = vmu_data.vmu_free_entities; 375 vmu_data.vmu_free_entities = entity; 376 } 377 378 /* 379 * Free zone entity, and all hashes of entities inside that zone, 380 * which are projects, tasks, and users. 381 */ 382 static void 383 vmu_free_zone(mod_hash_val_t val) 384 { 385 vmu_zone_t *zone = (vmu_zone_t *)val; 386 387 if (zone->vmz_zone != NULL) { 388 vmu_free_entity((mod_hash_val_t)zone->vmz_zone); 389 zone->vmz_zone = NULL; 390 } 391 if (zone->vmz_projects_hash != NULL) 392 i_mod_hash_clear_nosync(zone->vmz_projects_hash); 393 if (zone->vmz_tasks_hash != NULL) 394 i_mod_hash_clear_nosync(zone->vmz_tasks_hash); 395 if (zone->vmz_rusers_hash != NULL) 396 i_mod_hash_clear_nosync(zone->vmz_rusers_hash); 397 if (zone->vmz_eusers_hash != NULL) 398 i_mod_hash_clear_nosync(zone->vmz_eusers_hash); 399 zone->vmz_next = vmu_data.vmu_free_zones; 400 vmu_data.vmu_free_zones = zone; 401 } 402 403 /* 404 * Initialize synchronization primitives and hashes for system-wide tracking 405 * of visited vnodes and shared amps. Initialize results cache. 406 */ 407 void 408 vm_usage_init() 409 { 410 mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL); 411 cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL); 412 413 vmu_data.vmu_system = NULL; 414 vmu_data.vmu_zones_hash = NULL; 415 vmu_data.vmu_projects_col_hash = NULL; 416 vmu_data.vmu_rusers_col_hash = NULL; 417 vmu_data.vmu_eusers_col_hash = NULL; 418 419 vmu_data.vmu_free_bounds = NULL; 420 vmu_data.vmu_free_objects = NULL; 421 vmu_data.vmu_free_entities = NULL; 422 vmu_data.vmu_free_zones = NULL; 423 424 vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash( 425 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, 426 sizeof (vnode_t)); 427 vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash( 428 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, 429 sizeof (struct anon_map)); 430 vmu_data.vmu_projects_col_hash = mod_hash_create_idhash( 431 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE, 432 vmu_free_entity); 433 vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash( 434 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE, 435 vmu_free_entity); 436 vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash( 437 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE, 438 vmu_free_entity); 439 vmu_data.vmu_zones_hash = mod_hash_create_idhash( 440 "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone); 441 442 vmu_bound_cache = kmem_cache_create("vmu_bound_cache", 443 sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 444 vmu_object_cache = kmem_cache_create("vmu_object_cache", 445 sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0); 446 447 vmu_data.vmu_entities = NULL; 448 vmu_data.vmu_nentities = 0; 449 450 vmu_data.vmu_cache = NULL; 451 vmu_data.vmu_calc_thread = NULL; 452 vmu_data.vmu_calc_flags = 0; 453 vmu_data.vmu_pending_flags = 0; 454 vmu_data.vmu_pending_waiters = 0; 455 } 456 457 /* 458 * Allocate hashes for tracking vm objects visited for an entity. 459 * Update list of entities. 460 */ 461 static vmu_entity_t * 462 vmu_alloc_entity(id_t id, int type, id_t zoneid) 463 { 464 vmu_entity_t *entity; 465 466 if (vmu_data.vmu_free_entities != NULL) { 467 entity = vmu_data.vmu_free_entities; 468 vmu_data.vmu_free_entities = 469 vmu_data.vmu_free_entities->vme_next; 470 bzero(&entity->vme_result, sizeof (vmusage_t)); 471 } else { 472 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP); 473 } 474 entity->vme_result.vmu_id = id; 475 entity->vme_result.vmu_zoneid = zoneid; 476 entity->vme_result.vmu_type = type; 477 478 if (entity->vme_vnode_hash == NULL) 479 entity->vme_vnode_hash = mod_hash_create_ptrhash( 480 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object, 481 sizeof (vnode_t)); 482 483 if (entity->vme_amp_hash == NULL) 484 entity->vme_amp_hash = mod_hash_create_ptrhash( 485 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object, 486 sizeof (struct anon_map)); 487 488 if (entity->vme_anon_hash == NULL) 489 entity->vme_anon_hash = mod_hash_create_ptrhash( 490 "vmusage anon hash", VMUSAGE_HASH_SIZE, 491 mod_hash_null_valdtor, sizeof (struct anon)); 492 493 entity->vme_next = vmu_data.vmu_entities; 494 vmu_data.vmu_entities = entity; 495 vmu_data.vmu_nentities++; 496 497 return (entity); 498 } 499 500 /* 501 * Allocate a zone entity, and hashes for tracking visited vm objects 502 * for projects, tasks, and users within that zone. 503 */ 504 static vmu_zone_t * 505 vmu_alloc_zone(id_t id) 506 { 507 vmu_zone_t *zone; 508 509 if (vmu_data.vmu_free_zones != NULL) { 510 zone = vmu_data.vmu_free_zones; 511 vmu_data.vmu_free_zones = 512 vmu_data.vmu_free_zones->vmz_next; 513 zone->vmz_next = NULL; 514 zone->vmz_zone = NULL; 515 } else { 516 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP); 517 } 518 519 zone->vmz_id = id; 520 521 if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0) 522 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id); 523 524 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS | 525 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL) 526 zone->vmz_projects_hash = mod_hash_create_idhash( 527 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 528 529 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) 530 != 0 && zone->vmz_tasks_hash == NULL) 531 zone->vmz_tasks_hash = mod_hash_create_idhash( 532 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 533 534 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) 535 != 0 && zone->vmz_rusers_hash == NULL) 536 zone->vmz_rusers_hash = mod_hash_create_idhash( 537 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 538 539 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) 540 != 0 && zone->vmz_eusers_hash == NULL) 541 zone->vmz_eusers_hash = mod_hash_create_idhash( 542 "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity); 543 544 return (zone); 545 } 546 547 /* 548 * Allocate a structure for tracking visited bounds for a vm object. 549 */ 550 static vmu_object_t * 551 vmu_alloc_object(caddr_t key, int type) 552 { 553 vmu_object_t *object; 554 555 if (vmu_data.vmu_free_objects != NULL) { 556 object = vmu_data.vmu_free_objects; 557 vmu_data.vmu_free_objects = 558 vmu_data.vmu_free_objects->vmo_next; 559 } else { 560 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP); 561 } 562 563 object->vmo_next = NULL; 564 object->vmo_key = key; 565 object->vmo_type = type; 566 avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0); 567 568 return (object); 569 } 570 571 /* 572 * Allocate and return a bound structure. 573 */ 574 static vmu_bound_t * 575 vmu_alloc_bound() 576 { 577 vmu_bound_t *bound; 578 579 if (vmu_data.vmu_free_bounds != NULL) { 580 bound = vmu_data.vmu_free_bounds; 581 vmu_data.vmu_free_bounds = 582 vmu_data.vmu_free_bounds->vmb_next; 583 } else { 584 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP); 585 } 586 587 bound->vmb_next = NULL; 588 bound->vmb_start = 0; 589 bound->vmb_end = 0; 590 bound->vmb_type = 0; 591 return (bound); 592 } 593 594 /* 595 * vmu_find_insert_* functions implement hash lookup or allocate and 596 * insert operations. 597 */ 598 static vmu_object_t * 599 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type) 600 { 601 int ret; 602 vmu_object_t *object; 603 604 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, 605 (mod_hash_val_t *)&object); 606 if (ret != 0) { 607 object = vmu_alloc_object(key, type); 608 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, 609 (mod_hash_val_t)object, (mod_hash_hndl_t)0); 610 ASSERT(ret == 0); 611 } 612 return (object); 613 } 614 615 static int 616 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key) 617 { 618 int ret; 619 caddr_t val; 620 621 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key, 622 (mod_hash_val_t *)&val); 623 624 if (ret == 0) 625 return (0); 626 627 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key, 628 (mod_hash_val_t)key, (mod_hash_hndl_t)0); 629 630 ASSERT(ret == 0); 631 632 return (1); 633 } 634 635 static vmu_entity_t * 636 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid) 637 { 638 int ret; 639 vmu_entity_t *entity; 640 641 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id, 642 (mod_hash_val_t *)&entity); 643 if (ret != 0) { 644 entity = vmu_alloc_entity(id, type, zoneid); 645 ret = i_mod_hash_insert_nosync(hash, 646 (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity, 647 (mod_hash_hndl_t)0); 648 ASSERT(ret == 0); 649 } 650 return (entity); 651 } 652 653 654 655 656 /* 657 * Returns list of object bounds between start and end. New bounds inserted 658 * by this call are given type. 659 * 660 * Returns the number of pages covered if new bounds are created. Returns 0 661 * if region between start/end consists of all existing bounds. 662 */ 663 static pgcnt_t 664 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t 665 end, char type, vmu_bound_t **first, vmu_bound_t **last) 666 { 667 avl_tree_t *tree = &(ro->vmo_bounds); 668 avl_index_t where; 669 vmu_bound_t *walker, *tmp; 670 pgcnt_t ret = 0; 671 672 ASSERT(start <= end); 673 674 *first = *last = NULL; 675 676 tmp = vmu_alloc_bound(); 677 tmp->vmb_start = start; 678 tmp->vmb_type = type; 679 680 /* Hopelessly optimistic case. */ 681 if (walker = avl_find(tree, tmp, &where)) { 682 /* We got lucky. */ 683 vmu_free_bound(tmp); 684 *first = walker; 685 } 686 687 if (walker == NULL) { 688 /* Is start in the previous node? */ 689 walker = avl_nearest(tree, where, AVL_BEFORE); 690 if (walker != NULL) { 691 if (ISWITHIN(walker, start)) { 692 /* We found start. */ 693 vmu_free_bound(tmp); 694 *first = walker; 695 } 696 } 697 } 698 699 /* 700 * At this point, if *first is still NULL, then we 701 * didn't get a direct hit and start isn't covered 702 * by the previous node. We know that the next node 703 * must have a greater start value than we require 704 * because avl_find tells us where the AVL routines would 705 * insert our new node. We have some gap between the 706 * start we want and the next node. 707 */ 708 if (*first == NULL) { 709 walker = avl_nearest(tree, where, AVL_AFTER); 710 if (walker != NULL && walker->vmb_start <= end) { 711 /* Fill the gap. */ 712 tmp->vmb_end = walker->vmb_start - 1; 713 *first = tmp; 714 } else { 715 /* We have a gap over [start, end]. */ 716 tmp->vmb_end = end; 717 *first = *last = tmp; 718 } 719 ret += tmp->vmb_end - tmp->vmb_start + 1; 720 avl_insert(tree, tmp, where); 721 } 722 723 ASSERT(*first != NULL); 724 725 if (*last != NULL) { 726 /* We're done. */ 727 return (ret); 728 } 729 730 /* 731 * If we are here we still need to set *last and 732 * that may involve filling in some gaps. 733 */ 734 *last = *first; 735 for (;;) { 736 if (ISWITHIN(*last, end)) { 737 /* We're done. */ 738 break; 739 } 740 walker = AVL_NEXT(tree, *last); 741 if (walker == NULL || walker->vmb_start > end) { 742 /* Bottom or mid tree with gap. */ 743 tmp = vmu_alloc_bound(); 744 tmp->vmb_start = (*last)->vmb_end + 1; 745 tmp->vmb_end = end; 746 tmp->vmb_type = type; 747 ret += tmp->vmb_end - tmp->vmb_start + 1; 748 avl_insert_here(tree, tmp, *last, AVL_AFTER); 749 *last = tmp; 750 break; 751 } else { 752 if ((*last)->vmb_end + 1 != walker->vmb_start) { 753 /* Non-contiguous. */ 754 tmp = vmu_alloc_bound(); 755 tmp->vmb_start = (*last)->vmb_end + 1; 756 tmp->vmb_end = walker->vmb_start - 1; 757 tmp->vmb_type = type; 758 ret += tmp->vmb_end - tmp->vmb_start + 1; 759 avl_insert_here(tree, tmp, *last, AVL_AFTER); 760 *last = tmp; 761 } else { 762 *last = walker; 763 } 764 } 765 } 766 767 return (ret); 768 } 769 770 /* 771 * vmu_update_bounds() 772 * 773 * tree: avl_tree in which first and last hang. 774 * 775 * first, last: list of continuous bounds, of which zero or more are of 776 * type VMUSAGE_BOUND_UNKNOWN. 777 * 778 * new_tree: avl_tree in which new_first and new_last hang. 779 * 780 * new_first, new_last: list of continuous bounds, of which none are of 781 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to 782 * update the types of bounds in (first,last) with 783 * type VMUSAGE_BOUND_UNKNOWN. 784 * 785 * For the list of bounds (first,last), this function updates any bounds 786 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in 787 * the list (new_first, new_last). 788 * 789 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list 790 * (new_first, new_last), it will be split into multiple bounds. 791 * 792 * Return value: 793 * The number of pages in the list of bounds (first,last) that were of 794 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type 795 * VMUSAGE_BOUND_INCORE. 796 * 797 */ 798 static pgcnt_t 799 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last, 800 avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last) 801 { 802 vmu_bound_t *next, *new_next, *tmp; 803 pgcnt_t rss = 0; 804 805 next = *first; 806 new_next = new_first; 807 808 /* 809 * Verify first and last bound are covered by new bounds if they 810 * have unknown type. 811 */ 812 ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN || 813 (*first)->vmb_start >= new_first->vmb_start); 814 ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN || 815 (*last)->vmb_end <= new_last->vmb_end); 816 for (;;) { 817 /* If bound already has type, proceed to next bound. */ 818 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 819 if (next == *last) 820 break; 821 next = AVL_NEXT(tree, next); 822 continue; 823 } 824 while (new_next->vmb_end < next->vmb_start) 825 new_next = AVL_NEXT(new_tree, new_next); 826 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 827 next->vmb_type = new_next->vmb_type; 828 if (new_next->vmb_end < next->vmb_end) { 829 /* need to split bound */ 830 tmp = vmu_alloc_bound(); 831 tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN; 832 tmp->vmb_start = new_next->vmb_end + 1; 833 tmp->vmb_end = next->vmb_end; 834 avl_insert_here(tree, tmp, next, AVL_AFTER); 835 next->vmb_end = new_next->vmb_end; 836 if (*last == next) 837 *last = tmp; 838 if (next->vmb_type == VMUSAGE_BOUND_INCORE) 839 rss += next->vmb_end - next->vmb_start + 1; 840 next = tmp; 841 } else { 842 if (next->vmb_type == VMUSAGE_BOUND_INCORE) 843 rss += next->vmb_end - next->vmb_start + 1; 844 if (next == *last) 845 break; 846 next = AVL_NEXT(tree, next); 847 } 848 } 849 return (rss); 850 } 851 852 /* 853 * Merges adjacent bounds with same type between first and last bound. 854 * After merge, last pointer may point to a different bound, as (incoming) 855 * last bound may have been merged away. 856 */ 857 static void 858 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last) 859 { 860 vmu_bound_t *current; 861 vmu_bound_t *next; 862 863 ASSERT(tree != NULL); 864 ASSERT(*first != NULL); 865 ASSERT(*last != NULL); 866 867 current = *first; 868 while (current != *last) { 869 next = AVL_NEXT(tree, current); 870 if ((current->vmb_end + 1) == next->vmb_start && 871 current->vmb_type == next->vmb_type) { 872 current->vmb_end = next->vmb_end; 873 avl_remove(tree, next); 874 vmu_free_bound(next); 875 if (next == *last) { 876 *last = current; 877 } 878 } else { 879 current = AVL_NEXT(tree, current); 880 } 881 } 882 } 883 884 /* 885 * Given an amp and a list of bounds, updates each bound's type with 886 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE. 887 * 888 * If a bound is partially incore, it will be split into two bounds. 889 * first and last may be modified, as bounds may be split into multiple 890 * bounds if they are partially incore/not-incore. 891 * 892 * Set incore to non-zero if bounds are already known to be incore. 893 * 894 */ 895 static void 896 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp, 897 vmu_bound_t **first, vmu_bound_t **last, boolean_t incore) 898 { 899 vmu_bound_t *next; 900 vmu_bound_t *tmp; 901 pgcnt_t index; 902 short bound_type; 903 short page_type; 904 vnode_t *vn; 905 anoff_t off; 906 struct anon *ap; 907 908 next = *first; 909 /* Shared anon slots don't change once set. */ 910 ANON_LOCK_ENTER(&->a_rwlock, RW_READER); 911 for (;;) { 912 if (incore == B_TRUE) 913 next->vmb_type = VMUSAGE_BOUND_INCORE; 914 915 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 916 if (next == *last) 917 break; 918 next = AVL_NEXT(tree, next); 919 continue; 920 } 921 bound_type = next->vmb_type; 922 index = next->vmb_start; 923 while (index <= next->vmb_end) { 924 925 /* 926 * These are used to determine how much to increment 927 * index when a large page is found. 928 */ 929 page_t *page; 930 pgcnt_t pgcnt = 1; 931 uint_t pgshft; 932 pgcnt_t pgmsk; 933 934 ap = anon_get_ptr(amp->ahp, index); 935 if (ap != NULL) 936 swap_xlate(ap, &vn, &off); 937 938 if (ap != NULL && vn != NULL && vn->v_pages != NULL && 939 (page = page_exists(vn, off)) != NULL) { 940 page_type = VMUSAGE_BOUND_INCORE; 941 if (page->p_szc > 0) { 942 pgcnt = page_get_pagecnt(page->p_szc); 943 pgshft = page_get_shift(page->p_szc); 944 pgmsk = (0x1 << (pgshft - PAGESHIFT)) 945 - 1; 946 } 947 } else { 948 page_type = VMUSAGE_BOUND_NOT_INCORE; 949 } 950 if (bound_type == VMUSAGE_BOUND_UNKNOWN) { 951 next->vmb_type = page_type; 952 } else if (next->vmb_type != page_type) { 953 /* 954 * If current bound type does not match page 955 * type, need to split off new bound. 956 */ 957 tmp = vmu_alloc_bound(); 958 tmp->vmb_type = page_type; 959 tmp->vmb_start = index; 960 tmp->vmb_end = next->vmb_end; 961 avl_insert_here(tree, tmp, next, AVL_AFTER); 962 next->vmb_end = index - 1; 963 if (*last == next) 964 *last = tmp; 965 next = tmp; 966 } 967 if (pgcnt > 1) { 968 /* 969 * If inside large page, jump to next large 970 * page 971 */ 972 index = (index & ~pgmsk) + pgcnt; 973 } else { 974 index++; 975 } 976 } 977 if (next == *last) { 978 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 979 break; 980 } else 981 next = AVL_NEXT(tree, next); 982 } 983 ANON_LOCK_EXIT(&->a_rwlock); 984 } 985 986 /* 987 * Same as vmu_amp_update_incore_bounds(), except for tracking 988 * incore-/not-incore for vnodes. 989 */ 990 static void 991 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode, 992 vmu_bound_t **first, vmu_bound_t **last) 993 { 994 vmu_bound_t *next; 995 vmu_bound_t *tmp; 996 pgcnt_t index; 997 short bound_type; 998 short page_type; 999 1000 next = *first; 1001 for (;;) { 1002 if (vnode->v_pages == NULL) 1003 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE; 1004 1005 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) { 1006 if (next == *last) 1007 break; 1008 next = AVL_NEXT(tree, next); 1009 continue; 1010 } 1011 1012 bound_type = next->vmb_type; 1013 index = next->vmb_start; 1014 while (index <= next->vmb_end) { 1015 1016 /* 1017 * These are used to determine how much to increment 1018 * index when a large page is found. 1019 */ 1020 page_t *page; 1021 pgcnt_t pgcnt = 1; 1022 uint_t pgshft; 1023 pgcnt_t pgmsk; 1024 1025 if (vnode->v_pages != NULL && 1026 (page = page_exists(vnode, ptob(index))) != NULL) { 1027 page_type = VMUSAGE_BOUND_INCORE; 1028 if (page->p_szc > 0) { 1029 pgcnt = page_get_pagecnt(page->p_szc); 1030 pgshft = page_get_shift(page->p_szc); 1031 pgmsk = (0x1 << (pgshft - PAGESHIFT)) 1032 - 1; 1033 } 1034 } else { 1035 page_type = VMUSAGE_BOUND_NOT_INCORE; 1036 } 1037 if (bound_type == VMUSAGE_BOUND_UNKNOWN) { 1038 next->vmb_type = page_type; 1039 } else if (next->vmb_type != page_type) { 1040 /* 1041 * If current bound type does not match page 1042 * type, need to split off new bound. 1043 */ 1044 tmp = vmu_alloc_bound(); 1045 tmp->vmb_type = page_type; 1046 tmp->vmb_start = index; 1047 tmp->vmb_end = next->vmb_end; 1048 avl_insert_here(tree, tmp, next, AVL_AFTER); 1049 next->vmb_end = index - 1; 1050 if (*last == next) 1051 *last = tmp; 1052 next = tmp; 1053 } 1054 if (pgcnt > 1) { 1055 /* 1056 * If inside large page, jump to next large 1057 * page 1058 */ 1059 index = (index & ~pgmsk) + pgcnt; 1060 } else { 1061 index++; 1062 } 1063 } 1064 if (next == *last) { 1065 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN); 1066 break; 1067 } else 1068 next = AVL_NEXT(tree, next); 1069 } 1070 } 1071 1072 /* 1073 * Calculate the rss and swap consumed by a segment. vmu_entities is the 1074 * list of entities to visit. For shared segments, the vnode or amp 1075 * is looked up in each entity to see if it has been already counted. Private 1076 * anon pages are checked per entity to ensure that COW pages are not 1077 * double counted. 1078 * 1079 * For private mapped files, first the amp is checked for private pages. 1080 * Bounds not backed by the amp are looked up in the vnode for each entity 1081 * to avoid double counting of private COW vnode pages. 1082 */ 1083 static void 1084 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg) 1085 { 1086 struct segvn_data *svd; 1087 struct shm_data *shmd; 1088 struct spt_data *sptd; 1089 vmu_object_t *shared_object = NULL; 1090 vmu_object_t *entity_object = NULL; 1091 vmu_entity_t *entity; 1092 vmusage_t *result; 1093 vmu_bound_t *first = NULL; 1094 vmu_bound_t *last = NULL; 1095 vmu_bound_t *cur = NULL; 1096 vmu_bound_t *e_first = NULL; 1097 vmu_bound_t *e_last = NULL; 1098 vmu_bound_t *tmp; 1099 pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt; 1100 struct anon_map *private_amp = NULL; 1101 boolean_t incore = B_FALSE; 1102 boolean_t shared = B_FALSE; 1103 int file = 0; 1104 pgcnt_t swresv = 0; 1105 pgcnt_t panon = 0; 1106 1107 s_start = 0; 1108 p_end = 0; 1109 /* Can zero-length segments exist? Not sure, so paranoia. */ 1110 if (seg->s_size <= 0) 1111 return; 1112 1113 /* 1114 * Figure out if there is a shared object (such as a named vnode or 1115 * a shared amp, then figure out if there is a private amp, which 1116 * identifies private pages. 1117 */ 1118 if (seg->s_ops == &segvn_ops) { 1119 svd = (struct segvn_data *)seg->s_data; 1120 if (svd->type == MAP_SHARED) { 1121 shared = B_TRUE; 1122 } else { 1123 swresv = svd->swresv; 1124 1125 if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock, 1126 RW_READER) != 0) { 1127 /* 1128 * Text replication anon maps can be shared 1129 * across all zones. Space used for text 1130 * replication is typically capped as a small % 1131 * of memory. To keep it simple for now we 1132 * don't account for swap and memory space used 1133 * for text replication. 1134 */ 1135 if (svd->tr_state == SEGVN_TR_OFF && 1136 svd->amp != NULL) { 1137 private_amp = svd->amp; 1138 p_start = svd->anon_index; 1139 p_end = svd->anon_index + 1140 btop(seg->s_size) - 1; 1141 } 1142 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock); 1143 } 1144 } 1145 if (svd->vp != NULL) { 1146 file = 1; 1147 shared_object = vmu_find_insert_object( 1148 vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp, 1149 VMUSAGE_TYPE_VNODE); 1150 s_start = btop(svd->offset); 1151 s_end = btop(svd->offset + seg->s_size) - 1; 1152 } 1153 if (svd->amp != NULL && svd->type == MAP_SHARED) { 1154 ASSERT(shared_object == NULL); 1155 shared_object = vmu_find_insert_object( 1156 vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp, 1157 VMUSAGE_TYPE_AMP); 1158 s_start = svd->anon_index; 1159 s_end = svd->anon_index + btop(seg->s_size) - 1; 1160 /* schedctl mappings are always in core */ 1161 if (svd->amp->swresv == 0) 1162 incore = B_TRUE; 1163 } 1164 } else if (seg->s_ops == &segspt_shmops) { 1165 shared = B_TRUE; 1166 shmd = (struct shm_data *)seg->s_data; 1167 shared_object = vmu_find_insert_object( 1168 vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp, 1169 VMUSAGE_TYPE_AMP); 1170 s_start = 0; 1171 s_end = btop(seg->s_size) - 1; 1172 sptd = shmd->shm_sptseg->s_data; 1173 1174 /* ism segments are always incore and do not reserve swap */ 1175 if (sptd->spt_flags & SHM_SHARE_MMU) 1176 incore = B_TRUE; 1177 1178 } else { 1179 return; 1180 } 1181 1182 /* 1183 * If there is a private amp, count anon pages that exist. If an 1184 * anon has a refcnt > 1 (COW sharing), then save the anon in a 1185 * hash so that it is not double counted. 1186 * 1187 * If there is also a shared object, then figure out the bounds 1188 * which are not mapped by the private amp. 1189 */ 1190 if (private_amp != NULL) { 1191 1192 /* Enter as writer to prevent COW anons from being freed */ 1193 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER); 1194 1195 p_index = p_start; 1196 s_index = s_start; 1197 1198 while (p_index <= p_end) { 1199 1200 pgcnt_t p_index_next; 1201 pgcnt_t p_bound_size; 1202 int cnt; 1203 anoff_t off; 1204 struct vnode *vn; 1205 struct anon *ap; 1206 page_t *page; /* For handling of large */ 1207 pgcnt_t pgcnt = 1; /* pages */ 1208 pgcnt_t pgstart; 1209 pgcnt_t pgend; 1210 uint_t pgshft; 1211 pgcnt_t pgmsk; 1212 1213 p_index_next = p_index; 1214 ap = anon_get_next_ptr(private_amp->ahp, 1215 &p_index_next); 1216 1217 /* 1218 * If next anon is past end of mapping, simulate 1219 * end of anon so loop terminates. 1220 */ 1221 if (p_index_next > p_end) { 1222 p_index_next = p_end + 1; 1223 ap = NULL; 1224 } 1225 /* 1226 * For COW segments, keep track of bounds not 1227 * backed by private amp so they can be looked 1228 * up in the backing vnode 1229 */ 1230 if (p_index_next != p_index) { 1231 1232 /* 1233 * Compute index difference between anon and 1234 * previous anon. 1235 */ 1236 p_bound_size = p_index_next - p_index - 1; 1237 1238 if (shared_object != NULL) { 1239 cur = vmu_alloc_bound(); 1240 cur->vmb_start = s_index; 1241 cur->vmb_end = s_index + p_bound_size; 1242 cur->vmb_type = VMUSAGE_BOUND_UNKNOWN; 1243 if (first == NULL) { 1244 first = cur; 1245 last = cur; 1246 } else { 1247 last->vmb_next = cur; 1248 last = cur; 1249 } 1250 } 1251 p_index = p_index + p_bound_size + 1; 1252 s_index = s_index + p_bound_size + 1; 1253 } 1254 1255 /* Detect end of anons in amp */ 1256 if (ap == NULL) 1257 break; 1258 1259 cnt = ap->an_refcnt; 1260 swap_xlate(ap, &vn, &off); 1261 1262 if (vn == NULL || vn->v_pages == NULL || 1263 (page = page_exists(vn, off)) == NULL) { 1264 p_index++; 1265 s_index++; 1266 continue; 1267 } 1268 1269 /* 1270 * If large page is found, compute portion of large 1271 * page in mapping, and increment indicies to the next 1272 * large page. 1273 */ 1274 if (page->p_szc > 0) { 1275 1276 pgcnt = page_get_pagecnt(page->p_szc); 1277 pgshft = page_get_shift(page->p_szc); 1278 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1; 1279 1280 /* First page in large page */ 1281 pgstart = p_index & ~pgmsk; 1282 /* Last page in large page */ 1283 pgend = pgstart + pgcnt - 1; 1284 /* 1285 * Artifically end page if page extends past 1286 * end of mapping. 1287 */ 1288 if (pgend > p_end) 1289 pgend = p_end; 1290 1291 /* 1292 * Compute number of pages from large page 1293 * which are mapped. 1294 */ 1295 pgcnt = pgend - p_index + 1; 1296 1297 /* 1298 * Point indicies at page after large page, 1299 * or at page after end of mapping. 1300 */ 1301 p_index += pgcnt; 1302 s_index += pgcnt; 1303 } else { 1304 p_index++; 1305 s_index++; 1306 } 1307 1308 /* 1309 * Assume anon structs with a refcnt 1310 * of 1 are not COW shared, so there 1311 * is no reason to track them per entity. 1312 */ 1313 if (cnt == 1) { 1314 panon += pgcnt; 1315 continue; 1316 } 1317 for (entity = vmu_entities; entity != NULL; 1318 entity = entity->vme_next_calc) { 1319 1320 result = &entity->vme_result; 1321 /* 1322 * Track COW anons per entity so 1323 * they are not double counted. 1324 */ 1325 if (vmu_find_insert_anon(entity->vme_anon_hash, 1326 (caddr_t)ap) == 0) 1327 continue; 1328 1329 result->vmu_rss_all += (pgcnt << PAGESHIFT); 1330 result->vmu_rss_private += 1331 (pgcnt << PAGESHIFT); 1332 } 1333 } 1334 ANON_LOCK_EXIT(&private_amp->a_rwlock); 1335 } 1336 1337 /* Add up resident anon and swap reserved for private mappings */ 1338 if (swresv > 0 || panon > 0) { 1339 for (entity = vmu_entities; entity != NULL; 1340 entity = entity->vme_next_calc) { 1341 result = &entity->vme_result; 1342 result->vmu_swap_all += swresv; 1343 result->vmu_swap_private += swresv; 1344 result->vmu_rss_all += (panon << PAGESHIFT); 1345 result->vmu_rss_private += (panon << PAGESHIFT); 1346 } 1347 } 1348 1349 /* Compute resident pages backing shared amp or named vnode */ 1350 if (shared_object != NULL) { 1351 avl_tree_t *tree = &(shared_object->vmo_bounds); 1352 1353 if (first == NULL) { 1354 /* 1355 * No private amp, or private amp has no anon 1356 * structs. This means entire segment is backed by 1357 * the shared object. 1358 */ 1359 first = vmu_alloc_bound(); 1360 first->vmb_start = s_start; 1361 first->vmb_end = s_end; 1362 first->vmb_type = VMUSAGE_BOUND_UNKNOWN; 1363 } 1364 /* 1365 * Iterate bounds not backed by private amp, and compute 1366 * resident pages. 1367 */ 1368 cur = first; 1369 while (cur != NULL) { 1370 1371 if (vmu_insert_lookup_object_bounds(shared_object, 1372 cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN, 1373 &first, &last) > 0) { 1374 /* new bounds, find incore/not-incore */ 1375 if (shared_object->vmo_type == 1376 VMUSAGE_TYPE_VNODE) { 1377 vmu_vnode_update_incore_bounds( 1378 tree, 1379 (vnode_t *) 1380 shared_object->vmo_key, &first, 1381 &last); 1382 } else { 1383 vmu_amp_update_incore_bounds( 1384 tree, 1385 (struct anon_map *) 1386 shared_object->vmo_key, &first, 1387 &last, incore); 1388 } 1389 vmu_merge_bounds(tree, &first, &last); 1390 } 1391 for (entity = vmu_entities; entity != NULL; 1392 entity = entity->vme_next_calc) { 1393 avl_tree_t *e_tree; 1394 1395 result = &entity->vme_result; 1396 1397 entity_object = vmu_find_insert_object( 1398 shared_object->vmo_type == 1399 VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash: 1400 entity->vme_amp_hash, 1401 shared_object->vmo_key, 1402 shared_object->vmo_type); 1403 1404 virt = vmu_insert_lookup_object_bounds( 1405 entity_object, cur->vmb_start, cur->vmb_end, 1406 VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last); 1407 1408 if (virt == 0) 1409 continue; 1410 /* 1411 * Range visited for this entity 1412 */ 1413 e_tree = &(entity_object->vmo_bounds); 1414 rss = vmu_update_bounds(e_tree, &e_first, 1415 &e_last, tree, first, last); 1416 result->vmu_rss_all += (rss << PAGESHIFT); 1417 if (shared == B_TRUE && file == B_FALSE) { 1418 /* shared anon mapping */ 1419 result->vmu_swap_all += 1420 (virt << PAGESHIFT); 1421 result->vmu_swap_shared += 1422 (virt << PAGESHIFT); 1423 result->vmu_rss_shared += 1424 (rss << PAGESHIFT); 1425 } else if (shared == B_TRUE && file == B_TRUE) { 1426 /* shared file mapping */ 1427 result->vmu_rss_shared += 1428 (rss << PAGESHIFT); 1429 } else if (shared == B_FALSE && 1430 file == B_TRUE) { 1431 /* private file mapping */ 1432 result->vmu_rss_private += 1433 (rss << PAGESHIFT); 1434 } 1435 vmu_merge_bounds(e_tree, &e_first, &e_last); 1436 } 1437 tmp = cur; 1438 cur = cur->vmb_next; 1439 vmu_free_bound(tmp); 1440 } 1441 } 1442 } 1443 1444 /* 1445 * Based on the current calculation flags, find the relevant entities 1446 * which are relative to the process. Then calculate each segment 1447 * in the process'es address space for each relevant entity. 1448 */ 1449 static void 1450 vmu_calculate_proc(proc_t *p) 1451 { 1452 vmu_entity_t *entities = NULL; 1453 vmu_zone_t *zone; 1454 vmu_entity_t *tmp; 1455 struct as *as; 1456 struct seg *seg; 1457 int ret; 1458 1459 /* Figure out which entities are being computed */ 1460 if ((vmu_data.vmu_system) != NULL) { 1461 tmp = vmu_data.vmu_system; 1462 tmp->vme_next_calc = entities; 1463 entities = tmp; 1464 } 1465 if (vmu_data.vmu_calc_flags & 1466 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS | 1467 VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS | 1468 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS | 1469 VMUSAGE_ALL_EUSERS)) { 1470 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash, 1471 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, 1472 (mod_hash_val_t *)&zone); 1473 if (ret != 0) { 1474 zone = vmu_alloc_zone(p->p_zone->zone_id); 1475 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash, 1476 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id, 1477 (mod_hash_val_t)zone, (mod_hash_hndl_t)0); 1478 ASSERT(ret == 0); 1479 } 1480 if (zone->vmz_zone != NULL) { 1481 tmp = zone->vmz_zone; 1482 tmp->vme_next_calc = entities; 1483 entities = tmp; 1484 } 1485 if (vmu_data.vmu_calc_flags & 1486 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) { 1487 tmp = vmu_find_insert_entity(zone->vmz_projects_hash, 1488 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, 1489 zone->vmz_id); 1490 tmp->vme_next_calc = entities; 1491 entities = tmp; 1492 } 1493 if (vmu_data.vmu_calc_flags & 1494 (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) { 1495 tmp = vmu_find_insert_entity(zone->vmz_tasks_hash, 1496 p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id); 1497 tmp->vme_next_calc = entities; 1498 entities = tmp; 1499 } 1500 if (vmu_data.vmu_calc_flags & 1501 (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) { 1502 tmp = vmu_find_insert_entity(zone->vmz_rusers_hash, 1503 crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id); 1504 tmp->vme_next_calc = entities; 1505 entities = tmp; 1506 } 1507 if (vmu_data.vmu_calc_flags & 1508 (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) { 1509 tmp = vmu_find_insert_entity(zone->vmz_eusers_hash, 1510 crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id); 1511 tmp->vme_next_calc = entities; 1512 entities = tmp; 1513 } 1514 } 1515 /* Entities which collapse projects and users for all zones */ 1516 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) { 1517 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash, 1518 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES); 1519 tmp->vme_next_calc = entities; 1520 entities = tmp; 1521 } 1522 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) { 1523 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash, 1524 crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES); 1525 tmp->vme_next_calc = entities; 1526 entities = tmp; 1527 } 1528 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) { 1529 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash, 1530 crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES); 1531 tmp->vme_next_calc = entities; 1532 entities = tmp; 1533 } 1534 1535 ASSERT(entities != NULL); 1536 /* process all segs in process's address space */ 1537 as = p->p_as; 1538 AS_LOCK_ENTER(as, RW_READER); 1539 for (seg = AS_SEGFIRST(as); seg != NULL; 1540 seg = AS_SEGNEXT(as, seg)) { 1541 vmu_calculate_seg(entities, seg); 1542 } 1543 AS_LOCK_EXIT(as); 1544 } 1545 1546 /* 1547 * Free data created by previous call to vmu_calculate(). 1548 */ 1549 static void 1550 vmu_clear_calc() 1551 { 1552 if (vmu_data.vmu_system != NULL) { 1553 vmu_free_entity(vmu_data.vmu_system); 1554 vmu_data.vmu_system = NULL; 1555 } 1556 if (vmu_data.vmu_zones_hash != NULL) 1557 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash); 1558 if (vmu_data.vmu_projects_col_hash != NULL) 1559 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash); 1560 if (vmu_data.vmu_rusers_col_hash != NULL) 1561 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash); 1562 if (vmu_data.vmu_eusers_col_hash != NULL) 1563 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash); 1564 1565 i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash); 1566 i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash); 1567 } 1568 1569 /* 1570 * Free unused data structures. These can result if the system workload 1571 * decreases between calculations. 1572 */ 1573 static void 1574 vmu_free_extra() 1575 { 1576 vmu_bound_t *tb; 1577 vmu_object_t *to; 1578 vmu_entity_t *te; 1579 vmu_zone_t *tz; 1580 1581 while (vmu_data.vmu_free_bounds != NULL) { 1582 tb = vmu_data.vmu_free_bounds; 1583 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next; 1584 kmem_cache_free(vmu_bound_cache, tb); 1585 } 1586 while (vmu_data.vmu_free_objects != NULL) { 1587 to = vmu_data.vmu_free_objects; 1588 vmu_data.vmu_free_objects = 1589 vmu_data.vmu_free_objects->vmo_next; 1590 kmem_cache_free(vmu_object_cache, to); 1591 } 1592 while (vmu_data.vmu_free_entities != NULL) { 1593 te = vmu_data.vmu_free_entities; 1594 vmu_data.vmu_free_entities = 1595 vmu_data.vmu_free_entities->vme_next; 1596 if (te->vme_vnode_hash != NULL) 1597 mod_hash_destroy_hash(te->vme_vnode_hash); 1598 if (te->vme_amp_hash != NULL) 1599 mod_hash_destroy_hash(te->vme_amp_hash); 1600 if (te->vme_anon_hash != NULL) 1601 mod_hash_destroy_hash(te->vme_anon_hash); 1602 kmem_free(te, sizeof (vmu_entity_t)); 1603 } 1604 while (vmu_data.vmu_free_zones != NULL) { 1605 tz = vmu_data.vmu_free_zones; 1606 vmu_data.vmu_free_zones = 1607 vmu_data.vmu_free_zones->vmz_next; 1608 if (tz->vmz_projects_hash != NULL) 1609 mod_hash_destroy_hash(tz->vmz_projects_hash); 1610 if (tz->vmz_tasks_hash != NULL) 1611 mod_hash_destroy_hash(tz->vmz_tasks_hash); 1612 if (tz->vmz_rusers_hash != NULL) 1613 mod_hash_destroy_hash(tz->vmz_rusers_hash); 1614 if (tz->vmz_eusers_hash != NULL) 1615 mod_hash_destroy_hash(tz->vmz_eusers_hash); 1616 kmem_free(tz, sizeof (vmu_zone_t)); 1617 } 1618 } 1619 1620 extern kcondvar_t *pr_pid_cv; 1621 1622 /* 1623 * Determine which entity types are relevant and allocate the hashes to 1624 * track them. Then walk the process table and count rss and swap 1625 * for each process'es address space. Address space object such as 1626 * vnodes, amps and anons are tracked per entity, so that they are 1627 * not double counted in the results. 1628 * 1629 */ 1630 static void 1631 vmu_calculate() 1632 { 1633 int i = 0; 1634 int ret; 1635 proc_t *p; 1636 1637 vmu_clear_calc(); 1638 1639 if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM) 1640 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM, 1641 ALL_ZONES); 1642 1643 /* 1644 * Walk process table and calculate rss of each proc. 1645 * 1646 * Pidlock and p_lock cannot be held while doing the rss calculation. 1647 * This is because: 1648 * 1. The calculation allocates using KM_SLEEP. 1649 * 2. The calculation grabs a_lock, which cannot be grabbed 1650 * after p_lock. 1651 * 1652 * Since pidlock must be dropped, we cannot simply just walk the 1653 * practive list. Instead, we walk the process table, and sprlock 1654 * each process to ensure that it does not exit during the 1655 * calculation. 1656 */ 1657 1658 mutex_enter(&pidlock); 1659 for (i = 0; i < v.v_proc; i++) { 1660 again: 1661 p = pid_entry(i); 1662 if (p == NULL) 1663 continue; 1664 1665 mutex_enter(&p->p_lock); 1666 mutex_exit(&pidlock); 1667 1668 if (panicstr) { 1669 mutex_exit(&p->p_lock); 1670 return; 1671 } 1672 1673 /* Try to set P_PR_LOCK */ 1674 ret = sprtrylock_proc(p); 1675 if (ret == -1) { 1676 /* Process in invalid state */ 1677 mutex_exit(&p->p_lock); 1678 mutex_enter(&pidlock); 1679 continue; 1680 } else if (ret == 1) { 1681 /* 1682 * P_PR_LOCK is already set. Wait and try again. 1683 * This also drops p_lock. 1684 */ 1685 sprwaitlock_proc(p); 1686 mutex_enter(&pidlock); 1687 goto again; 1688 } 1689 mutex_exit(&p->p_lock); 1690 1691 vmu_calculate_proc(p); 1692 1693 mutex_enter(&p->p_lock); 1694 sprunlock(p); 1695 mutex_enter(&pidlock); 1696 } 1697 mutex_exit(&pidlock); 1698 1699 vmu_free_extra(); 1700 } 1701 1702 /* 1703 * allocate a new cache for N results satisfying flags 1704 */ 1705 vmu_cache_t * 1706 vmu_cache_alloc(size_t nres, uint_t flags) 1707 { 1708 vmu_cache_t *cache; 1709 1710 cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP); 1711 cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP); 1712 cache->vmc_nresults = nres; 1713 cache->vmc_flags = flags; 1714 cache->vmc_refcnt = 1; 1715 return (cache); 1716 } 1717 1718 /* 1719 * Make sure cached results are not freed 1720 */ 1721 static void 1722 vmu_cache_hold(vmu_cache_t *cache) 1723 { 1724 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); 1725 cache->vmc_refcnt++; 1726 } 1727 1728 /* 1729 * free cache data 1730 */ 1731 static void 1732 vmu_cache_rele(vmu_cache_t *cache) 1733 { 1734 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock)); 1735 ASSERT(cache->vmc_refcnt > 0); 1736 cache->vmc_refcnt--; 1737 if (cache->vmc_refcnt == 0) { 1738 kmem_free(cache->vmc_results, sizeof (vmusage_t) * 1739 cache->vmc_nresults); 1740 kmem_free(cache, sizeof (vmu_cache_t)); 1741 } 1742 } 1743 1744 /* 1745 * Copy out the cached results to a caller. Inspect the callers flags 1746 * and zone to determine which cached results should be copied. 1747 */ 1748 static int 1749 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres, 1750 uint_t flags, int cpflg) 1751 { 1752 vmusage_t *result, *out_result; 1753 vmusage_t dummy; 1754 size_t i, count = 0; 1755 size_t bufsize; 1756 int ret = 0; 1757 uint_t types = 0; 1758 1759 if (nres != NULL) { 1760 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg)) 1761 return (set_errno(EFAULT)); 1762 } else { 1763 bufsize = 0; 1764 } 1765 1766 /* figure out what results the caller is interested in. */ 1767 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone) 1768 types |= VMUSAGE_SYSTEM; 1769 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) 1770 types |= VMUSAGE_ZONE; 1771 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS | 1772 VMUSAGE_COL_PROJECTS)) 1773 types |= VMUSAGE_PROJECTS; 1774 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) 1775 types |= VMUSAGE_TASKS; 1776 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) 1777 types |= VMUSAGE_RUSERS; 1778 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) 1779 types |= VMUSAGE_EUSERS; 1780 1781 /* count results for current zone */ 1782 out_result = buf; 1783 for (result = cache->vmc_results, i = 0; 1784 i < cache->vmc_nresults; result++, i++) { 1785 1786 /* Do not return "other-zone" results to non-global zones */ 1787 if (curproc->p_zone != global_zone && 1788 curproc->p_zone->zone_id != result->vmu_zoneid) 1789 continue; 1790 1791 /* 1792 * If non-global zone requests VMUSAGE_SYSTEM, fake 1793 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result. 1794 */ 1795 if (curproc->p_zone != global_zone && 1796 (flags & VMUSAGE_SYSTEM) != 0 && 1797 result->vmu_type == VMUSAGE_ZONE) { 1798 count++; 1799 if (out_result != NULL) { 1800 if (bufsize < count) { 1801 ret = set_errno(EOVERFLOW); 1802 } else { 1803 dummy = *result; 1804 dummy.vmu_zoneid = ALL_ZONES; 1805 dummy.vmu_id = 0; 1806 dummy.vmu_type = VMUSAGE_SYSTEM; 1807 if (ddi_copyout(&dummy, out_result, 1808 sizeof (vmusage_t), cpflg)) 1809 return (set_errno(EFAULT)); 1810 out_result++; 1811 } 1812 } 1813 } 1814 1815 /* Skip results that do not match requested type */ 1816 if ((result->vmu_type & types) == 0) 1817 continue; 1818 1819 /* Skip collated results if not requested */ 1820 if (result->vmu_zoneid == ALL_ZONES) { 1821 if (result->vmu_type == VMUSAGE_PROJECTS && 1822 (flags & VMUSAGE_COL_PROJECTS) == 0) 1823 continue; 1824 if (result->vmu_type == VMUSAGE_EUSERS && 1825 (flags & VMUSAGE_COL_EUSERS) == 0) 1826 continue; 1827 if (result->vmu_type == VMUSAGE_RUSERS && 1828 (flags & VMUSAGE_COL_RUSERS) == 0) 1829 continue; 1830 } 1831 1832 /* Skip "other zone" results if not requested */ 1833 if (result->vmu_zoneid != curproc->p_zone->zone_id) { 1834 if (result->vmu_type == VMUSAGE_ZONE && 1835 (flags & VMUSAGE_ALL_ZONES) == 0) 1836 continue; 1837 if (result->vmu_type == VMUSAGE_PROJECTS && 1838 (flags & (VMUSAGE_ALL_PROJECTS | 1839 VMUSAGE_COL_PROJECTS)) == 0) 1840 continue; 1841 if (result->vmu_type == VMUSAGE_TASKS && 1842 (flags & VMUSAGE_ALL_TASKS) == 0) 1843 continue; 1844 if (result->vmu_type == VMUSAGE_RUSERS && 1845 (flags & (VMUSAGE_ALL_RUSERS | 1846 VMUSAGE_COL_RUSERS)) == 0) 1847 continue; 1848 if (result->vmu_type == VMUSAGE_EUSERS && 1849 (flags & (VMUSAGE_ALL_EUSERS | 1850 VMUSAGE_COL_EUSERS)) == 0) 1851 continue; 1852 } 1853 count++; 1854 if (out_result != NULL) { 1855 if (bufsize < count) { 1856 ret = set_errno(EOVERFLOW); 1857 } else { 1858 if (ddi_copyout(result, out_result, 1859 sizeof (vmusage_t), cpflg)) 1860 return (set_errno(EFAULT)); 1861 out_result++; 1862 } 1863 } 1864 } 1865 if (nres != NULL) 1866 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg)) 1867 return (set_errno(EFAULT)); 1868 1869 return (ret); 1870 } 1871 1872 /* 1873 * vm_getusage() 1874 * 1875 * Counts rss and swap by zone, project, task, and/or user. The flags argument 1876 * determines the type of results structures returned. Flags requesting 1877 * results from more than one zone are "flattened" to the local zone if the 1878 * caller is not the global zone. 1879 * 1880 * args: 1881 * flags: bitmap consisting of one or more of VMUSAGE_*. 1882 * age: maximum allowable age (time since counting was done) in 1883 * seconds of the results. Results from previous callers are 1884 * cached in kernel. 1885 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres 1886 * set on success. 1887 * nres: Set to number of vmusage_t structures pointed to by buf 1888 * before calling vm_getusage(). 1889 * On return 0 (success) or ENOSPC, is set to the number of result 1890 * structures returned or attempted to return. 1891 * 1892 * returns 0 on success, -1 on failure: 1893 * EINTR (interrupted) 1894 * ENOSPC (nres to small for results, nres set to needed value for success) 1895 * EINVAL (flags invalid) 1896 * EFAULT (bad address for buf or nres) 1897 */ 1898 int 1899 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg) 1900 { 1901 vmu_entity_t *entity; 1902 vmusage_t *result; 1903 int ret = 0; 1904 int cacherecent = 0; 1905 hrtime_t now; 1906 uint_t flags_orig; 1907 1908 /* 1909 * Non-global zones cannot request system wide and/or collated 1910 * results, or the system result, so munge the flags accordingly. 1911 */ 1912 flags_orig = flags; 1913 if (curproc->p_zone != global_zone) { 1914 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) { 1915 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS); 1916 flags |= VMUSAGE_PROJECTS; 1917 } 1918 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) { 1919 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS); 1920 flags |= VMUSAGE_RUSERS; 1921 } 1922 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) { 1923 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS); 1924 flags |= VMUSAGE_EUSERS; 1925 } 1926 if (flags & VMUSAGE_SYSTEM) { 1927 flags &= ~VMUSAGE_SYSTEM; 1928 flags |= VMUSAGE_ZONE; 1929 } 1930 } 1931 1932 /* Check for unknown flags */ 1933 if ((flags & (~VMUSAGE_MASK)) != 0) 1934 return (set_errno(EINVAL)); 1935 1936 /* Check for no flags */ 1937 if ((flags & VMUSAGE_MASK) == 0) 1938 return (set_errno(EINVAL)); 1939 1940 mutex_enter(&vmu_data.vmu_lock); 1941 now = gethrtime(); 1942 1943 start: 1944 if (vmu_data.vmu_cache != NULL) { 1945 1946 vmu_cache_t *cache; 1947 1948 if ((vmu_data.vmu_cache->vmc_timestamp + 1949 ((hrtime_t)age * NANOSEC)) > now) 1950 cacherecent = 1; 1951 1952 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags && 1953 cacherecent == 1) { 1954 cache = vmu_data.vmu_cache; 1955 vmu_cache_hold(cache); 1956 mutex_exit(&vmu_data.vmu_lock); 1957 1958 ret = vmu_copyout_results(cache, buf, nres, flags_orig, 1959 cpflg); 1960 mutex_enter(&vmu_data.vmu_lock); 1961 vmu_cache_rele(cache); 1962 if (vmu_data.vmu_pending_waiters > 0) 1963 cv_broadcast(&vmu_data.vmu_cv); 1964 mutex_exit(&vmu_data.vmu_lock); 1965 return (ret); 1966 } 1967 /* 1968 * If the cache is recent, it is likely that there are other 1969 * consumers of vm_getusage running, so add their flags to the 1970 * desired flags for the calculation. 1971 */ 1972 if (cacherecent == 1) 1973 flags = vmu_data.vmu_cache->vmc_flags | flags; 1974 } 1975 if (vmu_data.vmu_calc_thread == NULL) { 1976 1977 vmu_cache_t *cache; 1978 1979 vmu_data.vmu_calc_thread = curthread; 1980 vmu_data.vmu_calc_flags = flags; 1981 vmu_data.vmu_entities = NULL; 1982 vmu_data.vmu_nentities = 0; 1983 if (vmu_data.vmu_pending_waiters > 0) 1984 vmu_data.vmu_calc_flags |= 1985 vmu_data.vmu_pending_flags; 1986 1987 vmu_data.vmu_pending_flags = 0; 1988 mutex_exit(&vmu_data.vmu_lock); 1989 vmu_calculate(); 1990 mutex_enter(&vmu_data.vmu_lock); 1991 /* copy results to cache */ 1992 if (vmu_data.vmu_cache != NULL) 1993 vmu_cache_rele(vmu_data.vmu_cache); 1994 cache = vmu_data.vmu_cache = 1995 vmu_cache_alloc(vmu_data.vmu_nentities, 1996 vmu_data.vmu_calc_flags); 1997 1998 result = cache->vmc_results; 1999 for (entity = vmu_data.vmu_entities; entity != NULL; 2000 entity = entity->vme_next) { 2001 *result = entity->vme_result; 2002 result++; 2003 } 2004 cache->vmc_timestamp = gethrtime(); 2005 vmu_cache_hold(cache); 2006 2007 vmu_data.vmu_calc_flags = 0; 2008 vmu_data.vmu_calc_thread = NULL; 2009 2010 if (vmu_data.vmu_pending_waiters > 0) 2011 cv_broadcast(&vmu_data.vmu_cv); 2012 2013 mutex_exit(&vmu_data.vmu_lock); 2014 2015 /* copy cache */ 2016 ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg); 2017 mutex_enter(&vmu_data.vmu_lock); 2018 vmu_cache_rele(cache); 2019 mutex_exit(&vmu_data.vmu_lock); 2020 2021 return (ret); 2022 } 2023 vmu_data.vmu_pending_flags |= flags; 2024 vmu_data.vmu_pending_waiters++; 2025 while (vmu_data.vmu_calc_thread != NULL) { 2026 if (cv_wait_sig(&vmu_data.vmu_cv, 2027 &vmu_data.vmu_lock) == 0) { 2028 vmu_data.vmu_pending_waiters--; 2029 mutex_exit(&vmu_data.vmu_lock); 2030 return (set_errno(EINTR)); 2031 } 2032 } 2033 vmu_data.vmu_pending_waiters--; 2034 goto start; 2035 } 2036