1 /*
2 * CDDL HEADER START
3 *
4 * The contents of this file are subject to the terms of the
5 * Common Development and Distribution License (the "License").
6 * You may not use this file except in compliance with the License.
7 *
8 * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9 * or http://www.opensolaris.org/os/licensing.
10 * See the License for the specific language governing permissions
11 * and limitations under the License.
12 *
13 * When distributing Covered Code, include this CDDL HEADER in each
14 * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15 * If applicable, add the following below this CDDL HEADER, with the
16 * fields enclosed by brackets "[]" replaced with your own identifying
17 * information: Portions Copyright [yyyy] [name of copyright owner]
18 *
19 * CDDL HEADER END
20 */
21
22 /*
23 * Copyright 2009 Sun Microsystems, Inc. All rights reserved.
24 * Use is subject to license terms.
25 */
26
27 /*
28 * vm_usage
29 *
30 * This file implements the getvmusage() private system call.
31 * getvmusage() counts the amount of resident memory pages and swap
32 * reserved by the specified process collective. A "process collective" is
33 * the set of processes owned by a particular, zone, project, task, or user.
34 *
35 * rss and swap are counted so that for a given process collective, a page is
36 * only counted once. For example, this means that if multiple processes in
37 * the same project map the same page, then the project will only be charged
38 * once for that page. On the other hand, if two processes in different
39 * projects map the same page, then both projects will be charged
40 * for the page.
41 *
42 * The vm_getusage() calculation is implemented so that the first thread
43 * performs the rss/swap counting. Other callers will wait for that thread to
44 * finish, copying the results. This enables multiple rcapds and prstats to
45 * consume data from the same calculation. The results are also cached so that
46 * a caller interested in recent results can just copy them instead of starting
47 * a new calculation. The caller passes the maximium age (in seconds) of the
48 * data. If the cached data is young enough, the cache is copied, otherwise,
49 * a new calculation is executed and the cache is replaced with the new
50 * data.
51 *
52 * The rss calculation for each process collective is as follows:
53 *
54 * - Inspect flags, determine if counting rss for zones, projects, tasks,
55 * and/or users.
56 * - For each proc:
57 * - Figure out proc's collectives (zone, project, task, and/or user).
58 * - For each seg in proc's address space:
59 * - If seg is private:
60 * - Lookup anons in the amp.
61 * - For incore pages not previously visited each of the
62 * proc's collectives, add incore pagesize to each.
63 * collective.
64 * Anon's with a refcnt of 1 can be assummed to be not
65 * previously visited.
66 * - For address ranges without anons in the amp:
67 * - Lookup pages in underlying vnode.
68 * - For incore pages not previously visiting for
69 * each of the proc's collectives, add incore
70 * pagesize to each collective.
71 * - If seg is shared:
72 * - Lookup pages in the shared amp or vnode.
73 * - For incore pages not previously visited for each of
74 * the proc's collectives, add incore pagesize to each
75 * collective.
76 *
77 * Swap is reserved by private segments, and shared anonymous segments.
78 * The only shared anon segments which do not reserve swap are ISM segments
79 * and schedctl segments, both of which can be identified by having
80 * amp->swresv == 0.
81 *
82 * The swap calculation for each collective is as follows:
83 *
84 * - Inspect flags, determine if counting rss for zones, projects, tasks,
85 * and/or users.
86 * - For each proc:
87 * - Figure out proc's collectives (zone, project, task, and/or user).
88 * - For each seg in proc's address space:
89 * - If seg is private:
90 * - Add svd->swresv pages to swap count for each of the
91 * proc's collectives.
92 * - If seg is anon, shared, and amp->swresv != 0
93 * - For address ranges in amp not previously visited for
94 * each of the proc's collectives, add size of address
95 * range to the swap count for each collective.
96 *
97 * These two calculations are done simultaneously, with most of the work
98 * being done in vmu_calculate_seg(). The results of the calculation are
99 * copied into "vmu_data.vmu_cache_results".
100 *
101 * To perform the calculation, various things are tracked and cached:
102 *
103 * - incore/not-incore page ranges for all vnodes.
104 * (vmu_data.vmu_all_vnodes_hash)
105 * This eliminates looking up the same page more than once.
106 *
107 * - incore/not-incore page ranges for all shared amps.
108 * (vmu_data.vmu_all_amps_hash)
109 * This eliminates looking up the same page more than once.
110 *
111 * - visited page ranges for each collective.
112 * - per vnode (entity->vme_vnode_hash)
113 * - per shared amp (entity->vme_amp_hash)
114 * For accurate counting of map-shared and COW-shared pages.
115 *
116 * - visited private anons (refcnt > 1) for each collective.
117 * (entity->vme_anon_hash)
118 * For accurate counting of COW-shared pages.
119 *
120 * The common accounting structure is the vmu_entity_t, which represents
121 * collectives:
122 *
123 * - A zone.
124 * - A project, task, or user within a zone.
125 * - The entire system (vmu_data.vmu_system).
126 * - Each collapsed (col) project and user. This means a given projid or
127 * uid, regardless of which zone the process is in. For instance,
128 * project 0 in the global zone and project 0 in a non global zone are
129 * the same collapsed project.
130 *
131 * Each entity structure tracks which pages have been already visited for
132 * that entity (via previously inspected processes) so that these pages are
133 * not double counted.
134 */
135
136 #include <sys/errno.h>
137 #include <sys/types.h>
138 #include <sys/zone.h>
139 #include <sys/proc.h>
140 #include <sys/project.h>
141 #include <sys/task.h>
142 #include <sys/thread.h>
143 #include <sys/time.h>
144 #include <sys/mman.h>
145 #include <sys/modhash.h>
146 #include <sys/modhash_impl.h>
147 #include <sys/shm.h>
148 #include <sys/swap.h>
149 #include <sys/synch.h>
150 #include <sys/systm.h>
151 #include <sys/var.h>
152 #include <sys/vm_usage.h>
153 #include <sys/zone.h>
154 #include <sys/sunddi.h>
155 #include <sys/avl.h>
156 #include <vm/anon.h>
157 #include <vm/as.h>
158 #include <vm/seg_vn.h>
159 #include <vm/seg_spt.h>
160
161 #define VMUSAGE_HASH_SIZE 512
162
163 #define VMUSAGE_TYPE_VNODE 1
164 #define VMUSAGE_TYPE_AMP 2
165 #define VMUSAGE_TYPE_ANON 3
166
167 #define VMUSAGE_BOUND_UNKNOWN 0
168 #define VMUSAGE_BOUND_INCORE 1
169 #define VMUSAGE_BOUND_NOT_INCORE 2
170
171 #define ISWITHIN(node, addr) ((node)->vmb_start <= addr && \
172 (node)->vmb_end >= addr ? 1 : 0)
173
174 /*
175 * bounds for vnodes and shared amps
176 * Each bound is either entirely incore, entirely not in core, or
177 * entirely unknown. bounds are stored in an avl tree sorted by start member
178 * when in use, otherwise (free or temporary lists) they're strung
179 * together off of vmb_next.
180 */
181 typedef struct vmu_bound {
182 avl_node_t vmb_node;
183 struct vmu_bound *vmb_next; /* NULL in tree else on free or temp list */
184 pgcnt_t vmb_start; /* page offset in vnode/amp on which bound starts */
185 pgcnt_t vmb_end; /* page offset in vnode/amp on which bound ends */
186 char vmb_type; /* One of VMUSAGE_BOUND_* */
187 } vmu_bound_t;
188
189 /*
190 * hash of visited objects (vnodes or shared amps)
191 * key is address of vnode or amp. Bounds lists known incore/non-incore
192 * bounds for vnode/amp.
193 */
194 typedef struct vmu_object {
195 struct vmu_object *vmo_next; /* free list */
196 caddr_t vmo_key;
197 short vmo_type;
198 avl_tree_t vmo_bounds;
199 } vmu_object_t;
200
201 /*
202 * Entity by which to count results.
203 *
204 * The entity structure keeps the current rss/swap counts for each entity
205 * (zone, project, etc), and hashes of vm structures that have already
206 * been visited for the entity.
207 *
208 * vme_next: links the list of all entities currently being counted by
209 * vmu_calculate().
210 *
211 * vme_next_calc: links the list of entities related to the current process
212 * being counted by vmu_calculate_proc().
213 *
214 * vmu_calculate_proc() walks all processes. For each process, it makes a
215 * list of the entities related to that process using vme_next_calc. This
216 * list changes each time vmu_calculate_proc() is called.
217 *
218 */
219 typedef struct vmu_entity {
220 struct vmu_entity *vme_next;
221 struct vmu_entity *vme_next_calc;
222 mod_hash_t *vme_vnode_hash; /* vnodes visited for entity */
223 mod_hash_t *vme_amp_hash; /* shared amps visited for entity */
224 mod_hash_t *vme_anon_hash; /* COW anons visited for entity */
225 vmusage_t vme_result; /* identifies entity and results */
226 } vmu_entity_t;
227
228 /*
229 * Hash of entities visited within a zone, and an entity for the zone
230 * itself.
231 */
232 typedef struct vmu_zone {
233 struct vmu_zone *vmz_next; /* free list */
234 id_t vmz_id;
235 vmu_entity_t *vmz_zone;
236 mod_hash_t *vmz_projects_hash;
237 mod_hash_t *vmz_tasks_hash;
238 mod_hash_t *vmz_rusers_hash;
239 mod_hash_t *vmz_eusers_hash;
240 } vmu_zone_t;
241
242 /*
243 * Cache of results from last calculation
244 */
245 typedef struct vmu_cache {
246 vmusage_t *vmc_results; /* Results from last call to */
247 /* vm_getusage(). */
248 uint64_t vmc_nresults; /* Count of cached results */
249 uint64_t vmc_refcnt; /* refcnt for free */
250 uint_t vmc_flags; /* Flags for vm_getusage() */
251 hrtime_t vmc_timestamp; /* when cache was created */
252 } vmu_cache_t;
253
254 /*
255 * top level rss info for the system
256 */
257 typedef struct vmu_data {
258 kmutex_t vmu_lock; /* Protects vmu_data */
259 kcondvar_t vmu_cv; /* Used to signal threads */
260 /* Waiting for */
261 /* Rss_calc_thread to finish */
262 vmu_entity_t *vmu_system; /* Entity for tracking */
263 /* rss/swap for all processes */
264 /* in all zones */
265 mod_hash_t *vmu_zones_hash; /* Zones visited */
266 mod_hash_t *vmu_projects_col_hash; /* These *_col_hash hashes */
267 mod_hash_t *vmu_rusers_col_hash; /* keep track of entities, */
268 mod_hash_t *vmu_eusers_col_hash; /* ignoring zoneid, in order */
269 /* to implement VMUSAGE_COL_* */
270 /* flags, which aggregate by */
271 /* project or user regardless */
272 /* of zoneid. */
273 mod_hash_t *vmu_all_vnodes_hash; /* System wide visited vnodes */
274 /* to track incore/not-incore */
275 mod_hash_t *vmu_all_amps_hash; /* System wide visited shared */
276 /* amps to track incore/not- */
277 /* incore */
278 vmu_entity_t *vmu_entities; /* Linked list of entities */
279 size_t vmu_nentities; /* Count of entities in list */
280 vmu_cache_t *vmu_cache; /* Cached results */
281 kthread_t *vmu_calc_thread; /* NULL, or thread running */
282 /* vmu_calculate() */
283 uint_t vmu_calc_flags; /* Flags being using by */
284 /* currently running calc */
285 /* thread */
286 uint_t vmu_pending_flags; /* Flags of vm_getusage() */
287 /* threads waiting for */
288 /* calc thread to finish */
289 uint_t vmu_pending_waiters; /* Number of threads waiting */
290 /* for calc thread */
291 vmu_bound_t *vmu_free_bounds;
292 vmu_object_t *vmu_free_objects;
293 vmu_entity_t *vmu_free_entities;
294 vmu_zone_t *vmu_free_zones;
295 } vmu_data_t;
296
297 extern struct as kas;
298 extern proc_t *practive;
299 extern zone_t *global_zone;
300 extern struct seg_ops segvn_ops;
301 extern struct seg_ops segspt_shmops;
302
303 static vmu_data_t vmu_data;
304 static kmem_cache_t *vmu_bound_cache;
305 static kmem_cache_t *vmu_object_cache;
306
307 /*
308 * Comparison routine for AVL tree. We base our comparison on vmb_start.
309 */
310 static int
bounds_cmp(const void * bnd1,const void * bnd2)311 bounds_cmp(const void *bnd1, const void *bnd2)
312 {
313 const vmu_bound_t *bound1 = bnd1;
314 const vmu_bound_t *bound2 = bnd2;
315
316 if (bound1->vmb_start == bound2->vmb_start) {
317 return (0);
318 }
319 if (bound1->vmb_start < bound2->vmb_start) {
320 return (-1);
321 }
322
323 return (1);
324 }
325
326 /*
327 * Save a bound on the free list.
328 */
329 static void
vmu_free_bound(vmu_bound_t * bound)330 vmu_free_bound(vmu_bound_t *bound)
331 {
332 bound->vmb_next = vmu_data.vmu_free_bounds;
333 bound->vmb_start = 0;
334 bound->vmb_end = 0;
335 bound->vmb_type = 0;
336 vmu_data.vmu_free_bounds = bound;
337 }
338
339 /*
340 * Free an object, and all visited bound info.
341 */
342 static void
vmu_free_object(mod_hash_val_t val)343 vmu_free_object(mod_hash_val_t val)
344 {
345 vmu_object_t *obj = (vmu_object_t *)val;
346 avl_tree_t *tree = &(obj->vmo_bounds);
347 vmu_bound_t *bound;
348 void *cookie = NULL;
349
350 while ((bound = avl_destroy_nodes(tree, &cookie)) != NULL)
351 vmu_free_bound(bound);
352 avl_destroy(tree);
353
354 obj->vmo_type = 0;
355 obj->vmo_next = vmu_data.vmu_free_objects;
356 vmu_data.vmu_free_objects = obj;
357 }
358
359 /*
360 * Free an entity, and hashes of visited objects for that entity.
361 */
362 static void
vmu_free_entity(mod_hash_val_t val)363 vmu_free_entity(mod_hash_val_t val)
364 {
365 vmu_entity_t *entity = (vmu_entity_t *)val;
366
367 if (entity->vme_vnode_hash != NULL)
368 i_mod_hash_clear_nosync(entity->vme_vnode_hash);
369 if (entity->vme_amp_hash != NULL)
370 i_mod_hash_clear_nosync(entity->vme_amp_hash);
371 if (entity->vme_anon_hash != NULL)
372 i_mod_hash_clear_nosync(entity->vme_anon_hash);
373
374 entity->vme_next = vmu_data.vmu_free_entities;
375 vmu_data.vmu_free_entities = entity;
376 }
377
378 /*
379 * Free zone entity, and all hashes of entities inside that zone,
380 * which are projects, tasks, and users.
381 */
382 static void
vmu_free_zone(mod_hash_val_t val)383 vmu_free_zone(mod_hash_val_t val)
384 {
385 vmu_zone_t *zone = (vmu_zone_t *)val;
386
387 if (zone->vmz_zone != NULL) {
388 vmu_free_entity((mod_hash_val_t)zone->vmz_zone);
389 zone->vmz_zone = NULL;
390 }
391 if (zone->vmz_projects_hash != NULL)
392 i_mod_hash_clear_nosync(zone->vmz_projects_hash);
393 if (zone->vmz_tasks_hash != NULL)
394 i_mod_hash_clear_nosync(zone->vmz_tasks_hash);
395 if (zone->vmz_rusers_hash != NULL)
396 i_mod_hash_clear_nosync(zone->vmz_rusers_hash);
397 if (zone->vmz_eusers_hash != NULL)
398 i_mod_hash_clear_nosync(zone->vmz_eusers_hash);
399 zone->vmz_next = vmu_data.vmu_free_zones;
400 vmu_data.vmu_free_zones = zone;
401 }
402
403 /*
404 * Initialize synchronization primitives and hashes for system-wide tracking
405 * of visited vnodes and shared amps. Initialize results cache.
406 */
407 void
vm_usage_init()408 vm_usage_init()
409 {
410 mutex_init(&vmu_data.vmu_lock, NULL, MUTEX_DEFAULT, NULL);
411 cv_init(&vmu_data.vmu_cv, NULL, CV_DEFAULT, NULL);
412
413 vmu_data.vmu_system = NULL;
414 vmu_data.vmu_zones_hash = NULL;
415 vmu_data.vmu_projects_col_hash = NULL;
416 vmu_data.vmu_rusers_col_hash = NULL;
417 vmu_data.vmu_eusers_col_hash = NULL;
418
419 vmu_data.vmu_free_bounds = NULL;
420 vmu_data.vmu_free_objects = NULL;
421 vmu_data.vmu_free_entities = NULL;
422 vmu_data.vmu_free_zones = NULL;
423
424 vmu_data.vmu_all_vnodes_hash = mod_hash_create_ptrhash(
425 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
426 sizeof (vnode_t));
427 vmu_data.vmu_all_amps_hash = mod_hash_create_ptrhash(
428 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
429 sizeof (struct anon_map));
430 vmu_data.vmu_projects_col_hash = mod_hash_create_idhash(
431 "vmusage collapsed project hash", VMUSAGE_HASH_SIZE,
432 vmu_free_entity);
433 vmu_data.vmu_rusers_col_hash = mod_hash_create_idhash(
434 "vmusage collapsed ruser hash", VMUSAGE_HASH_SIZE,
435 vmu_free_entity);
436 vmu_data.vmu_eusers_col_hash = mod_hash_create_idhash(
437 "vmusage collpased euser hash", VMUSAGE_HASH_SIZE,
438 vmu_free_entity);
439 vmu_data.vmu_zones_hash = mod_hash_create_idhash(
440 "vmusage zone hash", VMUSAGE_HASH_SIZE, vmu_free_zone);
441
442 vmu_bound_cache = kmem_cache_create("vmu_bound_cache",
443 sizeof (vmu_bound_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
444 vmu_object_cache = kmem_cache_create("vmu_object_cache",
445 sizeof (vmu_object_t), 0, NULL, NULL, NULL, NULL, NULL, 0);
446
447 vmu_data.vmu_entities = NULL;
448 vmu_data.vmu_nentities = 0;
449
450 vmu_data.vmu_cache = NULL;
451 vmu_data.vmu_calc_thread = NULL;
452 vmu_data.vmu_calc_flags = 0;
453 vmu_data.vmu_pending_flags = 0;
454 vmu_data.vmu_pending_waiters = 0;
455 }
456
457 /*
458 * Allocate hashes for tracking vm objects visited for an entity.
459 * Update list of entities.
460 */
461 static vmu_entity_t *
vmu_alloc_entity(id_t id,int type,id_t zoneid)462 vmu_alloc_entity(id_t id, int type, id_t zoneid)
463 {
464 vmu_entity_t *entity;
465
466 if (vmu_data.vmu_free_entities != NULL) {
467 entity = vmu_data.vmu_free_entities;
468 vmu_data.vmu_free_entities =
469 vmu_data.vmu_free_entities->vme_next;
470 bzero(&entity->vme_result, sizeof (vmusage_t));
471 } else {
472 entity = kmem_zalloc(sizeof (vmu_entity_t), KM_SLEEP);
473 }
474 entity->vme_result.vmu_id = id;
475 entity->vme_result.vmu_zoneid = zoneid;
476 entity->vme_result.vmu_type = type;
477
478 if (entity->vme_vnode_hash == NULL)
479 entity->vme_vnode_hash = mod_hash_create_ptrhash(
480 "vmusage vnode hash", VMUSAGE_HASH_SIZE, vmu_free_object,
481 sizeof (vnode_t));
482
483 if (entity->vme_amp_hash == NULL)
484 entity->vme_amp_hash = mod_hash_create_ptrhash(
485 "vmusage amp hash", VMUSAGE_HASH_SIZE, vmu_free_object,
486 sizeof (struct anon_map));
487
488 if (entity->vme_anon_hash == NULL)
489 entity->vme_anon_hash = mod_hash_create_ptrhash(
490 "vmusage anon hash", VMUSAGE_HASH_SIZE,
491 mod_hash_null_valdtor, sizeof (struct anon));
492
493 entity->vme_next = vmu_data.vmu_entities;
494 vmu_data.vmu_entities = entity;
495 vmu_data.vmu_nentities++;
496
497 return (entity);
498 }
499
500 /*
501 * Allocate a zone entity, and hashes for tracking visited vm objects
502 * for projects, tasks, and users within that zone.
503 */
504 static vmu_zone_t *
vmu_alloc_zone(id_t id)505 vmu_alloc_zone(id_t id)
506 {
507 vmu_zone_t *zone;
508
509 if (vmu_data.vmu_free_zones != NULL) {
510 zone = vmu_data.vmu_free_zones;
511 vmu_data.vmu_free_zones =
512 vmu_data.vmu_free_zones->vmz_next;
513 zone->vmz_next = NULL;
514 zone->vmz_zone = NULL;
515 } else {
516 zone = kmem_zalloc(sizeof (vmu_zone_t), KM_SLEEP);
517 }
518
519 zone->vmz_id = id;
520
521 if ((vmu_data.vmu_calc_flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES)) != 0)
522 zone->vmz_zone = vmu_alloc_entity(id, VMUSAGE_ZONE, id);
523
524 if ((vmu_data.vmu_calc_flags & (VMUSAGE_PROJECTS |
525 VMUSAGE_ALL_PROJECTS)) != 0 && zone->vmz_projects_hash == NULL)
526 zone->vmz_projects_hash = mod_hash_create_idhash(
527 "vmusage project hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
528
529 if ((vmu_data.vmu_calc_flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
530 != 0 && zone->vmz_tasks_hash == NULL)
531 zone->vmz_tasks_hash = mod_hash_create_idhash(
532 "vmusage task hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
533
534 if ((vmu_data.vmu_calc_flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS))
535 != 0 && zone->vmz_rusers_hash == NULL)
536 zone->vmz_rusers_hash = mod_hash_create_idhash(
537 "vmusage ruser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
538
539 if ((vmu_data.vmu_calc_flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS))
540 != 0 && zone->vmz_eusers_hash == NULL)
541 zone->vmz_eusers_hash = mod_hash_create_idhash(
542 "vmusage euser hash", VMUSAGE_HASH_SIZE, vmu_free_entity);
543
544 return (zone);
545 }
546
547 /*
548 * Allocate a structure for tracking visited bounds for a vm object.
549 */
550 static vmu_object_t *
vmu_alloc_object(caddr_t key,int type)551 vmu_alloc_object(caddr_t key, int type)
552 {
553 vmu_object_t *object;
554
555 if (vmu_data.vmu_free_objects != NULL) {
556 object = vmu_data.vmu_free_objects;
557 vmu_data.vmu_free_objects =
558 vmu_data.vmu_free_objects->vmo_next;
559 } else {
560 object = kmem_cache_alloc(vmu_object_cache, KM_SLEEP);
561 }
562
563 object->vmo_next = NULL;
564 object->vmo_key = key;
565 object->vmo_type = type;
566 avl_create(&(object->vmo_bounds), bounds_cmp, sizeof (vmu_bound_t), 0);
567
568 return (object);
569 }
570
571 /*
572 * Allocate and return a bound structure.
573 */
574 static vmu_bound_t *
vmu_alloc_bound()575 vmu_alloc_bound()
576 {
577 vmu_bound_t *bound;
578
579 if (vmu_data.vmu_free_bounds != NULL) {
580 bound = vmu_data.vmu_free_bounds;
581 vmu_data.vmu_free_bounds =
582 vmu_data.vmu_free_bounds->vmb_next;
583 } else {
584 bound = kmem_cache_alloc(vmu_bound_cache, KM_SLEEP);
585 }
586
587 bound->vmb_next = NULL;
588 bound->vmb_start = 0;
589 bound->vmb_end = 0;
590 bound->vmb_type = 0;
591 return (bound);
592 }
593
594 /*
595 * vmu_find_insert_* functions implement hash lookup or allocate and
596 * insert operations.
597 */
598 static vmu_object_t *
vmu_find_insert_object(mod_hash_t * hash,caddr_t key,uint_t type)599 vmu_find_insert_object(mod_hash_t *hash, caddr_t key, uint_t type)
600 {
601 int ret;
602 vmu_object_t *object;
603
604 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
605 (mod_hash_val_t *)&object);
606 if (ret != 0) {
607 object = vmu_alloc_object(key, type);
608 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
609 (mod_hash_val_t)object, (mod_hash_hndl_t)0);
610 ASSERT(ret == 0);
611 }
612 return (object);
613 }
614
615 static int
vmu_find_insert_anon(mod_hash_t * hash,caddr_t key)616 vmu_find_insert_anon(mod_hash_t *hash, caddr_t key)
617 {
618 int ret;
619 caddr_t val;
620
621 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)key,
622 (mod_hash_val_t *)&val);
623
624 if (ret == 0)
625 return (0);
626
627 ret = i_mod_hash_insert_nosync(hash, (mod_hash_key_t)key,
628 (mod_hash_val_t)key, (mod_hash_hndl_t)0);
629
630 ASSERT(ret == 0);
631
632 return (1);
633 }
634
635 static vmu_entity_t *
vmu_find_insert_entity(mod_hash_t * hash,id_t id,uint_t type,id_t zoneid)636 vmu_find_insert_entity(mod_hash_t *hash, id_t id, uint_t type, id_t zoneid)
637 {
638 int ret;
639 vmu_entity_t *entity;
640
641 ret = i_mod_hash_find_nosync(hash, (mod_hash_key_t)(uintptr_t)id,
642 (mod_hash_val_t *)&entity);
643 if (ret != 0) {
644 entity = vmu_alloc_entity(id, type, zoneid);
645 ret = i_mod_hash_insert_nosync(hash,
646 (mod_hash_key_t)(uintptr_t)id, (mod_hash_val_t)entity,
647 (mod_hash_hndl_t)0);
648 ASSERT(ret == 0);
649 }
650 return (entity);
651 }
652
653
654
655
656 /*
657 * Returns list of object bounds between start and end. New bounds inserted
658 * by this call are given type.
659 *
660 * Returns the number of pages covered if new bounds are created. Returns 0
661 * if region between start/end consists of all existing bounds.
662 */
663 static pgcnt_t
vmu_insert_lookup_object_bounds(vmu_object_t * ro,pgcnt_t start,pgcnt_t end,char type,vmu_bound_t ** first,vmu_bound_t ** last)664 vmu_insert_lookup_object_bounds(vmu_object_t *ro, pgcnt_t start, pgcnt_t
665 end, char type, vmu_bound_t **first, vmu_bound_t **last)
666 {
667 avl_tree_t *tree = &(ro->vmo_bounds);
668 avl_index_t where;
669 vmu_bound_t *walker, *tmp;
670 pgcnt_t ret = 0;
671
672 ASSERT(start <= end);
673
674 *first = *last = NULL;
675
676 tmp = vmu_alloc_bound();
677 tmp->vmb_start = start;
678 tmp->vmb_type = type;
679
680 /* Hopelessly optimistic case. */
681 if (walker = avl_find(tree, tmp, &where)) {
682 /* We got lucky. */
683 vmu_free_bound(tmp);
684 *first = walker;
685 }
686
687 if (walker == NULL) {
688 /* Is start in the previous node? */
689 walker = avl_nearest(tree, where, AVL_BEFORE);
690 if (walker != NULL) {
691 if (ISWITHIN(walker, start)) {
692 /* We found start. */
693 vmu_free_bound(tmp);
694 *first = walker;
695 }
696 }
697 }
698
699 /*
700 * At this point, if *first is still NULL, then we
701 * didn't get a direct hit and start isn't covered
702 * by the previous node. We know that the next node
703 * must have a greater start value than we require
704 * because avl_find tells us where the AVL routines would
705 * insert our new node. We have some gap between the
706 * start we want and the next node.
707 */
708 if (*first == NULL) {
709 walker = avl_nearest(tree, where, AVL_AFTER);
710 if (walker != NULL && walker->vmb_start <= end) {
711 /* Fill the gap. */
712 tmp->vmb_end = walker->vmb_start - 1;
713 *first = tmp;
714 } else {
715 /* We have a gap over [start, end]. */
716 tmp->vmb_end = end;
717 *first = *last = tmp;
718 }
719 ret += tmp->vmb_end - tmp->vmb_start + 1;
720 avl_insert(tree, tmp, where);
721 }
722
723 ASSERT(*first != NULL);
724
725 if (*last != NULL) {
726 /* We're done. */
727 return (ret);
728 }
729
730 /*
731 * If we are here we still need to set *last and
732 * that may involve filling in some gaps.
733 */
734 *last = *first;
735 for (;;) {
736 if (ISWITHIN(*last, end)) {
737 /* We're done. */
738 break;
739 }
740 walker = AVL_NEXT(tree, *last);
741 if (walker == NULL || walker->vmb_start > end) {
742 /* Bottom or mid tree with gap. */
743 tmp = vmu_alloc_bound();
744 tmp->vmb_start = (*last)->vmb_end + 1;
745 tmp->vmb_end = end;
746 tmp->vmb_type = type;
747 ret += tmp->vmb_end - tmp->vmb_start + 1;
748 avl_insert_here(tree, tmp, *last, AVL_AFTER);
749 *last = tmp;
750 break;
751 } else {
752 if ((*last)->vmb_end + 1 != walker->vmb_start) {
753 /* Non-contiguous. */
754 tmp = vmu_alloc_bound();
755 tmp->vmb_start = (*last)->vmb_end + 1;
756 tmp->vmb_end = walker->vmb_start - 1;
757 tmp->vmb_type = type;
758 ret += tmp->vmb_end - tmp->vmb_start + 1;
759 avl_insert_here(tree, tmp, *last, AVL_AFTER);
760 *last = tmp;
761 } else {
762 *last = walker;
763 }
764 }
765 }
766
767 return (ret);
768 }
769
770 /*
771 * vmu_update_bounds()
772 *
773 * tree: avl_tree in which first and last hang.
774 *
775 * first, last: list of continuous bounds, of which zero or more are of
776 * type VMUSAGE_BOUND_UNKNOWN.
777 *
778 * new_tree: avl_tree in which new_first and new_last hang.
779 *
780 * new_first, new_last: list of continuous bounds, of which none are of
781 * type VMUSAGE_BOUND_UNKNOWN. These bounds are used to
782 * update the types of bounds in (first,last) with
783 * type VMUSAGE_BOUND_UNKNOWN.
784 *
785 * For the list of bounds (first,last), this function updates any bounds
786 * with type VMUSAGE_BOUND_UNKNOWN using the type of the corresponding bound in
787 * the list (new_first, new_last).
788 *
789 * If a bound of type VMUSAGE_BOUND_UNKNOWN spans multiple bounds in the list
790 * (new_first, new_last), it will be split into multiple bounds.
791 *
792 * Return value:
793 * The number of pages in the list of bounds (first,last) that were of
794 * type VMUSAGE_BOUND_UNKNOWN, which have been updated to be of type
795 * VMUSAGE_BOUND_INCORE.
796 *
797 */
798 static pgcnt_t
vmu_update_bounds(avl_tree_t * tree,vmu_bound_t ** first,vmu_bound_t ** last,avl_tree_t * new_tree,vmu_bound_t * new_first,vmu_bound_t * new_last)799 vmu_update_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last,
800 avl_tree_t *new_tree, vmu_bound_t *new_first, vmu_bound_t *new_last)
801 {
802 vmu_bound_t *next, *new_next, *tmp;
803 pgcnt_t rss = 0;
804
805 next = *first;
806 new_next = new_first;
807
808 /*
809 * Verify first and last bound are covered by new bounds if they
810 * have unknown type.
811 */
812 ASSERT((*first)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
813 (*first)->vmb_start >= new_first->vmb_start);
814 ASSERT((*last)->vmb_type != VMUSAGE_BOUND_UNKNOWN ||
815 (*last)->vmb_end <= new_last->vmb_end);
816 for (;;) {
817 /* If bound already has type, proceed to next bound. */
818 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
819 if (next == *last)
820 break;
821 next = AVL_NEXT(tree, next);
822 continue;
823 }
824 while (new_next->vmb_end < next->vmb_start)
825 new_next = AVL_NEXT(new_tree, new_next);
826 ASSERT(new_next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
827 next->vmb_type = new_next->vmb_type;
828 if (new_next->vmb_end < next->vmb_end) {
829 /* need to split bound */
830 tmp = vmu_alloc_bound();
831 tmp->vmb_type = VMUSAGE_BOUND_UNKNOWN;
832 tmp->vmb_start = new_next->vmb_end + 1;
833 tmp->vmb_end = next->vmb_end;
834 avl_insert_here(tree, tmp, next, AVL_AFTER);
835 next->vmb_end = new_next->vmb_end;
836 if (*last == next)
837 *last = tmp;
838 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
839 rss += next->vmb_end - next->vmb_start + 1;
840 next = tmp;
841 } else {
842 if (next->vmb_type == VMUSAGE_BOUND_INCORE)
843 rss += next->vmb_end - next->vmb_start + 1;
844 if (next == *last)
845 break;
846 next = AVL_NEXT(tree, next);
847 }
848 }
849 return (rss);
850 }
851
852 /*
853 * Merges adjacent bounds with same type between first and last bound.
854 * After merge, last pointer may point to a different bound, as (incoming)
855 * last bound may have been merged away.
856 */
857 static void
vmu_merge_bounds(avl_tree_t * tree,vmu_bound_t ** first,vmu_bound_t ** last)858 vmu_merge_bounds(avl_tree_t *tree, vmu_bound_t **first, vmu_bound_t **last)
859 {
860 vmu_bound_t *current;
861 vmu_bound_t *next;
862
863 ASSERT(tree != NULL);
864 ASSERT(*first != NULL);
865 ASSERT(*last != NULL);
866
867 current = *first;
868 while (current != *last) {
869 next = AVL_NEXT(tree, current);
870 if ((current->vmb_end + 1) == next->vmb_start &&
871 current->vmb_type == next->vmb_type) {
872 current->vmb_end = next->vmb_end;
873 avl_remove(tree, next);
874 vmu_free_bound(next);
875 if (next == *last) {
876 *last = current;
877 }
878 } else {
879 current = AVL_NEXT(tree, current);
880 }
881 }
882 }
883
884 /*
885 * Given an amp and a list of bounds, updates each bound's type with
886 * VMUSAGE_BOUND_INCORE or VMUSAGE_BOUND_NOT_INCORE.
887 *
888 * If a bound is partially incore, it will be split into two bounds.
889 * first and last may be modified, as bounds may be split into multiple
890 * bounds if they are partially incore/not-incore.
891 *
892 * Set incore to non-zero if bounds are already known to be incore.
893 *
894 */
895 static void
vmu_amp_update_incore_bounds(avl_tree_t * tree,struct anon_map * amp,vmu_bound_t ** first,vmu_bound_t ** last,boolean_t incore)896 vmu_amp_update_incore_bounds(avl_tree_t *tree, struct anon_map *amp,
897 vmu_bound_t **first, vmu_bound_t **last, boolean_t incore)
898 {
899 vmu_bound_t *next;
900 vmu_bound_t *tmp;
901 pgcnt_t index;
902 short bound_type;
903 short page_type;
904 vnode_t *vn;
905 anoff_t off;
906 struct anon *ap;
907
908 next = *first;
909 /* Shared anon slots don't change once set. */
910 ANON_LOCK_ENTER(&->a_rwlock, RW_READER);
911 for (;;) {
912 if (incore == B_TRUE)
913 next->vmb_type = VMUSAGE_BOUND_INCORE;
914
915 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
916 if (next == *last)
917 break;
918 next = AVL_NEXT(tree, next);
919 continue;
920 }
921 bound_type = next->vmb_type;
922 index = next->vmb_start;
923 while (index <= next->vmb_end) {
924
925 /*
926 * These are used to determine how much to increment
927 * index when a large page is found.
928 */
929 page_t *page;
930 pgcnt_t pgcnt = 1;
931 uint_t pgshft;
932 pgcnt_t pgmsk;
933
934 ap = anon_get_ptr(amp->ahp, index);
935 if (ap != NULL)
936 swap_xlate(ap, &vn, &off);
937
938 if (ap != NULL && vn != NULL && vn->v_pages != NULL &&
939 (page = page_exists(vn, off)) != NULL) {
940 page_type = VMUSAGE_BOUND_INCORE;
941 if (page->p_szc > 0) {
942 pgcnt = page_get_pagecnt(page->p_szc);
943 pgshft = page_get_shift(page->p_szc);
944 pgmsk = (0x1 << (pgshft - PAGESHIFT))
945 - 1;
946 }
947 } else {
948 page_type = VMUSAGE_BOUND_NOT_INCORE;
949 }
950 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
951 next->vmb_type = page_type;
952 } else if (next->vmb_type != page_type) {
953 /*
954 * If current bound type does not match page
955 * type, need to split off new bound.
956 */
957 tmp = vmu_alloc_bound();
958 tmp->vmb_type = page_type;
959 tmp->vmb_start = index;
960 tmp->vmb_end = next->vmb_end;
961 avl_insert_here(tree, tmp, next, AVL_AFTER);
962 next->vmb_end = index - 1;
963 if (*last == next)
964 *last = tmp;
965 next = tmp;
966 }
967 if (pgcnt > 1) {
968 /*
969 * If inside large page, jump to next large
970 * page
971 */
972 index = (index & ~pgmsk) + pgcnt;
973 } else {
974 index++;
975 }
976 }
977 if (next == *last) {
978 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
979 break;
980 } else
981 next = AVL_NEXT(tree, next);
982 }
983 ANON_LOCK_EXIT(&->a_rwlock);
984 }
985
986 /*
987 * Same as vmu_amp_update_incore_bounds(), except for tracking
988 * incore-/not-incore for vnodes.
989 */
990 static void
vmu_vnode_update_incore_bounds(avl_tree_t * tree,vnode_t * vnode,vmu_bound_t ** first,vmu_bound_t ** last)991 vmu_vnode_update_incore_bounds(avl_tree_t *tree, vnode_t *vnode,
992 vmu_bound_t **first, vmu_bound_t **last)
993 {
994 vmu_bound_t *next;
995 vmu_bound_t *tmp;
996 pgcnt_t index;
997 short bound_type;
998 short page_type;
999
1000 next = *first;
1001 for (;;) {
1002 if (vnode->v_pages == NULL)
1003 next->vmb_type = VMUSAGE_BOUND_NOT_INCORE;
1004
1005 if (next->vmb_type != VMUSAGE_BOUND_UNKNOWN) {
1006 if (next == *last)
1007 break;
1008 next = AVL_NEXT(tree, next);
1009 continue;
1010 }
1011
1012 bound_type = next->vmb_type;
1013 index = next->vmb_start;
1014 while (index <= next->vmb_end) {
1015
1016 /*
1017 * These are used to determine how much to increment
1018 * index when a large page is found.
1019 */
1020 page_t *page;
1021 pgcnt_t pgcnt = 1;
1022 uint_t pgshft;
1023 pgcnt_t pgmsk;
1024
1025 if (vnode->v_pages != NULL &&
1026 (page = page_exists(vnode, ptob(index))) != NULL) {
1027 page_type = VMUSAGE_BOUND_INCORE;
1028 if (page->p_szc > 0) {
1029 pgcnt = page_get_pagecnt(page->p_szc);
1030 pgshft = page_get_shift(page->p_szc);
1031 pgmsk = (0x1 << (pgshft - PAGESHIFT))
1032 - 1;
1033 }
1034 } else {
1035 page_type = VMUSAGE_BOUND_NOT_INCORE;
1036 }
1037 if (bound_type == VMUSAGE_BOUND_UNKNOWN) {
1038 next->vmb_type = page_type;
1039 } else if (next->vmb_type != page_type) {
1040 /*
1041 * If current bound type does not match page
1042 * type, need to split off new bound.
1043 */
1044 tmp = vmu_alloc_bound();
1045 tmp->vmb_type = page_type;
1046 tmp->vmb_start = index;
1047 tmp->vmb_end = next->vmb_end;
1048 avl_insert_here(tree, tmp, next, AVL_AFTER);
1049 next->vmb_end = index - 1;
1050 if (*last == next)
1051 *last = tmp;
1052 next = tmp;
1053 }
1054 if (pgcnt > 1) {
1055 /*
1056 * If inside large page, jump to next large
1057 * page
1058 */
1059 index = (index & ~pgmsk) + pgcnt;
1060 } else {
1061 index++;
1062 }
1063 }
1064 if (next == *last) {
1065 ASSERT(next->vmb_type != VMUSAGE_BOUND_UNKNOWN);
1066 break;
1067 } else
1068 next = AVL_NEXT(tree, next);
1069 }
1070 }
1071
1072 /*
1073 * Calculate the rss and swap consumed by a segment. vmu_entities is the
1074 * list of entities to visit. For shared segments, the vnode or amp
1075 * is looked up in each entity to see if it has been already counted. Private
1076 * anon pages are checked per entity to ensure that COW pages are not
1077 * double counted.
1078 *
1079 * For private mapped files, first the amp is checked for private pages.
1080 * Bounds not backed by the amp are looked up in the vnode for each entity
1081 * to avoid double counting of private COW vnode pages.
1082 */
1083 static void
vmu_calculate_seg(vmu_entity_t * vmu_entities,struct seg * seg)1084 vmu_calculate_seg(vmu_entity_t *vmu_entities, struct seg *seg)
1085 {
1086 struct segvn_data *svd;
1087 struct shm_data *shmd;
1088 struct spt_data *sptd;
1089 vmu_object_t *shared_object = NULL;
1090 vmu_object_t *entity_object = NULL;
1091 vmu_entity_t *entity;
1092 vmusage_t *result;
1093 vmu_bound_t *first = NULL;
1094 vmu_bound_t *last = NULL;
1095 vmu_bound_t *cur = NULL;
1096 vmu_bound_t *e_first = NULL;
1097 vmu_bound_t *e_last = NULL;
1098 vmu_bound_t *tmp;
1099 pgcnt_t p_index, s_index, p_start, p_end, s_start, s_end, rss, virt;
1100 struct anon_map *private_amp = NULL;
1101 boolean_t incore = B_FALSE;
1102 boolean_t shared = B_FALSE;
1103 int file = 0;
1104 pgcnt_t swresv = 0;
1105 pgcnt_t panon = 0;
1106
1107 s_start = 0;
1108 p_end = 0;
1109 /* Can zero-length segments exist? Not sure, so paranoia. */
1110 if (seg->s_size <= 0)
1111 return;
1112
1113 /*
1114 * Figure out if there is a shared object (such as a named vnode or
1115 * a shared amp, then figure out if there is a private amp, which
1116 * identifies private pages.
1117 */
1118 if (seg->s_ops == &segvn_ops) {
1119 svd = (struct segvn_data *)seg->s_data;
1120 if (svd->type == MAP_SHARED) {
1121 shared = B_TRUE;
1122 } else {
1123 swresv = svd->swresv;
1124
1125 if (SEGVN_LOCK_TRYENTER(seg->s_as, &svd->lock,
1126 RW_READER) != 0) {
1127 /*
1128 * Text replication anon maps can be shared
1129 * across all zones. Space used for text
1130 * replication is typically capped as a small %
1131 * of memory. To keep it simple for now we
1132 * don't account for swap and memory space used
1133 * for text replication.
1134 */
1135 if (svd->tr_state == SEGVN_TR_OFF &&
1136 svd->amp != NULL) {
1137 private_amp = svd->amp;
1138 p_start = svd->anon_index;
1139 p_end = svd->anon_index +
1140 btop(seg->s_size) - 1;
1141 }
1142 SEGVN_LOCK_EXIT(seg->s_as, &svd->lock);
1143 }
1144 }
1145 if (svd->vp != NULL) {
1146 file = 1;
1147 shared_object = vmu_find_insert_object(
1148 vmu_data.vmu_all_vnodes_hash, (caddr_t)svd->vp,
1149 VMUSAGE_TYPE_VNODE);
1150 s_start = btop(svd->offset);
1151 s_end = btop(svd->offset + seg->s_size) - 1;
1152 }
1153 if (svd->amp != NULL && svd->type == MAP_SHARED) {
1154 ASSERT(shared_object == NULL);
1155 shared_object = vmu_find_insert_object(
1156 vmu_data.vmu_all_amps_hash, (caddr_t)svd->amp,
1157 VMUSAGE_TYPE_AMP);
1158 s_start = svd->anon_index;
1159 s_end = svd->anon_index + btop(seg->s_size) - 1;
1160 /* schedctl mappings are always in core */
1161 if (svd->amp->swresv == 0)
1162 incore = B_TRUE;
1163 }
1164 } else if (seg->s_ops == &segspt_shmops) {
1165 shared = B_TRUE;
1166 shmd = (struct shm_data *)seg->s_data;
1167 shared_object = vmu_find_insert_object(
1168 vmu_data.vmu_all_amps_hash, (caddr_t)shmd->shm_amp,
1169 VMUSAGE_TYPE_AMP);
1170 s_start = 0;
1171 s_end = btop(seg->s_size) - 1;
1172 sptd = shmd->shm_sptseg->s_data;
1173
1174 /* ism segments are always incore and do not reserve swap */
1175 if (sptd->spt_flags & SHM_SHARE_MMU)
1176 incore = B_TRUE;
1177
1178 } else {
1179 return;
1180 }
1181
1182 /*
1183 * If there is a private amp, count anon pages that exist. If an
1184 * anon has a refcnt > 1 (COW sharing), then save the anon in a
1185 * hash so that it is not double counted.
1186 *
1187 * If there is also a shared object, then figure out the bounds
1188 * which are not mapped by the private amp.
1189 */
1190 if (private_amp != NULL) {
1191
1192 /* Enter as writer to prevent COW anons from being freed */
1193 ANON_LOCK_ENTER(&private_amp->a_rwlock, RW_WRITER);
1194
1195 p_index = p_start;
1196 s_index = s_start;
1197
1198 while (p_index <= p_end) {
1199
1200 pgcnt_t p_index_next;
1201 pgcnt_t p_bound_size;
1202 int cnt;
1203 anoff_t off;
1204 struct vnode *vn;
1205 struct anon *ap;
1206 page_t *page; /* For handling of large */
1207 pgcnt_t pgcnt = 1; /* pages */
1208 pgcnt_t pgstart;
1209 pgcnt_t pgend;
1210 uint_t pgshft;
1211 pgcnt_t pgmsk;
1212
1213 p_index_next = p_index;
1214 ap = anon_get_next_ptr(private_amp->ahp,
1215 &p_index_next);
1216
1217 /*
1218 * If next anon is past end of mapping, simulate
1219 * end of anon so loop terminates.
1220 */
1221 if (p_index_next > p_end) {
1222 p_index_next = p_end + 1;
1223 ap = NULL;
1224 }
1225 /*
1226 * For COW segments, keep track of bounds not
1227 * backed by private amp so they can be looked
1228 * up in the backing vnode
1229 */
1230 if (p_index_next != p_index) {
1231
1232 /*
1233 * Compute index difference between anon and
1234 * previous anon.
1235 */
1236 p_bound_size = p_index_next - p_index - 1;
1237
1238 if (shared_object != NULL) {
1239 cur = vmu_alloc_bound();
1240 cur->vmb_start = s_index;
1241 cur->vmb_end = s_index + p_bound_size;
1242 cur->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1243 if (first == NULL) {
1244 first = cur;
1245 last = cur;
1246 } else {
1247 last->vmb_next = cur;
1248 last = cur;
1249 }
1250 }
1251 p_index = p_index + p_bound_size + 1;
1252 s_index = s_index + p_bound_size + 1;
1253 }
1254
1255 /* Detect end of anons in amp */
1256 if (ap == NULL)
1257 break;
1258
1259 cnt = ap->an_refcnt;
1260 swap_xlate(ap, &vn, &off);
1261
1262 if (vn == NULL || vn->v_pages == NULL ||
1263 (page = page_exists(vn, off)) == NULL) {
1264 p_index++;
1265 s_index++;
1266 continue;
1267 }
1268
1269 /*
1270 * If large page is found, compute portion of large
1271 * page in mapping, and increment indicies to the next
1272 * large page.
1273 */
1274 if (page->p_szc > 0) {
1275
1276 pgcnt = page_get_pagecnt(page->p_szc);
1277 pgshft = page_get_shift(page->p_szc);
1278 pgmsk = (0x1 << (pgshft - PAGESHIFT)) - 1;
1279
1280 /* First page in large page */
1281 pgstart = p_index & ~pgmsk;
1282 /* Last page in large page */
1283 pgend = pgstart + pgcnt - 1;
1284 /*
1285 * Artifically end page if page extends past
1286 * end of mapping.
1287 */
1288 if (pgend > p_end)
1289 pgend = p_end;
1290
1291 /*
1292 * Compute number of pages from large page
1293 * which are mapped.
1294 */
1295 pgcnt = pgend - p_index + 1;
1296
1297 /*
1298 * Point indicies at page after large page,
1299 * or at page after end of mapping.
1300 */
1301 p_index += pgcnt;
1302 s_index += pgcnt;
1303 } else {
1304 p_index++;
1305 s_index++;
1306 }
1307
1308 /*
1309 * Assume anon structs with a refcnt
1310 * of 1 are not COW shared, so there
1311 * is no reason to track them per entity.
1312 */
1313 if (cnt == 1) {
1314 panon += pgcnt;
1315 continue;
1316 }
1317 for (entity = vmu_entities; entity != NULL;
1318 entity = entity->vme_next_calc) {
1319
1320 result = &entity->vme_result;
1321 /*
1322 * Track COW anons per entity so
1323 * they are not double counted.
1324 */
1325 if (vmu_find_insert_anon(entity->vme_anon_hash,
1326 (caddr_t)ap) == 0)
1327 continue;
1328
1329 result->vmu_rss_all += (pgcnt << PAGESHIFT);
1330 result->vmu_rss_private +=
1331 (pgcnt << PAGESHIFT);
1332 }
1333 }
1334 ANON_LOCK_EXIT(&private_amp->a_rwlock);
1335 }
1336
1337 /* Add up resident anon and swap reserved for private mappings */
1338 if (swresv > 0 || panon > 0) {
1339 for (entity = vmu_entities; entity != NULL;
1340 entity = entity->vme_next_calc) {
1341 result = &entity->vme_result;
1342 result->vmu_swap_all += swresv;
1343 result->vmu_swap_private += swresv;
1344 result->vmu_rss_all += (panon << PAGESHIFT);
1345 result->vmu_rss_private += (panon << PAGESHIFT);
1346 }
1347 }
1348
1349 /* Compute resident pages backing shared amp or named vnode */
1350 if (shared_object != NULL) {
1351 avl_tree_t *tree = &(shared_object->vmo_bounds);
1352
1353 if (first == NULL) {
1354 /*
1355 * No private amp, or private amp has no anon
1356 * structs. This means entire segment is backed by
1357 * the shared object.
1358 */
1359 first = vmu_alloc_bound();
1360 first->vmb_start = s_start;
1361 first->vmb_end = s_end;
1362 first->vmb_type = VMUSAGE_BOUND_UNKNOWN;
1363 }
1364 /*
1365 * Iterate bounds not backed by private amp, and compute
1366 * resident pages.
1367 */
1368 cur = first;
1369 while (cur != NULL) {
1370
1371 if (vmu_insert_lookup_object_bounds(shared_object,
1372 cur->vmb_start, cur->vmb_end, VMUSAGE_BOUND_UNKNOWN,
1373 &first, &last) > 0) {
1374 /* new bounds, find incore/not-incore */
1375 if (shared_object->vmo_type ==
1376 VMUSAGE_TYPE_VNODE) {
1377 vmu_vnode_update_incore_bounds(
1378 tree,
1379 (vnode_t *)
1380 shared_object->vmo_key, &first,
1381 &last);
1382 } else {
1383 vmu_amp_update_incore_bounds(
1384 tree,
1385 (struct anon_map *)
1386 shared_object->vmo_key, &first,
1387 &last, incore);
1388 }
1389 vmu_merge_bounds(tree, &first, &last);
1390 }
1391 for (entity = vmu_entities; entity != NULL;
1392 entity = entity->vme_next_calc) {
1393 avl_tree_t *e_tree;
1394
1395 result = &entity->vme_result;
1396
1397 entity_object = vmu_find_insert_object(
1398 shared_object->vmo_type ==
1399 VMUSAGE_TYPE_VNODE ? entity->vme_vnode_hash:
1400 entity->vme_amp_hash,
1401 shared_object->vmo_key,
1402 shared_object->vmo_type);
1403
1404 virt = vmu_insert_lookup_object_bounds(
1405 entity_object, cur->vmb_start, cur->vmb_end,
1406 VMUSAGE_BOUND_UNKNOWN, &e_first, &e_last);
1407
1408 if (virt == 0)
1409 continue;
1410 /*
1411 * Range visited for this entity
1412 */
1413 e_tree = &(entity_object->vmo_bounds);
1414 rss = vmu_update_bounds(e_tree, &e_first,
1415 &e_last, tree, first, last);
1416 result->vmu_rss_all += (rss << PAGESHIFT);
1417 if (shared == B_TRUE && file == B_FALSE) {
1418 /* shared anon mapping */
1419 result->vmu_swap_all +=
1420 (virt << PAGESHIFT);
1421 result->vmu_swap_shared +=
1422 (virt << PAGESHIFT);
1423 result->vmu_rss_shared +=
1424 (rss << PAGESHIFT);
1425 } else if (shared == B_TRUE && file == B_TRUE) {
1426 /* shared file mapping */
1427 result->vmu_rss_shared +=
1428 (rss << PAGESHIFT);
1429 } else if (shared == B_FALSE &&
1430 file == B_TRUE) {
1431 /* private file mapping */
1432 result->vmu_rss_private +=
1433 (rss << PAGESHIFT);
1434 }
1435 vmu_merge_bounds(e_tree, &e_first, &e_last);
1436 }
1437 tmp = cur;
1438 cur = cur->vmb_next;
1439 vmu_free_bound(tmp);
1440 }
1441 }
1442 }
1443
1444 /*
1445 * Based on the current calculation flags, find the relevant entities
1446 * which are relative to the process. Then calculate each segment
1447 * in the process'es address space for each relevant entity.
1448 */
1449 static void
vmu_calculate_proc(proc_t * p)1450 vmu_calculate_proc(proc_t *p)
1451 {
1452 vmu_entity_t *entities = NULL;
1453 vmu_zone_t *zone;
1454 vmu_entity_t *tmp;
1455 struct as *as;
1456 struct seg *seg;
1457 int ret;
1458
1459 /* Figure out which entities are being computed */
1460 if ((vmu_data.vmu_system) != NULL) {
1461 tmp = vmu_data.vmu_system;
1462 tmp->vme_next_calc = entities;
1463 entities = tmp;
1464 }
1465 if (vmu_data.vmu_calc_flags &
1466 (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES | VMUSAGE_PROJECTS |
1467 VMUSAGE_ALL_PROJECTS | VMUSAGE_TASKS | VMUSAGE_ALL_TASKS |
1468 VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_EUSERS |
1469 VMUSAGE_ALL_EUSERS)) {
1470 ret = i_mod_hash_find_nosync(vmu_data.vmu_zones_hash,
1471 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1472 (mod_hash_val_t *)&zone);
1473 if (ret != 0) {
1474 zone = vmu_alloc_zone(p->p_zone->zone_id);
1475 ret = i_mod_hash_insert_nosync(vmu_data.vmu_zones_hash,
1476 (mod_hash_key_t)(uintptr_t)p->p_zone->zone_id,
1477 (mod_hash_val_t)zone, (mod_hash_hndl_t)0);
1478 ASSERT(ret == 0);
1479 }
1480 if (zone->vmz_zone != NULL) {
1481 tmp = zone->vmz_zone;
1482 tmp->vme_next_calc = entities;
1483 entities = tmp;
1484 }
1485 if (vmu_data.vmu_calc_flags &
1486 (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS)) {
1487 tmp = vmu_find_insert_entity(zone->vmz_projects_hash,
1488 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS,
1489 zone->vmz_id);
1490 tmp->vme_next_calc = entities;
1491 entities = tmp;
1492 }
1493 if (vmu_data.vmu_calc_flags &
1494 (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS)) {
1495 tmp = vmu_find_insert_entity(zone->vmz_tasks_hash,
1496 p->p_task->tk_tkid, VMUSAGE_TASKS, zone->vmz_id);
1497 tmp->vme_next_calc = entities;
1498 entities = tmp;
1499 }
1500 if (vmu_data.vmu_calc_flags &
1501 (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS)) {
1502 tmp = vmu_find_insert_entity(zone->vmz_rusers_hash,
1503 crgetruid(p->p_cred), VMUSAGE_RUSERS, zone->vmz_id);
1504 tmp->vme_next_calc = entities;
1505 entities = tmp;
1506 }
1507 if (vmu_data.vmu_calc_flags &
1508 (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS)) {
1509 tmp = vmu_find_insert_entity(zone->vmz_eusers_hash,
1510 crgetuid(p->p_cred), VMUSAGE_EUSERS, zone->vmz_id);
1511 tmp->vme_next_calc = entities;
1512 entities = tmp;
1513 }
1514 }
1515 /* Entities which collapse projects and users for all zones */
1516 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_PROJECTS) {
1517 tmp = vmu_find_insert_entity(vmu_data.vmu_projects_col_hash,
1518 p->p_task->tk_proj->kpj_id, VMUSAGE_PROJECTS, ALL_ZONES);
1519 tmp->vme_next_calc = entities;
1520 entities = tmp;
1521 }
1522 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_RUSERS) {
1523 tmp = vmu_find_insert_entity(vmu_data.vmu_rusers_col_hash,
1524 crgetruid(p->p_cred), VMUSAGE_RUSERS, ALL_ZONES);
1525 tmp->vme_next_calc = entities;
1526 entities = tmp;
1527 }
1528 if (vmu_data.vmu_calc_flags & VMUSAGE_COL_EUSERS) {
1529 tmp = vmu_find_insert_entity(vmu_data.vmu_eusers_col_hash,
1530 crgetuid(p->p_cred), VMUSAGE_EUSERS, ALL_ZONES);
1531 tmp->vme_next_calc = entities;
1532 entities = tmp;
1533 }
1534
1535 ASSERT(entities != NULL);
1536 /* process all segs in process's address space */
1537 as = p->p_as;
1538 AS_LOCK_ENTER(as, RW_READER);
1539 for (seg = AS_SEGFIRST(as); seg != NULL;
1540 seg = AS_SEGNEXT(as, seg)) {
1541 vmu_calculate_seg(entities, seg);
1542 }
1543 AS_LOCK_EXIT(as);
1544 }
1545
1546 /*
1547 * Free data created by previous call to vmu_calculate().
1548 */
1549 static void
vmu_clear_calc()1550 vmu_clear_calc()
1551 {
1552 if (vmu_data.vmu_system != NULL) {
1553 vmu_free_entity(vmu_data.vmu_system);
1554 vmu_data.vmu_system = NULL;
1555 }
1556 if (vmu_data.vmu_zones_hash != NULL)
1557 i_mod_hash_clear_nosync(vmu_data.vmu_zones_hash);
1558 if (vmu_data.vmu_projects_col_hash != NULL)
1559 i_mod_hash_clear_nosync(vmu_data.vmu_projects_col_hash);
1560 if (vmu_data.vmu_rusers_col_hash != NULL)
1561 i_mod_hash_clear_nosync(vmu_data.vmu_rusers_col_hash);
1562 if (vmu_data.vmu_eusers_col_hash != NULL)
1563 i_mod_hash_clear_nosync(vmu_data.vmu_eusers_col_hash);
1564
1565 i_mod_hash_clear_nosync(vmu_data.vmu_all_vnodes_hash);
1566 i_mod_hash_clear_nosync(vmu_data.vmu_all_amps_hash);
1567 }
1568
1569 /*
1570 * Free unused data structures. These can result if the system workload
1571 * decreases between calculations.
1572 */
1573 static void
vmu_free_extra()1574 vmu_free_extra()
1575 {
1576 vmu_bound_t *tb;
1577 vmu_object_t *to;
1578 vmu_entity_t *te;
1579 vmu_zone_t *tz;
1580
1581 while (vmu_data.vmu_free_bounds != NULL) {
1582 tb = vmu_data.vmu_free_bounds;
1583 vmu_data.vmu_free_bounds = vmu_data.vmu_free_bounds->vmb_next;
1584 kmem_cache_free(vmu_bound_cache, tb);
1585 }
1586 while (vmu_data.vmu_free_objects != NULL) {
1587 to = vmu_data.vmu_free_objects;
1588 vmu_data.vmu_free_objects =
1589 vmu_data.vmu_free_objects->vmo_next;
1590 kmem_cache_free(vmu_object_cache, to);
1591 }
1592 while (vmu_data.vmu_free_entities != NULL) {
1593 te = vmu_data.vmu_free_entities;
1594 vmu_data.vmu_free_entities =
1595 vmu_data.vmu_free_entities->vme_next;
1596 if (te->vme_vnode_hash != NULL)
1597 mod_hash_destroy_hash(te->vme_vnode_hash);
1598 if (te->vme_amp_hash != NULL)
1599 mod_hash_destroy_hash(te->vme_amp_hash);
1600 if (te->vme_anon_hash != NULL)
1601 mod_hash_destroy_hash(te->vme_anon_hash);
1602 kmem_free(te, sizeof (vmu_entity_t));
1603 }
1604 while (vmu_data.vmu_free_zones != NULL) {
1605 tz = vmu_data.vmu_free_zones;
1606 vmu_data.vmu_free_zones =
1607 vmu_data.vmu_free_zones->vmz_next;
1608 if (tz->vmz_projects_hash != NULL)
1609 mod_hash_destroy_hash(tz->vmz_projects_hash);
1610 if (tz->vmz_tasks_hash != NULL)
1611 mod_hash_destroy_hash(tz->vmz_tasks_hash);
1612 if (tz->vmz_rusers_hash != NULL)
1613 mod_hash_destroy_hash(tz->vmz_rusers_hash);
1614 if (tz->vmz_eusers_hash != NULL)
1615 mod_hash_destroy_hash(tz->vmz_eusers_hash);
1616 kmem_free(tz, sizeof (vmu_zone_t));
1617 }
1618 }
1619
1620 extern kcondvar_t *pr_pid_cv;
1621
1622 /*
1623 * Determine which entity types are relevant and allocate the hashes to
1624 * track them. Then walk the process table and count rss and swap
1625 * for each process'es address space. Address space object such as
1626 * vnodes, amps and anons are tracked per entity, so that they are
1627 * not double counted in the results.
1628 *
1629 */
1630 static void
vmu_calculate()1631 vmu_calculate()
1632 {
1633 int i = 0;
1634 int ret;
1635 proc_t *p;
1636
1637 vmu_clear_calc();
1638
1639 if (vmu_data.vmu_calc_flags & VMUSAGE_SYSTEM)
1640 vmu_data.vmu_system = vmu_alloc_entity(0, VMUSAGE_SYSTEM,
1641 ALL_ZONES);
1642
1643 /*
1644 * Walk process table and calculate rss of each proc.
1645 *
1646 * Pidlock and p_lock cannot be held while doing the rss calculation.
1647 * This is because:
1648 * 1. The calculation allocates using KM_SLEEP.
1649 * 2. The calculation grabs a_lock, which cannot be grabbed
1650 * after p_lock.
1651 *
1652 * Since pidlock must be dropped, we cannot simply just walk the
1653 * practive list. Instead, we walk the process table, and sprlock
1654 * each process to ensure that it does not exit during the
1655 * calculation.
1656 */
1657
1658 mutex_enter(&pidlock);
1659 for (i = 0; i < v.v_proc; i++) {
1660 again:
1661 p = pid_entry(i);
1662 if (p == NULL)
1663 continue;
1664
1665 mutex_enter(&p->p_lock);
1666 mutex_exit(&pidlock);
1667
1668 if (panicstr) {
1669 mutex_exit(&p->p_lock);
1670 return;
1671 }
1672
1673 /* Try to set P_PR_LOCK */
1674 ret = sprtrylock_proc(p);
1675 if (ret == -1) {
1676 /* Process in invalid state */
1677 mutex_exit(&p->p_lock);
1678 mutex_enter(&pidlock);
1679 continue;
1680 } else if (ret == 1) {
1681 /*
1682 * P_PR_LOCK is already set. Wait and try again.
1683 * This also drops p_lock.
1684 */
1685 sprwaitlock_proc(p);
1686 mutex_enter(&pidlock);
1687 goto again;
1688 }
1689 mutex_exit(&p->p_lock);
1690
1691 vmu_calculate_proc(p);
1692
1693 mutex_enter(&p->p_lock);
1694 sprunlock(p);
1695 mutex_enter(&pidlock);
1696 }
1697 mutex_exit(&pidlock);
1698
1699 vmu_free_extra();
1700 }
1701
1702 /*
1703 * allocate a new cache for N results satisfying flags
1704 */
1705 vmu_cache_t *
vmu_cache_alloc(size_t nres,uint_t flags)1706 vmu_cache_alloc(size_t nres, uint_t flags)
1707 {
1708 vmu_cache_t *cache;
1709
1710 cache = kmem_zalloc(sizeof (vmu_cache_t), KM_SLEEP);
1711 cache->vmc_results = kmem_zalloc(sizeof (vmusage_t) * nres, KM_SLEEP);
1712 cache->vmc_nresults = nres;
1713 cache->vmc_flags = flags;
1714 cache->vmc_refcnt = 1;
1715 return (cache);
1716 }
1717
1718 /*
1719 * Make sure cached results are not freed
1720 */
1721 static void
vmu_cache_hold(vmu_cache_t * cache)1722 vmu_cache_hold(vmu_cache_t *cache)
1723 {
1724 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1725 cache->vmc_refcnt++;
1726 }
1727
1728 /*
1729 * free cache data
1730 */
1731 static void
vmu_cache_rele(vmu_cache_t * cache)1732 vmu_cache_rele(vmu_cache_t *cache)
1733 {
1734 ASSERT(MUTEX_HELD(&vmu_data.vmu_lock));
1735 ASSERT(cache->vmc_refcnt > 0);
1736 cache->vmc_refcnt--;
1737 if (cache->vmc_refcnt == 0) {
1738 kmem_free(cache->vmc_results, sizeof (vmusage_t) *
1739 cache->vmc_nresults);
1740 kmem_free(cache, sizeof (vmu_cache_t));
1741 }
1742 }
1743
1744 /*
1745 * Copy out the cached results to a caller. Inspect the callers flags
1746 * and zone to determine which cached results should be copied.
1747 */
1748 static int
vmu_copyout_results(vmu_cache_t * cache,vmusage_t * buf,size_t * nres,uint_t flags,int cpflg)1749 vmu_copyout_results(vmu_cache_t *cache, vmusage_t *buf, size_t *nres,
1750 uint_t flags, int cpflg)
1751 {
1752 vmusage_t *result, *out_result;
1753 vmusage_t dummy;
1754 size_t i, count = 0;
1755 size_t bufsize;
1756 int ret = 0;
1757 uint_t types = 0;
1758
1759 if (nres != NULL) {
1760 if (ddi_copyin((caddr_t)nres, &bufsize, sizeof (size_t), cpflg))
1761 return (set_errno(EFAULT));
1762 } else {
1763 bufsize = 0;
1764 }
1765
1766 /* figure out what results the caller is interested in. */
1767 if ((flags & VMUSAGE_SYSTEM) && curproc->p_zone == global_zone)
1768 types |= VMUSAGE_SYSTEM;
1769 if (flags & (VMUSAGE_ZONE | VMUSAGE_ALL_ZONES))
1770 types |= VMUSAGE_ZONE;
1771 if (flags & (VMUSAGE_PROJECTS | VMUSAGE_ALL_PROJECTS |
1772 VMUSAGE_COL_PROJECTS))
1773 types |= VMUSAGE_PROJECTS;
1774 if (flags & (VMUSAGE_TASKS | VMUSAGE_ALL_TASKS))
1775 types |= VMUSAGE_TASKS;
1776 if (flags & (VMUSAGE_RUSERS | VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS))
1777 types |= VMUSAGE_RUSERS;
1778 if (flags & (VMUSAGE_EUSERS | VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS))
1779 types |= VMUSAGE_EUSERS;
1780
1781 /* count results for current zone */
1782 out_result = buf;
1783 for (result = cache->vmc_results, i = 0;
1784 i < cache->vmc_nresults; result++, i++) {
1785
1786 /* Do not return "other-zone" results to non-global zones */
1787 if (curproc->p_zone != global_zone &&
1788 curproc->p_zone->zone_id != result->vmu_zoneid)
1789 continue;
1790
1791 /*
1792 * If non-global zone requests VMUSAGE_SYSTEM, fake
1793 * up VMUSAGE_ZONE result as VMUSAGE_SYSTEM result.
1794 */
1795 if (curproc->p_zone != global_zone &&
1796 (flags & VMUSAGE_SYSTEM) != 0 &&
1797 result->vmu_type == VMUSAGE_ZONE) {
1798 count++;
1799 if (out_result != NULL) {
1800 if (bufsize < count) {
1801 ret = set_errno(EOVERFLOW);
1802 } else {
1803 dummy = *result;
1804 dummy.vmu_zoneid = ALL_ZONES;
1805 dummy.vmu_id = 0;
1806 dummy.vmu_type = VMUSAGE_SYSTEM;
1807 if (ddi_copyout(&dummy, out_result,
1808 sizeof (vmusage_t), cpflg))
1809 return (set_errno(EFAULT));
1810 out_result++;
1811 }
1812 }
1813 }
1814
1815 /* Skip results that do not match requested type */
1816 if ((result->vmu_type & types) == 0)
1817 continue;
1818
1819 /* Skip collated results if not requested */
1820 if (result->vmu_zoneid == ALL_ZONES) {
1821 if (result->vmu_type == VMUSAGE_PROJECTS &&
1822 (flags & VMUSAGE_COL_PROJECTS) == 0)
1823 continue;
1824 if (result->vmu_type == VMUSAGE_EUSERS &&
1825 (flags & VMUSAGE_COL_EUSERS) == 0)
1826 continue;
1827 if (result->vmu_type == VMUSAGE_RUSERS &&
1828 (flags & VMUSAGE_COL_RUSERS) == 0)
1829 continue;
1830 }
1831
1832 /* Skip "other zone" results if not requested */
1833 if (result->vmu_zoneid != curproc->p_zone->zone_id) {
1834 if (result->vmu_type == VMUSAGE_ZONE &&
1835 (flags & VMUSAGE_ALL_ZONES) == 0)
1836 continue;
1837 if (result->vmu_type == VMUSAGE_PROJECTS &&
1838 (flags & (VMUSAGE_ALL_PROJECTS |
1839 VMUSAGE_COL_PROJECTS)) == 0)
1840 continue;
1841 if (result->vmu_type == VMUSAGE_TASKS &&
1842 (flags & VMUSAGE_ALL_TASKS) == 0)
1843 continue;
1844 if (result->vmu_type == VMUSAGE_RUSERS &&
1845 (flags & (VMUSAGE_ALL_RUSERS |
1846 VMUSAGE_COL_RUSERS)) == 0)
1847 continue;
1848 if (result->vmu_type == VMUSAGE_EUSERS &&
1849 (flags & (VMUSAGE_ALL_EUSERS |
1850 VMUSAGE_COL_EUSERS)) == 0)
1851 continue;
1852 }
1853 count++;
1854 if (out_result != NULL) {
1855 if (bufsize < count) {
1856 ret = set_errno(EOVERFLOW);
1857 } else {
1858 if (ddi_copyout(result, out_result,
1859 sizeof (vmusage_t), cpflg))
1860 return (set_errno(EFAULT));
1861 out_result++;
1862 }
1863 }
1864 }
1865 if (nres != NULL)
1866 if (ddi_copyout(&count, (void *)nres, sizeof (size_t), cpflg))
1867 return (set_errno(EFAULT));
1868
1869 return (ret);
1870 }
1871
1872 /*
1873 * vm_getusage()
1874 *
1875 * Counts rss and swap by zone, project, task, and/or user. The flags argument
1876 * determines the type of results structures returned. Flags requesting
1877 * results from more than one zone are "flattened" to the local zone if the
1878 * caller is not the global zone.
1879 *
1880 * args:
1881 * flags: bitmap consisting of one or more of VMUSAGE_*.
1882 * age: maximum allowable age (time since counting was done) in
1883 * seconds of the results. Results from previous callers are
1884 * cached in kernel.
1885 * buf: pointer to buffer array of vmusage_t. If NULL, then only nres
1886 * set on success.
1887 * nres: Set to number of vmusage_t structures pointed to by buf
1888 * before calling vm_getusage().
1889 * On return 0 (success) or ENOSPC, is set to the number of result
1890 * structures returned or attempted to return.
1891 *
1892 * returns 0 on success, -1 on failure:
1893 * EINTR (interrupted)
1894 * ENOSPC (nres to small for results, nres set to needed value for success)
1895 * EINVAL (flags invalid)
1896 * EFAULT (bad address for buf or nres)
1897 */
1898 int
vm_getusage(uint_t flags,time_t age,vmusage_t * buf,size_t * nres,int cpflg)1899 vm_getusage(uint_t flags, time_t age, vmusage_t *buf, size_t *nres, int cpflg)
1900 {
1901 vmu_entity_t *entity;
1902 vmusage_t *result;
1903 int ret = 0;
1904 int cacherecent = 0;
1905 hrtime_t now;
1906 uint_t flags_orig;
1907
1908 /*
1909 * Non-global zones cannot request system wide and/or collated
1910 * results, or the system result, so munge the flags accordingly.
1911 */
1912 flags_orig = flags;
1913 if (curproc->p_zone != global_zone) {
1914 if (flags & (VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS)) {
1915 flags &= ~(VMUSAGE_ALL_PROJECTS | VMUSAGE_COL_PROJECTS);
1916 flags |= VMUSAGE_PROJECTS;
1917 }
1918 if (flags & (VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS)) {
1919 flags &= ~(VMUSAGE_ALL_RUSERS | VMUSAGE_COL_RUSERS);
1920 flags |= VMUSAGE_RUSERS;
1921 }
1922 if (flags & (VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS)) {
1923 flags &= ~(VMUSAGE_ALL_EUSERS | VMUSAGE_COL_EUSERS);
1924 flags |= VMUSAGE_EUSERS;
1925 }
1926 if (flags & VMUSAGE_SYSTEM) {
1927 flags &= ~VMUSAGE_SYSTEM;
1928 flags |= VMUSAGE_ZONE;
1929 }
1930 }
1931
1932 /* Check for unknown flags */
1933 if ((flags & (~VMUSAGE_MASK)) != 0)
1934 return (set_errno(EINVAL));
1935
1936 /* Check for no flags */
1937 if ((flags & VMUSAGE_MASK) == 0)
1938 return (set_errno(EINVAL));
1939
1940 mutex_enter(&vmu_data.vmu_lock);
1941 now = gethrtime();
1942
1943 start:
1944 if (vmu_data.vmu_cache != NULL) {
1945
1946 vmu_cache_t *cache;
1947
1948 if ((vmu_data.vmu_cache->vmc_timestamp +
1949 ((hrtime_t)age * NANOSEC)) > now)
1950 cacherecent = 1;
1951
1952 if ((vmu_data.vmu_cache->vmc_flags & flags) == flags &&
1953 cacherecent == 1) {
1954 cache = vmu_data.vmu_cache;
1955 vmu_cache_hold(cache);
1956 mutex_exit(&vmu_data.vmu_lock);
1957
1958 ret = vmu_copyout_results(cache, buf, nres, flags_orig,
1959 cpflg);
1960 mutex_enter(&vmu_data.vmu_lock);
1961 vmu_cache_rele(cache);
1962 if (vmu_data.vmu_pending_waiters > 0)
1963 cv_broadcast(&vmu_data.vmu_cv);
1964 mutex_exit(&vmu_data.vmu_lock);
1965 return (ret);
1966 }
1967 /*
1968 * If the cache is recent, it is likely that there are other
1969 * consumers of vm_getusage running, so add their flags to the
1970 * desired flags for the calculation.
1971 */
1972 if (cacherecent == 1)
1973 flags = vmu_data.vmu_cache->vmc_flags | flags;
1974 }
1975 if (vmu_data.vmu_calc_thread == NULL) {
1976
1977 vmu_cache_t *cache;
1978
1979 vmu_data.vmu_calc_thread = curthread;
1980 vmu_data.vmu_calc_flags = flags;
1981 vmu_data.vmu_entities = NULL;
1982 vmu_data.vmu_nentities = 0;
1983 if (vmu_data.vmu_pending_waiters > 0)
1984 vmu_data.vmu_calc_flags |=
1985 vmu_data.vmu_pending_flags;
1986
1987 vmu_data.vmu_pending_flags = 0;
1988 mutex_exit(&vmu_data.vmu_lock);
1989 vmu_calculate();
1990 mutex_enter(&vmu_data.vmu_lock);
1991 /* copy results to cache */
1992 if (vmu_data.vmu_cache != NULL)
1993 vmu_cache_rele(vmu_data.vmu_cache);
1994 cache = vmu_data.vmu_cache =
1995 vmu_cache_alloc(vmu_data.vmu_nentities,
1996 vmu_data.vmu_calc_flags);
1997
1998 result = cache->vmc_results;
1999 for (entity = vmu_data.vmu_entities; entity != NULL;
2000 entity = entity->vme_next) {
2001 *result = entity->vme_result;
2002 result++;
2003 }
2004 cache->vmc_timestamp = gethrtime();
2005 vmu_cache_hold(cache);
2006
2007 vmu_data.vmu_calc_flags = 0;
2008 vmu_data.vmu_calc_thread = NULL;
2009
2010 if (vmu_data.vmu_pending_waiters > 0)
2011 cv_broadcast(&vmu_data.vmu_cv);
2012
2013 mutex_exit(&vmu_data.vmu_lock);
2014
2015 /* copy cache */
2016 ret = vmu_copyout_results(cache, buf, nres, flags_orig, cpflg);
2017 mutex_enter(&vmu_data.vmu_lock);
2018 vmu_cache_rele(cache);
2019 mutex_exit(&vmu_data.vmu_lock);
2020
2021 return (ret);
2022 }
2023 vmu_data.vmu_pending_flags |= flags;
2024 vmu_data.vmu_pending_waiters++;
2025 while (vmu_data.vmu_calc_thread != NULL) {
2026 if (cv_wait_sig(&vmu_data.vmu_cv,
2027 &vmu_data.vmu_lock) == 0) {
2028 vmu_data.vmu_pending_waiters--;
2029 mutex_exit(&vmu_data.vmu_lock);
2030 return (set_errno(EINTR));
2031 }
2032 }
2033 vmu_data.vmu_pending_waiters--;
2034 goto start;
2035 }
2036