xref: /illumos-gate/usr/src/uts/common/os/lgrp.c (revision 55381082)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License, Version 1.0 only
6  * (the "License").  You may not use this file except in compliance
7  * with the License.
8  *
9  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
10  * or http://www.opensolaris.org/os/licensing.
11  * See the License for the specific language governing permissions
12  * and limitations under the License.
13  *
14  * When distributing Covered Code, include this CDDL HEADER in each
15  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
16  * If applicable, add the following below this CDDL HEADER, with the
17  * fields enclosed by brackets "[]" replaced with your own identifying
18  * information: Portions Copyright [yyyy] [name of copyright owner]
19  *
20  * CDDL HEADER END
21  */
22 /*
23  * Copyright 2005 Sun Microsystems, Inc.  All rights reserved.
24  * Use is subject to license terms.
25  */
26 
27 #pragma ident	"%Z%%M%	%I%	%E% SMI"
28 
29 /*
30  * Basic NUMA support in terms of locality groups
31  *
32  * Solaris needs to know which CPUs, memory, etc. are near each other to
33  * provide good performance on NUMA machines by optimizing for locality.
34  * In order to do this, a new abstraction called a "locality group (lgroup)"
35  * has been introduced to keep track of which CPU-like and memory-like hardware
36  * resources are close to each other.  Currently, latency is the only measure
37  * used to determine how to group hardware resources into lgroups, but this
38  * does not limit the groupings to be based solely on latency.  Other factors
39  * may be used to determine the groupings in the future.
40  *
41  * Lgroups are organized into a hieararchy or topology that represents the
42  * latency topology of the machine.  There is always at least a root lgroup in
43  * the system.  It represents all the hardware resources in the machine at a
44  * latency big enough that any hardware resource can at least access any other
45  * hardware resource within that latency.  A Uniform Memory Access (UMA)
46  * machine is represented with one lgroup (the root).  In contrast, a NUMA
47  * machine is represented at least by the root lgroup and some number of leaf
48  * lgroups where the leaf lgroups contain the hardware resources within the
49  * least latency of each other and the root lgroup still contains all the
50  * resources in the machine.  Some number of intermediate lgroups may exist
51  * which represent more levels of locality than just the local latency of the
52  * leaf lgroups and the system latency of the root lgroup.  Non-leaf lgroups
53  * (eg. root and intermediate lgroups) contain the next nearest resources to
54  * its children lgroups.  Thus, the lgroup hierarchy from a given leaf lgroup
55  * to the root lgroup shows the hardware resources from closest to farthest
56  * from the leaf lgroup such that each successive ancestor lgroup contains
57  * the next nearest resources at the next level of locality from the previous.
58  *
59  * The kernel uses the lgroup abstraction to know how to allocate resources
60  * near a given process/thread.  At fork() and lwp/thread_create() time, a
61  * "home" lgroup is chosen for a thread.  This is done by picking the lgroup
62  * with the lowest load average.  Binding to a processor or processor set will
63  * change the home lgroup for a thread.  The scheduler has been modified to try
64  * to dispatch a thread on a CPU in its home lgroup.  Physical memory
65  * allocation is lgroup aware too, so memory will be allocated from the current
66  * thread's home lgroup if possible.  If the desired resources are not
67  * available, the kernel traverses the lgroup hierarchy going to the parent
68  * lgroup to find resources at the next level of locality until it reaches the
69  * root lgroup.
70  */
71 
72 #include <sys/lgrp.h>
73 #include <sys/lgrp_user.h>
74 #include <sys/types.h>
75 #include <sys/mman.h>
76 #include <sys/param.h>
77 #include <sys/var.h>
78 #include <sys/thread.h>
79 #include <sys/cpuvar.h>
80 #include <sys/cpupart.h>
81 #include <sys/kmem.h>
82 #include <vm/seg.h>
83 #include <vm/seg_kmem.h>
84 #include <vm/seg_spt.h>
85 #include <vm/seg_vn.h>
86 #include <vm/as.h>
87 #include <sys/atomic.h>
88 #include <sys/systm.h>
89 #include <sys/errno.h>
90 #include <sys/cmn_err.h>
91 #include <sys/kstat.h>
92 #include <sys/sysmacros.h>
93 #include <sys/chip.h>
94 #include <sys/promif.h>
95 #include <sys/sdt.h>
96 
97 lgrp_gen_t	lgrp_gen = 0;		/* generation of lgroup hierarchy */
98 lgrp_t *lgrp_table[NLGRPS_MAX]; /* table of all initialized lgrp_t structs */
99 				/* indexed by lgrp_id */
100 int	nlgrps;			/* number of lgroups in machine */
101 int	lgrp_alloc_hint = -1;	/* hint for where to try to allocate next */
102 int	lgrp_alloc_max = 0;	/* max lgroup ID allocated so far */
103 
104 /*
105  * Kstat data for lgroups.
106  *
107  * Actual kstat data is collected in lgrp_stats array.
108  * The lgrp_kstat_data array of named kstats is used to extract data from
109  * lgrp_stats and present it to kstat framework. It is protected from partallel
110  * modifications by lgrp_kstat_mutex. This may cause some contention when
111  * several kstat commands run in parallel but this is not the
112  * performance-critical path.
113  */
114 extern struct lgrp_stats lgrp_stats[];	/* table of per-lgrp stats */
115 
116 /*
117  * Declare kstat names statically for enums as defined in the header file.
118  */
119 LGRP_KSTAT_NAMES;
120 
121 static void	lgrp_kstat_init(void);
122 static int	lgrp_kstat_extract(kstat_t *, int);
123 static void	lgrp_kstat_reset(lgrp_id_t);
124 
125 static struct kstat_named lgrp_kstat_data[LGRP_NUM_STATS];
126 static kmutex_t lgrp_kstat_mutex;
127 
128 
129 /*
130  * max number of lgroups supported by the platform
131  */
132 int	nlgrpsmax = 0;
133 
134 /*
135  * The root lgroup. Represents the set of resources at the system wide
136  * level of locality.
137  */
138 lgrp_t		*lgrp_root = NULL;
139 
140 /*
141  * During system bootstrap cp_default does not contain the list of lgrp load
142  * averages (cp_lgrploads). The list is allocated after the first CPU is brought
143  * on-line when cp_default is initialized by cpupart_initialize_default().
144  * Configuring CPU0 may create a two-level topology with root and one leaf node
145  * containing CPU0. This topology is initially constructed in a special
146  * statically allocated 2-element lpl list lpl_bootstrap_list and later cloned
147  * to cp_default when cp_default is initialized. The lpl_bootstrap_list is used
148  * for all lpl operations until cp_default is fully constructed.
149  *
150  * The lpl_bootstrap_list is maintained by the code in lgrp.c. Every other
151  * consumer who needs default lpl should use lpl_bootstrap which is a pointer to
152  * the first element of lpl_bootstrap_list.
153  *
154  * CPUs that are added to the system, but have not yet been assigned to an
155  * lgrp will use lpl_bootstrap as a default lpl. This is necessary because
156  * on some architectures (x86) it's possible for the slave CPU startup thread
157  * to enter the dispatcher or allocate memory before calling lgrp_cpu_init().
158  */
159 #define	LPL_BOOTSTRAP_SIZE 2
160 static lpl_t	lpl_bootstrap_list[LPL_BOOTSTRAP_SIZE];
161 lpl_t		*lpl_bootstrap;
162 
163 /*
164  * If cp still references the bootstrap lpl, it has not yet been added to
165  * an lgrp. lgrp_mem_choose() uses this macro to detect the case where
166  * a thread is trying to allocate memory close to a CPU that has no lgrp.
167  */
168 #define	LGRP_CPU_HAS_NO_LGRP(cp)	((cp)->cpu_lpl == lpl_bootstrap)
169 
170 static lgrp_t	lroot;
171 
172 
173 /*
174  * Size, in bytes, beyond which random memory allocation policy is applied
175  * to non-shared memory.  Default is the maximum size, so random memory
176  * allocation won't be used for non-shared memory by default.
177  */
178 size_t	lgrp_privm_random_thresh = (size_t)(-1);
179 
180 /*
181  * Size, in bytes, beyond which random memory allocation policy is applied to
182  * shared memory.  Default is 8MB (2 ISM pages).
183  */
184 size_t	lgrp_shm_random_thresh = 8*1024*1024;
185 
186 /*
187  * Whether to do processor set aware memory allocation by default
188  */
189 int	lgrp_mem_pset_aware = 0;
190 
191 /*
192  * Set the default memory allocation policy for root lgroup
193  */
194 lgrp_mem_policy_t	lgrp_mem_policy_root = LGRP_MEM_POLICY_RANDOM;
195 
196 /*
197  * Set the default memory allocation policy.  For most platforms,
198  * next touch is sufficient, but some platforms may wish to override
199  * this.
200  */
201 lgrp_mem_policy_t	lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
202 
203 
204 /*
205  * lgroup CPU event handlers
206  */
207 static void	lgrp_cpu_init(struct cpu *);
208 static void	lgrp_cpu_fini(struct cpu *, lgrp_id_t);
209 static lgrp_t	*lgrp_cpu_to_lgrp(struct cpu *);
210 
211 static void	lgrp_latency_change(u_longlong_t, u_longlong_t);
212 
213 /*
214  * lgroup memory event handlers
215  */
216 static void	lgrp_mem_init(int, lgrp_handle_t, boolean_t);
217 static void	lgrp_mem_fini(int, lgrp_handle_t, boolean_t);
218 static void	lgrp_mem_rename(int, lgrp_handle_t, lgrp_handle_t);
219 
220 /*
221  * lgroup CPU partition event handlers
222  */
223 static void	lgrp_part_add_cpu(struct cpu *, lgrp_id_t);
224 static void	lgrp_part_del_cpu(struct cpu *);
225 
226 static void	lgrp_root_init(void);
227 
228 /*
229  * lpl topology
230  */
231 static void	lpl_init(lpl_t *, lpl_t *, lgrp_t *);
232 static void	lpl_clear(lpl_t *);
233 static void	lpl_leaf_insert(lpl_t *, struct cpupart *);
234 static void	lpl_leaf_remove(lpl_t *, struct cpupart *);
235 static void	lpl_rset_add(lpl_t *, lpl_t *);
236 static void	lpl_rset_del(lpl_t *, lpl_t *);
237 static int	lpl_rset_contains(lpl_t *, lpl_t *);
238 static void	lpl_cpu_adjcnt(lpl_act_t, struct cpu *);
239 static void	lpl_child_update(lpl_t *, struct cpupart *);
240 static int	lpl_pick(lpl_t *, lpl_t *);
241 static void	lpl_verify_wrapper(struct cpupart *);
242 
243 /*
244  * defines for lpl topology verifier return codes
245  */
246 
247 #define	LPL_TOPO_CORRECT			0
248 #define	LPL_TOPO_PART_HAS_NO_LPL		-1
249 #define	LPL_TOPO_CPUS_NOT_EMPTY			-2
250 #define	LPL_TOPO_LGRP_MISMATCH			-3
251 #define	LPL_TOPO_MISSING_PARENT			-4
252 #define	LPL_TOPO_PARENT_MISMATCH		-5
253 #define	LPL_TOPO_BAD_CPUCNT			-6
254 #define	LPL_TOPO_RSET_MISMATCH			-7
255 #define	LPL_TOPO_LPL_ORPHANED			-8
256 #define	LPL_TOPO_LPL_BAD_NCPU			-9
257 #define	LPL_TOPO_RSET_MSSNG_LF			-10
258 #define	LPL_TOPO_CPU_HAS_BAD_LPL		-11
259 #define	LPL_TOPO_BOGUS_HINT			-12
260 #define	LPL_TOPO_NONLEAF_HAS_CPUS		-13
261 #define	LPL_TOPO_LGRP_NOT_LEAF			-14
262 #define	LPL_TOPO_BAD_RSETCNT			-15
263 
264 /*
265  * Return whether lgroup optimizations should be enabled on this system
266  */
267 int
268 lgrp_optimizations(void)
269 {
270 	/*
271 	 * System must have more than 2 lgroups to enable lgroup optimizations
272 	 *
273 	 * XXX This assumes that a 2 lgroup system has an empty root lgroup
274 	 * with one child lgroup containing all the resources. A 2 lgroup
275 	 * system with a root lgroup directly containing CPUs or memory might
276 	 * need lgroup optimizations with its child lgroup, but there
277 	 * isn't such a machine for now....
278 	 */
279 	if (nlgrps > 2)
280 		return (1);
281 
282 	return (0);
283 }
284 
285 /*
286  * Build full lgroup topology
287  */
288 static void
289 lgrp_root_init(void)
290 {
291 	lgrp_handle_t	hand;
292 	int		i;
293 	lgrp_id_t	id;
294 
295 	/*
296 	 * Create the "root" lgroup
297 	 */
298 	ASSERT(nlgrps == 0);
299 	id = nlgrps++;
300 
301 	lgrp_root = &lroot;
302 
303 	lgrp_root->lgrp_cpu = NULL;
304 	lgrp_root->lgrp_mnodes = 0;
305 	lgrp_root->lgrp_nmnodes = 0;
306 	hand = lgrp_plat_root_hand();
307 	lgrp_root->lgrp_plathand = hand;
308 
309 	lgrp_root->lgrp_id = id;
310 	lgrp_root->lgrp_cpucnt = 0;
311 	lgrp_root->lgrp_childcnt = 0;
312 	klgrpset_clear(lgrp_root->lgrp_children);
313 	klgrpset_clear(lgrp_root->lgrp_leaves);
314 	lgrp_root->lgrp_parent = NULL;
315 	lgrp_root->lgrp_chips = NULL;
316 	lgrp_root->lgrp_chipcnt = 0;
317 	lgrp_root->lgrp_latency = lgrp_plat_latency(hand, hand);
318 
319 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
320 		klgrpset_clear(lgrp_root->lgrp_set[i]);
321 
322 	lgrp_root->lgrp_kstat = NULL;
323 
324 	lgrp_table[id] = lgrp_root;
325 
326 	/*
327 	 * Setup initial lpl list for CPU0 and initial t0 home.
328 	 * The only lpl space we have so far is lpl_bootstrap. It is used for
329 	 * all topology operations until cp_default is initialized at which
330 	 * point t0.t_lpl will be updated.
331 	 */
332 	lpl_bootstrap = lpl_bootstrap_list;
333 	t0.t_lpl = lpl_bootstrap;
334 	cp_default.cp_nlgrploads = LPL_BOOTSTRAP_SIZE;
335 	lpl_bootstrap_list[1].lpl_lgrpid = 1;
336 	cp_default.cp_lgrploads = lpl_bootstrap;
337 }
338 
339 /*
340  * Initialize the lgroup framework and allow the platform to do the same
341  */
342 void
343 lgrp_init(void)
344 {
345 	/*
346 	 * Initialize the platform
347 	 */
348 	lgrp_plat_init();
349 
350 	/*
351 	 * Set max number of lgroups supported on this platform which must be
352 	 * less than the max number of lgroups supported by the common lgroup
353 	 * framework (eg. NLGRPS_MAX is max elements in lgrp_table[], etc.)
354 	 */
355 	nlgrpsmax = lgrp_plat_max_lgrps();
356 	ASSERT(nlgrpsmax <= NLGRPS_MAX);
357 }
358 
359 /*
360  * Create the root and cpu0's lgroup, and set t0's home.
361  */
362 void
363 lgrp_setup(void)
364 {
365 	/*
366 	 * Setup the root lgroup
367 	 */
368 	lgrp_root_init();
369 
370 	/*
371 	 * Add cpu0 to an lgroup
372 	 */
373 	lgrp_config(LGRP_CONFIG_CPU_ADD, (uintptr_t)CPU, 0);
374 	lgrp_config(LGRP_CONFIG_CPU_ONLINE, (uintptr_t)CPU, 0);
375 }
376 
377 /*
378  * Lgroup initialization is split in two parts. The first part
379  * (lgrp_main_init()) is called right before start_other_cpus() in main. The
380  * second part (lgrp_main_mp_init()) is called right after start_other_cpus()
381  * when all CPUs are brought online and all distance information is available.
382  *
383  * When lgrp_main_init() is complete it sets lgrp_initialized. The
384  * lgrp_main_mp_init() sets lgrp_topo_initialized.
385  */
386 
387 /*
388  * true when lgrp initialization has been completed.
389  */
390 int	lgrp_initialized = 0;
391 
392 /*
393  * True when lgrp topology is constructed.
394  */
395 int	lgrp_topo_initialized = 0;
396 
397 /*
398  * Init routine called after startup(), /etc/system has been processed,
399  * and cpu0 has been added to an lgroup.
400  */
401 void
402 lgrp_main_init(void)
403 {
404 	cpu_t		*cp = CPU;
405 	lgrp_id_t	lgrpid;
406 	int		i;
407 	/*
408 	 * Enforce a valid lgrp_mem_default_policy
409 	 */
410 	if ((lgrp_mem_default_policy <= LGRP_MEM_POLICY_DEFAULT) ||
411 	    (lgrp_mem_default_policy >= LGRP_NUM_MEM_POLICIES))
412 		lgrp_mem_default_policy = LGRP_MEM_POLICY_NEXT;
413 
414 	/*
415 	 * See if mpo should be disabled.
416 	 * This may happen in the case of null proc LPA on Starcat.
417 	 * The platform won't be able to detect null proc LPA until after
418 	 * cpu0 and memory have already been added to lgroups.
419 	 * When and if it is detected, the Starcat platform will return
420 	 * a different platform handle for cpu0 which is what we check for
421 	 * here. If mpo should be disabled move cpu0 to it's rightful place
422 	 * (the root), and destroy the remaining lgroups. This effectively
423 	 * provides an UMA lgroup topology.
424 	 */
425 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
426 	if (lgrp_table[lgrpid]->lgrp_plathand !=
427 	    lgrp_plat_cpu_to_hand(cp->cpu_id)) {
428 		lgrp_part_del_cpu(cp);
429 		lgrp_cpu_fini(cp, lgrpid);
430 
431 		lgrp_cpu_init(cp);
432 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
433 
434 		ASSERT(cp->cpu_lpl->lpl_lgrpid == LGRP_ROOTID);
435 
436 		/*
437 		 * Destroy all lgroups except for root
438 		 */
439 		for (i = 0; i <= lgrp_alloc_max; i++) {
440 			if (LGRP_EXISTS(lgrp_table[i]) &&
441 			    lgrp_table[i] != lgrp_root)
442 				lgrp_destroy(lgrp_table[i]);
443 		}
444 
445 		/*
446 		 * Fix up root to point at itself for leaves and resources
447 		 * and not have any children
448 		 */
449 		lgrp_root->lgrp_childcnt = 0;
450 		klgrpset_clear(lgrp_root->lgrp_children);
451 		klgrpset_clear(lgrp_root->lgrp_leaves);
452 		klgrpset_add(lgrp_root->lgrp_leaves, LGRP_ROOTID);
453 		klgrpset_clear(lgrp_root->lgrp_set[LGRP_RSRC_MEM]);
454 		klgrpset_add(lgrp_root->lgrp_set[LGRP_RSRC_MEM], LGRP_ROOTID);
455 	}
456 
457 	/*
458 	 * Initialize kstats framework.
459 	 */
460 	lgrp_kstat_init();
461 	/*
462 	 * cpu0 is finally where it should be, so create it's lgroup's kstats
463 	 */
464 	mutex_enter(&cpu_lock);
465 	lgrp_kstat_create(cp);
466 	mutex_exit(&cpu_lock);
467 
468 	lgrp_plat_main_init();
469 	lgrp_initialized = 1;
470 }
471 
472 /*
473  * Finish lgrp initialization after all CPUS are brought on-line.
474  * This routine is called after start_other_cpus().
475  */
476 void
477 lgrp_main_mp_init(void)
478 {
479 	klgrpset_t changed;
480 
481 	/*
482 	 * Update lgroup topology (if necessary)
483 	 */
484 	klgrpset_clear(changed);
485 	(void) lgrp_topo_update(lgrp_table, lgrp_alloc_max + 1, &changed);
486 	lgrp_topo_initialized = 1;
487 }
488 
489 /*
490  * Handle lgroup (re)configuration events (eg. addition of CPU, etc.)
491  */
492 void
493 lgrp_config(lgrp_config_flag_t event, uintptr_t resource, uintptr_t where)
494 {
495 	klgrpset_t	changed;
496 	cpu_t		*cp;
497 	lgrp_id_t	id;
498 	int		rc;
499 
500 	switch (event) {
501 	/*
502 	 * The following (re)configuration events are common code
503 	 * initiated. lgrp_plat_config() is called here to inform the
504 	 * platform of the reconfiguration event.
505 	 */
506 	case LGRP_CONFIG_CPU_ADD:
507 		cp = (cpu_t *)resource;
508 
509 		/*
510 		 * Initialize the new CPU's lgrp related next/prev
511 		 * links, and give it a bootstrap lpl so that it can
512 		 * survive should it need to enter the dispatcher.
513 		 */
514 		cp->cpu_next_lpl = cp;
515 		cp->cpu_prev_lpl = cp;
516 		cp->cpu_next_lgrp = cp;
517 		cp->cpu_prev_lgrp = cp;
518 		cp->cpu_lpl = lpl_bootstrap;
519 
520 		lgrp_plat_config(event, resource);
521 		atomic_add_32(&lgrp_gen, 1);
522 
523 		break;
524 	case LGRP_CONFIG_CPU_DEL:
525 		lgrp_plat_config(event, resource);
526 		atomic_add_32(&lgrp_gen, 1);
527 
528 		break;
529 	case LGRP_CONFIG_CPU_ONLINE:
530 		cp = (cpu_t *)resource;
531 		lgrp_cpu_init(cp);
532 		lgrp_part_add_cpu(cp, cp->cpu_lpl->lpl_lgrpid);
533 		rc = lpl_topo_verify(cp->cpu_part);
534 		if (rc != LPL_TOPO_CORRECT) {
535 			panic("lpl_topo_verify failed: %d", rc);
536 		}
537 		lgrp_plat_config(event, resource);
538 		atomic_add_32(&lgrp_gen, 1);
539 
540 		break;
541 	case LGRP_CONFIG_CPU_OFFLINE:
542 		cp = (cpu_t *)resource;
543 		id = cp->cpu_lpl->lpl_lgrpid;
544 		lgrp_part_del_cpu(cp);
545 		lgrp_cpu_fini(cp, id);
546 		rc = lpl_topo_verify(cp->cpu_part);
547 		if (rc != LPL_TOPO_CORRECT) {
548 			panic("lpl_topo_verify failed: %d", rc);
549 		}
550 		lgrp_plat_config(event, resource);
551 		atomic_add_32(&lgrp_gen, 1);
552 
553 		break;
554 	case LGRP_CONFIG_CPUPART_ADD:
555 		cp = (cpu_t *)resource;
556 		lgrp_part_add_cpu((cpu_t *)resource, (lgrp_id_t)where);
557 		rc = lpl_topo_verify(cp->cpu_part);
558 		if (rc != LPL_TOPO_CORRECT) {
559 			panic("lpl_topo_verify failed: %d", rc);
560 		}
561 		lgrp_plat_config(event, resource);
562 
563 		break;
564 	case LGRP_CONFIG_CPUPART_DEL:
565 		cp = (cpu_t *)resource;
566 		lgrp_part_del_cpu((cpu_t *)resource);
567 		rc = lpl_topo_verify(cp->cpu_part);
568 		if (rc != LPL_TOPO_CORRECT) {
569 			panic("lpl_topo_verify failed: %d", rc);
570 		}
571 		lgrp_plat_config(event, resource);
572 
573 		break;
574 	/*
575 	 * The following events are initiated by the memnode
576 	 * subsystem.
577 	 */
578 	case LGRP_CONFIG_MEM_ADD:
579 		lgrp_mem_init((int)resource, where, B_FALSE);
580 		atomic_add_32(&lgrp_gen, 1);
581 
582 		break;
583 	case LGRP_CONFIG_MEM_DEL:
584 		lgrp_mem_fini((int)resource, where, B_FALSE);
585 		atomic_add_32(&lgrp_gen, 1);
586 
587 		break;
588 	case LGRP_CONFIG_MEM_RENAME: {
589 		lgrp_config_mem_rename_t *ren_arg =
590 		    (lgrp_config_mem_rename_t *)where;
591 
592 		lgrp_mem_rename((int)resource,
593 		    ren_arg->lmem_rename_from,
594 		    ren_arg->lmem_rename_to);
595 		atomic_add_32(&lgrp_gen, 1);
596 
597 		break;
598 	}
599 	case LGRP_CONFIG_GEN_UPDATE:
600 		atomic_add_32(&lgrp_gen, 1);
601 
602 		break;
603 	case LGRP_CONFIG_FLATTEN:
604 		if (where == 0)
605 			lgrp_topo_levels = (int)resource;
606 		else
607 			(void) lgrp_topo_flatten(resource,
608 			    lgrp_table, lgrp_alloc_max, &changed);
609 
610 		break;
611 	/*
612 	 * Initiated by platform latency probing code
613 	 */
614 	case LGRP_CONFIG_LATENCY_CHANGE:
615 		lgrp_latency_change((u_longlong_t)resource,
616 		    (u_longlong_t)where);
617 
618 		break;
619 	case LGRP_CONFIG_NOP:
620 
621 		break;
622 	default:
623 		break;
624 	}
625 
626 }
627 
628 /*
629  * Called to add lgrp info into cpu structure from cpu_add_unit;
630  * do not assume cpu is in cpu[] yet!
631  *
632  * CPUs are brought online with all other CPUs paused so we can't
633  * allocate memory or we could deadlock the system, so we rely on
634  * the platform to statically allocate as much space as we need
635  * for the lgrp structs and stats.
636  */
637 static void
638 lgrp_cpu_init(struct cpu *cp)
639 {
640 	klgrpset_t	changed;
641 	int		count;
642 	lgrp_handle_t	hand;
643 	int		first_cpu;
644 	lgrp_t		*my_lgrp;
645 	lgrp_id_t	lgrpid;
646 	struct cpu	*cptr;
647 	struct chip	*chp;
648 
649 	/*
650 	 * This is the first time through if the resource set
651 	 * for the root lgroup is empty. After cpu0 has been
652 	 * initially added to an lgroup, the root's CPU resource
653 	 * set can never be empty, since the system's last CPU
654 	 * cannot be offlined.
655 	 */
656 	if (klgrpset_isempty(lgrp_root->lgrp_set[LGRP_RSRC_CPU])) {
657 		/*
658 		 * First time through.
659 		 */
660 		first_cpu = 1;
661 	} else {
662 		/*
663 		 * If cpu0 needs to move lgroups, we may come
664 		 * through here again, at which time cpu_lock won't
665 		 * be held, and lgrp_initialized will be false.
666 		 */
667 		ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
668 		ASSERT(cp->cpu_part != NULL);
669 		first_cpu = 0;
670 	}
671 
672 	hand = lgrp_plat_cpu_to_hand(cp->cpu_id);
673 	my_lgrp = lgrp_hand_to_lgrp(hand);
674 
675 	if (my_lgrp == NULL) {
676 		/*
677 		 * Create new lgrp and add it to lgroup topology
678 		 */
679 		my_lgrp = lgrp_create();
680 		my_lgrp->lgrp_plathand = hand;
681 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
682 		lgrpid = my_lgrp->lgrp_id;
683 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
684 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
685 
686 		count = 0;
687 		klgrpset_clear(changed);
688 		count += lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
689 		    &changed);
690 		/*
691 		 * May have added new intermediate lgroups, so need to add
692 		 * resources other than CPUs which are added below
693 		 */
694 		(void) lgrp_mnode_update(changed, NULL);
695 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
696 	    > 0) {
697 		/*
698 		 * Leaf lgroup was created, but latency wasn't available
699 		 * then.  So, set latency for it and fill in rest of lgroup
700 		 * topology  now that we know how far it is from other leaf
701 		 * lgroups.
702 		 */
703 		lgrpid = my_lgrp->lgrp_id;
704 		klgrpset_clear(changed);
705 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
706 		    lgrpid))
707 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
708 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
709 		    &changed);
710 
711 		/*
712 		 * May have added new intermediate lgroups, so need to add
713 		 * resources other than CPUs which are added below
714 		 */
715 		(void) lgrp_mnode_update(changed, NULL);
716 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_CPU],
717 	    my_lgrp->lgrp_id)) {
718 		int	i;
719 
720 		/*
721 		 * Update existing lgroup and lgroups containing it with CPU
722 		 * resource
723 		 */
724 		lgrpid = my_lgrp->lgrp_id;
725 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
726 		for (i = 0; i <= lgrp_alloc_max; i++) {
727 			lgrp_t		*lgrp;
728 
729 			lgrp = lgrp_table[i];
730 			if (!LGRP_EXISTS(lgrp) ||
731 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
732 				continue;
733 
734 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
735 		}
736 	}
737 
738 	lgrpid = my_lgrp->lgrp_id;
739 	cp->cpu_lpl = &cp->cpu_part->cp_lgrploads[lgrpid];
740 
741 	/*
742 	 * For multi-lgroup systems, need to setup lpl for CPU0 or CPU0 will
743 	 * end up in lpl for lgroup 0 whether it is supposed to be in there or
744 	 * not since none of lgroup IDs in the lpl's have been set yet.
745 	 */
746 	if (first_cpu && nlgrpsmax > 1 && lgrpid != cp->cpu_lpl->lpl_lgrpid)
747 		cp->cpu_lpl->lpl_lgrpid = lgrpid;
748 
749 	/*
750 	 * link the CPU into the lgrp's CPU list
751 	 */
752 	if (my_lgrp->lgrp_cpucnt == 0) {
753 		my_lgrp->lgrp_cpu = cp;
754 		cp->cpu_next_lgrp = cp->cpu_prev_lgrp = cp;
755 	} else {
756 		cptr = my_lgrp->lgrp_cpu;
757 		cp->cpu_next_lgrp = cptr;
758 		cp->cpu_prev_lgrp = cptr->cpu_prev_lgrp;
759 		cptr->cpu_prev_lgrp->cpu_next_lgrp = cp;
760 		cptr->cpu_prev_lgrp = cp;
761 	}
762 	my_lgrp->lgrp_cpucnt++;
763 
764 	/*
765 	 * Add this cpu's chip to the per lgroup list
766 	 * if necessary
767 	 */
768 	if (cp->cpu_chip->chip_lgrp == NULL) {
769 		struct chip *lcpr;
770 
771 		chp = cp->cpu_chip;
772 
773 		if (my_lgrp->lgrp_chipcnt == 0) {
774 			my_lgrp->lgrp_chips = chp;
775 			chp->chip_next_lgrp =
776 			    chp->chip_prev_lgrp = chp;
777 		} else {
778 			lcpr = my_lgrp->lgrp_chips;
779 			chp->chip_next_lgrp = lcpr;
780 			chp->chip_prev_lgrp =
781 			    lcpr->chip_prev_lgrp;
782 			lcpr->chip_prev_lgrp->chip_next_lgrp =
783 			    chp;
784 			lcpr->chip_prev_lgrp = chp;
785 		}
786 		chp->chip_lgrp = my_lgrp;
787 		chp->chip_balance = chp->chip_next_lgrp;
788 		my_lgrp->lgrp_chipcnt++;
789 	}
790 }
791 
792 lgrp_t *
793 lgrp_create(void)
794 {
795 	lgrp_t		*my_lgrp;
796 	lgrp_id_t	lgrpid;
797 	int		i;
798 
799 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
800 
801 	/*
802 	 * Find an open slot in the lgroup table and recycle unused lgroup
803 	 * left there if any
804 	 */
805 	my_lgrp = NULL;
806 	if (lgrp_alloc_hint == -1)
807 		/*
808 		 * Allocate from end when hint not set yet because no lgroups
809 		 * have been deleted yet
810 		 */
811 		lgrpid = nlgrps++;
812 	else {
813 		/*
814 		 * Start looking for next open slot from hint and leave hint
815 		 * at slot allocated
816 		 */
817 		for (i = lgrp_alloc_hint; i < nlgrpsmax; i++) {
818 			my_lgrp = lgrp_table[i];
819 			if (!LGRP_EXISTS(my_lgrp)) {
820 				lgrpid = i;
821 				nlgrps++;
822 				break;
823 			}
824 		}
825 		lgrp_alloc_hint = lgrpid;
826 	}
827 
828 	/*
829 	 * Keep track of max lgroup ID allocated so far to cut down on searches
830 	 */
831 	if (lgrpid > lgrp_alloc_max)
832 		lgrp_alloc_max = lgrpid;
833 
834 	/*
835 	 * Need to allocate new lgroup if next open slot didn't have one
836 	 * for recycling
837 	 */
838 	if (my_lgrp == NULL)
839 		my_lgrp = lgrp_plat_alloc(lgrpid);
840 
841 	if (nlgrps > nlgrpsmax || my_lgrp == NULL)
842 		panic("Too many lgrps for platform (%d)", nlgrps);
843 
844 	my_lgrp->lgrp_id = lgrpid;
845 	my_lgrp->lgrp_latency = 0;
846 	my_lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
847 	my_lgrp->lgrp_parent = NULL;
848 	my_lgrp->lgrp_childcnt = 0;
849 	my_lgrp->lgrp_mnodes = (mnodeset_t)0;
850 	my_lgrp->lgrp_nmnodes = 0;
851 	klgrpset_clear(my_lgrp->lgrp_children);
852 	klgrpset_clear(my_lgrp->lgrp_leaves);
853 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
854 		klgrpset_clear(my_lgrp->lgrp_set[i]);
855 
856 	my_lgrp->lgrp_cpu = NULL;
857 	my_lgrp->lgrp_cpucnt = 0;
858 	my_lgrp->lgrp_chips = NULL;
859 	my_lgrp->lgrp_chipcnt = 0;
860 
861 	if (my_lgrp->lgrp_kstat != NULL)
862 		lgrp_kstat_reset(lgrpid);
863 
864 	lgrp_table[my_lgrp->lgrp_id] = my_lgrp;
865 
866 	return (my_lgrp);
867 }
868 
869 void
870 lgrp_destroy(lgrp_t *lgrp)
871 {
872 	int		i;
873 
874 	/*
875 	 * Unless this lgroup is being destroyed on behalf of
876 	 * the boot CPU, cpu_lock must be held
877 	 */
878 	ASSERT(!lgrp_initialized || MUTEX_HELD(&cpu_lock));
879 
880 	if (nlgrps == 1)
881 		cmn_err(CE_PANIC, "Can't destroy only lgroup!");
882 
883 	if (!LGRP_EXISTS(lgrp))
884 		return;
885 
886 	/*
887 	 * Set hint to lgroup being deleted and try to keep lower numbered
888 	 * hints to facilitate finding empty slots
889 	 */
890 	if (lgrp_alloc_hint == -1 || lgrp->lgrp_id < lgrp_alloc_hint)
891 		lgrp_alloc_hint = lgrp->lgrp_id;
892 
893 	/*
894 	 * Mark this lgroup to be recycled by setting its lgroup ID to
895 	 * LGRP_NONE and clear relevant fields
896 	 */
897 	lgrp->lgrp_id = LGRP_NONE;
898 	lgrp->lgrp_latency = 0;
899 	lgrp->lgrp_plathand = LGRP_NULL_HANDLE;
900 	lgrp->lgrp_parent = NULL;
901 	lgrp->lgrp_childcnt = 0;
902 
903 	klgrpset_clear(lgrp->lgrp_children);
904 	klgrpset_clear(lgrp->lgrp_leaves);
905 	for (i = 0; i < LGRP_RSRC_COUNT; i++)
906 		klgrpset_clear(lgrp->lgrp_set[i]);
907 
908 	lgrp->lgrp_mnodes = (mnodeset_t)0;
909 	lgrp->lgrp_nmnodes = 0;
910 
911 	lgrp->lgrp_cpu = NULL;
912 	lgrp->lgrp_cpucnt = 0;
913 	lgrp->lgrp_chipcnt = 0;
914 	lgrp->lgrp_chips = NULL;
915 
916 	nlgrps--;
917 }
918 
919 /*
920  * Initialize kstat data. Called from lgrp intialization code.
921  */
922 static void
923 lgrp_kstat_init(void)
924 {
925 	lgrp_stat_t	stat;
926 
927 	mutex_init(&lgrp_kstat_mutex, NULL, MUTEX_DEFAULT, NULL);
928 
929 	for (stat = 0; stat < LGRP_NUM_STATS; stat++)
930 		kstat_named_init(&lgrp_kstat_data[stat],
931 		    lgrp_kstat_names[stat], KSTAT_DATA_INT64);
932 }
933 
934 /*
935  * initialize an lgrp's kstats if needed
936  * called with cpu_lock held but not with cpus paused.
937  * we don't tear these down now because we don't know about
938  * memory leaving the lgrp yet...
939  */
940 
941 void
942 lgrp_kstat_create(cpu_t *cp)
943 {
944 	kstat_t		*lgrp_kstat;
945 	lgrp_id_t	lgrpid;
946 	lgrp_t		*my_lgrp;
947 
948 	ASSERT(MUTEX_HELD(&cpu_lock));
949 
950 	lgrpid = cp->cpu_lpl->lpl_lgrpid;
951 	my_lgrp = lgrp_table[lgrpid];
952 
953 	if (my_lgrp->lgrp_kstat != NULL)
954 		return; /* already initialized */
955 
956 	lgrp_kstat = kstat_create("lgrp", lgrpid, NULL, "misc",
957 	    KSTAT_TYPE_NAMED, LGRP_NUM_STATS,
958 	    KSTAT_FLAG_VIRTUAL | KSTAT_FLAG_WRITABLE);
959 
960 	if (lgrp_kstat != NULL) {
961 		lgrp_kstat->ks_lock = &lgrp_kstat_mutex;
962 		lgrp_kstat->ks_private = my_lgrp;
963 		lgrp_kstat->ks_data = &lgrp_kstat_data;
964 		lgrp_kstat->ks_update = lgrp_kstat_extract;
965 		my_lgrp->lgrp_kstat = lgrp_kstat;
966 		kstat_install(lgrp_kstat);
967 	}
968 }
969 
970 /*
971  * this will do something when we manage to remove now unused lgrps
972  */
973 
974 /* ARGSUSED */
975 void
976 lgrp_kstat_destroy(cpu_t *cp)
977 {
978 	ASSERT(MUTEX_HELD(&cpu_lock));
979 }
980 
981 /*
982  * Called when a CPU is off-lined.
983  */
984 static void
985 lgrp_cpu_fini(struct cpu *cp, lgrp_id_t lgrpid)
986 {
987 	lgrp_t *my_lgrp;
988 	struct cpu *prev;
989 	struct cpu *next;
990 	chip_t  *chp;
991 
992 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
993 
994 	prev = cp->cpu_prev_lgrp;
995 	next = cp->cpu_next_lgrp;
996 
997 	prev->cpu_next_lgrp = next;
998 	next->cpu_prev_lgrp = prev;
999 
1000 	/*
1001 	 * just because I'm paranoid doesn't mean...
1002 	 */
1003 
1004 	cp->cpu_next_lgrp = cp->cpu_prev_lgrp = NULL;
1005 
1006 	my_lgrp = lgrp_table[lgrpid];
1007 	my_lgrp->lgrp_cpucnt--;
1008 
1009 	/*
1010 	 * If the last CPU on it's chip is being offlined
1011 	 * then remove this chip from the per lgroup list.
1012 	 *
1013 	 * This is also done for the boot CPU when it needs
1014 	 * to move between lgroups as a consequence of
1015 	 * null proc lpa.
1016 	 */
1017 	chp = cp->cpu_chip;
1018 	if (chp->chip_ncpu == 0 || !lgrp_initialized) {
1019 
1020 		chip_t	*chpp;
1021 
1022 		if (--my_lgrp->lgrp_chipcnt == 0)
1023 			my_lgrp->lgrp_chips = NULL;
1024 		else if (my_lgrp->lgrp_chips == chp)
1025 			my_lgrp->lgrp_chips = chp->chip_next_lgrp;
1026 
1027 		/*
1028 		 * Walk this lgroup's chip list looking for chips that
1029 		 * may try to balance against the one that's leaving
1030 		 */
1031 		for (chpp = chp->chip_next_lgrp; chpp != chp;
1032 		    chpp = chpp->chip_next_lgrp) {
1033 			if (chpp->chip_balance == chp)
1034 				chpp->chip_balance = chp->chip_next_lgrp;
1035 		}
1036 
1037 		chp->chip_prev_lgrp->chip_next_lgrp = chp->chip_next_lgrp;
1038 		chp->chip_next_lgrp->chip_prev_lgrp = chp->chip_prev_lgrp;
1039 
1040 		chp->chip_next_lgrp = chp->chip_prev_lgrp = NULL;
1041 		chp->chip_lgrp = NULL;
1042 		chp->chip_balance = NULL;
1043 	}
1044 
1045 	/*
1046 	 * Removing last CPU in lgroup, so update lgroup topology
1047 	 */
1048 	if (my_lgrp->lgrp_cpucnt == 0) {
1049 		klgrpset_t	changed;
1050 		int		count;
1051 		int		i;
1052 
1053 		my_lgrp->lgrp_cpu = NULL;
1054 
1055 		/*
1056 		 * Remove this lgroup from its lgroup CPU resources and remove
1057 		 * lgroup from lgroup topology if it doesn't have any more
1058 		 * resources in it now
1059 		 */
1060 		klgrpset_del(my_lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1061 		if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1062 			count = 0;
1063 			klgrpset_clear(changed);
1064 			count += lgrp_leaf_delete(my_lgrp, lgrp_table,
1065 			    lgrp_alloc_max + 1, &changed);
1066 			return;
1067 		}
1068 
1069 		/*
1070 		 * This lgroup isn't empty, so just remove it from CPU
1071 		 * resources of any lgroups that contain it as such
1072 		 */
1073 		for (i = 0; i <= lgrp_alloc_max; i++) {
1074 			lgrp_t		*lgrp;
1075 
1076 			lgrp = lgrp_table[i];
1077 			if (!LGRP_EXISTS(lgrp) ||
1078 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_CPU],
1079 			    lgrpid))
1080 				continue;
1081 
1082 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_CPU], lgrpid);
1083 		}
1084 		return;
1085 	}
1086 
1087 	if (my_lgrp->lgrp_cpu == cp)
1088 		my_lgrp->lgrp_cpu = next;
1089 
1090 }
1091 
1092 /*
1093  * Update memory nodes in target lgroups and return ones that get changed
1094  */
1095 int
1096 lgrp_mnode_update(klgrpset_t target, klgrpset_t *changed)
1097 {
1098 	int	count;
1099 	int	i;
1100 	int	j;
1101 	lgrp_t	*lgrp;
1102 	lgrp_t	*lgrp_rsrc;
1103 
1104 	count = 0;
1105 	if (changed)
1106 		klgrpset_clear(*changed);
1107 
1108 	if (klgrpset_isempty(target))
1109 		return (0);
1110 
1111 	/*
1112 	 * Find each lgroup in target lgroups
1113 	 */
1114 	for (i = 0; i <= lgrp_alloc_max; i++) {
1115 		/*
1116 		 * Skip any lgroups that don't exist or aren't in target group
1117 		 */
1118 		lgrp = lgrp_table[i];
1119 		if (!klgrpset_ismember(target, i) || !LGRP_EXISTS(lgrp)) {
1120 			continue;
1121 		}
1122 
1123 		/*
1124 		 * Initialize memnodes for intermediate lgroups to 0
1125 		 * and update them from scratch since they may have completely
1126 		 * changed
1127 		 */
1128 		if (lgrp->lgrp_childcnt && lgrp != lgrp_root) {
1129 			lgrp->lgrp_mnodes = (mnodeset_t)0;
1130 			lgrp->lgrp_nmnodes = 0;
1131 		}
1132 
1133 		/*
1134 		 * Update memory nodes of of target lgroup with memory nodes
1135 		 * from each lgroup in its lgroup memory resource set
1136 		 */
1137 		for (j = 0; j <= lgrp_alloc_max; j++) {
1138 			int	k;
1139 
1140 			/*
1141 			 * Skip any lgroups that don't exist or aren't in
1142 			 * memory resources of target lgroup
1143 			 */
1144 			lgrp_rsrc = lgrp_table[j];
1145 			if (!LGRP_EXISTS(lgrp_rsrc) ||
1146 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1147 			    j))
1148 				continue;
1149 
1150 			/*
1151 			 * Update target lgroup's memnodes to include memnodes
1152 			 * of this lgroup
1153 			 */
1154 			for (k = 0; k < sizeof (mnodeset_t) * NBBY; k++) {
1155 				mnodeset_t	mnode_mask;
1156 
1157 				mnode_mask = (mnodeset_t)1 << k;
1158 				if ((lgrp_rsrc->lgrp_mnodes & mnode_mask) &&
1159 				    !(lgrp->lgrp_mnodes & mnode_mask)) {
1160 					lgrp->lgrp_mnodes |= mnode_mask;
1161 					lgrp->lgrp_nmnodes++;
1162 				}
1163 			}
1164 			count++;
1165 			if (changed)
1166 				klgrpset_add(*changed, lgrp->lgrp_id);
1167 		}
1168 	}
1169 
1170 	return (count);
1171 }
1172 
1173 /*
1174  * Memory copy-rename. Called when the "mnode" containing the kernel cage memory
1175  * is moved from one board to another. The "from" and "to" arguments specify the
1176  * source and the destination of the move.
1177  *
1178  * See plat_lgrp_config() for a detailed description of the copy-rename
1179  * semantics.
1180  *
1181  * The lgrp_mem_rename() is called by the platform copy-rename code to update
1182  * the lgroup topology which is changing as memory moves from one lgroup to
1183  * another. It removes the mnode from the source lgroup and re-inserts it in the
1184  * target lgroup.
1185  *
1186  * The lgrp_mem_rename() function passes a flag to lgrp_mem_init() and
1187  * lgrp_mem_fini() telling that the insertion and deleteion are part of a DR
1188  * copy-rename operation.
1189  *
1190  * There is one case which requires special handling. If the system contains
1191  * only two boards (mnodes), the lgrp_mem_fini() removes the only mnode from the
1192  * lgroup hierarchy. This mnode is soon re-inserted back in the hierarchy by
1193  * lgrp_mem_init), but there is a window when the system has no memory in the
1194  * lgroup hierarchy. If another thread tries to allocate memory during this
1195  * window, the allocation will fail, although the system has physical memory.
1196  * This may cause a system panic or a deadlock (some sleeping memory allocations
1197  * happen with cpu_lock held which prevents lgrp_mem_init() from re-inserting
1198  * the mnode back).
1199  *
1200  * The lgrp_memnode_choose() function walks the lgroup hierarchy looking for the
1201  * lgrp with non-empty lgrp_mnodes. To deal with the special case above,
1202  * lgrp_mem_fini() does not remove the last mnode from the lroot->lgrp_mnodes,
1203  * but it updates the rest of the lgroup topology as if the mnode was actually
1204  * removed. The lgrp_mem_init() function recognizes that the mnode being
1205  * inserted represents such a special case and updates the topology
1206  * appropriately.
1207  */
1208 void
1209 lgrp_mem_rename(int mnode, lgrp_handle_t from, lgrp_handle_t to)
1210 {
1211 	/*
1212 	 * Remove the memory from the source node and add it to the destination
1213 	 * node.
1214 	 */
1215 	lgrp_mem_fini(mnode, from, B_TRUE);
1216 	lgrp_mem_init(mnode, to, B_TRUE);
1217 }
1218 
1219 /*
1220  * Called to indicate that the lgrp with platform handle "hand" now
1221  * contains the memory identified by "mnode".
1222  *
1223  * LOCKING for this routine is a bit tricky. Usually it is called without
1224  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1225  * callers. During DR of the board containing the caged memory it may be called
1226  * with cpu_lock already held and CPUs paused.
1227  *
1228  * If the insertion is part of the DR copy-rename and the inserted mnode (and
1229  * only this mnode) is already present in the lgrp_root->lgrp_mnodes set, we are
1230  * dealing with the special case of DR copy-rename described in
1231  * lgrp_mem_rename().
1232  */
1233 void
1234 lgrp_mem_init(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1235 {
1236 	klgrpset_t	changed;
1237 	int		count;
1238 	int		i;
1239 	lgrp_t		*my_lgrp;
1240 	lgrp_id_t	lgrpid;
1241 	mnodeset_t	mnodes_mask = ((mnodeset_t)1 << mnode);
1242 	boolean_t	drop_lock = B_FALSE;
1243 	boolean_t	need_synch = B_FALSE;
1244 
1245 	/*
1246 	 * Grab CPU lock (if we haven't already)
1247 	 */
1248 	if (!MUTEX_HELD(&cpu_lock)) {
1249 		mutex_enter(&cpu_lock);
1250 		drop_lock = B_TRUE;
1251 	}
1252 
1253 	/*
1254 	 * This routine may be called from a context where we already
1255 	 * hold cpu_lock, and have already paused cpus.
1256 	 */
1257 	if (!cpus_paused())
1258 		need_synch = B_TRUE;
1259 
1260 	/*
1261 	 * Check if this mnode is already configured and return immediately if
1262 	 * it is.
1263 	 *
1264 	 * NOTE: in special case of copy-rename of the only remaining mnode,
1265 	 * lgrp_mem_fini() refuses to remove the last mnode from the root, so we
1266 	 * recognize this case and continue as usual, but skip the update to
1267 	 * the lgrp_mnodes and the lgrp_nmnodes. This restores the inconsistency
1268 	 * in topology, temporarily introduced by lgrp_mem_fini().
1269 	 */
1270 	if (! (is_copy_rename && (lgrp_root->lgrp_mnodes == mnodes_mask)) &&
1271 	    lgrp_root->lgrp_mnodes & mnodes_mask) {
1272 		if (drop_lock)
1273 			mutex_exit(&cpu_lock);
1274 		return;
1275 	}
1276 
1277 	/*
1278 	 * Update lgroup topology with new memory resources, keeping track of
1279 	 * which lgroups change
1280 	 */
1281 	count = 0;
1282 	klgrpset_clear(changed);
1283 	my_lgrp = lgrp_hand_to_lgrp(hand);
1284 	if (my_lgrp == NULL) {
1285 		/* new lgrp */
1286 		my_lgrp = lgrp_create();
1287 		lgrpid = my_lgrp->lgrp_id;
1288 		my_lgrp->lgrp_plathand = hand;
1289 		my_lgrp->lgrp_latency = lgrp_plat_latency(hand, hand);
1290 		klgrpset_add(my_lgrp->lgrp_leaves, lgrpid);
1291 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1292 
1293 		if (need_synch)
1294 			pause_cpus(NULL);
1295 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1296 		    &changed);
1297 		if (need_synch)
1298 			start_cpus();
1299 	} else if (my_lgrp->lgrp_latency == 0 && lgrp_plat_latency(hand, hand)
1300 	    > 0) {
1301 		/*
1302 		 * Leaf lgroup was created, but latency wasn't available
1303 		 * then.  So, set latency for it and fill in rest of lgroup
1304 		 * topology  now that we know how far it is from other leaf
1305 		 * lgroups.
1306 		 */
1307 		klgrpset_clear(changed);
1308 		lgrpid = my_lgrp->lgrp_id;
1309 		if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1310 		    lgrpid))
1311 			klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1312 		if (need_synch)
1313 			pause_cpus(NULL);
1314 		count = lgrp_leaf_add(my_lgrp, lgrp_table, lgrp_alloc_max + 1,
1315 		    &changed);
1316 		if (need_synch)
1317 			start_cpus();
1318 	} else if (!klgrpset_ismember(my_lgrp->lgrp_set[LGRP_RSRC_MEM],
1319 	    my_lgrp->lgrp_id)) {
1320 		/*
1321 		 * Add new lgroup memory resource to existing lgroup
1322 		 */
1323 		lgrpid = my_lgrp->lgrp_id;
1324 		klgrpset_add(my_lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1325 		klgrpset_add(changed, lgrpid);
1326 		count++;
1327 		for (i = 0; i <= lgrp_alloc_max; i++) {
1328 			lgrp_t		*lgrp;
1329 
1330 			lgrp = lgrp_table[i];
1331 			if (!LGRP_EXISTS(lgrp) ||
1332 			    !lgrp_rsets_member(lgrp->lgrp_set, lgrpid))
1333 				continue;
1334 
1335 			klgrpset_add(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1336 			klgrpset_add(changed, lgrp->lgrp_id);
1337 			count++;
1338 		}
1339 	}
1340 
1341 	/*
1342 	 * Add memory node to lgroup and remove lgroup from ones that need
1343 	 * to be updated
1344 	 */
1345 	if (!(my_lgrp->lgrp_mnodes & mnodes_mask)) {
1346 		my_lgrp->lgrp_mnodes |= mnodes_mask;
1347 		my_lgrp->lgrp_nmnodes++;
1348 	}
1349 	klgrpset_del(changed, lgrpid);
1350 
1351 	/*
1352 	 * Update memory node information for all lgroups that changed and
1353 	 * contain new memory node as a resource
1354 	 */
1355 	if (count)
1356 		(void) lgrp_mnode_update(changed, NULL);
1357 
1358 	if (drop_lock)
1359 		mutex_exit(&cpu_lock);
1360 }
1361 
1362 /*
1363  * Called to indicate that the lgroup associated with the platform
1364  * handle "hand" no longer contains given memory node
1365  *
1366  * LOCKING for this routine is a bit tricky. Usually it is called without
1367  * cpu_lock and it must must grab cpu_lock here to prevent racing with other
1368  * callers. During DR of the board containing the caged memory it may be called
1369  * with cpu_lock already held and CPUs paused.
1370  *
1371  * If the deletion is part of the DR copy-rename and the deleted mnode is the
1372  * only one present in the lgrp_root->lgrp_mnodes, all the topology is updated,
1373  * but lgrp_root->lgrp_mnodes is left intact. Later, lgrp_mem_init() will insert
1374  * the same mnode back into the topology. See lgrp_mem_rename() and
1375  * lgrp_mem_init() for additional details.
1376  */
1377 void
1378 lgrp_mem_fini(int mnode, lgrp_handle_t hand, boolean_t is_copy_rename)
1379 {
1380 	klgrpset_t	changed;
1381 	int		count;
1382 	int		i;
1383 	lgrp_t		*my_lgrp;
1384 	lgrp_id_t	lgrpid;
1385 	mnodeset_t	mnodes_mask;
1386 	boolean_t	drop_lock = B_FALSE;
1387 	boolean_t	need_synch = B_FALSE;
1388 
1389 	/*
1390 	 * Grab CPU lock (if we haven't already)
1391 	 */
1392 	if (!MUTEX_HELD(&cpu_lock)) {
1393 		mutex_enter(&cpu_lock);
1394 		drop_lock = B_TRUE;
1395 	}
1396 
1397 	/*
1398 	 * This routine may be called from a context where we already
1399 	 * hold cpu_lock and have already paused cpus.
1400 	 */
1401 	if (!cpus_paused())
1402 		need_synch = B_TRUE;
1403 
1404 	my_lgrp = lgrp_hand_to_lgrp(hand);
1405 
1406 	/*
1407 	 * The lgrp *must* be pre-existing
1408 	 */
1409 	ASSERT(my_lgrp != NULL);
1410 
1411 	/*
1412 	 * Delete memory node from lgroups which contain it
1413 	 */
1414 	mnodes_mask = ((mnodeset_t)1 << mnode);
1415 	for (i = 0; i <= lgrp_alloc_max; i++) {
1416 		lgrp_t *lgrp = lgrp_table[i];
1417 		/*
1418 		 * Skip any non-existent lgroups and any lgroups that don't
1419 		 * contain leaf lgroup of memory as a memory resource
1420 		 */
1421 		if (!LGRP_EXISTS(lgrp) ||
1422 		    !(lgrp->lgrp_mnodes & mnodes_mask))
1423 			continue;
1424 
1425 		/*
1426 		 * Avoid removing the last mnode from the root in the DR
1427 		 * copy-rename case. See lgrp_mem_rename() for details.
1428 		 */
1429 		if (is_copy_rename &&
1430 		    (lgrp == lgrp_root) && (lgrp->lgrp_mnodes == mnodes_mask))
1431 			continue;
1432 
1433 		/*
1434 		 * Remove memory node from lgroup.
1435 		 */
1436 		lgrp->lgrp_mnodes &= ~mnodes_mask;
1437 		lgrp->lgrp_nmnodes--;
1438 		ASSERT(lgrp->lgrp_nmnodes >= 0);
1439 	}
1440 	ASSERT(lgrp_root->lgrp_nmnodes > 0);
1441 
1442 	/*
1443 	 * Don't need to update lgroup topology if this lgroup still has memory.
1444 	 *
1445 	 * In the special case of DR copy-rename with the only mnode being
1446 	 * removed, the lgrp_mnodes for the root is always non-zero, but we
1447 	 * still need to update the lgroup topology.
1448 	 */
1449 	if ((my_lgrp->lgrp_nmnodes > 0) &&
1450 	    !(is_copy_rename &&
1451 		(my_lgrp == lgrp_root) &&
1452 		(my_lgrp->lgrp_mnodes == mnodes_mask))) {
1453 		if (drop_lock)
1454 			mutex_exit(&cpu_lock);
1455 		return;
1456 	}
1457 
1458 	/*
1459 	 * This lgroup does not contain any memory now
1460 	 */
1461 	klgrpset_clear(my_lgrp->lgrp_set[LGRP_RSRC_MEM]);
1462 
1463 	/*
1464 	 * Remove this lgroup from lgroup topology if it does not contain any
1465 	 * resources now
1466 	 */
1467 	lgrpid = my_lgrp->lgrp_id;
1468 	count = 0;
1469 	klgrpset_clear(changed);
1470 	if (lgrp_rsets_empty(my_lgrp->lgrp_set)) {
1471 		/*
1472 		 * Delete lgroup when no more resources
1473 		 */
1474 		if (need_synch)
1475 			pause_cpus(NULL);
1476 		count = lgrp_leaf_delete(my_lgrp, lgrp_table,
1477 		    lgrp_alloc_max + 1, &changed);
1478 		ASSERT(count > 0);
1479 		if (need_synch)
1480 			start_cpus();
1481 	} else {
1482 		/*
1483 		 * Remove lgroup from memory resources of any lgroups that
1484 		 * contain it as such
1485 		 */
1486 		for (i = 0; i <= lgrp_alloc_max; i++) {
1487 			lgrp_t		*lgrp;
1488 
1489 			lgrp = lgrp_table[i];
1490 			if (!LGRP_EXISTS(lgrp) ||
1491 			    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM],
1492 			    lgrpid))
1493 				continue;
1494 
1495 			klgrpset_del(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid);
1496 		}
1497 	}
1498 	if (drop_lock)
1499 		mutex_exit(&cpu_lock);
1500 }
1501 
1502 /*
1503  * Return lgroup with given platform handle
1504  */
1505 lgrp_t *
1506 lgrp_hand_to_lgrp(lgrp_handle_t hand)
1507 {
1508 	int	i;
1509 	lgrp_t	*lgrp;
1510 
1511 	if (hand == LGRP_NULL_HANDLE)
1512 		return (NULL);
1513 
1514 	for (i = 0; i <= lgrp_alloc_max; i++) {
1515 		lgrp = lgrp_table[i];
1516 		if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1517 			return (lgrp);
1518 	}
1519 	return (NULL);
1520 }
1521 
1522 /*
1523  * Return the home lgroup of the current thread.
1524  * We must do this with kernel preemption disabled, since we don't want our
1525  * thread to be re-homed while we're poking around with its lpl, and the lpl
1526  * should never be NULL.
1527  *
1528  * NOTE: Can't guarantee that lgroup will be valid once kernel preemption
1529  * is enabled because of DR.  Callers can use disable kernel preemption
1530  * around this call to guarantee that the lgroup will be valid beyond this
1531  * routine, since kernel preemption can be recursive.
1532  */
1533 lgrp_t *
1534 lgrp_home_lgrp(void)
1535 {
1536 	lgrp_t	*lgrp;
1537 	lpl_t	*lpl;
1538 
1539 	kpreempt_disable();
1540 
1541 	lpl = curthread->t_lpl;
1542 	ASSERT(lpl != NULL);
1543 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1544 	ASSERT(LGRP_EXISTS(lgrp_table[lpl->lpl_lgrpid]));
1545 	lgrp = lgrp_table[lpl->lpl_lgrpid];
1546 
1547 	kpreempt_enable();
1548 
1549 	return (lgrp);
1550 }
1551 
1552 /*
1553  * Return ID of home lgroup for given thread
1554  * (See comments for lgrp_home_lgrp() for special care and handling
1555  * instructions)
1556  */
1557 lgrp_id_t
1558 lgrp_home_id(kthread_t *t)
1559 {
1560 	lgrp_id_t	lgrp;
1561 	lpl_t		*lpl;
1562 
1563 	ASSERT(t != NULL);
1564 	/*
1565 	 * We'd like to ASSERT(MUTEX_HELD(&ttoproc(t)->p_lock)), but we
1566 	 * cannot since the HAT layer can call into this routine to
1567 	 * determine the locality for its data structures in the context
1568 	 * of a page fault.
1569 	 */
1570 
1571 	kpreempt_disable();
1572 
1573 	lpl = t->t_lpl;
1574 	ASSERT(lpl != NULL);
1575 	ASSERT(lpl->lpl_lgrpid >= 0 && lpl->lpl_lgrpid <= lgrp_alloc_max);
1576 	lgrp = lpl->lpl_lgrpid;
1577 
1578 	kpreempt_enable();
1579 
1580 	return (lgrp);
1581 }
1582 
1583 /*
1584  * Return lgroup containing the physical memory for the given page frame number
1585  */
1586 lgrp_t *
1587 lgrp_pfn_to_lgrp(pfn_t pfn)
1588 {
1589 	lgrp_handle_t	hand;
1590 	int		i;
1591 	lgrp_t		*lgrp;
1592 
1593 	hand = lgrp_plat_pfn_to_hand(pfn);
1594 	if (hand != LGRP_NULL_HANDLE)
1595 		for (i = 0; i <= lgrp_alloc_max; i++) {
1596 			lgrp = lgrp_table[i];
1597 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1598 				return (lgrp);
1599 		}
1600 	return (NULL);
1601 }
1602 
1603 /*
1604  * Return lgroup containing the physical memory for the given page frame number
1605  */
1606 lgrp_t *
1607 lgrp_phys_to_lgrp(u_longlong_t physaddr)
1608 {
1609 	lgrp_handle_t	hand;
1610 	int		i;
1611 	lgrp_t		*lgrp;
1612 	pfn_t		pfn;
1613 
1614 	pfn = btop(physaddr);
1615 	hand = lgrp_plat_pfn_to_hand(pfn);
1616 	if (hand != LGRP_NULL_HANDLE)
1617 		for (i = 0; i <= lgrp_alloc_max; i++) {
1618 			lgrp = lgrp_table[i];
1619 			if (LGRP_EXISTS(lgrp) && lgrp->lgrp_plathand == hand)
1620 				return (lgrp);
1621 		}
1622 	return (NULL);
1623 }
1624 
1625 /*
1626  * Return the leaf lgroup containing the given CPU
1627  *
1628  * The caller needs to take precautions necessary to prevent
1629  * "cpu" from going away across a call to this function.
1630  * hint: kpreempt_disable()/kpreempt_enable()
1631  */
1632 static lgrp_t *
1633 lgrp_cpu_to_lgrp(cpu_t *cpu)
1634 {
1635 	return (cpu->cpu_chip->chip_lgrp);
1636 }
1637 
1638 /*
1639  * Return the sum of the partition loads in an lgrp divided by
1640  * the number of CPUs in the lgrp.  This is our best approximation
1641  * of an 'lgroup load average' for a useful per-lgroup kstat.
1642  */
1643 static uint64_t
1644 lgrp_sum_loadavgs(lgrp_t *lgrp)
1645 {
1646 	cpu_t *cpu;
1647 	int ncpu;
1648 	uint64_t loads = 0;
1649 
1650 	mutex_enter(&cpu_lock);
1651 
1652 	cpu = lgrp->lgrp_cpu;
1653 	ncpu = lgrp->lgrp_cpucnt;
1654 
1655 	if (cpu == NULL || ncpu == 0) {
1656 		mutex_exit(&cpu_lock);
1657 		return (0ull);
1658 	}
1659 
1660 	do {
1661 		loads += cpu->cpu_lpl->lpl_loadavg;
1662 		cpu = cpu->cpu_next_lgrp;
1663 	} while (cpu != lgrp->lgrp_cpu);
1664 
1665 	mutex_exit(&cpu_lock);
1666 
1667 	return (loads / ncpu);
1668 }
1669 
1670 void
1671 lgrp_stat_add(lgrp_id_t lgrpid, lgrp_stat_t stat, int64_t val)
1672 {
1673 	struct lgrp_stats *pstats;
1674 
1675 	/*
1676 	 * Verify that the caller isn't trying to add to
1677 	 * a statistic for an lgroup that has gone away
1678 	 */
1679 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1680 		return;
1681 
1682 	pstats = &lgrp_stats[lgrpid];
1683 	atomic_add_64((uint64_t *)LGRP_STAT_WRITE_PTR(pstats, stat), val);
1684 }
1685 
1686 int64_t
1687 lgrp_stat_read(lgrp_id_t lgrpid, lgrp_stat_t stat)
1688 {
1689 	uint64_t val;
1690 	struct lgrp_stats *pstats;
1691 
1692 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1693 		return ((int64_t)0);
1694 
1695 	pstats = &lgrp_stats[lgrpid];
1696 	LGRP_STAT_READ(pstats, stat, val);
1697 	return (val);
1698 }
1699 
1700 /*
1701  * Reset all kstats for lgrp specified by its lgrpid.
1702  */
1703 static void
1704 lgrp_kstat_reset(lgrp_id_t lgrpid)
1705 {
1706 	lgrp_stat_t stat;
1707 
1708 	if (lgrpid < 0 || lgrpid > lgrp_alloc_max)
1709 		return;
1710 
1711 	for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1712 		LGRP_STAT_RESET(&lgrp_stats[lgrpid], stat);
1713 	}
1714 }
1715 
1716 /*
1717  * Collect all per-lgrp statistics for the lgrp associated with this
1718  * kstat, and store them in the ks_data array.
1719  *
1720  * The superuser can reset all the running counter statistics for an
1721  * lgrp by writing to any of the lgrp's stats.
1722  */
1723 static int
1724 lgrp_kstat_extract(kstat_t *ksp, int rw)
1725 {
1726 	lgrp_stat_t		stat;
1727 	struct kstat_named	*ksd;
1728 	lgrp_t			*lgrp;
1729 	lgrp_id_t		lgrpid;
1730 
1731 	lgrp = (lgrp_t *)ksp->ks_private;
1732 
1733 	ksd = (struct kstat_named *)ksp->ks_data;
1734 	ASSERT(ksd == (struct kstat_named *)&lgrp_kstat_data);
1735 
1736 	lgrpid = lgrp->lgrp_id;
1737 
1738 	if (lgrpid == LGRP_NONE) {
1739 		/*
1740 		 * Return all zeroes as stats for freed lgrp.
1741 		 */
1742 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1743 			ksd[stat].value.i64 = 0;
1744 		}
1745 		ksd[stat + LGRP_NUM_CPUS].value.i64 = 0;
1746 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 = 0;
1747 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 = 0;
1748 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 = 0;
1749 		ksd[stat + LGRP_LOADAVG].value.i64 = 0;
1750 	} else if (rw != KSTAT_WRITE) {
1751 		/*
1752 		 * Handle counter stats
1753 		 */
1754 		for (stat = 0; stat < LGRP_NUM_COUNTER_STATS; stat++) {
1755 			ksd[stat].value.i64 = lgrp_stat_read(lgrpid, stat);
1756 		}
1757 
1758 		/*
1759 		 * Handle kernel data snapshot stats
1760 		 */
1761 		ksd[stat + LGRP_NUM_CPUS].value.i64 = lgrp->lgrp_cpucnt;
1762 		ksd[stat + LGRP_NUM_PG_INSTALL].value.i64 =
1763 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_INSTALL);
1764 		ksd[stat + LGRP_NUM_PG_AVAIL].value.i64 =
1765 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_AVAIL);
1766 		ksd[stat + LGRP_NUM_PG_FREE].value.i64 =
1767 		    lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
1768 		ksd[stat + LGRP_LOADAVG].value.i64 = lgrp_sum_loadavgs(lgrp);
1769 	} else {
1770 		lgrp_kstat_reset(lgrpid);
1771 	}
1772 
1773 	return (0);
1774 }
1775 
1776 int
1777 lgrp_query_cpu(processorid_t id, lgrp_id_t *lp)
1778 {
1779 	cpu_t	*cp;
1780 
1781 	mutex_enter(&cpu_lock);
1782 
1783 	if ((cp = cpu_get(id)) == NULL) {
1784 		mutex_exit(&cpu_lock);
1785 		return (EINVAL);
1786 	}
1787 
1788 	if (cpu_is_offline(cp) || cpu_is_poweredoff(cp)) {
1789 		mutex_exit(&cpu_lock);
1790 		return (EINVAL);
1791 	}
1792 
1793 	ASSERT(cp->cpu_lpl != NULL);
1794 
1795 	*lp = cp->cpu_lpl->lpl_lgrpid;
1796 
1797 	mutex_exit(&cpu_lock);
1798 
1799 	return (0);
1800 }
1801 
1802 int
1803 lgrp_query_load(processorid_t id, lgrp_load_t *lp)
1804 {
1805 	cpu_t *cp;
1806 
1807 	mutex_enter(&cpu_lock);
1808 
1809 	if ((cp = cpu_get(id)) == NULL) {
1810 		mutex_exit(&cpu_lock);
1811 		return (EINVAL);
1812 	}
1813 
1814 	ASSERT(cp->cpu_lpl != NULL);
1815 
1816 	*lp = cp->cpu_lpl->lpl_loadavg;
1817 
1818 	mutex_exit(&cpu_lock);
1819 
1820 	return (0);
1821 }
1822 
1823 void
1824 lgrp_latency_change(u_longlong_t oldtime, u_longlong_t newtime)
1825 {
1826 	lgrp_t		*lgrp;
1827 	int		i;
1828 
1829 	for (i = 0; i <= lgrp_alloc_max; i++) {
1830 		lgrp = lgrp_table[i];
1831 
1832 		if (LGRP_EXISTS(lgrp) && (lgrp->lgrp_latency == oldtime))
1833 			lgrp->lgrp_latency = (int)newtime;
1834 	}
1835 }
1836 
1837 /*
1838  * Add a resource named by lpl_leaf to rset of lpl_target
1839  *
1840  * This routine also adjusts ncpu and nrset if the call succeeds in adding a
1841  * resource. It is adjusted here, as this is presently the only place that we
1842  * can be certain a resource addition has succeeded.
1843  *
1844  * We keep the list of rsets sorted so that the dispatcher can quickly walk the
1845  * list in order until it reaches a NULL.  (This list is required to be NULL
1846  * terminated, too).  This is done so that we can mark start pos + 1, so that
1847  * each lpl is traversed sequentially, but in a different order.  We hope this
1848  * will improve performance a bit.  (Hopefully, less read-to-own traffic...)
1849  */
1850 
1851 void
1852 lpl_rset_add(lpl_t *lpl_target, lpl_t *lpl_leaf)
1853 {
1854 	int		i;
1855 	int		entry_slot = 0;
1856 
1857 	/* return if leaf is already present */
1858 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1859 		if (lpl_target->lpl_rset[i] == lpl_leaf) {
1860 			return;
1861 		}
1862 
1863 		if (lpl_target->lpl_rset[i]->lpl_lgrpid >
1864 		    lpl_leaf->lpl_lgrpid) {
1865 			break;
1866 		}
1867 	}
1868 
1869 	/* insert leaf, update counts */
1870 	entry_slot = i;
1871 	i = lpl_target->lpl_nrset++;
1872 	if (lpl_target->lpl_nrset >= LPL_RSET_MAX) {
1873 		panic("More leaf lgrps in system than are supported!\n");
1874 	}
1875 
1876 	/*
1877 	 * Start at the end of the rset array and work backwards towards the
1878 	 * slot into which the new lpl will be inserted. This effectively
1879 	 * preserves the current ordering by scooting everybody over one entry,
1880 	 * and placing the new entry into the space created.
1881 	 */
1882 
1883 	while (i-- > entry_slot) {
1884 		lpl_target->lpl_rset[i + 1] = lpl_target->lpl_rset[i];
1885 	}
1886 
1887 	lpl_target->lpl_rset[entry_slot] = lpl_leaf;
1888 	lpl_target->lpl_ncpu += lpl_leaf->lpl_ncpu;
1889 }
1890 
1891 /*
1892  * Update each of lpl_parent's children with a proper hint and
1893  * a reference to their parent.
1894  * The lgrp topology is used as the reference since it is fully
1895  * consistent and correct at this point.
1896  *
1897  * Each child's hint will reference an element in lpl_parent's
1898  * rset that designates where the child should start searching
1899  * for CPU resources. The hint selected is the highest order leaf present
1900  * in the child's lineage.
1901  *
1902  * This should be called after any potential change in lpl_parent's
1903  * rset.
1904  */
1905 static void
1906 lpl_child_update(lpl_t *lpl_parent, struct cpupart *cp)
1907 {
1908 	klgrpset_t	children, leaves;
1909 	lpl_t		*lpl;
1910 	int		hint;
1911 	int		i, j;
1912 
1913 	children = lgrp_table[lpl_parent->lpl_lgrpid]->lgrp_children;
1914 	if (klgrpset_isempty(children))
1915 		return; /* nothing to do */
1916 
1917 	for (i = 0; i <= lgrp_alloc_max; i++) {
1918 		if (klgrpset_ismember(children, i)) {
1919 
1920 			/*
1921 			 * Given the set of leaves in this child's lineage,
1922 			 * find the highest order leaf present in the parent's
1923 			 * rset. Select this as the hint for the child.
1924 			 */
1925 			leaves = lgrp_table[i]->lgrp_leaves;
1926 			hint = 0;
1927 			for (j = 0; j < lpl_parent->lpl_nrset; j++) {
1928 				lpl = lpl_parent->lpl_rset[j];
1929 				if (klgrpset_ismember(leaves, lpl->lpl_lgrpid))
1930 					hint = j;
1931 			}
1932 			cp->cp_lgrploads[i].lpl_hint = hint;
1933 
1934 			/*
1935 			 * (Re)set the parent. It may be incorrect if
1936 			 * lpl_parent is new in the topology.
1937 			 */
1938 			cp->cp_lgrploads[i].lpl_parent = lpl_parent;
1939 		}
1940 	}
1941 }
1942 
1943 /*
1944  * Delete resource lpl_leaf from rset of lpl_target, assuming it's there.
1945  *
1946  * This routine also adjusts ncpu and nrset if the call succeeds in deleting a
1947  * resource. The values are adjusted here, as this is the only place that we can
1948  * be certain a resource was successfully deleted.
1949  */
1950 void
1951 lpl_rset_del(lpl_t *lpl_target, lpl_t *lpl_leaf)
1952 {
1953 	int i;
1954 
1955 	/* find leaf in intermediate node */
1956 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1957 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1958 			break;
1959 	}
1960 
1961 	/* return if leaf not found */
1962 	if (lpl_target->lpl_rset[i] != lpl_leaf)
1963 		return;
1964 
1965 	/* prune leaf, compress array */
1966 	ASSERT(lpl_target->lpl_nrset < LPL_RSET_MAX);
1967 	lpl_target->lpl_rset[lpl_target->lpl_nrset--] = NULL;
1968 	lpl_target->lpl_ncpu--;
1969 	do {
1970 		lpl_target->lpl_rset[i] = lpl_target->lpl_rset[i + 1];
1971 	} while (i++ < lpl_target->lpl_nrset);
1972 }
1973 
1974 /*
1975  * Check to see if the resource set of the target lpl contains the
1976  * supplied leaf lpl.  This returns 1 if the lpl is found, 0 if it is not.
1977  */
1978 
1979 int
1980 lpl_rset_contains(lpl_t *lpl_target, lpl_t *lpl_leaf)
1981 {
1982 	int i;
1983 
1984 	for (i = 0; i < lpl_target->lpl_nrset; i++) {
1985 		if (lpl_target->lpl_rset[i] == lpl_leaf)
1986 			return (1);
1987 	}
1988 
1989 	return (0);
1990 }
1991 
1992 /*
1993  * Called when we change cpu lpl membership.  This increments or decrements the
1994  * per-cpu counter in every lpl in which our leaf appears.
1995  */
1996 void
1997 lpl_cpu_adjcnt(lpl_act_t act, cpu_t *cp)
1998 {
1999 	cpupart_t	*cpupart;
2000 	lgrp_t		*lgrp_leaf;
2001 	lgrp_t		*lgrp_cur;
2002 	lpl_t		*lpl_leaf;
2003 	lpl_t		*lpl_cur;
2004 	int		i;
2005 
2006 	ASSERT(act == LPL_DECREMENT || act == LPL_INCREMENT);
2007 
2008 	cpupart = cp->cpu_part;
2009 	lpl_leaf = cp->cpu_lpl;
2010 	lgrp_leaf = lgrp_table[lpl_leaf->lpl_lgrpid];
2011 
2012 	for (i = 0; i <= lgrp_alloc_max; i++) {
2013 		lgrp_cur = lgrp_table[i];
2014 
2015 		/*
2016 		 * Don't adjust if the lgrp isn't there, if we're the leaf lpl
2017 		 * for the cpu in question, or if the current lgrp and leaf
2018 		 * don't share the same resources.
2019 		 */
2020 
2021 		if (!LGRP_EXISTS(lgrp_cur) || (lgrp_cur == lgrp_leaf) ||
2022 		    !klgrpset_intersects(lgrp_leaf->lgrp_set[LGRP_RSRC_CPU],
2023 		    lgrp_cur->lgrp_set[LGRP_RSRC_CPU]))
2024 			continue;
2025 
2026 
2027 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2028 
2029 		if (lpl_cur->lpl_nrset > 0) {
2030 			if (act == LPL_INCREMENT) {
2031 				lpl_cur->lpl_ncpu++;
2032 			} else if (act == LPL_DECREMENT) {
2033 				lpl_cur->lpl_ncpu--;
2034 			}
2035 		}
2036 	}
2037 }
2038 
2039 /*
2040  * Initialize lpl with given resources and specified lgrp
2041  */
2042 
2043 void
2044 lpl_init(lpl_t *lpl, lpl_t *lpl_leaf, lgrp_t *lgrp)
2045 {
2046 	lpl->lpl_lgrpid = lgrp->lgrp_id;
2047 	lpl->lpl_loadavg = 0;
2048 	if (lpl == lpl_leaf)
2049 		lpl->lpl_ncpu = 1;
2050 	else
2051 		lpl->lpl_ncpu = lpl_leaf->lpl_ncpu;
2052 	lpl->lpl_nrset = 1;
2053 	lpl->lpl_rset[0] = lpl_leaf;
2054 	lpl->lpl_lgrp = lgrp;
2055 	lpl->lpl_parent = NULL; /* set by lpl_leaf_insert() */
2056 	lpl->lpl_cpus = NULL; /* set by lgrp_part_add_cpu() */
2057 }
2058 
2059 /*
2060  * Clear an unused lpl
2061  */
2062 
2063 void
2064 lpl_clear(lpl_t *lpl)
2065 {
2066 	lgrpid_t	lid;
2067 
2068 	/* save lid for debugging purposes */
2069 	lid = lpl->lpl_lgrpid;
2070 	bzero(lpl, sizeof (lpl_t));
2071 	lpl->lpl_lgrpid = lid;
2072 }
2073 
2074 /*
2075  * Given a CPU-partition, verify that the lpl topology in the CPU-partition
2076  * is in sync with the lgroup toplogy in the system.  The lpl topology may not
2077  * make full use of all of the lgroup topology, but this checks to make sure
2078  * that for the parts that it does use, it has correctly understood the
2079  * relationships that exist. This function returns
2080  * 0 if the topology is correct, and a non-zero error code, for non-debug
2081  * kernels if incorrect.  Asserts are spread throughout the code to aid in
2082  * debugging on a DEBUG kernel.
2083  */
2084 int
2085 lpl_topo_verify(cpupart_t *cpupart)
2086 {
2087 	lgrp_t		*lgrp;
2088 	lpl_t		*lpl;
2089 	klgrpset_t	rset;
2090 	klgrpset_t	cset;
2091 	cpu_t		*cpu;
2092 	cpu_t		*cp_start;
2093 	int		i;
2094 	int		j;
2095 	int		sum;
2096 
2097 	/* topology can't be incorrect if it doesn't exist */
2098 	if (!lgrp_topo_initialized || !lgrp_initialized)
2099 		return (LPL_TOPO_CORRECT);
2100 
2101 	ASSERT(cpupart != NULL);
2102 
2103 	for (i = 0; i <= lgrp_alloc_max; i++) {
2104 		lgrp = lgrp_table[i];
2105 		lpl = NULL;
2106 		/* make sure lpls are allocated */
2107 		ASSERT(cpupart->cp_lgrploads);
2108 		if (!cpupart->cp_lgrploads)
2109 			return (LPL_TOPO_PART_HAS_NO_LPL);
2110 
2111 		lpl = &cpupart->cp_lgrploads[i];
2112 		/* make sure our index is good */
2113 		ASSERT(i < cpupart->cp_nlgrploads);
2114 
2115 		/* if lgroup doesn't exist, make sure lpl is empty */
2116 		if (!LGRP_EXISTS(lgrp)) {
2117 			ASSERT(lpl->lpl_ncpu == 0);
2118 			if (lpl->lpl_ncpu > 0) {
2119 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2120 			} else {
2121 				continue;
2122 			}
2123 		}
2124 
2125 		/* verify that lgroup and lpl are identically numbered */
2126 		ASSERT(lgrp->lgrp_id == lpl->lpl_lgrpid);
2127 
2128 		/* if lgroup isn't in our partition, make sure lpl is empty */
2129 		if (!klgrpset_intersects(lgrp->lgrp_leaves,
2130 		    cpupart->cp_lgrpset)) {
2131 			ASSERT(lpl->lpl_ncpu == 0);
2132 			if (lpl->lpl_ncpu > 0) {
2133 				return (LPL_TOPO_CPUS_NOT_EMPTY);
2134 			}
2135 			/*
2136 			 * lpl is empty, and lgroup isn't in partition.  verify
2137 			 * that lpl doesn't show up in anyone else's rsets (in
2138 			 * this partition, anyway)
2139 			 */
2140 
2141 			for (j = 0; j < cpupart->cp_nlgrploads; j++) {
2142 				lpl_t *i_lpl; /* lpl we're iterating over */
2143 
2144 				i_lpl = &cpupart->cp_lgrploads[j];
2145 
2146 				ASSERT(!lpl_rset_contains(i_lpl, lpl));
2147 				if (lpl_rset_contains(i_lpl, lpl)) {
2148 					return (LPL_TOPO_LPL_ORPHANED);
2149 				}
2150 			}
2151 			/* lgroup is empty, and everything is ok. continue */
2152 			continue;
2153 		}
2154 
2155 
2156 		/* lgroup is in this partition, now check it against lpl */
2157 
2158 		/* do both have matching lgrps? */
2159 		ASSERT(lgrp == lpl->lpl_lgrp);
2160 		if (lgrp != lpl->lpl_lgrp) {
2161 			return (LPL_TOPO_LGRP_MISMATCH);
2162 		}
2163 
2164 		/* do the parent lgroups exist and do they match? */
2165 		if (lgrp->lgrp_parent) {
2166 			ASSERT(lpl->lpl_parent);
2167 			ASSERT(lgrp->lgrp_parent->lgrp_id ==
2168 				    lpl->lpl_parent->lpl_lgrpid);
2169 
2170 			if (!lpl->lpl_parent) {
2171 				return (LPL_TOPO_MISSING_PARENT);
2172 			} else if (lgrp->lgrp_parent->lgrp_id !=
2173 			    lpl->lpl_parent->lpl_lgrpid) {
2174 				return (LPL_TOPO_PARENT_MISMATCH);
2175 			}
2176 		}
2177 
2178 		/* only leaf lgroups keep a cpucnt, only check leaves */
2179 		if ((lpl->lpl_nrset == 1) && (lpl == lpl->lpl_rset[0])) {
2180 
2181 			/* verify that lgrp is also a leaf */
2182 			ASSERT((lgrp->lgrp_childcnt == 0) &&
2183 			    (klgrpset_ismember(lgrp->lgrp_leaves,
2184 			    lpl->lpl_lgrpid)));
2185 
2186 			if ((lgrp->lgrp_childcnt > 0) ||
2187 			    (!klgrpset_ismember(lgrp->lgrp_leaves,
2188 			    lpl->lpl_lgrpid))) {
2189 				return (LPL_TOPO_LGRP_NOT_LEAF);
2190 			}
2191 
2192 			ASSERT((lgrp->lgrp_cpucnt >= lpl->lpl_ncpu) &&
2193 			    (lpl->lpl_ncpu > 0));
2194 			if ((lgrp->lgrp_cpucnt < lpl->lpl_ncpu) ||
2195 				(lpl->lpl_ncpu <= 0)) {
2196 				return (LPL_TOPO_BAD_CPUCNT);
2197 			}
2198 
2199 			/*
2200 			 * Check that lpl_ncpu also matches the number of
2201 			 * cpus in the lpl's linked list.  This only exists in
2202 			 * leaves, but they should always match.
2203 			 */
2204 			j = 0;
2205 			cpu = cp_start = lpl->lpl_cpus;
2206 			while (cpu != NULL) {
2207 				j++;
2208 
2209 				/* check to make sure cpu's lpl is leaf lpl */
2210 				ASSERT(cpu->cpu_lpl == lpl);
2211 				if (cpu->cpu_lpl != lpl) {
2212 					return (LPL_TOPO_CPU_HAS_BAD_LPL);
2213 				}
2214 
2215 				/* check next cpu */
2216 				if ((cpu = cpu->cpu_next_lpl) != cp_start) {
2217 					continue;
2218 				} else {
2219 					cpu = NULL;
2220 				}
2221 			}
2222 
2223 			ASSERT(j == lpl->lpl_ncpu);
2224 			if (j != lpl->lpl_ncpu) {
2225 				return (LPL_TOPO_LPL_BAD_NCPU);
2226 			}
2227 
2228 			/*
2229 			 * Also, check that leaf lpl is contained in all
2230 			 * intermediate lpls that name the leaf as a descendant
2231 			 */
2232 
2233 			for (j = 0; j <= lgrp_alloc_max; j++) {
2234 				klgrpset_t intersect;
2235 				lgrp_t *lgrp_cand;
2236 				lpl_t *lpl_cand;
2237 
2238 				lgrp_cand = lgrp_table[j];
2239 				intersect = klgrpset_intersects(
2240 				    lgrp_cand->lgrp_set[LGRP_RSRC_CPU],
2241 				    cpupart->cp_lgrpset);
2242 
2243 				if (!LGRP_EXISTS(lgrp_cand) ||
2244 				    !klgrpset_intersects(lgrp_cand->lgrp_leaves,
2245 				    cpupart->cp_lgrpset) ||
2246 				    (intersect == 0))
2247 					continue;
2248 
2249 				lpl_cand =
2250 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2251 
2252 				if (klgrpset_ismember(intersect,
2253 				    lgrp->lgrp_id)) {
2254 					ASSERT(lpl_rset_contains(lpl_cand,
2255 					    lpl));
2256 
2257 					if (!lpl_rset_contains(lpl_cand, lpl)) {
2258 						return (LPL_TOPO_RSET_MSSNG_LF);
2259 					}
2260 				}
2261 			}
2262 
2263 		} else { /* non-leaf specific checks */
2264 
2265 			/*
2266 			 * Non-leaf lpls should have lpl_cpus == NULL
2267 			 * verify that this is so
2268 			 */
2269 			ASSERT(lpl->lpl_cpus == NULL);
2270 			if (lpl->lpl_cpus != NULL) {
2271 				return (LPL_TOPO_NONLEAF_HAS_CPUS);
2272 			}
2273 
2274 			/*
2275 			 * verify that the sum of the cpus in the leaf resources
2276 			 * is equal to the total ncpu in the intermediate
2277 			 */
2278 			for (j = sum = 0; j < lpl->lpl_nrset; j++) {
2279 				sum += lpl->lpl_rset[j]->lpl_ncpu;
2280 			}
2281 
2282 			ASSERT(sum == lpl->lpl_ncpu);
2283 			if (sum != lpl->lpl_ncpu) {
2284 				return (LPL_TOPO_LPL_BAD_NCPU);
2285 			}
2286 		}
2287 
2288 		/*
2289 		 * check on lpl_hint. Don't check root, since it has no parent.
2290 		 */
2291 		if (lpl->lpl_parent != NULL) {
2292 			int hint;
2293 			lpl_t *hint_lpl;
2294 
2295 			/* make sure hint is within limits of nrset */
2296 			hint = lpl->lpl_hint;
2297 			ASSERT(lpl->lpl_parent->lpl_nrset >= hint);
2298 			if (lpl->lpl_parent->lpl_nrset < hint) {
2299 				return (LPL_TOPO_BOGUS_HINT);
2300 			}
2301 
2302 			/* make sure hint points to valid lpl */
2303 			hint_lpl = lpl->lpl_parent->lpl_rset[hint];
2304 			ASSERT(hint_lpl->lpl_ncpu > 0);
2305 			if (hint_lpl->lpl_ncpu <= 0) {
2306 				return (LPL_TOPO_BOGUS_HINT);
2307 			}
2308 		}
2309 
2310 		/*
2311 		 * Check the rset of the lpl in question.  Make sure that each
2312 		 * rset contains a subset of the resources in
2313 		 * lgrp_set[LGRP_RSRC_CPU] and in cp_lgrpset.  This also makes
2314 		 * sure that each rset doesn't include resources that are
2315 		 * outside of that set.  (Which would be resources somehow not
2316 		 * accounted for).
2317 		 */
2318 
2319 		klgrpset_clear(rset);
2320 		for (j = 0; j < lpl->lpl_nrset; j++) {
2321 			klgrpset_add(rset, lpl->lpl_rset[j]->lpl_lgrpid);
2322 		}
2323 		klgrpset_copy(cset, rset);
2324 		/* make sure lpl rset matches lgrp rset */
2325 		klgrpset_diff(rset, lgrp->lgrp_set[LGRP_RSRC_CPU]);
2326 		/* make sure rset is contained with in partition, too */
2327 		klgrpset_diff(cset, cpupart->cp_lgrpset);
2328 
2329 		ASSERT(klgrpset_isempty(rset) &&
2330 			    klgrpset_isempty(cset));
2331 		if (!klgrpset_isempty(rset) ||
2332 		    !klgrpset_isempty(cset)) {
2333 			return (LPL_TOPO_RSET_MISMATCH);
2334 		}
2335 
2336 		/*
2337 		 * check to make sure lpl_nrset matches the number of rsets
2338 		 * contained in the lpl
2339 		 */
2340 
2341 		for (j = 0; (lpl->lpl_rset[j] != NULL) && (j < LPL_RSET_MAX);
2342 		    j++);
2343 
2344 		ASSERT(j == lpl->lpl_nrset);
2345 		if (j != lpl->lpl_nrset) {
2346 			return (LPL_TOPO_BAD_RSETCNT);
2347 		}
2348 
2349 	}
2350 	return (LPL_TOPO_CORRECT);
2351 }
2352 
2353 /*
2354  * Flatten lpl topology to given number of levels.  This is presently only
2355  * implemented for a flatten to 2 levels, which will prune out the intermediates
2356  * and home the leaf lpls to the root lpl.
2357  */
2358 int
2359 lpl_topo_flatten(int levels)
2360 {
2361 	int		i;
2362 	uint_t		sum;
2363 	lgrp_t		*lgrp_cur;
2364 	lpl_t		*lpl_cur;
2365 	lpl_t		*lpl_root;
2366 	cpupart_t	*cp;
2367 
2368 	if (levels != 2)
2369 		return (0);
2370 
2371 	/* called w/ cpus paused - grab no locks! */
2372 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
2373 	    !lgrp_initialized);
2374 
2375 	cp = cp_list_head;
2376 	do {
2377 		lpl_root = &cp->cp_lgrploads[lgrp_root->lgrp_id];
2378 		ASSERT(LGRP_EXISTS(lgrp_root) && (lpl_root->lpl_ncpu > 0));
2379 
2380 		for (i = 0; i <= lgrp_alloc_max; i++) {
2381 			lgrp_cur = lgrp_table[i];
2382 			lpl_cur = &cp->cp_lgrploads[i];
2383 
2384 			if ((lgrp_cur == lgrp_root) ||
2385 			    (!LGRP_EXISTS(lgrp_cur) &&
2386 			    (lpl_cur->lpl_ncpu == 0)))
2387 				continue;
2388 
2389 			if (!LGRP_EXISTS(lgrp_cur) && (lpl_cur->lpl_ncpu > 0)) {
2390 				/*
2391 				 * this should be a deleted intermediate, so
2392 				 * clear it
2393 				 */
2394 				lpl_clear(lpl_cur);
2395 			} else if ((lpl_cur->lpl_nrset == 1) &&
2396 			    (lpl_cur->lpl_rset[0] == lpl_cur) &&
2397 			    ((lpl_cur->lpl_parent->lpl_ncpu == 0) ||
2398 			    (!LGRP_EXISTS(lpl_cur->lpl_parent->lpl_lgrp)))) {
2399 				/*
2400 				 * this is a leaf whose parent was deleted, or
2401 				 * whose parent had their lgrp deleted.  (And
2402 				 * whose parent will soon be deleted).  Point
2403 				 * this guy back to the root lpl.
2404 				 */
2405 				lpl_cur->lpl_parent = lpl_root;
2406 				lpl_rset_add(lpl_root, lpl_cur);
2407 			}
2408 
2409 		}
2410 
2411 		/*
2412 		 * Now that we're done, make sure the count on the root lpl is
2413 		 * correct, and update the hints of the children for the sake of
2414 		 * thoroughness
2415 		 */
2416 		for (i = sum = 0; i < lpl_root->lpl_nrset; i++) {
2417 			sum += lpl_root->lpl_rset[i]->lpl_ncpu;
2418 		}
2419 		lpl_root->lpl_ncpu = sum;
2420 		lpl_child_update(lpl_root, cp);
2421 
2422 		cp = cp->cp_next;
2423 	} while (cp != cp_list_head);
2424 
2425 	return (levels);
2426 }
2427 
2428 /*
2429  * Insert a lpl into the resource hierarchy and create any additional lpls that
2430  * are necessary to represent the varying states of locality for the cpu
2431  * resoruces newly added to the partition.
2432  *
2433  * This routine is clever enough that it can correctly add resources from the
2434  * new leaf into both direct and indirect resource sets in the hierarchy.  (Ie,
2435  * those for which the lpl is a leaf as opposed to simply a named equally local
2436  * resource).  The one special case that needs additional processing is when a
2437  * new intermediate lpl is introduced.  Since the main loop only traverses
2438  * looking to add the leaf resource where it does not yet exist, additional work
2439  * is necessary to add other leaf resources that may need to exist in the newly
2440  * created intermediate.  This is performed by the second inner loop, and is
2441  * only done when the check for more than one overlapping resource succeeds.
2442  */
2443 
2444 void
2445 lpl_leaf_insert(lpl_t *lpl_leaf, cpupart_t *cpupart)
2446 {
2447 	int		i;
2448 	int		j;
2449 	int		hint;
2450 	int		rset_num_intersect;
2451 	lgrp_t		*lgrp_cur;
2452 	lpl_t		*lpl_cur;
2453 	lpl_t		*lpl_parent;
2454 	lgrpid_t	parent_id;
2455 	klgrpset_t	rset_intersect; /* resources in cpupart and lgrp */
2456 
2457 	for (i = 0; i <= lgrp_alloc_max; i++) {
2458 		lgrp_cur = lgrp_table[i];
2459 
2460 		/*
2461 		 * Don't insert if the lgrp isn't there, if the leaf isn't
2462 		 * contained within the current lgrp, or if the current lgrp has
2463 		 * no leaves in this partition
2464 		 */
2465 
2466 		if (!LGRP_EXISTS(lgrp_cur) ||
2467 		    !klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2468 		    lpl_leaf->lpl_lgrpid) ||
2469 		    !klgrpset_intersects(lgrp_cur->lgrp_leaves,
2470 		    cpupart->cp_lgrpset))
2471 			continue;
2472 
2473 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2474 		if (lgrp_cur->lgrp_parent != NULL) {
2475 			/* if lgrp has a parent, assign it properly */
2476 			parent_id = lgrp_cur->lgrp_parent->lgrp_id;
2477 			lpl_parent = &cpupart->cp_lgrploads[parent_id];
2478 		} else {
2479 			/* if not, make sure parent ptr gets set to null */
2480 			lpl_parent = NULL;
2481 		}
2482 
2483 		if (lpl_cur == lpl_leaf) {
2484 			/*
2485 			 * Almost all leaf state was initialized elsewhere.  The
2486 			 * only thing left to do is to set the parent.
2487 			 */
2488 			lpl_cur->lpl_parent = lpl_parent;
2489 			continue;
2490 		}
2491 
2492 		/*
2493 		 * Initialize intermediate lpl
2494 		 * Save this lpl's hint though. Since we're changing this
2495 		 * lpl's resources, we need to update the hint in this lpl's
2496 		 * children, but the hint in this lpl is unaffected and
2497 		 * should be preserved.
2498 		 */
2499 		hint = lpl_cur->lpl_hint;
2500 
2501 		lpl_clear(lpl_cur);
2502 		lpl_init(lpl_cur, lpl_leaf, lgrp_cur);
2503 
2504 		lpl_cur->lpl_hint = hint;
2505 		lpl_cur->lpl_parent = lpl_parent;
2506 
2507 		/* does new lpl need to be populated with other resources? */
2508 		rset_intersect =
2509 		    klgrpset_intersects(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2510 			cpupart->cp_lgrpset);
2511 		klgrpset_nlgrps(rset_intersect, rset_num_intersect);
2512 
2513 		if (rset_num_intersect > 1) {
2514 			/*
2515 			 * If so, figure out what lpls have resources that
2516 			 * intersect this one, and add them.
2517 			 */
2518 			for (j = 0; j <= lgrp_alloc_max; j++) {
2519 				lgrp_t	*lgrp_cand;	/* candidate lgrp */
2520 				lpl_t	*lpl_cand;	/* candidate lpl */
2521 
2522 				lgrp_cand = lgrp_table[j];
2523 				if (!LGRP_EXISTS(lgrp_cand) ||
2524 				    !klgrpset_ismember(rset_intersect,
2525 					lgrp_cand->lgrp_id))
2526 					continue;
2527 				lpl_cand =
2528 				    &cpupart->cp_lgrploads[lgrp_cand->lgrp_id];
2529 				lpl_rset_add(lpl_cur, lpl_cand);
2530 			}
2531 		}
2532 		/*
2533 		 * This lpl's rset has changed. Update the hint in it's
2534 		 * children.
2535 		 */
2536 		lpl_child_update(lpl_cur, cpupart);
2537 	}
2538 }
2539 
2540 /*
2541  * remove a lpl from the hierarchy of resources, clearing its state when
2542  * finished.  If the lpls at the intermediate levels of the hierarchy have no
2543  * remaining resources, or no longer name a leaf resource in the cpu-partition,
2544  * delete them as well.
2545  */
2546 
2547 void
2548 lpl_leaf_remove(lpl_t *lpl_leaf, cpupart_t *cpupart)
2549 {
2550 	int		i;
2551 	lgrp_t		*lgrp_cur;
2552 	lpl_t		*lpl_cur;
2553 	klgrpset_t	leaf_intersect;	/* intersection of leaves */
2554 
2555 	for (i = 0; i <= lgrp_alloc_max; i++) {
2556 		lgrp_cur = lgrp_table[i];
2557 
2558 		/*
2559 		 * Don't attempt to remove from lgrps that aren't there, that
2560 		 * don't contain our leaf, or from the leaf itself. (We do that
2561 		 * later)
2562 		 */
2563 
2564 		if (!LGRP_EXISTS(lgrp_cur))
2565 			continue;
2566 
2567 		lpl_cur = &cpupart->cp_lgrploads[lgrp_cur->lgrp_id];
2568 
2569 		if (!klgrpset_ismember(lgrp_cur->lgrp_set[LGRP_RSRC_CPU],
2570 		    lpl_leaf->lpl_lgrpid) ||
2571 		    (lpl_cur == lpl_leaf)) {
2572 			continue;
2573 		}
2574 
2575 		/*
2576 		 * This is a slightly sleazy simplification in that we have
2577 		 * already marked the cp_lgrpset as no longer containing the
2578 		 * leaf we've deleted.  Any lpls that pass the above checks
2579 		 * based upon lgrp membership but not necessarily cpu-part
2580 		 * membership also get cleared by the checks below.  Currently
2581 		 * this is harmless, as the lpls should be empty anyway.
2582 		 *
2583 		 * In particular, we want to preserve lpls that have additional
2584 		 * leaf resources, even though we don't yet have a processor
2585 		 * architecture that represents resources this way.
2586 		 */
2587 
2588 		leaf_intersect = klgrpset_intersects(lgrp_cur->lgrp_leaves,
2589 		    cpupart->cp_lgrpset);
2590 
2591 		lpl_rset_del(lpl_cur, lpl_leaf);
2592 		if ((lpl_cur->lpl_nrset == 0) || (!leaf_intersect)) {
2593 			lpl_clear(lpl_cur);
2594 		} else {
2595 			/*
2596 			 * Update this lpl's children
2597 			 */
2598 			lpl_child_update(lpl_cur, cpupart);
2599 		}
2600 	}
2601 	lpl_clear(lpl_leaf);
2602 }
2603 
2604 /*
2605  * add a cpu to a partition in terms of lgrp load avg bookeeping
2606  *
2607  * The lpl (cpu partition load average information) is now arranged in a
2608  * hierarchical fashion whereby resources that are closest, ie. most local, to
2609  * the cpu in question are considered to be leaves in a tree of resources.
2610  * There are two general cases for cpu additon:
2611  *
2612  * 1. A lpl structure that contains resources already in the hierarchy tree.
2613  * In this case, all of the associated lpl relationships have been defined, and
2614  * all that is necessary is that we link the new cpu into the per-lpl list of
2615  * cpus, and increment the ncpu count of all places where this cpu resource will
2616  * be accounted for.  lpl_cpu_adjcnt updates the cpu count, and the cpu pointer
2617  * pushing is accomplished by this routine.
2618  *
2619  * 2. The lpl to contain the resources in this cpu-partition for this lgrp does
2620  * not exist yet.  In this case, it is necessary to build the leaf lpl, and
2621  * construct the hierarchy of state necessary to name it's more distant
2622  * resources, if they should exist.  The leaf structure is initialized by this
2623  * routine, as is the cpu-partition state for the lgrp membership.  This routine
2624  * also calls lpl_leaf_insert() which inserts the named lpl into the hierarchy
2625  * and builds all of the "ancestoral" state necessary to identify resources at
2626  * differing levels of locality.
2627  */
2628 void
2629 lgrp_part_add_cpu(cpu_t *cp, lgrp_id_t lgrpid)
2630 {
2631 	cpupart_t	*cpupart;
2632 	lgrp_t		*lgrp_leaf;
2633 	lpl_t		*lpl_leaf;
2634 
2635 	/* called sometimes w/ cpus paused - grab no locks */
2636 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2637 
2638 	cpupart = cp->cpu_part;
2639 	lgrp_leaf = lgrp_table[lgrpid];
2640 
2641 	/* don't add non-existent lgrp */
2642 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2643 	lpl_leaf = &cpupart->cp_lgrploads[lgrpid];
2644 	cp->cpu_lpl = lpl_leaf;
2645 
2646 	/* only leaf lpls contain cpus */
2647 
2648 	if (lpl_leaf->lpl_ncpu++ == 0) {
2649 		lpl_init(lpl_leaf, lpl_leaf, lgrp_leaf);
2650 		klgrpset_add(cpupart->cp_lgrpset, lgrpid);
2651 		lpl_leaf_insert(lpl_leaf, cpupart);
2652 	} else {
2653 		/*
2654 		 * the lpl should already exist in the parent, so just update
2655 		 * the count of available CPUs
2656 		 */
2657 		lpl_cpu_adjcnt(LPL_INCREMENT, cp);
2658 	}
2659 
2660 	/* link cpu into list of cpus in lpl */
2661 
2662 	if (lpl_leaf->lpl_cpus) {
2663 		cp->cpu_next_lpl = lpl_leaf->lpl_cpus;
2664 		cp->cpu_prev_lpl = lpl_leaf->lpl_cpus->cpu_prev_lpl;
2665 		lpl_leaf->lpl_cpus->cpu_prev_lpl->cpu_next_lpl = cp;
2666 		lpl_leaf->lpl_cpus->cpu_prev_lpl = cp;
2667 	} else {
2668 		/*
2669 		 * We increment ncpu immediately after we create a new leaf
2670 		 * lpl, so assert that ncpu == 1 for the case where we don't
2671 		 * have any cpu pointers yet.
2672 		 */
2673 		ASSERT(lpl_leaf->lpl_ncpu == 1);
2674 		lpl_leaf->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = cp;
2675 	}
2676 
2677 }
2678 
2679 
2680 /*
2681  * remove a cpu from a partition in terms of lgrp load avg bookeeping
2682  *
2683  * The lpl (cpu partition load average information) is now arranged in a
2684  * hierarchical fashion whereby resources that are closest, ie. most local, to
2685  * the cpu in question are considered to be leaves in a tree of resources.
2686  * There are two removal cases in question:
2687  *
2688  * 1. Removal of the resource in the leaf leaves other resources remaining in
2689  * that leaf.  (Another cpu still exists at this level of locality).  In this
2690  * case, the count of available cpus is decremented in all assocated lpls by
2691  * calling lpl_adj_cpucnt(), and the pointer to the removed cpu is pruned
2692  * from the per-cpu lpl list.
2693  *
2694  * 2. Removal of the resource results in the lpl containing no resources.  (It's
2695  * empty)  In this case, all of what has occurred for the first step must take
2696  * place; however, additionally we must remove the lpl structure itself, prune
2697  * out any stranded lpls that do not directly name a leaf resource, and mark the
2698  * cpu partition in question as no longer containing resources from the lgrp of
2699  * the lpl that has been delted.  Cpu-partition changes are handled by this
2700  * method, but the lpl_leaf_remove function deals with the details of pruning
2701  * out the empty lpl and any of its orphaned direct ancestors.
2702  */
2703 void
2704 lgrp_part_del_cpu(cpu_t *cp)
2705 {
2706 	lpl_t		*lpl;
2707 	lpl_t		*leaf_lpl;
2708 	lgrp_t		*lgrp_leaf;
2709 
2710 	/* called sometimes w/ cpus paused - grab no locks */
2711 
2712 	ASSERT(MUTEX_HELD(&cpu_lock) || !lgrp_initialized);
2713 
2714 	lpl = leaf_lpl = cp->cpu_lpl;
2715 	lgrp_leaf = leaf_lpl->lpl_lgrp;
2716 
2717 	/* don't delete a leaf that isn't there */
2718 	ASSERT(LGRP_EXISTS(lgrp_leaf));
2719 
2720 	/* no double-deletes */
2721 	ASSERT(lpl->lpl_ncpu);
2722 	if (--lpl->lpl_ncpu == 0) {
2723 		/*
2724 		 * This was the last cpu in this lgroup for this partition,
2725 		 * clear its bit in the partition's lgroup bitmask
2726 		 */
2727 		klgrpset_del(cp->cpu_part->cp_lgrpset, lpl->lpl_lgrpid);
2728 
2729 		/* eliminate remaning lpl link pointers in cpu, lpl */
2730 		lpl->lpl_cpus = cp->cpu_next_lpl = cp->cpu_prev_lpl = NULL;
2731 
2732 		lpl_leaf_remove(leaf_lpl, cp->cpu_part);
2733 	} else {
2734 
2735 		/* unlink cpu from lists of cpus in lpl */
2736 		cp->cpu_prev_lpl->cpu_next_lpl = cp->cpu_next_lpl;
2737 		cp->cpu_next_lpl->cpu_prev_lpl = cp->cpu_prev_lpl;
2738 		if (lpl->lpl_cpus == cp) {
2739 			lpl->lpl_cpus = cp->cpu_next_lpl;
2740 		}
2741 
2742 		/*
2743 		 * Update the cpu count in the lpls associated with parent
2744 		 * lgroups.
2745 		 */
2746 		lpl_cpu_adjcnt(LPL_DECREMENT, cp);
2747 
2748 	}
2749 	/* clear cpu's lpl ptr when we're all done */
2750 	cp->cpu_lpl = NULL;
2751 }
2752 
2753 /*
2754  * Recompute load average for the specified partition/lgrp fragment.
2755  *
2756  * We rely on the fact that this routine is called from the clock thread
2757  * at a point before the clock thread can block (i.e. before its first
2758  * lock request).  Since the clock thread can not be preempted (since it
2759  * runs at highest priority), we know that cpu partitions can not change
2760  * (since doing so would require either the repartition requester or the
2761  * cpu_pause thread to run on this cpu), so we can update the cpu's load
2762  * without grabbing cpu_lock.
2763  */
2764 void
2765 lgrp_loadavg(lpl_t *lpl, uint_t nrcpus, int ageflag)
2766 {
2767 	uint_t		ncpu;
2768 	int64_t		old, new, f;
2769 
2770 	/*
2771 	 * 1 - exp(-1/(20 * ncpu)) << 13 = 400 for 1 cpu...
2772 	 */
2773 	static short expval[] = {
2774 	    0, 3196, 1618, 1083,
2775 	    814, 652, 543, 466,
2776 	    408, 363, 326, 297,
2777 	    272, 251, 233, 218,
2778 	    204, 192, 181, 172,
2779 	    163, 155, 148, 142,
2780 	    136, 130, 125, 121,
2781 	    116, 112, 109, 105
2782 	};
2783 
2784 	/* ASSERT (called from clock level) */
2785 
2786 	if ((lpl == NULL) ||	/* we're booting - this is easiest for now */
2787 	    ((ncpu = lpl->lpl_ncpu) == 0)) {
2788 		return;
2789 	}
2790 
2791 	for (;;) {
2792 
2793 		if (ncpu >= sizeof (expval) / sizeof (expval[0]))
2794 			f = expval[1]/ncpu; /* good approx. for large ncpu */
2795 		else
2796 			f = expval[ncpu];
2797 
2798 		/*
2799 		 * Modify the load average atomically to avoid losing
2800 		 * anticipatory load updates (see lgrp_move_thread()).
2801 		 */
2802 		if (ageflag) {
2803 			/*
2804 			 * We're supposed to both update and age the load.
2805 			 * This happens 10 times/sec. per cpu.  We do a
2806 			 * little hoop-jumping to avoid integer overflow.
2807 			 */
2808 			int64_t		q, r;
2809 
2810 			do {
2811 				old = new = lpl->lpl_loadavg;
2812 				q = (old  >> 16) << 7;
2813 				r = (old  & 0xffff) << 7;
2814 				new += ((long long)(nrcpus - q) * f -
2815 				    ((r * f) >> 16)) >> 7;
2816 
2817 				/*
2818 				 * Check for overflow
2819 				 */
2820 				if (new > LGRP_LOADAVG_MAX)
2821 					new = LGRP_LOADAVG_MAX;
2822 				else if (new < 0)
2823 					new = 0;
2824 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2825 			    new) != old);
2826 		} else {
2827 			/*
2828 			 * We're supposed to update the load, but not age it.
2829 			 * This option is used to update the load (which either
2830 			 * has already been aged in this 1/10 sec. interval or
2831 			 * soon will be) to account for a remotely executing
2832 			 * thread.
2833 			 */
2834 			do {
2835 				old = new = lpl->lpl_loadavg;
2836 				new += f;
2837 				/*
2838 				 * Check for overflow
2839 				 * Underflow not possible here
2840 				 */
2841 				if (new < old)
2842 					new = LGRP_LOADAVG_MAX;
2843 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
2844 			    new) != old);
2845 		}
2846 
2847 		/*
2848 		 * Do the same for this lpl's parent
2849 		 */
2850 		if ((lpl = lpl->lpl_parent) == NULL)
2851 			break;
2852 		ncpu = lpl->lpl_ncpu;
2853 	}
2854 }
2855 
2856 /*
2857  * Initialize lpl topology in the target based on topology currently present in
2858  * lpl_bootstrap.
2859  *
2860  * lpl_topo_bootstrap is only called once from cpupart_initialize_default() to
2861  * initialize cp_default list of lpls. Up to this point all topology operations
2862  * were performed using lpl_bootstrap. Now cp_default has its own list of lpls
2863  * and all subsequent lpl operations should use it instead of lpl_bootstrap. The
2864  * `target' points to the list of lpls in cp_default and `size' is the size of
2865  * this list.
2866  *
2867  * This function walks the lpl topology in lpl_bootstrap and does for things:
2868  *
2869  * 1) Copies all fields from lpl_bootstrap to the target.
2870  *
2871  * 2) Sets CPU0 lpl pointer to the correct element of the target list.
2872  *
2873  * 3) Updates lpl_parent pointers to point to the lpls in the target list
2874  *    instead of lpl_bootstrap.
2875  *
2876  * 4) Updates pointers in the resource list of the target to point to the lpls
2877  *    in the target list instead of lpl_bootstrap.
2878  *
2879  * After lpl_topo_bootstrap() completes, target contains the same information
2880  * that would be present there if it were used during boot instead of
2881  * lpl_bootstrap. There is no need in information in lpl_bootstrap after this
2882  * and it is bzeroed.
2883  */
2884 void
2885 lpl_topo_bootstrap(lpl_t *target, int size)
2886 {
2887 	lpl_t	*lpl = lpl_bootstrap;
2888 	lpl_t	*target_lpl = target;
2889 	int	howmany;
2890 	int	id;
2891 	int	i;
2892 
2893 	/*
2894 	 * The only target that should be passed here is cp_default lpl list.
2895 	 */
2896 	ASSERT(target == cp_default.cp_lgrploads);
2897 	ASSERT(size == cp_default.cp_nlgrploads);
2898 	ASSERT(!lgrp_topo_initialized);
2899 	ASSERT(ncpus == 1);
2900 
2901 	howmany = MIN(LPL_BOOTSTRAP_SIZE, size);
2902 	for (i = 0; i < howmany; i++, lpl++, target_lpl++) {
2903 		/*
2904 		 * Copy all fields from lpl.
2905 		 */
2906 
2907 		*target_lpl = *lpl;
2908 
2909 		/*
2910 		 * Substitute CPU0 lpl pointer with one relative to target.
2911 		 */
2912 		if (lpl->lpl_cpus == CPU) {
2913 			ASSERT(CPU->cpu_lpl == lpl);
2914 			CPU->cpu_lpl = target_lpl;
2915 		}
2916 
2917 		/*
2918 		 * Substitute parent information with parent relative to target.
2919 		 */
2920 		if (lpl->lpl_parent != NULL)
2921 			target_lpl->lpl_parent = (lpl_t *)
2922 			    (((uintptr_t)lpl->lpl_parent -
2923 				(uintptr_t)lpl_bootstrap) +
2924 				(uintptr_t)target);
2925 
2926 		/*
2927 		 * Walk over resource set substituting pointers relative to
2928 		 * lpl_bootstrap to pointers relative to target.
2929 		 */
2930 		ASSERT(lpl->lpl_nrset <= 1);
2931 
2932 		for (id = 0; id < lpl->lpl_nrset; id++) {
2933 			if (lpl->lpl_rset[id] != NULL) {
2934 				target_lpl->lpl_rset[id] =
2935 				    (lpl_t *)
2936 				    (((uintptr_t)lpl->lpl_rset[id] -
2937 					(uintptr_t)lpl_bootstrap) +
2938 					(uintptr_t)target);
2939 			}
2940 		}
2941 	}
2942 
2943 	/*
2944 	 * Topology information in lpl_bootstrap is no longer needed.
2945 	 */
2946 	bzero(lpl_bootstrap_list, sizeof (lpl_bootstrap_list));
2947 }
2948 
2949 /* the maximum effect that a single thread can have on it's lgroup's load */
2950 #define	LGRP_LOADAVG_MAX_EFFECT(ncpu) \
2951 	((lgrp_loadavg_max_effect) / (ncpu))
2952 uint32_t	lgrp_loadavg_max_effect = LGRP_LOADAVG_THREAD_MAX;
2953 
2954 /*
2955  * If the lowest load among the lgroups a process' threads are currently
2956  * spread across is greater than lgrp_expand_proc_thresh, we'll consider
2957  * expanding the process to a new lgroup.
2958  */
2959 #define	LGRP_EXPAND_PROC_THRESH_DEFAULT 62250
2960 lgrp_load_t	lgrp_expand_proc_thresh = LGRP_EXPAND_PROC_THRESH_DEFAULT;
2961 
2962 #define	LGRP_EXPAND_PROC_THRESH(ncpu) \
2963 	((lgrp_expand_proc_thresh) / (ncpu))
2964 
2965 /*
2966  * A process will be expanded to a new lgroup only if the difference between
2967  * the lowest load on the lgroups the process' thread's are currently spread
2968  * across and the lowest load on the other lgroups in the process' partition
2969  * is greater than lgrp_expand_proc_diff.
2970  */
2971 #define	LGRP_EXPAND_PROC_DIFF_DEFAULT 60000
2972 lgrp_load_t	lgrp_expand_proc_diff = LGRP_EXPAND_PROC_DIFF_DEFAULT;
2973 
2974 #define	LGRP_EXPAND_PROC_DIFF(ncpu) \
2975 	((lgrp_expand_proc_diff) / (ncpu))
2976 
2977 /*
2978  * The loadavg tolerance accounts for "noise" inherent in the load, which may
2979  * be present due to impreciseness of the load average decay algorithm.
2980  *
2981  * The default tolerance is lgrp_loadavg_max_effect. Note that the tunable
2982  * tolerance is scaled by the number of cpus in the lgroup just like
2983  * lgrp_loadavg_max_effect. For example, if lgrp_loadavg_tolerance = 0x10000,
2984  * and ncpu = 4, then lgrp_choose will consider differences in lgroup loads
2985  * of: 0x10000 / 4 => 0x4000 or greater to be significant.
2986  */
2987 uint32_t	lgrp_loadavg_tolerance = LGRP_LOADAVG_THREAD_MAX;
2988 #define	LGRP_LOADAVG_TOLERANCE(ncpu)	\
2989 	((lgrp_loadavg_tolerance) / ncpu)
2990 
2991 /*
2992  * lgrp_choose() will choose root lgroup as home when lowest lgroup load
2993  * average is above this threshold
2994  */
2995 uint32_t	lgrp_load_thresh = UINT32_MAX;
2996 
2997 /*
2998  * lgrp_choose() will try to skip any lgroups with less memory
2999  * than this free when choosing a home lgroup
3000  */
3001 pgcnt_t	lgrp_mem_free_thresh = 0;
3002 
3003 /*
3004  * When choosing between similarly loaded lgroups, lgrp_choose() will pick
3005  * one based on one of the following policies:
3006  * - Random selection
3007  * - Pseudo round robin placement
3008  * - Longest time since a thread was last placed
3009  */
3010 #define	LGRP_CHOOSE_RANDOM	1
3011 #define	LGRP_CHOOSE_RR		2
3012 #define	LGRP_CHOOSE_TIME	3
3013 
3014 int	lgrp_choose_policy = LGRP_CHOOSE_TIME;
3015 
3016 /*
3017  * Choose a suitable leaf lgroup for a kthread.  The kthread is assumed not to
3018  * be bound to a CPU or processor set.
3019  *
3020  * Arguments:
3021  *	t		The thread
3022  *	cpupart		The partition the thread belongs to.
3023  *
3024  * NOTE: Should at least be called with the cpu_lock held, kernel preemption
3025  *	 disabled, or thread_lock held (at splhigh) to protect against the CPU
3026  *	 partitions changing out from under us and assumes that given thread is
3027  *	 protected.  Also, called sometimes w/ cpus paused or kernel preemption
3028  *	 disabled, so don't grab any locks because we should never block under
3029  *	 those conditions.
3030  */
3031 lpl_t *
3032 lgrp_choose(kthread_t *t, cpupart_t *cpupart)
3033 {
3034 	lgrp_load_t	bestload, bestrload;
3035 	int		lgrpid_offset, lgrp_count;
3036 	lgrp_id_t	lgrpid, lgrpid_start;
3037 	lpl_t		*lpl, *bestlpl, *bestrlpl;
3038 	klgrpset_t	lgrpset;
3039 	proc_t		*p;
3040 
3041 	ASSERT(t != NULL);
3042 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3043 	    THREAD_LOCK_HELD(t));
3044 	ASSERT(cpupart != NULL);
3045 
3046 	p = t->t_procp;
3047 
3048 	/* A process should always be in an active partition */
3049 	ASSERT(!klgrpset_isempty(cpupart->cp_lgrpset));
3050 
3051 	bestlpl = bestrlpl = NULL;
3052 	bestload = bestrload = LGRP_LOADAVG_MAX;
3053 	lgrpset = cpupart->cp_lgrpset;
3054 
3055 	switch (lgrp_choose_policy) {
3056 	case LGRP_CHOOSE_RR:
3057 		lgrpid = cpupart->cp_lgrp_hint;
3058 		do {
3059 			if (++lgrpid > lgrp_alloc_max)
3060 				lgrpid = 0;
3061 		} while (!klgrpset_ismember(lgrpset, lgrpid));
3062 
3063 		break;
3064 	default:
3065 	case LGRP_CHOOSE_TIME:
3066 	case LGRP_CHOOSE_RANDOM:
3067 		klgrpset_nlgrps(lgrpset, lgrp_count);
3068 		lgrpid_offset =
3069 		    (((ushort_t)(gethrtime() >> 4)) % lgrp_count) + 1;
3070 		for (lgrpid = 0; ; lgrpid++) {
3071 			if (klgrpset_ismember(lgrpset, lgrpid)) {
3072 				if (--lgrpid_offset == 0)
3073 					break;
3074 			}
3075 		}
3076 		break;
3077 	}
3078 
3079 	lgrpid_start = lgrpid;
3080 
3081 	DTRACE_PROBE2(lgrp_choose_start, lgrp_id_t, lgrpid_start,
3082 	    lgrp_id_t, cpupart->cp_lgrp_hint);
3083 
3084 	/*
3085 	 * Use lgroup affinities (if any) to choose best lgroup
3086 	 *
3087 	 * NOTE: Assumes that thread is protected from going away and its
3088 	 *	 lgroup affinities won't change (ie. p_lock, or
3089 	 *	 thread_lock() being held and/or CPUs paused)
3090 	 */
3091 	if (t->t_lgrp_affinity) {
3092 		lpl = lgrp_affinity_best(t, cpupart, lgrpid_start);
3093 		if (lpl != NULL)
3094 			return (lpl);
3095 	}
3096 
3097 	ASSERT(klgrpset_ismember(lgrpset, lgrpid_start));
3098 	bestlpl = &cpupart->cp_lgrploads[lgrpid_start];
3099 
3100 	do {
3101 		pgcnt_t	npgs;
3102 
3103 		/*
3104 		 * Skip any lgroups outside of thread's pset
3105 		 */
3106 		if (!klgrpset_ismember(lgrpset, lgrpid)) {
3107 			if (++lgrpid > lgrp_alloc_max)
3108 				lgrpid = 0;	/* wrap the search */
3109 			continue;
3110 		}
3111 
3112 		/*
3113 		 * Skip any non-leaf lgroups
3114 		 */
3115 		if (lgrp_table[lgrpid]->lgrp_childcnt != 0)
3116 			continue;
3117 
3118 		/*
3119 		 * Skip any lgroups without enough free memory
3120 		 * (when threshold set to nonzero positive value)
3121 		 */
3122 		if (lgrp_mem_free_thresh > 0) {
3123 			npgs = lgrp_mem_size(lgrpid, LGRP_MEM_SIZE_FREE);
3124 			if (npgs < lgrp_mem_free_thresh) {
3125 				if (++lgrpid > lgrp_alloc_max)
3126 					lgrpid = 0;	/* wrap the search */
3127 				continue;
3128 			}
3129 		}
3130 
3131 		lpl = &cpupart->cp_lgrploads[lgrpid];
3132 		if (klgrpset_isempty(p->p_lgrpset) ||
3133 		    klgrpset_ismember(p->p_lgrpset, lgrpid)) {
3134 			/*
3135 			 * Either this is a new process or the process already
3136 			 * has threads on this lgrp, so this is a preferred
3137 			 * lgroup for the thread.
3138 			 */
3139 			if (lpl_pick(lpl, bestlpl)) {
3140 				bestload = lpl->lpl_loadavg;
3141 				bestlpl = lpl;
3142 			}
3143 		} else {
3144 			/*
3145 			 * The process doesn't have any threads on this lgrp,
3146 			 * but we're willing to consider this lgrp if the load
3147 			 * difference is big enough to justify splitting up
3148 			 * the process' threads.
3149 			 */
3150 			if (lpl_pick(lpl, bestrlpl)) {
3151 				bestrload = lpl->lpl_loadavg;
3152 				bestrlpl = lpl;
3153 			}
3154 		}
3155 		if (++lgrpid > lgrp_alloc_max)
3156 			lgrpid = 0;	/* wrap the search */
3157 	} while (lgrpid != lgrpid_start);
3158 
3159 	/*
3160 	 * Return root lgroup if threshold isn't set to maximum value and
3161 	 * lowest lgroup load average more than a certain threshold
3162 	 */
3163 	if (lgrp_load_thresh != UINT32_MAX &&
3164 	    bestload >= lgrp_load_thresh && bestrload >= lgrp_load_thresh)
3165 		return (&cpupart->cp_lgrploads[lgrp_root->lgrp_id]);
3166 
3167 	/*
3168 	 * If all the lgroups over which the thread's process is spread are
3169 	 * heavily loaded, we'll consider placing the thread on one of the
3170 	 * other leaf lgroups in the thread's partition.
3171 	 */
3172 	if ((bestload > LGRP_EXPAND_PROC_THRESH(bestlpl->lpl_ncpu)) &&
3173 	    (bestrload < bestload) &&	/* paranoid about wraparound */
3174 	    (bestrload + LGRP_EXPAND_PROC_DIFF(bestrlpl->lpl_ncpu) <
3175 	    bestload)) {
3176 		bestlpl = bestrlpl;
3177 	}
3178 
3179 	cpupart->cp_lgrp_hint = bestlpl->lpl_lgrpid;
3180 	bestlpl->lpl_homed_time = gethrtime_unscaled();
3181 
3182 	ASSERT(bestlpl->lpl_ncpu > 0);
3183 	return (bestlpl);
3184 }
3185 
3186 /*
3187  * Return 1 if lpl1 is a better candidate than lpl2 for lgrp homing.
3188  */
3189 static int
3190 lpl_pick(lpl_t *lpl1, lpl_t *lpl2)
3191 {
3192 	lgrp_load_t	l1, l2;
3193 	lgrp_load_t	tolerance = LGRP_LOADAVG_TOLERANCE(lpl1->lpl_ncpu);
3194 
3195 
3196 	if (lpl2 == NULL)
3197 		return (1);
3198 
3199 	l1 = lpl1->lpl_loadavg;
3200 	l2 = lpl2->lpl_loadavg;
3201 
3202 	if ((l1 + tolerance < l2) && (l1 < l2)) {
3203 		/* lpl1 is significantly less loaded than lpl2 */
3204 		return (1);
3205 	}
3206 
3207 	if (lgrp_choose_policy == LGRP_CHOOSE_TIME &&
3208 	    l1 + tolerance >= l2 && l1 < l2 &&
3209 	    lpl1->lpl_homed_time < lpl2->lpl_homed_time) {
3210 		/*
3211 		 * lpl1's load is within the tolerance of lpl2. We're
3212 		 * willing to consider it be to better however if
3213 		 * it has been longer since we last homed a thread there
3214 		 */
3215 		return (1);
3216 	}
3217 
3218 	return (0);
3219 }
3220 
3221 /*
3222  * An LWP is expected to be assigned to an lgroup for at least this long
3223  * for its anticipatory load to be justified.  NOTE that this value should
3224  * not be set extremely huge (say, larger than 100 years), to avoid problems
3225  * with overflow in the calculation that uses it.
3226  */
3227 #define	LGRP_MIN_NSEC	(NANOSEC / 10)		/* 1/10 of a second */
3228 hrtime_t lgrp_min_nsec = LGRP_MIN_NSEC;
3229 
3230 /*
3231  * Routine to change a thread's lgroup affiliation.  This routine updates
3232  * the thread's kthread_t struct and its process' proc_t struct to note the
3233  * thread's new lgroup affiliation, and its lgroup affinities.
3234  *
3235  * Note that this is the only routine that modifies a thread's t_lpl field,
3236  * and that adds in or removes anticipatory load.
3237  *
3238  * If the thread is exiting, newlpl is NULL.
3239  *
3240  * Locking:
3241  * The following lock must be held on entry:
3242  *	cpu_lock, kpreempt_disable(), or thread_lock -- to assure t's new lgrp
3243  *		doesn't get removed from t's partition
3244  *
3245  * This routine is not allowed to grab any locks, since it may be called
3246  * with cpus paused (such as from cpu_offline).
3247  */
3248 void
3249 lgrp_move_thread(kthread_t *t, lpl_t *newlpl, int do_lgrpset_delete)
3250 {
3251 	proc_t		*p;
3252 	lpl_t		*lpl, *oldlpl;
3253 	lgrp_id_t	oldid;
3254 	kthread_t	*tp;
3255 	uint_t		ncpu;
3256 	lgrp_load_t	old, new;
3257 
3258 	ASSERT(t);
3259 	ASSERT(MUTEX_HELD(&cpu_lock) || curthread->t_preempt > 0 ||
3260 	    THREAD_LOCK_HELD(t));
3261 
3262 	/*
3263 	 * If not changing lpls, just return
3264 	 */
3265 	if ((oldlpl = t->t_lpl) == newlpl)
3266 		return;
3267 
3268 	/*
3269 	 * Make sure the thread's lwp hasn't exited (if so, this thread is now
3270 	 * associated with process 0 rather than with its original process).
3271 	 */
3272 	if (t->t_proc_flag & TP_LWPEXIT) {
3273 		if (newlpl != NULL) {
3274 			t->t_lpl = newlpl;
3275 		}
3276 		return;
3277 	}
3278 
3279 	p = ttoproc(t);
3280 
3281 	/*
3282 	 * If the thread had a previous lgroup, update its process' p_lgrpset
3283 	 * to account for it being moved from its old lgroup.
3284 	 */
3285 	if ((oldlpl != NULL) &&	/* thread had a previous lgroup */
3286 	    (p->p_tlist != NULL)) {
3287 		oldid = oldlpl->lpl_lgrpid;
3288 
3289 		if (newlpl != NULL)
3290 			lgrp_stat_add(oldid, LGRP_NUM_MIGR, 1);
3291 
3292 		if ((do_lgrpset_delete) &&
3293 		    (klgrpset_ismember(p->p_lgrpset, oldid))) {
3294 			for (tp = p->p_tlist->t_forw; ; tp = tp->t_forw) {
3295 				/*
3296 				 * Check if a thread other than the thread
3297 				 * that's moving is assigned to the same
3298 				 * lgroup as the thread that's moving.  Note
3299 				 * that we have to compare lgroup IDs, rather
3300 				 * than simply comparing t_lpl's, since the
3301 				 * threads may belong to different partitions
3302 				 * but be assigned to the same lgroup.
3303 				 */
3304 				ASSERT(tp->t_lpl != NULL);
3305 
3306 				if ((tp != t) &&
3307 				    (tp->t_lpl->lpl_lgrpid == oldid)) {
3308 					/*
3309 					 * Another thread is assigned to the
3310 					 * same lgroup as the thread that's
3311 					 * moving, p_lgrpset doesn't change.
3312 					 */
3313 					break;
3314 				} else if (tp == p->p_tlist) {
3315 					/*
3316 					 * No other thread is assigned to the
3317 					 * same lgroup as the exiting thread,
3318 					 * clear the lgroup's bit in p_lgrpset.
3319 					 */
3320 					klgrpset_del(p->p_lgrpset, oldid);
3321 					break;
3322 				}
3323 			}
3324 		}
3325 
3326 		/*
3327 		 * If this thread was assigned to its old lgroup for such a
3328 		 * short amount of time that the anticipatory load that was
3329 		 * added on its behalf has aged very little, remove that
3330 		 * anticipatory load.
3331 		 */
3332 		if ((t->t_anttime + lgrp_min_nsec > gethrtime()) &&
3333 		    ((ncpu = oldlpl->lpl_ncpu) > 0)) {
3334 			lpl = oldlpl;
3335 			for (;;) {
3336 				do {
3337 					old = new = lpl->lpl_loadavg;
3338 					new -= LGRP_LOADAVG_MAX_EFFECT(ncpu);
3339 					if (new > old) {
3340 						/*
3341 						 * this can happen if the load
3342 						 * average was aged since we
3343 						 * added in the anticipatory
3344 						 * load
3345 						 */
3346 						new = 0;
3347 					}
3348 				} while (cas32(
3349 					(lgrp_load_t *)&lpl->lpl_loadavg, old,
3350 					    new) != old);
3351 
3352 				lpl = lpl->lpl_parent;
3353 				if (lpl == NULL)
3354 					break;
3355 
3356 				ncpu = lpl->lpl_ncpu;
3357 				ASSERT(ncpu > 0);
3358 			}
3359 		}
3360 	}
3361 	/*
3362 	 * If the thread has a new lgroup (i.e. it's not exiting), update its
3363 	 * t_lpl and its process' p_lgrpset, and apply an anticipatory load
3364 	 * to its new lgroup to account for its move to its new lgroup.
3365 	 */
3366 	if (newlpl != NULL) {
3367 		/*
3368 		 * This thread is moving to a new lgroup
3369 		 */
3370 		t->t_lpl = newlpl;
3371 
3372 		/*
3373 		 * Reflect move in load average of new lgroup
3374 		 * unless it is root lgroup
3375 		 */
3376 		if (lgrp_table[newlpl->lpl_lgrpid] == lgrp_root)
3377 			return;
3378 
3379 		if (!klgrpset_ismember(p->p_lgrpset, newlpl->lpl_lgrpid)) {
3380 			klgrpset_add(p->p_lgrpset, newlpl->lpl_lgrpid);
3381 		}
3382 
3383 		/*
3384 		 * It'll take some time for the load on the new lgroup
3385 		 * to reflect this thread's placement on it.  We'd
3386 		 * like not, however, to have all threads between now
3387 		 * and then also piling on to this lgroup.  To avoid
3388 		 * this pileup, we anticipate the load this thread
3389 		 * will generate on its new lgroup.  The goal is to
3390 		 * make the lgroup's load appear as though the thread
3391 		 * had been there all along.  We're very conservative
3392 		 * in calculating this anticipatory load, we assume
3393 		 * the worst case case (100% CPU-bound thread).  This
3394 		 * may be modified in the future to be more accurate.
3395 		 */
3396 		lpl = newlpl;
3397 		for (;;) {
3398 			ncpu = lpl->lpl_ncpu;
3399 			ASSERT(ncpu > 0);
3400 			do {
3401 				old = new = lpl->lpl_loadavg;
3402 				new += LGRP_LOADAVG_MAX_EFFECT(ncpu);
3403 				/*
3404 				 * Check for overflow
3405 				 * Underflow not possible here
3406 				 */
3407 				if (new < old)
3408 					new = UINT32_MAX;
3409 			} while (cas32((lgrp_load_t *)&lpl->lpl_loadavg, old,
3410 			    new) != old);
3411 
3412 			lpl = lpl->lpl_parent;
3413 			if (lpl == NULL)
3414 				break;
3415 		}
3416 		t->t_anttime = gethrtime();
3417 	}
3418 }
3419 
3420 /*
3421  * Return lgroup memory allocation policy given advice from madvise(3C)
3422  */
3423 lgrp_mem_policy_t
3424 lgrp_madv_to_policy(uchar_t advice, size_t size, int type)
3425 {
3426 	switch (advice) {
3427 	case MADV_ACCESS_LWP:
3428 		return (LGRP_MEM_POLICY_NEXT);
3429 	case MADV_ACCESS_MANY:
3430 		return (LGRP_MEM_POLICY_RANDOM);
3431 	default:
3432 		return (lgrp_mem_policy_default(size, type));
3433 	}
3434 }
3435 
3436 /*
3437  * Figure out default policy
3438  */
3439 lgrp_mem_policy_t
3440 lgrp_mem_policy_default(size_t size, int type)
3441 {
3442 	cpupart_t		*cp;
3443 	lgrp_mem_policy_t	policy;
3444 	size_t			pset_mem_size;
3445 
3446 	/*
3447 	 * Randomly allocate memory across lgroups for shared memory
3448 	 * beyond a certain threshold
3449 	 */
3450 	if ((type != MAP_SHARED && size > lgrp_privm_random_thresh) ||
3451 	    (type == MAP_SHARED && size > lgrp_shm_random_thresh)) {
3452 		/*
3453 		 * Get total memory size of current thread's pset
3454 		 */
3455 		kpreempt_disable();
3456 		cp = curthread->t_cpupart;
3457 		klgrpset_totalsize(cp->cp_lgrpset, pset_mem_size);
3458 		kpreempt_enable();
3459 
3460 		/*
3461 		 * Choose policy to randomly allocate memory across
3462 		 * lgroups in pset if it will fit and is not default
3463 		 * partition.  Otherwise, allocate memory randomly
3464 		 * across machine.
3465 		 */
3466 		if (lgrp_mem_pset_aware && size < pset_mem_size)
3467 			policy = LGRP_MEM_POLICY_RANDOM_PSET;
3468 		else
3469 			policy = LGRP_MEM_POLICY_RANDOM;
3470 	} else
3471 		/*
3472 		 * Apply default policy for private memory and
3473 		 * shared memory under the respective random
3474 		 * threshold.
3475 		 */
3476 		policy = lgrp_mem_default_policy;
3477 
3478 	return (policy);
3479 }
3480 
3481 /*
3482  * Get memory allocation policy for this segment
3483  */
3484 lgrp_mem_policy_info_t *
3485 lgrp_mem_policy_get(struct seg *seg, caddr_t vaddr)
3486 {
3487 	lgrp_mem_policy_info_t	*policy_info;
3488 	extern struct seg_ops	segspt_ops;
3489 	extern struct seg_ops	segspt_shmops;
3490 
3491 	/*
3492 	 * This is for binary compatibility to protect against third party
3493 	 * segment drivers which haven't recompiled to allow for
3494 	 * SEGOP_GETPOLICY()
3495 	 */
3496 	if (seg->s_ops != &segvn_ops && seg->s_ops != &segspt_ops &&
3497 	    seg->s_ops != &segspt_shmops)
3498 		return (NULL);
3499 
3500 	policy_info = NULL;
3501 	if (seg->s_ops->getpolicy != NULL)
3502 		policy_info = SEGOP_GETPOLICY(seg, vaddr);
3503 
3504 	return (policy_info);
3505 }
3506 
3507 /*
3508  * Set policy for allocating private memory given desired policy, policy info,
3509  * size in bytes of memory that policy is being applied.
3510  * Return 0 if policy wasn't set already and 1 if policy was set already
3511  */
3512 int
3513 lgrp_privm_policy_set(lgrp_mem_policy_t policy,
3514     lgrp_mem_policy_info_t *policy_info, size_t size)
3515 {
3516 
3517 	ASSERT(policy_info != NULL);
3518 
3519 	if (policy == LGRP_MEM_POLICY_DEFAULT)
3520 		policy = lgrp_mem_policy_default(size, MAP_PRIVATE);
3521 
3522 	/*
3523 	 * Policy set already?
3524 	 */
3525 	if (policy == policy_info->mem_policy)
3526 		return (1);
3527 
3528 	/*
3529 	 * Set policy
3530 	 */
3531 	policy_info->mem_policy = policy;
3532 	policy_info->mem_reserved = 0;
3533 
3534 	return (0);
3535 }
3536 
3537 
3538 /*
3539  * Get shared memory allocation policy with given tree and offset
3540  */
3541 lgrp_mem_policy_info_t *
3542 lgrp_shm_policy_get(struct anon_map *amp, ulong_t anon_index, vnode_t *vp,
3543     u_offset_t vn_off)
3544 {
3545 	u_offset_t		off;
3546 	lgrp_mem_policy_info_t	*policy_info;
3547 	lgrp_shm_policy_seg_t	*policy_seg;
3548 	lgrp_shm_locality_t	*shm_locality;
3549 	avl_tree_t		*tree;
3550 	avl_index_t		where;
3551 
3552 	/*
3553 	 * Get policy segment tree from anon_map or vnode and use specified
3554 	 * anon index or vnode offset as offset
3555 	 *
3556 	 * Assume that no lock needs to be held on anon_map or vnode, since
3557 	 * they should be protected by their reference count which must be
3558 	 * nonzero for an existing segment
3559 	 */
3560 	if (amp) {
3561 		ASSERT(amp->refcnt != 0);
3562 		shm_locality = amp->locality;
3563 		if (shm_locality == NULL)
3564 			return (NULL);
3565 		tree = shm_locality->loc_tree;
3566 		off = ptob(anon_index);
3567 	} else if (vp) {
3568 		shm_locality = vp->v_locality;
3569 		if (shm_locality == NULL)
3570 			return (NULL);
3571 		ASSERT(shm_locality->loc_count != 0);
3572 		tree = shm_locality->loc_tree;
3573 		off = vn_off;
3574 	}
3575 
3576 	if (tree == NULL)
3577 		return (NULL);
3578 
3579 	/*
3580 	 * Lookup policy segment for offset into shared object and return
3581 	 * policy info
3582 	 */
3583 	rw_enter(&shm_locality->loc_lock, RW_READER);
3584 	policy_info = NULL;
3585 	policy_seg = avl_find(tree, &off, &where);
3586 	if (policy_seg)
3587 		policy_info = &policy_seg->shm_policy;
3588 	rw_exit(&shm_locality->loc_lock);
3589 
3590 	return (policy_info);
3591 }
3592 
3593 /*
3594  * Return lgroup to use for allocating memory
3595  * given the segment and address
3596  *
3597  * There isn't any mutual exclusion that exists between calls
3598  * to this routine and DR, so this routine and whomever calls it
3599  * should be mindful of the possibility that the lgrp returned
3600  * may be deleted. If this happens, dereferences of the lgrp
3601  * pointer will still be safe, but the resources in the lgrp will
3602  * be gone, and LGRP_EXISTS() will no longer be true.
3603  */
3604 lgrp_t *
3605 lgrp_mem_choose(struct seg *seg, caddr_t vaddr, size_t pgsz)
3606 {
3607 	int			i;
3608 	lgrp_t			*lgrp;
3609 	klgrpset_t		lgrpset;
3610 	int			lgrps_spanned;
3611 	unsigned long		off;
3612 	lgrp_mem_policy_t	policy;
3613 	lgrp_mem_policy_info_t	*policy_info;
3614 	ushort_t		random;
3615 	int			stat = 0;
3616 
3617 	/*
3618 	 * Just return null if the lgrp framework hasn't finished
3619 	 * initializing or if this is a UMA machine.
3620 	 */
3621 	if (nlgrps == 1 || !lgrp_initialized)
3622 		return (lgrp_root);
3623 
3624 	/*
3625 	 * Get memory allocation policy for this segment
3626 	 */
3627 	policy = lgrp_mem_default_policy;
3628 	if (seg != NULL) {
3629 		if (seg->s_as == &kas) {
3630 			if (policy == LGRP_MEM_POLICY_RANDOM_PROC ||
3631 			    policy == LGRP_MEM_POLICY_RANDOM_PSET)
3632 				policy = LGRP_MEM_POLICY_RANDOM;
3633 		} else {
3634 			policy_info = lgrp_mem_policy_get(seg, vaddr);
3635 			if (policy_info != NULL)
3636 				policy = policy_info->mem_policy;
3637 		}
3638 	}
3639 	lgrpset = 0;
3640 
3641 	/*
3642 	 * Initialize lgroup to home by default
3643 	 */
3644 	lgrp = lgrp_home_lgrp();
3645 
3646 	/*
3647 	 * When homing threads on root lgrp, override default memory
3648 	 * allocation policies with root lgroup memory allocation policy
3649 	 */
3650 	if (lgrp == lgrp_root)
3651 		policy = lgrp_mem_policy_root;
3652 
3653 	/*
3654 	 * Implement policy
3655 	 */
3656 	switch (policy) {
3657 	case LGRP_MEM_POLICY_NEXT_CPU:
3658 
3659 		/*
3660 		 * Return lgroup of current CPU which faulted on memory
3661 		 * If the CPU isn't currently in an lgrp, then opt to
3662 		 * allocate from the root.
3663 		 *
3664 		 * Kernel preemption needs to be disabled here to prevent
3665 		 * the current CPU from going away before lgrp is found.
3666 		 */
3667 		if (LGRP_CPU_HAS_NO_LGRP(CPU)) {
3668 			lgrp = lgrp_root;
3669 		} else {
3670 			kpreempt_disable();
3671 			lgrp = lgrp_cpu_to_lgrp(CPU);
3672 			kpreempt_enable();
3673 		}
3674 		break;
3675 
3676 	case LGRP_MEM_POLICY_NEXT:
3677 	case LGRP_MEM_POLICY_DEFAULT:
3678 	default:
3679 
3680 		/*
3681 		 * Just return current thread's home lgroup
3682 		 * for default policy (next touch)
3683 		 * If the thread is homed to the root,
3684 		 * then the default policy is random across lgroups.
3685 		 * Fallthrough to the random case.
3686 		 */
3687 		if (lgrp != lgrp_root) {
3688 			if (policy == LGRP_MEM_POLICY_NEXT)
3689 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_NEXT, 1);
3690 			else
3691 				lgrp_stat_add(lgrp->lgrp_id,
3692 				    LGRP_NUM_DEFAULT, 1);
3693 			break;
3694 		}
3695 		/* LINTED fallthrough on case statement */
3696 	case LGRP_MEM_POLICY_RANDOM:
3697 
3698 		/*
3699 		 * Return a random leaf lgroup with memory
3700 		 */
3701 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3702 		/*
3703 		 * Count how many lgroups are spanned
3704 		 */
3705 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3706 
3707 		/*
3708 		 * There may be no memnodes in the root lgroup during DR copy
3709 		 * rename on a system with only two boards (memnodes)
3710 		 * configured. In this case just return the root lgrp.
3711 		 */
3712 		if (lgrps_spanned == 0) {
3713 			lgrp = lgrp_root;
3714 			break;
3715 		}
3716 
3717 		/*
3718 		 * Pick a random offset within lgroups spanned
3719 		 * and return lgroup at that offset
3720 		 */
3721 		random = (ushort_t)gethrtime() >> 4;
3722 		off = random % lgrps_spanned;
3723 		ASSERT(off <= lgrp_alloc_max);
3724 
3725 		for (i = 0; i <= lgrp_alloc_max; i++) {
3726 			if (!klgrpset_ismember(lgrpset, i))
3727 				continue;
3728 			if (off)
3729 				off--;
3730 			else {
3731 				lgrp = lgrp_table[i];
3732 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3733 				    1);
3734 				break;
3735 			}
3736 		}
3737 		break;
3738 
3739 	case LGRP_MEM_POLICY_RANDOM_PROC:
3740 
3741 		/*
3742 		 * Grab copy of bitmask of lgroups spanned by
3743 		 * this process
3744 		 */
3745 		klgrpset_copy(lgrpset, curproc->p_lgrpset);
3746 		stat = LGRP_NUM_RANDOM_PROC;
3747 
3748 		/* LINTED fallthrough on case statement */
3749 	case LGRP_MEM_POLICY_RANDOM_PSET:
3750 
3751 		if (!stat)
3752 			stat = LGRP_NUM_RANDOM_PSET;
3753 
3754 		if (klgrpset_isempty(lgrpset)) {
3755 			/*
3756 			 * Grab copy of bitmask of lgroups spanned by
3757 			 * this processor set
3758 			 */
3759 			kpreempt_disable();
3760 			klgrpset_copy(lgrpset,
3761 			    curthread->t_cpupart->cp_lgrpset);
3762 			kpreempt_enable();
3763 		}
3764 
3765 		/*
3766 		 * Count how many lgroups are spanned
3767 		 */
3768 		klgrpset_nlgrps(lgrpset, lgrps_spanned);
3769 		ASSERT(lgrps_spanned <= nlgrps);
3770 
3771 		/*
3772 		 * Probably lgrps_spanned should be always non-zero, but to be
3773 		 * on the safe side we return lgrp_root if it is empty.
3774 		 */
3775 		if (lgrps_spanned == 0) {
3776 			lgrp = lgrp_root;
3777 			break;
3778 		}
3779 
3780 		/*
3781 		 * Pick a random offset within lgroups spanned
3782 		 * and return lgroup at that offset
3783 		 */
3784 		random = (ushort_t)gethrtime() >> 4;
3785 		off = random % lgrps_spanned;
3786 		ASSERT(off <= lgrp_alloc_max);
3787 
3788 		for (i = 0; i <= lgrp_alloc_max; i++) {
3789 			if (!klgrpset_ismember(lgrpset, i))
3790 				continue;
3791 			if (off)
3792 				off--;
3793 			else {
3794 				lgrp = lgrp_table[i];
3795 				lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_RANDOM,
3796 				    1);
3797 				break;
3798 			}
3799 		}
3800 		break;
3801 
3802 	case LGRP_MEM_POLICY_ROUNDROBIN:
3803 
3804 		/*
3805 		 * Use offset within segment to determine
3806 		 * offset from home lgroup to choose for
3807 		 * next lgroup to allocate memory from
3808 		 */
3809 		off = ((unsigned long)(vaddr - seg->s_base) / pgsz) %
3810 		    (lgrp_alloc_max + 1);
3811 
3812 		kpreempt_disable();
3813 		lgrpset = lgrp_root->lgrp_set[LGRP_RSRC_MEM];
3814 		i = lgrp->lgrp_id;
3815 		kpreempt_enable();
3816 
3817 		while (off > 0) {
3818 			i = (i + 1) % (lgrp_alloc_max + 1);
3819 			lgrp = lgrp_table[i];
3820 			if (klgrpset_ismember(lgrpset, i))
3821 				off--;
3822 		}
3823 		lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ROUNDROBIN, 1);
3824 
3825 		break;
3826 	}
3827 
3828 	ASSERT(lgrp != NULL);
3829 	return (lgrp);
3830 }
3831 
3832 /*
3833  * Return the number of pages in an lgroup
3834  *
3835  * NOTE: NUMA test (numat) driver uses this, so changing arguments or semantics
3836  *	 could cause tests that rely on the numat driver to fail....
3837  */
3838 pgcnt_t
3839 lgrp_mem_size(lgrp_id_t lgrpid, lgrp_mem_query_t query)
3840 {
3841 	lgrp_t *lgrp;
3842 
3843 	lgrp = lgrp_table[lgrpid];
3844 	if (!LGRP_EXISTS(lgrp) ||
3845 	    klgrpset_isempty(lgrp->lgrp_set[LGRP_RSRC_MEM]) ||
3846 	    !klgrpset_ismember(lgrp->lgrp_set[LGRP_RSRC_MEM], lgrpid))
3847 		return (0);
3848 
3849 	return (lgrp_plat_mem_size(lgrp->lgrp_plathand, query));
3850 }
3851 
3852 /*
3853  * Initialize lgroup shared memory allocation policy support
3854  */
3855 void
3856 lgrp_shm_policy_init(struct anon_map *amp, vnode_t *vp)
3857 {
3858 	lgrp_shm_locality_t	*shm_locality;
3859 
3860 	/*
3861 	 * Initialize locality field in anon_map
3862 	 * Don't need any locks because this is called when anon_map is
3863 	 * allocated, but not used anywhere yet.
3864 	 */
3865 	if (amp) {
3866 		ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3867 		if (amp->locality == NULL) {
3868 			/*
3869 			 * Allocate and initialize shared memory locality info
3870 			 * and set anon_map locality pointer to it
3871 			 * Drop lock across kmem_alloc(KM_SLEEP)
3872 			 */
3873 			ANON_LOCK_EXIT(&amp->a_rwlock);
3874 			shm_locality = kmem_alloc(sizeof (*shm_locality),
3875 			    KM_SLEEP);
3876 			rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT,
3877 			    NULL);
3878 			shm_locality->loc_count = 1;	/* not used for amp */
3879 			shm_locality->loc_tree = NULL;
3880 
3881 			/*
3882 			 * Reacquire lock and check to see whether anyone beat
3883 			 * us to initializing the locality info
3884 			 */
3885 			ANON_LOCK_ENTER(&amp->a_rwlock, RW_WRITER);
3886 			if (amp->locality != NULL) {
3887 				rw_destroy(&shm_locality->loc_lock);
3888 				kmem_free(shm_locality,
3889 				    sizeof (*shm_locality));
3890 			} else
3891 				amp->locality = shm_locality;
3892 		}
3893 		ANON_LOCK_EXIT(&amp->a_rwlock);
3894 		return;
3895 	}
3896 
3897 	/*
3898 	 * Allocate shared vnode policy info if vnode is not locality aware yet
3899 	 */
3900 	mutex_enter(&vp->v_lock);
3901 	if ((vp->v_flag & V_LOCALITY) == 0) {
3902 		/*
3903 		 * Allocate and initialize shared memory locality info
3904 		 */
3905 		mutex_exit(&vp->v_lock);
3906 		shm_locality = kmem_alloc(sizeof (*shm_locality), KM_SLEEP);
3907 		rw_init(&shm_locality->loc_lock, NULL, RW_DEFAULT, NULL);
3908 		shm_locality->loc_count = 1;
3909 		shm_locality->loc_tree = NULL;
3910 
3911 		/*
3912 		 * Point vnode locality field at shared vnode policy info
3913 		 * and set locality aware flag in vnode
3914 		 */
3915 		mutex_enter(&vp->v_lock);
3916 		if ((vp->v_flag & V_LOCALITY) == 0) {
3917 			vp->v_locality = shm_locality;
3918 			vp->v_flag |= V_LOCALITY;
3919 		} else {
3920 			/*
3921 			 * Lost race so free locality info and increment count.
3922 			 */
3923 			rw_destroy(&shm_locality->loc_lock);
3924 			kmem_free(shm_locality, sizeof (*shm_locality));
3925 			shm_locality = vp->v_locality;
3926 			shm_locality->loc_count++;
3927 		}
3928 		mutex_exit(&vp->v_lock);
3929 
3930 		return;
3931 	}
3932 
3933 	/*
3934 	 * Increment reference count of number of segments mapping this vnode
3935 	 * shared
3936 	 */
3937 	shm_locality = vp->v_locality;
3938 	shm_locality->loc_count++;
3939 	mutex_exit(&vp->v_lock);
3940 }
3941 
3942 /*
3943  * Destroy the given shared memory policy segment tree
3944  */
3945 void
3946 lgrp_shm_policy_tree_destroy(avl_tree_t *tree)
3947 {
3948 	lgrp_shm_policy_seg_t	*cur;
3949 	lgrp_shm_policy_seg_t	*next;
3950 
3951 	if (tree == NULL)
3952 		return;
3953 
3954 	cur = (lgrp_shm_policy_seg_t *)avl_first(tree);
3955 	while (cur != NULL) {
3956 		next = AVL_NEXT(tree, cur);
3957 		avl_remove(tree, cur);
3958 		kmem_free(cur, sizeof (*cur));
3959 		cur = next;
3960 	}
3961 	kmem_free(tree, sizeof (avl_tree_t));
3962 }
3963 
3964 /*
3965  * Uninitialize lgroup shared memory allocation policy support
3966  */
3967 void
3968 lgrp_shm_policy_fini(struct anon_map *amp, vnode_t *vp)
3969 {
3970 	lgrp_shm_locality_t	*shm_locality;
3971 
3972 	/*
3973 	 * For anon_map, deallocate shared memory policy tree and
3974 	 * zero locality field
3975 	 * Don't need any locks because anon_map is being freed
3976 	 */
3977 	if (amp) {
3978 		if (amp->locality == NULL)
3979 			return;
3980 		shm_locality = amp->locality;
3981 		shm_locality->loc_count = 0;	/* not really used for amp */
3982 		rw_destroy(&shm_locality->loc_lock);
3983 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
3984 		kmem_free(shm_locality, sizeof (*shm_locality));
3985 		amp->locality = 0;
3986 		return;
3987 	}
3988 
3989 	/*
3990 	 * For vnode, decrement reference count of segments mapping this vnode
3991 	 * shared and delete locality info if reference count drops to 0
3992 	 */
3993 	mutex_enter(&vp->v_lock);
3994 	shm_locality = vp->v_locality;
3995 	shm_locality->loc_count--;
3996 
3997 	if (shm_locality->loc_count == 0) {
3998 		rw_destroy(&shm_locality->loc_lock);
3999 		lgrp_shm_policy_tree_destroy(shm_locality->loc_tree);
4000 		kmem_free(shm_locality, sizeof (*shm_locality));
4001 		vp->v_locality = 0;
4002 		vp->v_flag &= ~V_LOCALITY;
4003 	}
4004 	mutex_exit(&vp->v_lock);
4005 }
4006 
4007 /*
4008  * Compare two shared memory policy segments
4009  * Used by AVL tree code for searching
4010  */
4011 int
4012 lgrp_shm_policy_compar(const void *x, const void *y)
4013 {
4014 	lgrp_shm_policy_seg_t *a = (lgrp_shm_policy_seg_t *)x;
4015 	lgrp_shm_policy_seg_t *b = (lgrp_shm_policy_seg_t *)y;
4016 
4017 	if (a->shm_off < b->shm_off)
4018 		return (-1);
4019 	if (a->shm_off >= b->shm_off + b->shm_size)
4020 		return (1);
4021 	return (0);
4022 }
4023 
4024 /*
4025  * Concatenate seg1 with seg2 and remove seg2
4026  */
4027 static int
4028 lgrp_shm_policy_concat(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg1,
4029     lgrp_shm_policy_seg_t *seg2)
4030 {
4031 	if (!seg1 || !seg2 ||
4032 	    seg1->shm_off + seg1->shm_size != seg2->shm_off ||
4033 	    seg1->shm_policy.mem_policy != seg2->shm_policy.mem_policy)
4034 		return (-1);
4035 
4036 	seg1->shm_size += seg2->shm_size;
4037 	avl_remove(tree, seg2);
4038 	kmem_free(seg2, sizeof (*seg2));
4039 	return (0);
4040 }
4041 
4042 /*
4043  * Split segment at given offset and return rightmost (uppermost) segment
4044  * Assumes that there are no overlapping segments
4045  */
4046 static lgrp_shm_policy_seg_t *
4047 lgrp_shm_policy_split(avl_tree_t *tree, lgrp_shm_policy_seg_t *seg,
4048     u_offset_t off)
4049 {
4050 	lgrp_shm_policy_seg_t	*newseg;
4051 	avl_index_t		where;
4052 
4053 	ASSERT(seg != NULL);
4054 	ASSERT(off >= seg->shm_off && off <= seg->shm_off + seg->shm_size);
4055 
4056 	if (!seg || off < seg->shm_off || off > seg->shm_off +
4057 	    seg->shm_size)
4058 		return (NULL);
4059 
4060 	if (off == seg->shm_off || off == seg->shm_off + seg->shm_size)
4061 		return (seg);
4062 
4063 	/*
4064 	 * Adjust size of left segment and allocate new (right) segment
4065 	 */
4066 	newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t), KM_SLEEP);
4067 	newseg->shm_policy = seg->shm_policy;
4068 	newseg->shm_off = off;
4069 	newseg->shm_size = seg->shm_size - (off - seg->shm_off);
4070 	seg->shm_size = off - seg->shm_off;
4071 
4072 	/*
4073 	 * Find where to insert new segment in AVL tree and insert it
4074 	 */
4075 	(void) avl_find(tree, &off, &where);
4076 	avl_insert(tree, newseg, where);
4077 
4078 	return (newseg);
4079 }
4080 
4081 /*
4082  * Set shared memory allocation policy on specified shared object at given
4083  * offset and length
4084  *
4085  * Return 0 if policy wasn't set already, 1 if policy was set already, and
4086  * -1 if can't set policy.
4087  */
4088 int
4089 lgrp_shm_policy_set(lgrp_mem_policy_t policy, struct anon_map *amp,
4090     ulong_t anon_index, vnode_t *vp, u_offset_t vn_off, size_t len)
4091 {
4092 	u_offset_t		eoff;
4093 	lgrp_shm_policy_seg_t	*next;
4094 	lgrp_shm_policy_seg_t	*newseg;
4095 	u_offset_t		off;
4096 	u_offset_t		oldeoff;
4097 	lgrp_shm_policy_seg_t	*prev;
4098 	int			retval;
4099 	lgrp_shm_policy_seg_t	*seg;
4100 	lgrp_shm_locality_t	*shm_locality;
4101 	avl_tree_t		*tree;
4102 	avl_index_t		where;
4103 
4104 	ASSERT(amp || vp);
4105 	ASSERT((len & PAGEOFFSET) == 0);
4106 
4107 	if (len == 0)
4108 		return (-1);
4109 
4110 	retval = 0;
4111 
4112 	/*
4113 	 * Get locality info and starting offset into shared object
4114 	 * Try anon map first and then vnode
4115 	 * Assume that no locks need to be held on anon_map or vnode, since
4116 	 * it should be protected by its reference count which must be nonzero
4117 	 * for an existing segment.
4118 	 */
4119 	if (amp) {
4120 		/*
4121 		 * Get policy info from anon_map
4122 		 *
4123 		 */
4124 		ASSERT(amp->refcnt != 0);
4125 		if (amp->locality == NULL)
4126 			lgrp_shm_policy_init(amp, NULL);
4127 		shm_locality = amp->locality;
4128 		off = ptob(anon_index);
4129 	} else if (vp) {
4130 		/*
4131 		 * Get policy info from vnode
4132 		 */
4133 		if ((vp->v_flag & V_LOCALITY) == 0 || vp->v_locality == NULL)
4134 			lgrp_shm_policy_init(NULL, vp);
4135 		shm_locality = vp->v_locality;
4136 		ASSERT(shm_locality->loc_count != 0);
4137 		off = vn_off;
4138 	} else
4139 		return (-1);
4140 
4141 	ASSERT((off & PAGEOFFSET) == 0);
4142 
4143 	/*
4144 	 * Figure out default policy
4145 	 */
4146 	if (policy == LGRP_MEM_POLICY_DEFAULT)
4147 		policy = lgrp_mem_policy_default(len, MAP_SHARED);
4148 
4149 	/*
4150 	 * Create AVL tree if there isn't one yet
4151 	 * and set locality field to point at it
4152 	 */
4153 	rw_enter(&shm_locality->loc_lock, RW_WRITER);
4154 	tree = shm_locality->loc_tree;
4155 	if (!tree) {
4156 		rw_exit(&shm_locality->loc_lock);
4157 
4158 		tree = kmem_alloc(sizeof (avl_tree_t), KM_SLEEP);
4159 
4160 		rw_enter(&shm_locality->loc_lock, RW_WRITER);
4161 		if (shm_locality->loc_tree == NULL) {
4162 			avl_create(tree, lgrp_shm_policy_compar,
4163 			    sizeof (lgrp_shm_policy_seg_t),
4164 			    offsetof(lgrp_shm_policy_seg_t, shm_tree));
4165 			shm_locality->loc_tree = tree;
4166 		} else {
4167 			/*
4168 			 * Another thread managed to set up the tree
4169 			 * before we could. Free the tree we allocated
4170 			 * and use the one that's already there.
4171 			 */
4172 			kmem_free(tree, sizeof (*tree));
4173 			tree = shm_locality->loc_tree;
4174 		}
4175 	}
4176 
4177 	/*
4178 	 * Set policy
4179 	 *
4180 	 * Need to maintain hold on writer's lock to keep tree from
4181 	 * changing out from under us
4182 	 */
4183 	while (len != 0) {
4184 		/*
4185 		 * Find policy segment for specified offset into shared object
4186 		 */
4187 		seg = avl_find(tree, &off, &where);
4188 
4189 		/*
4190 		 * Didn't find any existing segment that contains specified
4191 		 * offset, so allocate new segment, insert it, and concatenate
4192 		 * with adjacent segments if possible
4193 		 */
4194 		if (seg == NULL) {
4195 			newseg = kmem_alloc(sizeof (lgrp_shm_policy_seg_t),
4196 			    KM_SLEEP);
4197 			newseg->shm_policy.mem_policy = policy;
4198 			newseg->shm_policy.mem_reserved = 0;
4199 			newseg->shm_off = off;
4200 			avl_insert(tree, newseg, where);
4201 
4202 			/*
4203 			 * Check to see whether new segment overlaps with next
4204 			 * one, set length of new segment accordingly, and
4205 			 * calculate remaining length and next offset
4206 			 */
4207 			seg = AVL_NEXT(tree, newseg);
4208 			if (seg == NULL || off + len <= seg->shm_off) {
4209 				newseg->shm_size = len;
4210 				len = 0;
4211 			} else {
4212 				newseg->shm_size = seg->shm_off - off;
4213 				off = seg->shm_off;
4214 				len -= newseg->shm_size;
4215 			}
4216 
4217 			/*
4218 			 * Try to concatenate new segment with next and
4219 			 * previous ones, since they might have the same policy
4220 			 * now.  Grab previous and next segments first because
4221 			 * they will change on concatenation.
4222 			 */
4223 			prev =  AVL_PREV(tree, newseg);
4224 			next = AVL_NEXT(tree, newseg);
4225 			(void) lgrp_shm_policy_concat(tree, newseg, next);
4226 			(void) lgrp_shm_policy_concat(tree, prev, newseg);
4227 
4228 			continue;
4229 		}
4230 
4231 		eoff = off + len;
4232 		oldeoff = seg->shm_off + seg->shm_size;
4233 
4234 		/*
4235 		 * Policy set already?
4236 		 */
4237 		if (policy == seg->shm_policy.mem_policy) {
4238 			/*
4239 			 * Nothing left to do if offset and length
4240 			 * fall within this segment
4241 			 */
4242 			if (eoff <= oldeoff) {
4243 				retval = 1;
4244 				break;
4245 			} else {
4246 				len = eoff - oldeoff;
4247 				off = oldeoff;
4248 				continue;
4249 			}
4250 		}
4251 
4252 		/*
4253 		 * Specified offset and length match existing segment exactly
4254 		 */
4255 		if (off == seg->shm_off && len == seg->shm_size) {
4256 			/*
4257 			 * Set policy and update current length
4258 			 */
4259 			seg->shm_policy.mem_policy = policy;
4260 			seg->shm_policy.mem_reserved = 0;
4261 			len = 0;
4262 
4263 			/*
4264 			 * Try concatenating new segment with previous and next
4265 			 * segments, since they might have the same policy now.
4266 			 * Grab previous and next segments first because they
4267 			 * will change on concatenation.
4268 			 */
4269 			prev =  AVL_PREV(tree, seg);
4270 			next = AVL_NEXT(tree, seg);
4271 			(void) lgrp_shm_policy_concat(tree, seg, next);
4272 			(void) lgrp_shm_policy_concat(tree, prev, seg);
4273 		} else {
4274 			/*
4275 			 * Specified offset and length only apply to part of
4276 			 * existing segment
4277 			 */
4278 
4279 			/*
4280 			 * New segment starts in middle of old one, so split
4281 			 * new one off near beginning of old one
4282 			 */
4283 			newseg = NULL;
4284 			if (off > seg->shm_off) {
4285 				newseg = lgrp_shm_policy_split(tree, seg, off);
4286 
4287 				/*
4288 				 * New segment ends where old one did, so try
4289 				 * to concatenate with next segment
4290 				 */
4291 				if (eoff == oldeoff) {
4292 					newseg->shm_policy.mem_policy = policy;
4293 					newseg->shm_policy.mem_reserved = 0;
4294 					(void) lgrp_shm_policy_concat(tree,
4295 					    newseg, AVL_NEXT(tree, newseg));
4296 					break;
4297 				}
4298 			}
4299 
4300 			/*
4301 			 * New segment ends before old one, so split off end of
4302 			 * old one
4303 			 */
4304 			if (eoff < oldeoff) {
4305 				if (newseg) {
4306 					(void) lgrp_shm_policy_split(tree,
4307 					    newseg, eoff);
4308 					newseg->shm_policy.mem_policy = policy;
4309 					newseg->shm_policy.mem_reserved = 0;
4310 				} else {
4311 					(void) lgrp_shm_policy_split(tree, seg,
4312 					    eoff);
4313 					seg->shm_policy.mem_policy = policy;
4314 					seg->shm_policy.mem_reserved = 0;
4315 				}
4316 
4317 				if (off == seg->shm_off)
4318 					(void) lgrp_shm_policy_concat(tree,
4319 					    AVL_PREV(tree, seg), seg);
4320 				break;
4321 			}
4322 
4323 			/*
4324 			 * Calculate remaining length and next offset
4325 			 */
4326 			len = eoff - oldeoff;
4327 			off = oldeoff;
4328 		}
4329 	}
4330 
4331 	rw_exit(&shm_locality->loc_lock);
4332 	return (retval);
4333 }
4334 
4335 /*
4336  * Return the best memnode from which to allocate memory given
4337  * an lgroup.
4338  *
4339  * "c" is for cookie, which is good enough for me.
4340  * It references a cookie struct that should be zero'ed to initialize.
4341  * The cookie should live on the caller's stack.
4342  *
4343  * The routine returns -1 when:
4344  *	- traverse is 0, and all the memnodes in "lgrp" have been returned.
4345  *	- traverse is 1, and all the memnodes in the system have been
4346  *	  returned.
4347  */
4348 int
4349 lgrp_memnode_choose(lgrp_mnode_cookie_t *c)
4350 {
4351 	lgrp_t		*lp = c->lmc_lgrp;
4352 	mnodeset_t	nodes = c->lmc_nodes;
4353 	int		cnt = c->lmc_cnt;
4354 	int		offset, mnode;
4355 
4356 	extern int	max_mem_nodes;
4357 
4358 	/*
4359 	 * If the set is empty, and the caller is willing, traverse
4360 	 * up the hierarchy until we find a non-empty set.
4361 	 */
4362 	while (nodes == (mnodeset_t)0 || cnt <= 0) {
4363 		if (c->lmc_scope == LGRP_SRCH_LOCAL ||
4364 		    ((lp = lp->lgrp_parent) == NULL))
4365 			return (-1);
4366 
4367 		nodes = lp->lgrp_mnodes & ~(c->lmc_tried);
4368 		cnt = lp->lgrp_nmnodes - c->lmc_ntried;
4369 	}
4370 
4371 	/*
4372 	 * Select a memnode by picking one at a "random" offset.
4373 	 * Because of DR, memnodes can come and go at any time.
4374 	 * This code must be able to cope with the possibility
4375 	 * that the nodes count "cnt" is inconsistent with respect
4376 	 * to the number of elements actually in "nodes", and
4377 	 * therefore that the offset chosen could be greater than
4378 	 * the number of elements in the set (some memnodes may
4379 	 * have dissapeared just before cnt was read).
4380 	 * If this happens, the search simply wraps back to the
4381 	 * beginning of the set.
4382 	 */
4383 	ASSERT(nodes != (mnodeset_t)0 && cnt > 0);
4384 	offset = c->lmc_rand % cnt;
4385 	do {
4386 		for (mnode = 0; mnode < max_mem_nodes; mnode++)
4387 			if (nodes & ((mnodeset_t)1 << mnode))
4388 				if (!offset--)
4389 					break;
4390 	} while (mnode >= max_mem_nodes);
4391 
4392 	/* Found a node. Store state before returning. */
4393 	c->lmc_lgrp = lp;
4394 	c->lmc_nodes = (nodes & ~((mnodeset_t)1 << mnode));
4395 	c->lmc_cnt = cnt - 1;
4396 	c->lmc_tried = (c->lmc_tried | ((mnodeset_t)1 << mnode));
4397 	c->lmc_ntried++;
4398 
4399 	return (mnode);
4400 }
4401