xref: /illumos-gate/usr/src/uts/common/vm/vm_pagelist.c (revision d362b749)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 /* Copyright (c) 1984, 1986, 1987, 1988, 1989 AT&T */
27 /*	All Rights Reserved   */
28 
29 /*
30  * Portions of this source code were derived from Berkeley 4.3 BSD
31  * under license from the Regents of the University of California.
32  */
33 
34 #pragma ident	"%Z%%M%	%I%	%E% SMI"
35 
36 /*
37  * This file contains common functions to access and manage the page lists.
38  * Many of these routines originated from platform dependent modules
39  * (sun4/vm/vm_dep.c, i86pc/vm/vm_machdep.c) and modified to function in
40  * a platform independent manner.
41  *
42  * vm/vm_dep.h provides for platform specific support.
43  */
44 
45 #include <sys/types.h>
46 #include <sys/debug.h>
47 #include <sys/cmn_err.h>
48 #include <sys/systm.h>
49 #include <sys/atomic.h>
50 #include <sys/sysmacros.h>
51 #include <vm/as.h>
52 #include <vm/page.h>
53 #include <vm/seg_kmem.h>
54 #include <vm/seg_vn.h>
55 #include <sys/memnode.h>
56 #include <vm/vm_dep.h>
57 #include <sys/lgrp.h>
58 #include <sys/mem_config.h>
59 #include <sys/callb.h>
60 #include <sys/mem_cage.h>
61 #include <sys/sdt.h>
62 
63 extern uint_t	vac_colors;
64 
65 #define	MAX_PRAGMA_ALIGN	128
66 
67 /* vm_cpu_data0 for the boot cpu before kmem is initialized */
68 
69 #if L2CACHE_ALIGN_MAX <= MAX_PRAGMA_ALIGN
70 #pragma align	L2CACHE_ALIGN_MAX(vm_cpu_data0)
71 #else
72 #pragma align	MAX_PRAGMA_ALIGN(vm_cpu_data0)
73 #endif
74 char		vm_cpu_data0[VM_CPU_DATA_PADSIZE];
75 
76 /*
77  * number of page colors equivalent to reqested color in page_get routines.
78  * If set, keeps large pages intact longer and keeps MPO allocation
79  * from the local mnode in favor of acquiring the 'correct' page color from
80  * a demoted large page or from a remote mnode.
81  */
82 uint_t	colorequiv;
83 
84 /*
85  * color equivalency mask for each page size.
86  * Mask is computed based on cpu L2$ way sizes and colorequiv global.
87  * High 4 bits determine the number of high order bits of the color to ignore.
88  * Low 4 bits determines number of low order bits of color to ignore (it's only
89  * relevant for hashed index based page coloring).
90  */
91 uchar_t colorequivszc[MMU_PAGE_SIZES];
92 
93 /*
94  * if set, specifies the percentage of large pages that are free from within
95  * a large page region before attempting to lock those pages for
96  * page_get_contig_pages processing.
97  *
98  * Should be turned on when kpr is available when page_trylock_contig_pages
99  * can be more selective.
100  */
101 
102 int	ptcpthreshold;
103 
104 /*
105  * Limit page get contig page search based on failure cnts in pgcpfailcnt[].
106  * Enabled by default via pgcplimitsearch.
107  *
108  * pgcpfailcnt[] is bounded by PGCPFAILMAX (>= 1/2 of installed
109  * memory). When reached, pgcpfailcnt[] is reset to 1/2 of this upper
110  * bound. This upper bound range guarantees:
111  *    - all large page 'slots' will be searched over time
112  *    - the minimum (1) large page candidates considered on each pgcp call
113  *    - count doesn't wrap around to 0
114  */
115 pgcnt_t	pgcpfailcnt[MMU_PAGE_SIZES];
116 int	pgcplimitsearch = 1;
117 
118 #define	PGCPFAILMAX		(1 << (highbit(physinstalled) - 1))
119 #define	SETPGCPFAILCNT(szc)						\
120 	if (++pgcpfailcnt[szc] >= PGCPFAILMAX)				\
121 		pgcpfailcnt[szc] = PGCPFAILMAX / 2;
122 
123 #ifdef VM_STATS
124 struct vmm_vmstats_str  vmm_vmstats;
125 
126 #endif /* VM_STATS */
127 
128 #if defined(__sparc)
129 #define	LPGCREATE	0
130 #else
131 /* enable page_get_contig_pages */
132 #define	LPGCREATE	1
133 #endif
134 
135 int pg_contig_disable;
136 int pg_lpgcreate_nocage = LPGCREATE;
137 
138 /*
139  * page_freelist_split pfn flag to signify no hi pfn requirement.
140  */
141 #define	PFNNULL		0
142 
143 /* Flags involved in promotion and demotion routines */
144 #define	PC_FREE		0x1	/* put page on freelist */
145 #define	PC_ALLOC	0x2	/* return page for allocation */
146 
147 /*
148  * Flag for page_demote to be used with PC_FREE to denote that we don't care
149  * what the color is as the color parameter to the function is ignored.
150  */
151 #define	PC_NO_COLOR	(-1)
152 
153 /* mtype value for page_promote to use when mtype does not matter */
154 #define	PC_MTYPE_ANY	(-1)
155 
156 /*
157  * page counters candidates info
158  * See page_ctrs_cands comment below for more details.
159  * fields are as follows:
160  *	pcc_pages_free:		# pages which freelist coalesce can create
161  *	pcc_color_free:		pointer to page free counts per color
162  */
163 typedef struct pcc_info {
164 	pgcnt_t	pcc_pages_free;
165 	pgcnt_t	*pcc_color_free;
166 } pcc_info_t;
167 
168 /*
169  * On big machines it can take a long time to check page_counters
170  * arrays. page_ctrs_cands is a summary array whose elements are a dynamically
171  * updated sum of all elements of the corresponding page_counters arrays.
172  * page_freelist_coalesce() searches page_counters only if an appropriate
173  * element of page_ctrs_cands array is greater than 0.
174  *
175  * page_ctrs_cands is indexed by mutex (i), region (r), mnode (m), mrange (g)
176  */
177 pcc_info_t **page_ctrs_cands[NPC_MUTEX][MMU_PAGE_SIZES];
178 
179 /*
180  * Return in val the total number of free pages which can be created
181  * for the given mnode (m), mrange (g), and region size (r)
182  */
183 #define	PGCTRS_CANDS_GETVALUE(m, g, r, val) {				\
184 	int i;								\
185 	val = 0;							\
186 	for (i = 0; i < NPC_MUTEX; i++) {				\
187 	    val += page_ctrs_cands[i][(r)][(m)][(g)].pcc_pages_free;	\
188 	}								\
189 }
190 
191 /*
192  * Return in val the total number of free pages which can be created
193  * for the given mnode (m), mrange (g), region size (r), and color (c)
194  */
195 #define	PGCTRS_CANDS_GETVALUECOLOR(m, g, r, c, val) {			\
196 	int i;								\
197 	val = 0;							\
198 	ASSERT((c) < PAGE_GET_PAGECOLORS(r));				\
199 	for (i = 0; i < NPC_MUTEX; i++) {				\
200 	    val +=							\
201 		page_ctrs_cands[i][(r)][(m)][(g)].pcc_color_free[(c)];	\
202 	}								\
203 }
204 
205 /*
206  * We can only allow a single thread to update a counter within the physical
207  * range of the largest supported page size. That is the finest granularity
208  * possible since the counter values are dependent on each other
209  * as you move accross region sizes. PP_CTR_LOCK_INDX is used to determine the
210  * ctr_mutex lock index for a particular physical range.
211  */
212 static kmutex_t	*ctr_mutex[NPC_MUTEX];
213 
214 #define	PP_CTR_LOCK_INDX(pp)						\
215 	(((pp)->p_pagenum >>						\
216 	    (PAGE_BSZS_SHIFT(mmu_page_sizes - 1))) & (NPC_MUTEX - 1))
217 
218 #define	INVALID_COLOR 0xffffffff
219 #define	INVALID_MASK  0xffffffff
220 
221 /*
222  * Local functions prototypes.
223  */
224 
225 void page_ctr_add(int, int, page_t *, int);
226 void page_ctr_add_internal(int, int, page_t *, int);
227 void page_ctr_sub(int, int, page_t *, int);
228 void page_ctr_sub_internal(int, int, page_t *, int);
229 void page_freelist_lock(int);
230 void page_freelist_unlock(int);
231 page_t *page_promote(int, pfn_t, uchar_t, int, int);
232 page_t *page_demote(int, pfn_t, uchar_t, uchar_t, int, int);
233 page_t *page_freelist_split(uchar_t,
234     uint_t, int, int, pfn_t, page_list_walker_t *);
235 page_t *page_get_mnode_cachelist(uint_t, uint_t, int, int);
236 static int page_trylock_cons(page_t *pp, se_t se);
237 
238 /*
239  * The page_counters array below is used to keep track of free contiguous
240  * physical memory.  A hw_page_map_t will be allocated per mnode per szc.
241  * This contains an array of counters, the size of the array, a shift value
242  * used to convert a pagenum into a counter array index or vice versa, as
243  * well as a cache of the last successful index to be promoted to a larger
244  * page size.  As an optimization, we keep track of the last successful index
245  * to be promoted per page color for the given size region, and this is
246  * allocated dynamically based upon the number of colors for a given
247  * region size.
248  *
249  * Conceptually, the page counters are represented as:
250  *
251  *	page_counters[region_size][mnode]
252  *
253  *	region_size:	size code of a candidate larger page made up
254  *			of contiguous free smaller pages.
255  *
256  *	page_counters[region_size][mnode].hpm_counters[index]:
257  *		represents how many (region_size - 1) pages either
258  *		exist or can be created within the given index range.
259  *
260  * Let's look at a sparc example:
261  *	If we want to create a free 512k page, we look at region_size 2
262  *	for the mnode we want.  We calculate the index and look at a specific
263  *	hpm_counters location.  If we see 8 (FULL_REGION_CNT on sparc) at
264  *	this location, it means that 8 64k pages either exist or can be created
265  *	from 8K pages in order to make a single free 512k page at the given
266  *	index.  Note that when a region is full, it will contribute to the
267  *	counts in the region above it.  Thus we will not know what page
268  *	size the free pages will be which can be promoted to this new free
269  *	page unless we look at all regions below the current region.
270  */
271 
272 /*
273  * Note: hpmctr_t is defined in platform vm_dep.h
274  * hw_page_map_t contains all the information needed for the page_counters
275  * logic. The fields are as follows:
276  *
277  *	hpm_counters:	dynamically allocated array to hold counter data
278  *	hpm_entries:	entries in hpm_counters
279  *	hpm_shift:	shift for pnum/array index conv
280  *	hpm_base:	PFN mapped to counter index 0
281  *	hpm_color_current:	last index in counter array for this color at
282  *				which we successfully created a large page
283  */
284 typedef struct hw_page_map {
285 	hpmctr_t	*hpm_counters;
286 	size_t		hpm_entries;
287 	int		hpm_shift;
288 	pfn_t		hpm_base;
289 	size_t		*hpm_color_current[MAX_MNODE_MRANGES];
290 } hw_page_map_t;
291 
292 /*
293  * Element zero is not used, but is allocated for convenience.
294  */
295 static hw_page_map_t *page_counters[MMU_PAGE_SIZES];
296 
297 /*
298  * Cached value of MNODE_RANGE_CNT(mnode).
299  * This is a function call in x86.
300  */
301 static int mnode_nranges[MAX_MEM_NODES];
302 static int mnode_maxmrange[MAX_MEM_NODES];
303 
304 /*
305  * The following macros are convenient ways to get access to the individual
306  * elements of the page_counters arrays.  They can be used on both
307  * the left side and right side of equations.
308  */
309 #define	PAGE_COUNTERS(mnode, rg_szc, idx)			\
310 	(page_counters[(rg_szc)][(mnode)].hpm_counters[(idx)])
311 
312 #define	PAGE_COUNTERS_COUNTERS(mnode, rg_szc) 			\
313 	(page_counters[(rg_szc)][(mnode)].hpm_counters)
314 
315 #define	PAGE_COUNTERS_SHIFT(mnode, rg_szc) 			\
316 	(page_counters[(rg_szc)][(mnode)].hpm_shift)
317 
318 #define	PAGE_COUNTERS_ENTRIES(mnode, rg_szc) 			\
319 	(page_counters[(rg_szc)][(mnode)].hpm_entries)
320 
321 #define	PAGE_COUNTERS_BASE(mnode, rg_szc) 			\
322 	(page_counters[(rg_szc)][(mnode)].hpm_base)
323 
324 #define	PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, rg_szc, g)		\
325 	(page_counters[(rg_szc)][(mnode)].hpm_color_current[(g)])
326 
327 #define	PAGE_COUNTERS_CURRENT_COLOR(mnode, rg_szc, color, mrange)	\
328 	(page_counters[(rg_szc)][(mnode)].				\
329 	hpm_color_current[(mrange)][(color)])
330 
331 #define	PNUM_TO_IDX(mnode, rg_szc, pnum)			\
332 	(((pnum) - PAGE_COUNTERS_BASE((mnode), (rg_szc))) >>	\
333 		PAGE_COUNTERS_SHIFT((mnode), (rg_szc)))
334 
335 #define	IDX_TO_PNUM(mnode, rg_szc, index) 			\
336 	(PAGE_COUNTERS_BASE((mnode), (rg_szc)) +		\
337 		((index) << PAGE_COUNTERS_SHIFT((mnode), (rg_szc))))
338 
339 /*
340  * Protects the hpm_counters and hpm_color_current memory from changing while
341  * looking at page counters information.
342  * Grab the write lock to modify what these fields point at.
343  * Grab the read lock to prevent any pointers from changing.
344  * The write lock can not be held during memory allocation due to a possible
345  * recursion deadlock with trying to grab the read lock while the
346  * write lock is already held.
347  */
348 krwlock_t page_ctrs_rwlock[MAX_MEM_NODES];
349 
350 
351 /*
352  * initialize cpu_vm_data to point at cache aligned vm_cpu_data_t.
353  */
354 void
355 cpu_vm_data_init(struct cpu *cp)
356 {
357 	if (cp == CPU0) {
358 		cp->cpu_vm_data = (void *)&vm_cpu_data0;
359 	} else {
360 		void	*kmptr;
361 		int	align;
362 		size_t	sz;
363 
364 		align = (L2CACHE_ALIGN) ? L2CACHE_ALIGN : L2CACHE_ALIGN_MAX;
365 		sz = P2ROUNDUP(sizeof (vm_cpu_data_t), align) + align;
366 		kmptr = kmem_zalloc(sz, KM_SLEEP);
367 		cp->cpu_vm_data = (void *) P2ROUNDUP((uintptr_t)kmptr, align);
368 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr = kmptr;
369 		((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize = sz;
370 	}
371 }
372 
373 /*
374  * free cpu_vm_data
375  */
376 void
377 cpu_vm_data_destroy(struct cpu *cp)
378 {
379 	if (cp->cpu_seqid && cp->cpu_vm_data) {
380 		ASSERT(cp != CPU0);
381 		kmem_free(((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmptr,
382 		    ((vm_cpu_data_t *)cp->cpu_vm_data)->vc_kmsize);
383 	}
384 	cp->cpu_vm_data = NULL;
385 }
386 
387 
388 /*
389  * page size to page size code
390  */
391 int
392 page_szc(size_t pagesize)
393 {
394 	int	i = 0;
395 
396 	while (hw_page_array[i].hp_size) {
397 		if (pagesize == hw_page_array[i].hp_size)
398 			return (i);
399 		i++;
400 	}
401 	return (-1);
402 }
403 
404 /*
405  * page size to page size code with the restriction that it be a supported
406  * user page size.  If it's not a supported user page size, -1 will be returned.
407  */
408 int
409 page_szc_user_filtered(size_t pagesize)
410 {
411 	int szc = page_szc(pagesize);
412 	if ((szc != -1) && (SZC_2_USERSZC(szc) != -1)) {
413 		return (szc);
414 	}
415 	return (-1);
416 }
417 
418 /*
419  * Return how many page sizes are available for the user to use.  This is
420  * what the hardware supports and not based upon how the OS implements the
421  * support of different page sizes.
422  */
423 uint_t
424 page_num_user_pagesizes(void)
425 {
426 	return (mmu_exported_page_sizes);
427 }
428 
429 uint_t
430 page_num_pagesizes(void)
431 {
432 	return (mmu_page_sizes);
433 }
434 
435 /*
436  * returns the count of the number of base pagesize pages associated with szc
437  */
438 pgcnt_t
439 page_get_pagecnt(uint_t szc)
440 {
441 	if (szc >= mmu_page_sizes)
442 		panic("page_get_pagecnt: out of range %d", szc);
443 	return (hw_page_array[szc].hp_pgcnt);
444 }
445 
446 size_t
447 page_get_pagesize(uint_t szc)
448 {
449 	if (szc >= mmu_page_sizes)
450 		panic("page_get_pagesize: out of range %d", szc);
451 	return (hw_page_array[szc].hp_size);
452 }
453 
454 /*
455  * Return the size of a page based upon the index passed in.  An index of
456  * zero refers to the smallest page size in the system, and as index increases
457  * it refers to the next larger supported page size in the system.
458  * Note that szc and userszc may not be the same due to unsupported szc's on
459  * some systems.
460  */
461 size_t
462 page_get_user_pagesize(uint_t userszc)
463 {
464 	uint_t szc = USERSZC_2_SZC(userszc);
465 
466 	if (szc >= mmu_page_sizes)
467 		panic("page_get_user_pagesize: out of range %d", szc);
468 	return (hw_page_array[szc].hp_size);
469 }
470 
471 uint_t
472 page_get_shift(uint_t szc)
473 {
474 	if (szc >= mmu_page_sizes)
475 		panic("page_get_shift: out of range %d", szc);
476 	return (PAGE_GET_SHIFT(szc));
477 }
478 
479 uint_t
480 page_get_pagecolors(uint_t szc)
481 {
482 	if (szc >= mmu_page_sizes)
483 		panic("page_get_pagecolors: out of range %d", szc);
484 	return (PAGE_GET_PAGECOLORS(szc));
485 }
486 
487 /*
488  * this assigns the desired equivalent color after a split
489  */
490 uint_t
491 page_correct_color(uchar_t szc, uchar_t nszc, uint_t color,
492     uint_t ncolor, uint_t ceq_mask)
493 {
494 	ASSERT(nszc > szc);
495 	ASSERT(szc < mmu_page_sizes);
496 	ASSERT(color < PAGE_GET_PAGECOLORS(szc));
497 	ASSERT(ncolor < PAGE_GET_PAGECOLORS(nszc));
498 
499 	color &= ceq_mask;
500 	ncolor = PAGE_CONVERT_COLOR(ncolor, szc, nszc);
501 	return (color | (ncolor & ~ceq_mask));
502 }
503 
504 /*
505  * The interleaved_mnodes flag is set when mnodes overlap in
506  * the physbase..physmax range, but have disjoint slices.
507  * In this case hpm_counters is shared by all mnodes.
508  * This flag is set dynamically by the platform.
509  */
510 int interleaved_mnodes = 0;
511 
512 /*
513  * Called by startup().
514  * Size up the per page size free list counters based on physmax
515  * of each node and max_mem_nodes.
516  *
517  * If interleaved_mnodes is set we need to find the first mnode that
518  * exists. hpm_counters for the first mnode will then be shared by
519  * all other mnodes. If interleaved_mnodes is not set, just set
520  * first=mnode each time. That means there will be no sharing.
521  */
522 size_t
523 page_ctrs_sz(void)
524 {
525 	int	r;		/* region size */
526 	int	mnode;
527 	int	firstmn;	/* first mnode that exists */
528 	int	nranges;
529 	pfn_t	physbase;
530 	pfn_t	physmax;
531 	uint_t	ctrs_sz = 0;
532 	int 	i;
533 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
534 
535 	/*
536 	 * We need to determine how many page colors there are for each
537 	 * page size in order to allocate memory for any color specific
538 	 * arrays.
539 	 */
540 	for (i = 0; i < mmu_page_sizes; i++) {
541 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
542 	}
543 
544 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
545 
546 		pgcnt_t r_pgcnt;
547 		pfn_t   r_base;
548 		pgcnt_t r_align;
549 
550 		if (mem_node_config[mnode].exists == 0)
551 			continue;
552 
553 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
554 		nranges = MNODE_RANGE_CNT(mnode);
555 		mnode_nranges[mnode] = nranges;
556 		mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
557 
558 		/*
559 		 * determine size needed for page counter arrays with
560 		 * base aligned to large page size.
561 		 */
562 		for (r = 1; r < mmu_page_sizes; r++) {
563 			/* add in space for hpm_color_current */
564 			ctrs_sz += sizeof (size_t) *
565 			    colors_per_szc[r] * nranges;
566 
567 			if (firstmn != mnode)
568 				continue;
569 
570 			/* add in space for hpm_counters */
571 			r_align = page_get_pagecnt(r);
572 			r_base = physbase;
573 			r_base &= ~(r_align - 1);
574 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
575 
576 			/*
577 			 * Round up to always allocate on pointer sized
578 			 * boundaries.
579 			 */
580 			ctrs_sz += P2ROUNDUP((r_pgcnt * sizeof (hpmctr_t)),
581 			    sizeof (hpmctr_t *));
582 		}
583 	}
584 
585 	for (r = 1; r < mmu_page_sizes; r++) {
586 		ctrs_sz += (max_mem_nodes * sizeof (hw_page_map_t));
587 	}
588 
589 	/* add in space for page_ctrs_cands and pcc_color_free */
590 	ctrs_sz += sizeof (pcc_info_t *) * max_mem_nodes *
591 	    mmu_page_sizes * NPC_MUTEX;
592 
593 	for (mnode = 0; mnode < max_mem_nodes; mnode++) {
594 
595 		if (mem_node_config[mnode].exists == 0)
596 			continue;
597 
598 		nranges = mnode_nranges[mnode];
599 		ctrs_sz += sizeof (pcc_info_t) * nranges *
600 		    mmu_page_sizes * NPC_MUTEX;
601 		for (r = 1; r < mmu_page_sizes; r++) {
602 			ctrs_sz += sizeof (pgcnt_t) * nranges *
603 			    colors_per_szc[r] * NPC_MUTEX;
604 		}
605 	}
606 
607 	/* ctr_mutex */
608 	ctrs_sz += (max_mem_nodes * NPC_MUTEX * sizeof (kmutex_t));
609 
610 	/* size for page list counts */
611 	PLCNT_SZ(ctrs_sz);
612 
613 	/*
614 	 * add some slop for roundups. page_ctrs_alloc will roundup the start
615 	 * address of the counters to ecache_alignsize boundary for every
616 	 * memory node.
617 	 */
618 	return (ctrs_sz + max_mem_nodes * L2CACHE_ALIGN);
619 }
620 
621 caddr_t
622 page_ctrs_alloc(caddr_t alloc_base)
623 {
624 	int	mnode;
625 	int	mrange, nranges;
626 	int	r;		/* region size */
627 	int	i;
628 	int	firstmn;	/* first mnode that exists */
629 	pfn_t	physbase;
630 	pfn_t	physmax;
631 	pgcnt_t colors_per_szc[MMU_PAGE_SIZES];
632 
633 	/*
634 	 * We need to determine how many page colors there are for each
635 	 * page size in order to allocate memory for any color specific
636 	 * arrays.
637 	 */
638 	for (i = 0; i < mmu_page_sizes; i++) {
639 		colors_per_szc[i] = PAGE_GET_PAGECOLORS(i);
640 	}
641 
642 	for (r = 1; r < mmu_page_sizes; r++) {
643 		page_counters[r] = (hw_page_map_t *)alloc_base;
644 		alloc_base += (max_mem_nodes * sizeof (hw_page_map_t));
645 	}
646 
647 	/* page_ctrs_cands and pcc_color_free array */
648 	for (i = 0; i < NPC_MUTEX; i++) {
649 		for (r = 1; r < mmu_page_sizes; r++) {
650 
651 			page_ctrs_cands[i][r] = (pcc_info_t **)alloc_base;
652 			alloc_base += sizeof (pcc_info_t *) * max_mem_nodes;
653 
654 			for (mnode = 0; mnode < max_mem_nodes; mnode++) {
655 				pcc_info_t *pi;
656 
657 				if (mem_node_config[mnode].exists == 0)
658 					continue;
659 
660 				nranges = mnode_nranges[mnode];
661 
662 				pi = (pcc_info_t *)alloc_base;
663 				alloc_base += sizeof (pcc_info_t) * nranges;
664 				page_ctrs_cands[i][r][mnode] = pi;
665 
666 				for (mrange = 0; mrange < nranges; mrange++) {
667 					pi->pcc_color_free =
668 					    (pgcnt_t *)alloc_base;
669 					alloc_base += sizeof (pgcnt_t) *
670 					    colors_per_szc[r];
671 					pi++;
672 				}
673 			}
674 		}
675 	}
676 
677 	/* ctr_mutex */
678 	for (i = 0; i < NPC_MUTEX; i++) {
679 		ctr_mutex[i] = (kmutex_t *)alloc_base;
680 		alloc_base += (max_mem_nodes * sizeof (kmutex_t));
681 	}
682 
683 	/* initialize page list counts */
684 	PLCNT_INIT(alloc_base);
685 
686 	for (firstmn = -1, mnode = 0; mnode < max_mem_nodes; mnode++) {
687 
688 		pgcnt_t r_pgcnt;
689 		pfn_t	r_base;
690 		pgcnt_t r_align;
691 		int	r_shift;
692 		int	nranges = mnode_nranges[mnode];
693 
694 		if (mem_node_config[mnode].exists == 0)
695 			continue;
696 
697 		HPM_COUNTERS_LIMITS(mnode, physbase, physmax, firstmn);
698 
699 		for (r = 1; r < mmu_page_sizes; r++) {
700 			/*
701 			 * the page_counters base has to be aligned to the
702 			 * page count of page size code r otherwise the counts
703 			 * will cross large page boundaries.
704 			 */
705 			r_align = page_get_pagecnt(r);
706 			r_base = physbase;
707 			/* base needs to be aligned - lower to aligned value */
708 			r_base &= ~(r_align - 1);
709 			r_pgcnt = howmany(physmax - r_base + 1, r_align);
710 			r_shift = PAGE_BSZS_SHIFT(r);
711 
712 			PAGE_COUNTERS_SHIFT(mnode, r) = r_shift;
713 			PAGE_COUNTERS_ENTRIES(mnode, r) = r_pgcnt;
714 			PAGE_COUNTERS_BASE(mnode, r) = r_base;
715 			for (mrange = 0; mrange < nranges; mrange++) {
716 				PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
717 				    r, mrange) = (size_t *)alloc_base;
718 				alloc_base += sizeof (size_t) *
719 				    colors_per_szc[r];
720 			}
721 			for (i = 0; i < colors_per_szc[r]; i++) {
722 				uint_t color_mask = colors_per_szc[r] - 1;
723 				pfn_t  pfnum = r_base;
724 				size_t idx;
725 				int mrange;
726 				MEM_NODE_ITERATOR_DECL(it);
727 
728 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
729 				ASSERT(pfnum != (pfn_t)-1);
730 				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i,
731 				    color_mask, color_mask, &it);
732 				idx = PNUM_TO_IDX(mnode, r, pfnum);
733 				idx = (idx >= r_pgcnt) ? 0 : idx;
734 				for (mrange = 0; mrange < nranges; mrange++) {
735 					PAGE_COUNTERS_CURRENT_COLOR(mnode,
736 					    r, i, mrange) = idx;
737 				}
738 			}
739 
740 			/* hpm_counters may be shared by all mnodes */
741 			if (firstmn == mnode) {
742 				PAGE_COUNTERS_COUNTERS(mnode, r) =
743 				    (hpmctr_t *)alloc_base;
744 				alloc_base +=
745 				    P2ROUNDUP((sizeof (hpmctr_t) * r_pgcnt),
746 				    sizeof (hpmctr_t *));
747 			} else {
748 				PAGE_COUNTERS_COUNTERS(mnode, r) =
749 				    PAGE_COUNTERS_COUNTERS(firstmn, r);
750 			}
751 
752 			/*
753 			 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
754 			 * satisfy the identity requirement.
755 			 * We should be able to go from one to the other
756 			 * and get consistent values.
757 			 */
758 			ASSERT(PNUM_TO_IDX(mnode, r,
759 			    (IDX_TO_PNUM(mnode, r, 0))) == 0);
760 			ASSERT(IDX_TO_PNUM(mnode, r,
761 			    (PNUM_TO_IDX(mnode, r, r_base))) == r_base);
762 		}
763 		/*
764 		 * Roundup the start address of the page_counters to
765 		 * cache aligned boundary for every memory node.
766 		 * page_ctrs_sz() has added some slop for these roundups.
767 		 */
768 		alloc_base = (caddr_t)P2ROUNDUP((uintptr_t)alloc_base,
769 		    L2CACHE_ALIGN);
770 	}
771 
772 	/* Initialize other page counter specific data structures. */
773 	for (mnode = 0; mnode < MAX_MEM_NODES; mnode++) {
774 		rw_init(&page_ctrs_rwlock[mnode], NULL, RW_DEFAULT, NULL);
775 	}
776 
777 	return (alloc_base);
778 }
779 
780 /*
781  * Functions to adjust region counters for each size free list.
782  * Caller is responsible to acquire the ctr_mutex lock if necessary and
783  * thus can be called during startup without locks.
784  */
785 /* ARGSUSED */
786 void
787 page_ctr_add_internal(int mnode, int mtype, page_t *pp, int flags)
788 {
789 	ssize_t		r;	/* region size */
790 	ssize_t		idx;
791 	pfn_t		pfnum;
792 	int		lckidx;
793 
794 	ASSERT(mnode == PP_2_MEM_NODE(pp));
795 	ASSERT(mtype == PP_2_MTYPE(pp));
796 
797 	ASSERT(pp->p_szc < mmu_page_sizes);
798 
799 	PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
800 
801 	/* no counter update needed for largest page size */
802 	if (pp->p_szc >= mmu_page_sizes - 1) {
803 		return;
804 	}
805 
806 	r = pp->p_szc + 1;
807 	pfnum = pp->p_pagenum;
808 	lckidx = PP_CTR_LOCK_INDX(pp);
809 
810 	/*
811 	 * Increment the count of free pages for the current
812 	 * region. Continue looping up in region size incrementing
813 	 * count if the preceeding region is full.
814 	 */
815 	while (r < mmu_page_sizes) {
816 		idx = PNUM_TO_IDX(mnode, r, pfnum);
817 
818 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
819 		ASSERT(PAGE_COUNTERS(mnode, r, idx) < FULL_REGION_CNT(r));
820 
821 		if (++PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r)) {
822 			break;
823 		} else {
824 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
825 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
826 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
827 
828 			cand->pcc_pages_free++;
829 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]++;
830 		}
831 		r++;
832 	}
833 }
834 
835 void
836 page_ctr_add(int mnode, int mtype, page_t *pp, int flags)
837 {
838 	int		lckidx = PP_CTR_LOCK_INDX(pp);
839 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
840 
841 	mutex_enter(lock);
842 	page_ctr_add_internal(mnode, mtype, pp, flags);
843 	mutex_exit(lock);
844 }
845 
846 void
847 page_ctr_sub_internal(int mnode, int mtype, page_t *pp, int flags)
848 {
849 	int		lckidx;
850 	ssize_t		r;	/* region size */
851 	ssize_t		idx;
852 	pfn_t		pfnum;
853 
854 	ASSERT(mnode == PP_2_MEM_NODE(pp));
855 	ASSERT(mtype == PP_2_MTYPE(pp));
856 
857 	ASSERT(pp->p_szc < mmu_page_sizes);
858 
859 	PLCNT_DECR(pp, mnode, mtype, pp->p_szc, flags);
860 
861 	/* no counter update needed for largest page size */
862 	if (pp->p_szc >= mmu_page_sizes - 1) {
863 		return;
864 	}
865 
866 	r = pp->p_szc + 1;
867 	pfnum = pp->p_pagenum;
868 	lckidx = PP_CTR_LOCK_INDX(pp);
869 
870 	/*
871 	 * Decrement the count of free pages for the current
872 	 * region. Continue looping up in region size decrementing
873 	 * count if the preceeding region was full.
874 	 */
875 	while (r < mmu_page_sizes) {
876 		idx = PNUM_TO_IDX(mnode, r, pfnum);
877 
878 		ASSERT(idx < PAGE_COUNTERS_ENTRIES(mnode, r));
879 		ASSERT(PAGE_COUNTERS(mnode, r, idx) > 0);
880 
881 		if (--PAGE_COUNTERS(mnode, r, idx) != FULL_REGION_CNT(r) - 1) {
882 			break;
883 		} else {
884 			int root_mtype = PP_2_MTYPE(PP_GROUPLEADER(pp, r));
885 			pcc_info_t *cand = &page_ctrs_cands[lckidx][r][mnode]
886 			    [MTYPE_2_MRANGE(mnode, root_mtype)];
887 
888 			ASSERT(cand->pcc_pages_free != 0);
889 			ASSERT(cand->pcc_color_free[PP_2_BIN_SZC(pp, r)] != 0);
890 
891 			cand->pcc_pages_free--;
892 			cand->pcc_color_free[PP_2_BIN_SZC(pp, r)]--;
893 		}
894 		r++;
895 	}
896 }
897 
898 void
899 page_ctr_sub(int mnode, int mtype, page_t *pp, int flags)
900 {
901 	int		lckidx = PP_CTR_LOCK_INDX(pp);
902 	kmutex_t	*lock = &ctr_mutex[lckidx][mnode];
903 
904 	mutex_enter(lock);
905 	page_ctr_sub_internal(mnode, mtype, pp, flags);
906 	mutex_exit(lock);
907 }
908 
909 /*
910  * Adjust page counters following a memory attach, since typically the
911  * size of the array needs to change, and the PFN to counter index
912  * mapping needs to change.
913  *
914  * It is possible this mnode did not exist at startup. In that case
915  * allocate pcc_info_t and pcc_color_free arrays. Also, allow for nranges
916  * to change (a theoretical possibility on x86), which means pcc_color_free
917  * arrays must be extended.
918  */
919 uint_t
920 page_ctrs_adjust(int mnode)
921 {
922 	pgcnt_t npgs;
923 	int	r;		/* region size */
924 	int	i;
925 	size_t	pcsz, old_csz;
926 	hpmctr_t *new_ctr, *old_ctr;
927 	pfn_t	oldbase, newbase;
928 	pfn_t	physbase, physmax;
929 	size_t	old_npgs;
930 	hpmctr_t *ctr_cache[MMU_PAGE_SIZES];
931 	size_t	size_cache[MMU_PAGE_SIZES];
932 	size_t	*color_cache[MMU_PAGE_SIZES][MAX_MNODE_MRANGES];
933 	size_t	*old_color_array[MAX_MNODE_MRANGES];
934 	pgcnt_t	colors_per_szc[MMU_PAGE_SIZES];
935 	pcc_info_t **cands_cache;
936 	pcc_info_t *old_pi, *pi;
937 	pgcnt_t *pgcntp;
938 	int nr, old_nranges, mrange, nranges = MNODE_RANGE_CNT(mnode);
939 	int cands_cache_nranges;
940 	int old_maxmrange, new_maxmrange;
941 	int rc = 0;
942 
943 	cands_cache = kmem_zalloc(sizeof (pcc_info_t *) * NPC_MUTEX *
944 	    MMU_PAGE_SIZES, KM_NOSLEEP);
945 	if (cands_cache == NULL)
946 		return (ENOMEM);
947 
948 	i = -1;
949 	HPM_COUNTERS_LIMITS(mnode, physbase, physmax, i);
950 
951 	newbase = physbase & ~PC_BASE_ALIGN_MASK;
952 	npgs = roundup(physmax, PC_BASE_ALIGN) - newbase;
953 
954 	/* prepare to free non-null pointers on the way out */
955 	cands_cache_nranges = nranges;
956 	bzero(ctr_cache, sizeof (ctr_cache));
957 	bzero(color_cache, sizeof (color_cache));
958 
959 	/*
960 	 * We need to determine how many page colors there are for each
961 	 * page size in order to allocate memory for any color specific
962 	 * arrays.
963 	 */
964 	for (r = 0; r < mmu_page_sizes; r++) {
965 		colors_per_szc[r] = PAGE_GET_PAGECOLORS(r);
966 	}
967 
968 	/*
969 	 * Preallocate all of the new hpm_counters arrays as we can't
970 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
971 	 * If we can't allocate all of the arrays, undo our work so far
972 	 * and return failure.
973 	 */
974 	for (r = 1; r < mmu_page_sizes; r++) {
975 		pcsz = npgs >> PAGE_BSZS_SHIFT(r);
976 		size_cache[r] = pcsz;
977 		ctr_cache[r] = kmem_zalloc(pcsz *
978 		    sizeof (hpmctr_t), KM_NOSLEEP);
979 		if (ctr_cache[r] == NULL) {
980 			rc = ENOMEM;
981 			goto cleanup;
982 		}
983 	}
984 
985 	/*
986 	 * Preallocate all of the new color current arrays as we can't
987 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
988 	 * If we can't allocate all of the arrays, undo our work so far
989 	 * and return failure.
990 	 */
991 	for (r = 1; r < mmu_page_sizes; r++) {
992 		for (mrange = 0; mrange < nranges; mrange++) {
993 			color_cache[r][mrange] = kmem_zalloc(sizeof (size_t) *
994 			    colors_per_szc[r], KM_NOSLEEP);
995 			if (color_cache[r][mrange] == NULL) {
996 				rc = ENOMEM;
997 				goto cleanup;
998 			}
999 		}
1000 	}
1001 
1002 	/*
1003 	 * Preallocate all of the new pcc_info_t arrays as we can't
1004 	 * hold the page_ctrs_rwlock as a writer and allocate memory.
1005 	 * If we can't allocate all of the arrays, undo our work so far
1006 	 * and return failure.
1007 	 */
1008 	for (r = 1; r < mmu_page_sizes; r++) {
1009 		for (i = 0; i < NPC_MUTEX; i++) {
1010 			pi = kmem_zalloc(nranges * sizeof (pcc_info_t),
1011 			    KM_NOSLEEP);
1012 			if (pi == NULL) {
1013 				rc = ENOMEM;
1014 				goto cleanup;
1015 			}
1016 			cands_cache[i * MMU_PAGE_SIZES + r] = pi;
1017 
1018 			for (mrange = 0; mrange < nranges; mrange++, pi++) {
1019 				pgcntp = kmem_zalloc(colors_per_szc[r] *
1020 				    sizeof (pgcnt_t), KM_NOSLEEP);
1021 				if (pgcntp == NULL) {
1022 					rc = ENOMEM;
1023 					goto cleanup;
1024 				}
1025 				pi->pcc_color_free = pgcntp;
1026 			}
1027 		}
1028 	}
1029 
1030 	/*
1031 	 * Grab the write lock to prevent others from walking these arrays
1032 	 * while we are modifying them.
1033 	 */
1034 	PAGE_CTRS_WRITE_LOCK(mnode);
1035 
1036 	old_nranges = mnode_nranges[mnode];
1037 	cands_cache_nranges = old_nranges;
1038 	mnode_nranges[mnode] = nranges;
1039 	old_maxmrange = mnode_maxmrange[mnode];
1040 	mnode_maxmrange[mnode] = MNODE_MAX_MRANGE(mnode);
1041 	new_maxmrange = mnode_maxmrange[mnode];
1042 
1043 	for (r = 1; r < mmu_page_sizes; r++) {
1044 		PAGE_COUNTERS_SHIFT(mnode, r) = PAGE_BSZS_SHIFT(r);
1045 		old_ctr = PAGE_COUNTERS_COUNTERS(mnode, r);
1046 		old_csz = PAGE_COUNTERS_ENTRIES(mnode, r);
1047 		oldbase = PAGE_COUNTERS_BASE(mnode, r);
1048 		old_npgs = old_csz << PAGE_COUNTERS_SHIFT(mnode, r);
1049 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1050 			old_color_array[mrange] =
1051 			    PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode,
1052 			    r, mrange);
1053 		}
1054 
1055 		pcsz = npgs >> PAGE_COUNTERS_SHIFT(mnode, r);
1056 		new_ctr = ctr_cache[r];
1057 		ctr_cache[r] = NULL;
1058 		if (old_ctr != NULL &&
1059 		    (oldbase + old_npgs > newbase) &&
1060 		    (newbase + npgs > oldbase)) {
1061 			/*
1062 			 * Map the intersection of the old and new
1063 			 * counters into the new array.
1064 			 */
1065 			size_t offset;
1066 			if (newbase > oldbase) {
1067 				offset = (newbase - oldbase) >>
1068 				    PAGE_COUNTERS_SHIFT(mnode, r);
1069 				bcopy(old_ctr + offset, new_ctr,
1070 				    MIN(pcsz, (old_csz - offset)) *
1071 				    sizeof (hpmctr_t));
1072 			} else {
1073 				offset = (oldbase - newbase) >>
1074 				    PAGE_COUNTERS_SHIFT(mnode, r);
1075 				bcopy(old_ctr, new_ctr + offset,
1076 				    MIN(pcsz - offset, old_csz) *
1077 				    sizeof (hpmctr_t));
1078 			}
1079 		}
1080 
1081 		PAGE_COUNTERS_COUNTERS(mnode, r) = new_ctr;
1082 		PAGE_COUNTERS_ENTRIES(mnode, r) = pcsz;
1083 		PAGE_COUNTERS_BASE(mnode, r) = newbase;
1084 
1085 		/* update shared hpm_counters in other mnodes */
1086 		if (interleaved_mnodes) {
1087 			for (i = 0; i < max_mem_nodes; i++) {
1088 				if (i == mnode)
1089 					continue;
1090 				if (mem_node_config[i].exists == 0)
1091 					continue;
1092 				ASSERT(PAGE_COUNTERS_COUNTERS(i, r) == old_ctr);
1093 				PAGE_COUNTERS_COUNTERS(i, r) = new_ctr;
1094 				PAGE_COUNTERS_ENTRIES(i, r) = pcsz;
1095 				PAGE_COUNTERS_BASE(i, r) = newbase;
1096 			}
1097 		}
1098 
1099 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1100 			PAGE_COUNTERS_CURRENT_COLOR_ARRAY(mnode, r, mrange) =
1101 			    color_cache[r][mrange];
1102 			color_cache[r][mrange] = NULL;
1103 		}
1104 		/*
1105 		 * for now, just reset on these events as it's probably
1106 		 * not worthwhile to try and optimize this.
1107 		 */
1108 		for (i = 0; i < colors_per_szc[r]; i++) {
1109 			uint_t color_mask = colors_per_szc[r] - 1;
1110 			int mlo = interleaved_mnodes ? 0 : mnode;
1111 			int mhi = interleaved_mnodes ? max_mem_nodes :
1112 			    (mnode + 1);
1113 			int m;
1114 			pfn_t  pfnum = newbase;
1115 			size_t idx;
1116 			MEM_NODE_ITERATOR_DECL(it);
1117 
1118 			for (m = mlo; m < mhi; m++) {
1119 				if (mem_node_config[m].exists == 0)
1120 					continue;
1121 				MEM_NODE_ITERATOR_INIT(pfnum, m, &it);
1122 				ASSERT(pfnum != (pfn_t)-1);
1123 				PAGE_NEXT_PFN_FOR_COLOR(pfnum, r, i, color_mask,
1124 				    color_mask, &it);
1125 				idx = PNUM_TO_IDX(m, r, pfnum);
1126 				idx = (idx < pcsz) ? idx : 0;
1127 				for (mrange = 0; mrange < nranges; mrange++) {
1128 					PAGE_COUNTERS_CURRENT_COLOR(m,
1129 					    r, i, mrange) = idx;
1130 				}
1131 			}
1132 		}
1133 
1134 		/* cache info for freeing out of the critical path */
1135 		if ((caddr_t)old_ctr >= kernelheap &&
1136 		    (caddr_t)old_ctr < ekernelheap) {
1137 			ctr_cache[r] = old_ctr;
1138 			size_cache[r] = old_csz;
1139 		}
1140 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1141 			size_t *tmp = old_color_array[mrange];
1142 			if ((caddr_t)tmp >= kernelheap &&
1143 			    (caddr_t)tmp < ekernelheap) {
1144 				color_cache[r][mrange] = tmp;
1145 			}
1146 		}
1147 		/*
1148 		 * Verify that PNUM_TO_IDX and IDX_TO_PNUM
1149 		 * satisfy the identity requirement.
1150 		 * We should be able to go from one to the other
1151 		 * and get consistent values.
1152 		 */
1153 		ASSERT(PNUM_TO_IDX(mnode, r,
1154 		    (IDX_TO_PNUM(mnode, r, 0))) == 0);
1155 		ASSERT(IDX_TO_PNUM(mnode, r,
1156 		    (PNUM_TO_IDX(mnode, r, newbase))) == newbase);
1157 
1158 		/* pcc_info_t and pcc_color_free */
1159 		for (i = 0; i < NPC_MUTEX; i++) {
1160 			pcc_info_t *epi;
1161 			pcc_info_t *eold_pi;
1162 
1163 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1164 			old_pi = page_ctrs_cands[i][r][mnode];
1165 			page_ctrs_cands[i][r][mnode] = pi;
1166 			cands_cache[i * MMU_PAGE_SIZES + r] = old_pi;
1167 
1168 			/* preserve old pcc_color_free values, if any */
1169 			if (old_pi == NULL)
1170 				continue;
1171 
1172 			/*
1173 			 * when/if x86 does DR, must account for
1174 			 * possible change in range index when
1175 			 * preserving pcc_info
1176 			 */
1177 			epi = &pi[nranges];
1178 			eold_pi = &old_pi[old_nranges];
1179 			if (new_maxmrange > old_maxmrange) {
1180 				pi += new_maxmrange - old_maxmrange;
1181 			} else if (new_maxmrange < old_maxmrange) {
1182 				old_pi += old_maxmrange - new_maxmrange;
1183 			}
1184 			for (; pi < epi && old_pi < eold_pi; pi++, old_pi++) {
1185 				pcc_info_t tmp = *pi;
1186 				*pi = *old_pi;
1187 				*old_pi = tmp;
1188 			}
1189 		}
1190 	}
1191 	PAGE_CTRS_WRITE_UNLOCK(mnode);
1192 
1193 	/*
1194 	 * Now that we have dropped the write lock, it is safe to free all
1195 	 * of the memory we have cached above.
1196 	 * We come thru here to free memory when pre-alloc fails, and also to
1197 	 * free old pointers which were recorded while locked.
1198 	 */
1199 cleanup:
1200 	for (r = 1; r < mmu_page_sizes; r++) {
1201 		if (ctr_cache[r] != NULL) {
1202 			kmem_free(ctr_cache[r],
1203 			    size_cache[r] * sizeof (hpmctr_t));
1204 		}
1205 		for (mrange = 0; mrange < MAX_MNODE_MRANGES; mrange++) {
1206 			if (color_cache[r][mrange] != NULL) {
1207 				kmem_free(color_cache[r][mrange],
1208 				    colors_per_szc[r] * sizeof (size_t));
1209 			}
1210 		}
1211 		for (i = 0; i < NPC_MUTEX; i++) {
1212 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1213 			if (pi == NULL)
1214 				continue;
1215 			nr = cands_cache_nranges;
1216 			for (mrange = 0; mrange < nr; mrange++, pi++) {
1217 				pgcntp = pi->pcc_color_free;
1218 				if (pgcntp == NULL)
1219 					continue;
1220 				if ((caddr_t)pgcntp >= kernelheap &&
1221 				    (caddr_t)pgcntp < ekernelheap) {
1222 					kmem_free(pgcntp,
1223 					    colors_per_szc[r] *
1224 					    sizeof (pgcnt_t));
1225 				}
1226 			}
1227 			pi = cands_cache[i * MMU_PAGE_SIZES + r];
1228 			if ((caddr_t)pi >= kernelheap &&
1229 			    (caddr_t)pi < ekernelheap) {
1230 				kmem_free(pi, nr * sizeof (pcc_info_t));
1231 			}
1232 		}
1233 	}
1234 
1235 	kmem_free(cands_cache,
1236 	    sizeof (pcc_info_t *) * NPC_MUTEX * MMU_PAGE_SIZES);
1237 	return (rc);
1238 }
1239 
1240 
1241 #ifdef DEBUG
1242 
1243 /*
1244  * confirm pp is a large page corresponding to szc
1245  */
1246 void
1247 chk_lpg(page_t *pp, uchar_t szc)
1248 {
1249 	spgcnt_t npgs = page_get_pagecnt(pp->p_szc);
1250 	uint_t noreloc;
1251 
1252 	if (npgs == 1) {
1253 		ASSERT(pp->p_szc == 0);
1254 		ASSERT(pp->p_next == pp);
1255 		ASSERT(pp->p_prev == pp);
1256 		return;
1257 	}
1258 
1259 	ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1260 	ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1261 
1262 	ASSERT(IS_P2ALIGNED(pp->p_pagenum, npgs));
1263 	ASSERT(pp->p_pagenum == (pp->p_next->p_pagenum - 1));
1264 	ASSERT(pp->p_prev->p_pagenum == (pp->p_pagenum + (npgs - 1)));
1265 	ASSERT(pp->p_prev == (pp + (npgs - 1)));
1266 
1267 	/*
1268 	 * Check list of pages.
1269 	 */
1270 	noreloc = PP_ISNORELOC(pp);
1271 	while (npgs--) {
1272 		if (npgs != 0) {
1273 			ASSERT(pp->p_pagenum == pp->p_next->p_pagenum - 1);
1274 			ASSERT(pp->p_next == (pp + 1));
1275 		}
1276 		ASSERT(pp->p_szc == szc);
1277 		ASSERT(PP_ISFREE(pp));
1278 		ASSERT(PP_ISAGED(pp));
1279 		ASSERT(pp->p_vpnext == pp || pp->p_vpnext == NULL);
1280 		ASSERT(pp->p_vpprev == pp || pp->p_vpprev == NULL);
1281 		ASSERT(pp->p_vnode  == NULL);
1282 		ASSERT(PP_ISNORELOC(pp) == noreloc);
1283 
1284 		pp = pp->p_next;
1285 	}
1286 }
1287 #endif /* DEBUG */
1288 
1289 void
1290 page_freelist_lock(int mnode)
1291 {
1292 	int i;
1293 	for (i = 0; i < NPC_MUTEX; i++) {
1294 		mutex_enter(FPC_MUTEX(mnode, i));
1295 		mutex_enter(CPC_MUTEX(mnode, i));
1296 	}
1297 }
1298 
1299 void
1300 page_freelist_unlock(int mnode)
1301 {
1302 	int i;
1303 	for (i = 0; i < NPC_MUTEX; i++) {
1304 		mutex_exit(FPC_MUTEX(mnode, i));
1305 		mutex_exit(CPC_MUTEX(mnode, i));
1306 	}
1307 }
1308 
1309 /*
1310  * add pp to the specified page list. Defaults to head of the page list
1311  * unless PG_LIST_TAIL is specified.
1312  */
1313 void
1314 page_list_add(page_t *pp, int flags)
1315 {
1316 	page_t		**ppp;
1317 	kmutex_t	*pcm;
1318 	uint_t		bin, mtype;
1319 	int		mnode;
1320 
1321 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1322 	ASSERT(PP_ISFREE(pp));
1323 	ASSERT(!hat_page_is_mapped(pp));
1324 	ASSERT(hat_page_getshare(pp) == 0);
1325 
1326 	/*
1327 	 * Large pages should be freed via page_list_add_pages().
1328 	 */
1329 	ASSERT(pp->p_szc == 0);
1330 
1331 	/*
1332 	 * Don't need to lock the freelist first here
1333 	 * because the page isn't on the freelist yet.
1334 	 * This means p_szc can't change on us.
1335 	 */
1336 
1337 	bin = PP_2_BIN(pp);
1338 	mnode = PP_2_MEM_NODE(pp);
1339 	mtype = PP_2_MTYPE(pp);
1340 
1341 	if (flags & PG_LIST_ISINIT) {
1342 		/*
1343 		 * PG_LIST_ISINIT is set during system startup (ie. single
1344 		 * threaded), add a page to the free list and add to the
1345 		 * the free region counters w/o any locking
1346 		 */
1347 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1348 
1349 		/* inline version of page_add() */
1350 		if (*ppp != NULL) {
1351 			pp->p_next = *ppp;
1352 			pp->p_prev = (*ppp)->p_prev;
1353 			(*ppp)->p_prev = pp;
1354 			pp->p_prev->p_next = pp;
1355 		} else
1356 			*ppp = pp;
1357 
1358 		page_ctr_add_internal(mnode, mtype, pp, flags);
1359 		VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1360 	} else {
1361 		pcm = PC_BIN_MUTEX(mnode, bin, flags);
1362 
1363 		if (flags & PG_FREE_LIST) {
1364 			VM_STAT_ADD(vmm_vmstats.pladd_free[0]);
1365 			ASSERT(PP_ISAGED(pp));
1366 			ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1367 
1368 		} else {
1369 			VM_STAT_ADD(vmm_vmstats.pladd_cache);
1370 			ASSERT(pp->p_vnode);
1371 			ASSERT((pp->p_offset & PAGEOFFSET) == 0);
1372 			ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1373 		}
1374 		mutex_enter(pcm);
1375 		page_add(ppp, pp);
1376 
1377 		if (flags & PG_LIST_TAIL)
1378 			*ppp = (*ppp)->p_next;
1379 		/*
1380 		 * Add counters before releasing pcm mutex to avoid a race with
1381 		 * page_freelist_coalesce and page_freelist_split.
1382 		 */
1383 		page_ctr_add(mnode, mtype, pp, flags);
1384 		mutex_exit(pcm);
1385 	}
1386 
1387 
1388 #if defined(__sparc)
1389 	if (PP_ISNORELOC(pp)) {
1390 		kcage_freemem_add(1);
1391 	}
1392 #endif
1393 	/*
1394 	 * It is up to the caller to unlock the page!
1395 	 */
1396 	ASSERT(PAGE_EXCL(pp) || (flags & PG_LIST_ISINIT));
1397 }
1398 
1399 
1400 #ifdef __sparc
1401 /*
1402  * This routine is only used by kcage_init during system startup.
1403  * It performs the function of page_list_sub/PP_SETNORELOC/page_list_add
1404  * without the overhead of taking locks and updating counters.
1405  */
1406 void
1407 page_list_noreloc_startup(page_t *pp)
1408 {
1409 	page_t		**ppp;
1410 	uint_t		bin;
1411 	int		mnode;
1412 	int		mtype;
1413 	int		flags = 0;
1414 
1415 	/*
1416 	 * If this is a large page on the freelist then
1417 	 * break it up into smaller pages.
1418 	 */
1419 	if (pp->p_szc != 0)
1420 		page_boot_demote(pp);
1421 
1422 	/*
1423 	 * Get list page is currently on.
1424 	 */
1425 	bin = PP_2_BIN(pp);
1426 	mnode = PP_2_MEM_NODE(pp);
1427 	mtype = PP_2_MTYPE(pp);
1428 	ASSERT(mtype == MTYPE_RELOC);
1429 	ASSERT(pp->p_szc == 0);
1430 
1431 	if (PP_ISAGED(pp)) {
1432 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1433 		flags |= PG_FREE_LIST;
1434 	} else {
1435 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1436 		flags |= PG_CACHE_LIST;
1437 	}
1438 
1439 	ASSERT(*ppp != NULL);
1440 
1441 	/*
1442 	 * Delete page from current list.
1443 	 */
1444 	if (*ppp == pp)
1445 		*ppp = pp->p_next;		/* go to next page */
1446 	if (*ppp == pp) {
1447 		*ppp = NULL;			/* page list is gone */
1448 	} else {
1449 		pp->p_prev->p_next = pp->p_next;
1450 		pp->p_next->p_prev = pp->p_prev;
1451 	}
1452 
1453 	/*
1454 	 * Decrement page counters
1455 	 */
1456 	page_ctr_sub_internal(mnode, mtype, pp, flags);
1457 
1458 	/*
1459 	 * Set no reloc for cage initted pages.
1460 	 */
1461 	PP_SETNORELOC(pp);
1462 
1463 	mtype = PP_2_MTYPE(pp);
1464 	ASSERT(mtype == MTYPE_NORELOC);
1465 
1466 	/*
1467 	 * Get new list for page.
1468 	 */
1469 	if (PP_ISAGED(pp)) {
1470 		ppp = &PAGE_FREELISTS(mnode, 0, bin, mtype);
1471 	} else {
1472 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1473 	}
1474 
1475 	/*
1476 	 * Insert page on new list.
1477 	 */
1478 	if (*ppp == NULL) {
1479 		*ppp = pp;
1480 		pp->p_next = pp->p_prev = pp;
1481 	} else {
1482 		pp->p_next = *ppp;
1483 		pp->p_prev = (*ppp)->p_prev;
1484 		(*ppp)->p_prev = pp;
1485 		pp->p_prev->p_next = pp;
1486 	}
1487 
1488 	/*
1489 	 * Increment page counters
1490 	 */
1491 	page_ctr_add_internal(mnode, mtype, pp, flags);
1492 
1493 	/*
1494 	 * Update cage freemem counter
1495 	 */
1496 	atomic_add_long(&kcage_freemem, 1);
1497 }
1498 #else	/* __sparc */
1499 
1500 /* ARGSUSED */
1501 void
1502 page_list_noreloc_startup(page_t *pp)
1503 {
1504 	panic("page_list_noreloc_startup: should be here only for sparc");
1505 }
1506 #endif
1507 
1508 void
1509 page_list_add_pages(page_t *pp, int flags)
1510 {
1511 	kmutex_t *pcm;
1512 	pgcnt_t	pgcnt;
1513 	uint_t	bin, mtype, i;
1514 	int	mnode;
1515 
1516 	/* default to freelist/head */
1517 	ASSERT((flags & (PG_CACHE_LIST | PG_LIST_TAIL)) == 0);
1518 
1519 	CHK_LPG(pp, pp->p_szc);
1520 	VM_STAT_ADD(vmm_vmstats.pladd_free[pp->p_szc]);
1521 
1522 	bin = PP_2_BIN(pp);
1523 	mnode = PP_2_MEM_NODE(pp);
1524 	mtype = PP_2_MTYPE(pp);
1525 
1526 	if (flags & PG_LIST_ISINIT) {
1527 		ASSERT(pp->p_szc == mmu_page_sizes - 1);
1528 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1529 		ASSERT(!PP_ISNORELOC(pp));
1530 		PLCNT_INCR(pp, mnode, mtype, pp->p_szc, flags);
1531 	} else {
1532 
1533 		ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
1534 
1535 		pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1536 
1537 		mutex_enter(pcm);
1538 		page_vpadd(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1539 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
1540 		mutex_exit(pcm);
1541 
1542 		pgcnt = page_get_pagecnt(pp->p_szc);
1543 #if defined(__sparc)
1544 		if (PP_ISNORELOC(pp))
1545 			kcage_freemem_add(pgcnt);
1546 #endif
1547 		for (i = 0; i < pgcnt; i++, pp++)
1548 			page_unlock_nocapture(pp);
1549 	}
1550 }
1551 
1552 /*
1553  * During boot, need to demote a large page to base
1554  * pagesize pages for seg_kmem for use in boot_alloc()
1555  */
1556 void
1557 page_boot_demote(page_t *pp)
1558 {
1559 	ASSERT(pp->p_szc != 0);
1560 	ASSERT(PP_ISFREE(pp));
1561 	ASSERT(PP_ISAGED(pp));
1562 
1563 	(void) page_demote(PP_2_MEM_NODE(pp),
1564 	    PFN_BASE(pp->p_pagenum, pp->p_szc), pp->p_szc, 0, PC_NO_COLOR,
1565 	    PC_FREE);
1566 
1567 	ASSERT(PP_ISFREE(pp));
1568 	ASSERT(PP_ISAGED(pp));
1569 	ASSERT(pp->p_szc == 0);
1570 }
1571 
1572 /*
1573  * Take a particular page off of whatever freelist the page
1574  * is claimed to be on.
1575  *
1576  * NOTE: Only used for PAGESIZE pages.
1577  */
1578 void
1579 page_list_sub(page_t *pp, int flags)
1580 {
1581 	int		bin;
1582 	uint_t		mtype;
1583 	int		mnode;
1584 	kmutex_t	*pcm;
1585 	page_t		**ppp;
1586 
1587 	ASSERT(PAGE_EXCL(pp));
1588 	ASSERT(PP_ISFREE(pp));
1589 
1590 	/*
1591 	 * The p_szc field can only be changed by page_promote()
1592 	 * and page_demote(). Only free pages can be promoted and
1593 	 * demoted and the free list MUST be locked during these
1594 	 * operations. So to prevent a race in page_list_sub()
1595 	 * between computing which bin of the freelist lock to
1596 	 * grab and actually grabing the lock we check again that
1597 	 * the bin we locked is still the correct one. Notice that
1598 	 * the p_szc field could have actually changed on us but
1599 	 * if the bin happens to still be the same we are safe.
1600 	 */
1601 try_again:
1602 	bin = PP_2_BIN(pp);
1603 	mnode = PP_2_MEM_NODE(pp);
1604 	pcm = PC_BIN_MUTEX(mnode, bin, flags);
1605 	mutex_enter(pcm);
1606 	if (PP_2_BIN(pp) != bin) {
1607 		mutex_exit(pcm);
1608 		goto try_again;
1609 	}
1610 	mtype = PP_2_MTYPE(pp);
1611 
1612 	if (flags & PG_FREE_LIST) {
1613 		VM_STAT_ADD(vmm_vmstats.plsub_free[0]);
1614 		ASSERT(PP_ISAGED(pp));
1615 		ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1616 	} else {
1617 		VM_STAT_ADD(vmm_vmstats.plsub_cache);
1618 		ASSERT(!PP_ISAGED(pp));
1619 		ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
1620 	}
1621 
1622 	/*
1623 	 * Common PAGESIZE case.
1624 	 *
1625 	 * Note that we locked the freelist. This prevents
1626 	 * any page promotion/demotion operations. Therefore
1627 	 * the p_szc will not change until we drop pcm mutex.
1628 	 */
1629 	if (pp->p_szc == 0) {
1630 		page_sub(ppp, pp);
1631 		/*
1632 		 * Subtract counters before releasing pcm mutex
1633 		 * to avoid race with page_freelist_coalesce.
1634 		 */
1635 		page_ctr_sub(mnode, mtype, pp, flags);
1636 		mutex_exit(pcm);
1637 
1638 #if defined(__sparc)
1639 		if (PP_ISNORELOC(pp)) {
1640 			kcage_freemem_sub(1);
1641 		}
1642 #endif
1643 		return;
1644 	}
1645 
1646 	/*
1647 	 * Large pages on the cache list are not supported.
1648 	 */
1649 	if (flags & PG_CACHE_LIST)
1650 		panic("page_list_sub: large page on cachelist");
1651 
1652 	/*
1653 	 * Slow but rare.
1654 	 *
1655 	 * Somebody wants this particular page which is part
1656 	 * of a large page. In this case we just demote the page
1657 	 * if it's on the freelist.
1658 	 *
1659 	 * We have to drop pcm before locking the entire freelist.
1660 	 * Once we have re-locked the freelist check to make sure
1661 	 * the page hasn't already been demoted or completely
1662 	 * freed.
1663 	 */
1664 	mutex_exit(pcm);
1665 	page_freelist_lock(mnode);
1666 	if (pp->p_szc != 0) {
1667 		/*
1668 		 * Large page is on freelist.
1669 		 */
1670 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum, pp->p_szc),
1671 		    pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
1672 	}
1673 	ASSERT(PP_ISFREE(pp));
1674 	ASSERT(PP_ISAGED(pp));
1675 	ASSERT(pp->p_szc == 0);
1676 
1677 	/*
1678 	 * Subtract counters before releasing pcm mutex
1679 	 * to avoid race with page_freelist_coalesce.
1680 	 */
1681 	bin = PP_2_BIN(pp);
1682 	mtype = PP_2_MTYPE(pp);
1683 	ppp = &PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype);
1684 
1685 	page_sub(ppp, pp);
1686 	page_ctr_sub(mnode, mtype, pp, flags);
1687 	page_freelist_unlock(mnode);
1688 
1689 #if defined(__sparc)
1690 	if (PP_ISNORELOC(pp)) {
1691 		kcage_freemem_sub(1);
1692 	}
1693 #endif
1694 }
1695 
1696 void
1697 page_list_sub_pages(page_t *pp, uint_t szc)
1698 {
1699 	kmutex_t *pcm;
1700 	uint_t	bin, mtype;
1701 	int	mnode;
1702 
1703 	ASSERT(PAGE_EXCL(pp));
1704 	ASSERT(PP_ISFREE(pp));
1705 	ASSERT(PP_ISAGED(pp));
1706 
1707 	/*
1708 	 * See comment in page_list_sub().
1709 	 */
1710 try_again:
1711 	bin = PP_2_BIN(pp);
1712 	mnode = PP_2_MEM_NODE(pp);
1713 	pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
1714 	mutex_enter(pcm);
1715 	if (PP_2_BIN(pp) != bin) {
1716 		mutex_exit(pcm);
1717 		goto	try_again;
1718 	}
1719 
1720 	/*
1721 	 * If we're called with a page larger than szc or it got
1722 	 * promoted above szc before we locked the freelist then
1723 	 * drop pcm and re-lock entire freelist. If page still larger
1724 	 * than szc then demote it.
1725 	 */
1726 	if (pp->p_szc > szc) {
1727 		mutex_exit(pcm);
1728 		pcm = NULL;
1729 		page_freelist_lock(mnode);
1730 		if (pp->p_szc > szc) {
1731 			VM_STAT_ADD(vmm_vmstats.plsubpages_szcbig);
1732 			(void) page_demote(mnode,
1733 			    PFN_BASE(pp->p_pagenum, pp->p_szc),
1734 			    pp->p_szc, szc, PC_NO_COLOR, PC_FREE);
1735 		}
1736 		bin = PP_2_BIN(pp);
1737 	}
1738 	ASSERT(PP_ISFREE(pp));
1739 	ASSERT(PP_ISAGED(pp));
1740 	ASSERT(pp->p_szc <= szc);
1741 	ASSERT(pp == PP_PAGEROOT(pp));
1742 
1743 	VM_STAT_ADD(vmm_vmstats.plsub_free[pp->p_szc]);
1744 
1745 	mtype = PP_2_MTYPE(pp);
1746 	if (pp->p_szc != 0) {
1747 		page_vpsub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1748 		CHK_LPG(pp, pp->p_szc);
1749 	} else {
1750 		VM_STAT_ADD(vmm_vmstats.plsubpages_szc0);
1751 		page_sub(&PAGE_FREELISTS(mnode, pp->p_szc, bin, mtype), pp);
1752 	}
1753 	page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
1754 
1755 	if (pcm != NULL) {
1756 		mutex_exit(pcm);
1757 	} else {
1758 		page_freelist_unlock(mnode);
1759 	}
1760 
1761 #if defined(__sparc)
1762 	if (PP_ISNORELOC(pp)) {
1763 		pgcnt_t	pgcnt;
1764 
1765 		pgcnt = page_get_pagecnt(pp->p_szc);
1766 		kcage_freemem_sub(pgcnt);
1767 	}
1768 #endif
1769 }
1770 
1771 /*
1772  * Add the page to the front of a linked list of pages
1773  * using the p_next & p_prev pointers for the list.
1774  * The caller is responsible for protecting the list pointers.
1775  */
1776 void
1777 mach_page_add(page_t **ppp, page_t *pp)
1778 {
1779 	if (*ppp == NULL) {
1780 		pp->p_next = pp->p_prev = pp;
1781 	} else {
1782 		pp->p_next = *ppp;
1783 		pp->p_prev = (*ppp)->p_prev;
1784 		(*ppp)->p_prev = pp;
1785 		pp->p_prev->p_next = pp;
1786 	}
1787 	*ppp = pp;
1788 }
1789 
1790 /*
1791  * Remove this page from a linked list of pages
1792  * using the p_next & p_prev pointers for the list.
1793  *
1794  * The caller is responsible for protecting the list pointers.
1795  */
1796 void
1797 mach_page_sub(page_t **ppp, page_t *pp)
1798 {
1799 	ASSERT(PP_ISFREE(pp));
1800 
1801 	if (*ppp == NULL || pp == NULL)
1802 		panic("mach_page_sub");
1803 
1804 	if (*ppp == pp)
1805 		*ppp = pp->p_next;		/* go to next page */
1806 
1807 	if (*ppp == pp)
1808 		*ppp = NULL;			/* page list is gone */
1809 	else {
1810 		pp->p_prev->p_next = pp->p_next;
1811 		pp->p_next->p_prev = pp->p_prev;
1812 	}
1813 	pp->p_prev = pp->p_next = pp;		/* make pp a list of one */
1814 }
1815 
1816 /*
1817  * Routine fsflush uses to gradually coalesce the free list into larger pages.
1818  */
1819 void
1820 page_promote_size(page_t *pp, uint_t cur_szc)
1821 {
1822 	pfn_t pfn;
1823 	int mnode;
1824 	int idx;
1825 	int new_szc = cur_szc + 1;
1826 	int full = FULL_REGION_CNT(new_szc);
1827 
1828 	pfn = page_pptonum(pp);
1829 	mnode = PFN_2_MEM_NODE(pfn);
1830 
1831 	page_freelist_lock(mnode);
1832 
1833 	idx = PNUM_TO_IDX(mnode, new_szc, pfn);
1834 	if (PAGE_COUNTERS(mnode, new_szc, idx) == full)
1835 		(void) page_promote(mnode, pfn, new_szc, PC_FREE, PC_MTYPE_ANY);
1836 
1837 	page_freelist_unlock(mnode);
1838 }
1839 
1840 static uint_t page_promote_err;
1841 static uint_t page_promote_noreloc_err;
1842 
1843 /*
1844  * Create a single larger page (of szc new_szc) from smaller contiguous pages
1845  * for the given mnode starting at pfnum. Pages involved are on the freelist
1846  * before the call and may be returned to the caller if requested, otherwise
1847  * they will be placed back on the freelist.
1848  * If flags is PC_ALLOC, then the large page will be returned to the user in
1849  * a state which is consistent with a page being taken off the freelist.  If
1850  * we failed to lock the new large page, then we will return NULL to the
1851  * caller and put the large page on the freelist instead.
1852  * If flags is PC_FREE, then the large page will be placed on the freelist,
1853  * and NULL will be returned.
1854  * The caller is responsible for locking the freelist as well as any other
1855  * accounting which needs to be done for a returned page.
1856  *
1857  * RFE: For performance pass in pp instead of pfnum so
1858  * 	we can avoid excessive calls to page_numtopp_nolock().
1859  *	This would depend on an assumption that all contiguous
1860  *	pages are in the same memseg so we can just add/dec
1861  *	our pp.
1862  *
1863  * Lock ordering:
1864  *
1865  *	There is a potential but rare deadlock situation
1866  *	for page promotion and demotion operations. The problem
1867  *	is there are two paths into the freelist manager and
1868  *	they have different lock orders:
1869  *
1870  *	page_create()
1871  *		lock freelist
1872  *		page_lock(EXCL)
1873  *		unlock freelist
1874  *		return
1875  *		caller drops page_lock
1876  *
1877  *	page_free() and page_reclaim()
1878  *		caller grabs page_lock(EXCL)
1879  *
1880  *		lock freelist
1881  *		unlock freelist
1882  *		drop page_lock
1883  *
1884  *	What prevents a thread in page_create() from deadlocking
1885  *	with a thread freeing or reclaiming the same page is the
1886  *	page_trylock() in page_get_freelist(). If the trylock fails
1887  *	it skips the page.
1888  *
1889  *	The lock ordering for promotion and demotion is the same as
1890  *	for page_create(). Since the same deadlock could occur during
1891  *	page promotion and freeing or reclaiming of a page on the
1892  *	cache list we might have to fail the operation and undo what
1893  *	have done so far. Again this is rare.
1894  */
1895 page_t *
1896 page_promote(int mnode, pfn_t pfnum, uchar_t new_szc, int flags, int mtype)
1897 {
1898 	page_t		*pp, *pplist, *tpp, *start_pp;
1899 	pgcnt_t		new_npgs, npgs;
1900 	uint_t		bin;
1901 	pgcnt_t		tmpnpgs, pages_left;
1902 	uint_t		noreloc;
1903 	int 		which_list;
1904 	ulong_t		index;
1905 	kmutex_t	*phm;
1906 
1907 	/*
1908 	 * General algorithm:
1909 	 * Find the starting page
1910 	 * Walk each page struct removing it from the freelist,
1911 	 * and linking it to all the other pages removed.
1912 	 * Once all pages are off the freelist,
1913 	 * walk the list, modifying p_szc to new_szc and what
1914 	 * ever other info needs to be done to create a large free page.
1915 	 * According to the flags, either return the page or put it
1916 	 * on the freelist.
1917 	 */
1918 
1919 	start_pp = page_numtopp_nolock(pfnum);
1920 	ASSERT(start_pp && (start_pp->p_pagenum == pfnum));
1921 	new_npgs = page_get_pagecnt(new_szc);
1922 	ASSERT(IS_P2ALIGNED(pfnum, new_npgs));
1923 
1924 	/* don't return page of the wrong mtype */
1925 	if (mtype != PC_MTYPE_ANY && mtype != PP_2_MTYPE(start_pp))
1926 			return (NULL);
1927 
1928 	/*
1929 	 * Loop through smaller pages to confirm that all pages
1930 	 * give the same result for PP_ISNORELOC().
1931 	 * We can check this reliably here as the protocol for setting
1932 	 * P_NORELOC requires pages to be taken off the free list first.
1933 	 */
1934 	noreloc = PP_ISNORELOC(start_pp);
1935 	for (pp = start_pp + new_npgs; --pp > start_pp; ) {
1936 		if (noreloc != PP_ISNORELOC(pp)) {
1937 			page_promote_noreloc_err++;
1938 			page_promote_err++;
1939 			return (NULL);
1940 		}
1941 	}
1942 
1943 	pages_left = new_npgs;
1944 	pplist = NULL;
1945 	pp = start_pp;
1946 
1947 	/* Loop around coalescing the smaller pages into a big page. */
1948 	while (pages_left) {
1949 		/*
1950 		 * Remove from the freelist.
1951 		 */
1952 		ASSERT(PP_ISFREE(pp));
1953 		bin = PP_2_BIN(pp);
1954 		ASSERT(mnode == PP_2_MEM_NODE(pp));
1955 		mtype = PP_2_MTYPE(pp);
1956 		if (PP_ISAGED(pp)) {
1957 
1958 			/*
1959 			 * PG_FREE_LIST
1960 			 */
1961 			if (pp->p_szc) {
1962 				page_vpsub(&PAGE_FREELISTS(mnode,
1963 				    pp->p_szc, bin, mtype), pp);
1964 			} else {
1965 				mach_page_sub(&PAGE_FREELISTS(mnode, 0,
1966 				    bin, mtype), pp);
1967 			}
1968 			which_list = PG_FREE_LIST;
1969 		} else {
1970 			ASSERT(pp->p_szc == 0);
1971 
1972 			/*
1973 			 * PG_CACHE_LIST
1974 			 *
1975 			 * Since this page comes from the
1976 			 * cachelist, we must destroy the
1977 			 * vnode association.
1978 			 */
1979 			if (!page_trylock(pp, SE_EXCL)) {
1980 				goto fail_promote;
1981 			}
1982 
1983 			/*
1984 			 * We need to be careful not to deadlock
1985 			 * with another thread in page_lookup().
1986 			 * The page_lookup() thread could be holding
1987 			 * the same phm that we need if the two
1988 			 * pages happen to hash to the same phm lock.
1989 			 * At this point we have locked the entire
1990 			 * freelist and page_lookup() could be trying
1991 			 * to grab a freelist lock.
1992 			 */
1993 			index = PAGE_HASH_FUNC(pp->p_vnode, pp->p_offset);
1994 			phm = PAGE_HASH_MUTEX(index);
1995 			if (!mutex_tryenter(phm)) {
1996 				page_unlock_nocapture(pp);
1997 				goto fail_promote;
1998 			}
1999 
2000 			mach_page_sub(&PAGE_CACHELISTS(mnode, bin, mtype), pp);
2001 			page_hashout(pp, phm);
2002 			mutex_exit(phm);
2003 			PP_SETAGED(pp);
2004 			page_unlock_nocapture(pp);
2005 			which_list = PG_CACHE_LIST;
2006 		}
2007 		page_ctr_sub(mnode, mtype, pp, which_list);
2008 
2009 		/*
2010 		 * Concatenate the smaller page(s) onto
2011 		 * the large page list.
2012 		 */
2013 		tmpnpgs = npgs = page_get_pagecnt(pp->p_szc);
2014 		pages_left -= npgs;
2015 		tpp = pp;
2016 		while (npgs--) {
2017 			tpp->p_szc = new_szc;
2018 			tpp = tpp->p_next;
2019 		}
2020 		page_list_concat(&pplist, &pp);
2021 		pp += tmpnpgs;
2022 	}
2023 	CHK_LPG(pplist, new_szc);
2024 
2025 	/*
2026 	 * return the page to the user if requested
2027 	 * in the properly locked state.
2028 	 */
2029 	if (flags == PC_ALLOC && (page_trylock_cons(pplist, SE_EXCL))) {
2030 		return (pplist);
2031 	}
2032 
2033 	/*
2034 	 * Otherwise place the new large page on the freelist
2035 	 */
2036 	bin = PP_2_BIN(pplist);
2037 	mnode = PP_2_MEM_NODE(pplist);
2038 	mtype = PP_2_MTYPE(pplist);
2039 	page_vpadd(&PAGE_FREELISTS(mnode, new_szc, bin, mtype), pplist);
2040 
2041 	page_ctr_add(mnode, mtype, pplist, PG_FREE_LIST);
2042 	return (NULL);
2043 
2044 fail_promote:
2045 	/*
2046 	 * A thread must have still been freeing or
2047 	 * reclaiming the page on the cachelist.
2048 	 * To prevent a deadlock undo what we have
2049 	 * done sofar and return failure. This
2050 	 * situation can only happen while promoting
2051 	 * PAGESIZE pages.
2052 	 */
2053 	page_promote_err++;
2054 	while (pplist) {
2055 		pp = pplist;
2056 		mach_page_sub(&pplist, pp);
2057 		pp->p_szc = 0;
2058 		bin = PP_2_BIN(pp);
2059 		mtype = PP_2_MTYPE(pp);
2060 		mach_page_add(&PAGE_FREELISTS(mnode, 0, bin, mtype), pp);
2061 		page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2062 	}
2063 	return (NULL);
2064 
2065 }
2066 
2067 /*
2068  * Break up a large page into smaller size pages.
2069  * Pages involved are on the freelist before the call and may
2070  * be returned to the caller if requested, otherwise they will
2071  * be placed back on the freelist.
2072  * The caller is responsible for locking the freelist as well as any other
2073  * accounting which needs to be done for a returned page.
2074  * If flags is not PC_ALLOC, the color argument is ignored, and thus
2075  * technically, any value may be passed in but PC_NO_COLOR is the standard
2076  * which should be followed for clarity's sake.
2077  */
2078 page_t *
2079 page_demote(int mnode, pfn_t pfnum, uchar_t cur_szc, uchar_t new_szc,
2080     int color, int flags)
2081 {
2082 	page_t	*pp, *pplist, *npplist;
2083 	pgcnt_t	npgs, n;
2084 	uint_t	bin;
2085 	uint_t	mtype;
2086 	page_t	*ret_pp = NULL;
2087 
2088 	ASSERT(cur_szc != 0);
2089 	ASSERT(new_szc < cur_szc);
2090 
2091 	pplist = page_numtopp_nolock(pfnum);
2092 	ASSERT(pplist != NULL);
2093 
2094 	ASSERT(pplist->p_szc == cur_szc);
2095 
2096 	bin = PP_2_BIN(pplist);
2097 	ASSERT(mnode == PP_2_MEM_NODE(pplist));
2098 	mtype = PP_2_MTYPE(pplist);
2099 	page_vpsub(&PAGE_FREELISTS(mnode, cur_szc, bin, mtype), pplist);
2100 
2101 	CHK_LPG(pplist, cur_szc);
2102 	page_ctr_sub(mnode, mtype, pplist, PG_FREE_LIST);
2103 
2104 	/*
2105 	 * Number of PAGESIZE pages for smaller new_szc
2106 	 * page.
2107 	 */
2108 	npgs = page_get_pagecnt(new_szc);
2109 
2110 	while (pplist) {
2111 		pp = pplist;
2112 
2113 		ASSERT(pp->p_szc == cur_szc);
2114 
2115 		/*
2116 		 * We either break it up into PAGESIZE pages or larger.
2117 		 */
2118 		if (npgs == 1) {	/* PAGESIZE case */
2119 			mach_page_sub(&pplist, pp);
2120 			ASSERT(pp->p_szc == cur_szc);
2121 			ASSERT(new_szc == 0);
2122 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2123 			pp->p_szc = new_szc;
2124 			bin = PP_2_BIN(pp);
2125 			if ((bin == color) && (flags == PC_ALLOC) &&
2126 			    (ret_pp == NULL) &&
2127 			    page_trylock_cons(pp, SE_EXCL)) {
2128 				ret_pp = pp;
2129 			} else {
2130 				mtype = PP_2_MTYPE(pp);
2131 				mach_page_add(&PAGE_FREELISTS(mnode, 0, bin,
2132 				    mtype), pp);
2133 				page_ctr_add(mnode, mtype, pp, PG_FREE_LIST);
2134 			}
2135 		} else {
2136 
2137 			/*
2138 			 * Break down into smaller lists of pages.
2139 			 */
2140 			page_list_break(&pplist, &npplist, npgs);
2141 
2142 			pp = pplist;
2143 			n = npgs;
2144 			while (n--) {
2145 				ASSERT(pp->p_szc == cur_szc);
2146 				pp->p_szc = new_szc;
2147 				pp = pp->p_next;
2148 			}
2149 
2150 			CHK_LPG(pplist, new_szc);
2151 
2152 			bin = PP_2_BIN(pplist);
2153 			ASSERT(mnode == PP_2_MEM_NODE(pp));
2154 			if ((bin == color) && (flags == PC_ALLOC) &&
2155 			    (ret_pp == NULL) &&
2156 			    page_trylock_cons(pp, SE_EXCL)) {
2157 				ret_pp = pp;
2158 			} else {
2159 				mtype = PP_2_MTYPE(pp);
2160 				page_vpadd(&PAGE_FREELISTS(mnode, new_szc,
2161 				    bin, mtype), pplist);
2162 
2163 				page_ctr_add(mnode, mtype, pplist,
2164 				    PG_FREE_LIST);
2165 			}
2166 			pplist = npplist;
2167 		}
2168 	}
2169 	return (ret_pp);
2170 }
2171 
2172 int mpss_coalesce_disable = 0;
2173 
2174 /*
2175  * Coalesce free pages into a page of the given szc and color if possible.
2176  * Return the pointer to the page created, otherwise, return NULL.
2177  *
2178  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2179  */
2180 page_t *
2181 page_freelist_coalesce(int mnode, uchar_t szc, uint_t color, uint_t ceq_mask,
2182     int mtype, pfn_t pfnhi)
2183 {
2184 	int 	r = szc;		/* region size */
2185 	int	mrange;
2186 	uint_t 	full, bin, color_mask, wrap = 0;
2187 	pfn_t	pfnum, lo, hi;
2188 	size_t	len, idx, idx0;
2189 	pgcnt_t	cands = 0, szcpgcnt = page_get_pagecnt(szc);
2190 	page_t	*ret_pp;
2191 	MEM_NODE_ITERATOR_DECL(it);
2192 #if defined(__sparc)
2193 	pfn_t pfnum0, nlo, nhi;
2194 #endif
2195 
2196 	if (mpss_coalesce_disable) {
2197 		ASSERT(szc < MMU_PAGE_SIZES);
2198 		VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[szc][0]);
2199 		return (NULL);
2200 	}
2201 
2202 	ASSERT(szc < mmu_page_sizes);
2203 	color_mask = PAGE_GET_PAGECOLORS(szc) - 1;
2204 	ASSERT(ceq_mask <= color_mask);
2205 	ASSERT(color <= color_mask);
2206 	color &= ceq_mask;
2207 
2208 	/* Prevent page_counters dynamic memory from being freed */
2209 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2210 
2211 	mrange = MTYPE_2_MRANGE(mnode, mtype);
2212 	ASSERT(mrange < mnode_nranges[mnode]);
2213 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce[r][mrange]);
2214 
2215 	/* get pfn range for mtype */
2216 	len = PAGE_COUNTERS_ENTRIES(mnode, r);
2217 #if defined(__sparc)
2218 	lo = PAGE_COUNTERS_BASE(mnode, r);
2219 	hi = IDX_TO_PNUM(mnode, r, len);
2220 #else
2221 	MNODETYPE_2_PFN(mnode, mtype, lo, hi);
2222 	hi++;
2223 #endif
2224 
2225 	/* use lower limit if given */
2226 	if (pfnhi != PFNNULL && pfnhi < hi)
2227 		hi = pfnhi;
2228 
2229 	/* round to szcpgcnt boundaries */
2230 	lo = P2ROUNDUP(lo, szcpgcnt);
2231 	MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
2232 	ASSERT(lo != (pfn_t)-1);
2233 	hi = hi & ~(szcpgcnt - 1);
2234 
2235 	/* set lo to the closest pfn of the right color */
2236 	if (((PFN_2_COLOR(lo, szc, &it) ^ color) & ceq_mask) ||
2237 	    (interleaved_mnodes && PFN_2_MEM_NODE(lo) != mnode)) {
2238 		PAGE_NEXT_PFN_FOR_COLOR(lo, szc, color, ceq_mask, color_mask,
2239 		    &it);
2240 	}
2241 
2242 	if (hi <= lo) {
2243 		rw_exit(&page_ctrs_rwlock[mnode]);
2244 		return (NULL);
2245 	}
2246 
2247 	full = FULL_REGION_CNT(r);
2248 
2249 	/* calculate the number of page candidates and initial search index */
2250 	bin = color;
2251 	idx0 = (size_t)(-1);
2252 	do {
2253 		pgcnt_t acand;
2254 
2255 		PGCTRS_CANDS_GETVALUECOLOR(mnode, mrange, r, bin, acand);
2256 		if (acand) {
2257 			idx = PAGE_COUNTERS_CURRENT_COLOR(mnode,
2258 			    r, bin, mrange);
2259 			idx0 = MIN(idx0, idx);
2260 			cands += acand;
2261 		}
2262 		bin = ADD_MASKED(bin, 1, ceq_mask, color_mask);
2263 	} while (bin != color);
2264 
2265 	if (cands == 0) {
2266 		VM_STAT_ADD(vmm_vmstats.page_ctrs_cands_skip[r][mrange]);
2267 		rw_exit(&page_ctrs_rwlock[mnode]);
2268 		return (NULL);
2269 	}
2270 
2271 	pfnum = IDX_TO_PNUM(mnode, r, idx0);
2272 	if (pfnum < lo || pfnum >= hi) {
2273 		pfnum = lo;
2274 	} else {
2275 		MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2276 		if (pfnum == (pfn_t)-1) {
2277 			pfnum = lo;
2278 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2279 			ASSERT(pfnum != (pfn_t)-1);
2280 		} else if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) & ceq_mask ||
2281 		    (interleaved_mnodes && PFN_2_MEM_NODE(pfnum) != mnode)) {
2282 			/* invalid color, get the closest correct pfn */
2283 			PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2284 			    color_mask, &it);
2285 			if (pfnum >= hi) {
2286 				pfnum = lo;
2287 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2288 			}
2289 		}
2290 	}
2291 
2292 	/* set starting index */
2293 	idx0 = PNUM_TO_IDX(mnode, r, pfnum);
2294 	ASSERT(idx0 < len);
2295 
2296 #if defined(__sparc)
2297 	pfnum0 = pfnum;		/* page corresponding to idx0 */
2298 	nhi = 0;		/* search kcage ranges */
2299 #endif
2300 
2301 	for (idx = idx0; wrap == 0 || (idx < idx0 && wrap < 2); ) {
2302 
2303 #if defined(__sparc)
2304 		/*
2305 		 * Find lowest intersection of kcage ranges and mnode.
2306 		 * MTYPE_NORELOC means look in the cage, otherwise outside.
2307 		 */
2308 		if (nhi <= pfnum) {
2309 			if (kcage_next_range(mtype == MTYPE_NORELOC, pfnum,
2310 			    (wrap == 0 ? hi : pfnum0), &nlo, &nhi))
2311 				goto wrapit;
2312 
2313 			/* jump to the next page in the range */
2314 			if (pfnum < nlo) {
2315 				pfnum = P2ROUNDUP(nlo, szcpgcnt);
2316 				MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2317 				idx = PNUM_TO_IDX(mnode, r, pfnum);
2318 				if (idx >= len || pfnum >= hi)
2319 					goto wrapit;
2320 				if ((PFN_2_COLOR(pfnum, szc, &it) ^ color) &
2321 				    ceq_mask)
2322 					goto next;
2323 				if (interleaved_mnodes &&
2324 				    PFN_2_MEM_NODE(pfnum) != mnode)
2325 					goto next;
2326 			}
2327 		}
2328 #endif
2329 
2330 		if (PAGE_COUNTERS(mnode, r, idx) != full)
2331 			goto next;
2332 
2333 		/*
2334 		 * RFE: For performance maybe we can do something less
2335 		 *	brutal than locking the entire freelist. So far
2336 		 * 	this doesn't seem to be a performance problem?
2337 		 */
2338 		page_freelist_lock(mnode);
2339 		if (PAGE_COUNTERS(mnode, r, idx) == full) {
2340 			ret_pp =
2341 			    page_promote(mnode, pfnum, r, PC_ALLOC, mtype);
2342 			if (ret_pp != NULL) {
2343 				VM_STAT_ADD(vmm_vmstats.pfc_coalok[r][mrange]);
2344 				PAGE_COUNTERS_CURRENT_COLOR(mnode, r,
2345 				    PFN_2_COLOR(pfnum, szc, &it), mrange) = idx;
2346 				page_freelist_unlock(mnode);
2347 				rw_exit(&page_ctrs_rwlock[mnode]);
2348 #if defined(__sparc)
2349 				if (PP_ISNORELOC(ret_pp)) {
2350 					pgcnt_t npgs;
2351 
2352 					npgs = page_get_pagecnt(ret_pp->p_szc);
2353 					kcage_freemem_sub(npgs);
2354 				}
2355 #endif
2356 				return (ret_pp);
2357 			}
2358 		} else {
2359 			VM_STAT_ADD(vmm_vmstats.page_ctrs_changed[r][mrange]);
2360 		}
2361 
2362 		page_freelist_unlock(mnode);
2363 		/*
2364 		 * No point looking for another page if we've
2365 		 * already tried all of the ones that
2366 		 * page_ctr_cands indicated.  Stash off where we left
2367 		 * off.
2368 		 * Note: this is not exact since we don't hold the
2369 		 * page_freelist_locks before we initially get the
2370 		 * value of cands for performance reasons, but should
2371 		 * be a decent approximation.
2372 		 */
2373 		if (--cands == 0) {
2374 			PAGE_COUNTERS_CURRENT_COLOR(mnode, r, color, mrange) =
2375 			    idx;
2376 			break;
2377 		}
2378 next:
2379 		PAGE_NEXT_PFN_FOR_COLOR(pfnum, szc, color, ceq_mask,
2380 		    color_mask, &it);
2381 		idx = PNUM_TO_IDX(mnode, r, pfnum);
2382 		if (idx >= len || pfnum >= hi) {
2383 wrapit:
2384 			pfnum = lo;
2385 			MEM_NODE_ITERATOR_INIT(pfnum, mnode, &it);
2386 			idx = PNUM_TO_IDX(mnode, r, pfnum);
2387 			wrap++;
2388 #if defined(__sparc)
2389 			nhi = 0;	/* search kcage ranges */
2390 #endif
2391 		}
2392 	}
2393 
2394 	rw_exit(&page_ctrs_rwlock[mnode]);
2395 	VM_STAT_ADD(vmm_vmstats.page_ctrs_failed[r][mrange]);
2396 	return (NULL);
2397 }
2398 
2399 /*
2400  * For the given mnode, promote as many small pages to large pages as possible.
2401  * mnode can be -1, which means do them all
2402  */
2403 void
2404 page_freelist_coalesce_all(int mnode)
2405 {
2406 	int 	r;		/* region size */
2407 	int 	idx, full;
2408 	size_t	len;
2409 	int doall = interleaved_mnodes || mnode < 0;
2410 	int mlo = doall ? 0 : mnode;
2411 	int mhi = doall ? max_mem_nodes : (mnode + 1);
2412 
2413 	VM_STAT_ADD(vmm_vmstats.page_ctrs_coalesce_all);
2414 
2415 	if (mpss_coalesce_disable) {
2416 		return;
2417 	}
2418 
2419 	/*
2420 	 * Lock the entire freelist and coalesce what we can.
2421 	 *
2422 	 * Always promote to the largest page possible
2423 	 * first to reduce the number of page promotions.
2424 	 */
2425 	for (mnode = mlo; mnode < mhi; mnode++) {
2426 		rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
2427 		page_freelist_lock(mnode);
2428 	}
2429 	for (r = mmu_page_sizes - 1; r > 0; r--) {
2430 		for (mnode = mlo; mnode < mhi; mnode++) {
2431 			pgcnt_t cands = 0;
2432 			int mrange, nranges = mnode_nranges[mnode];
2433 
2434 			for (mrange = 0; mrange < nranges; mrange++) {
2435 				PGCTRS_CANDS_GETVALUE(mnode, mrange, r, cands);
2436 				if (cands != 0)
2437 					break;
2438 			}
2439 			if (cands == 0) {
2440 				VM_STAT_ADD(vmm_vmstats.
2441 				    page_ctrs_cands_skip_all);
2442 				continue;
2443 			}
2444 
2445 			full = FULL_REGION_CNT(r);
2446 			len  = PAGE_COUNTERS_ENTRIES(mnode, r);
2447 
2448 			for (idx = 0; idx < len; idx++) {
2449 				if (PAGE_COUNTERS(mnode, r, idx) == full) {
2450 					pfn_t pfnum =
2451 					    IDX_TO_PNUM(mnode, r, idx);
2452 					int tmnode = interleaved_mnodes ?
2453 					    PFN_2_MEM_NODE(pfnum) : mnode;
2454 
2455 					ASSERT(pfnum >=
2456 					    mem_node_config[tmnode].physbase &&
2457 					    pfnum <
2458 					    mem_node_config[tmnode].physmax);
2459 
2460 					(void) page_promote(tmnode,
2461 					    pfnum, r, PC_FREE, PC_MTYPE_ANY);
2462 				}
2463 			}
2464 			/* shared hpm_counters covers all mnodes, so we quit */
2465 			if (interleaved_mnodes)
2466 				break;
2467 		}
2468 	}
2469 	for (mnode = mlo; mnode < mhi; mnode++) {
2470 		page_freelist_unlock(mnode);
2471 		rw_exit(&page_ctrs_rwlock[mnode]);
2472 	}
2473 }
2474 
2475 /*
2476  * This is where all polices for moving pages around
2477  * to different page size free lists is implemented.
2478  * Returns 1 on success, 0 on failure.
2479  *
2480  * So far these are the priorities for this algorithm in descending
2481  * order:
2482  *
2483  *	1) When servicing a request try to do so with a free page
2484  *	   from next size up. Helps defer fragmentation as long
2485  *	   as possible.
2486  *
2487  *	2) Page coalesce on demand. Only when a freelist
2488  *	   larger than PAGESIZE is empty and step 1
2489  *	   will not work since all larger size lists are
2490  *	   also empty.
2491  *
2492  * If pfnhi is non-zero, search for large page with pfn range less than pfnhi.
2493  */
2494 
2495 page_t *
2496 page_freelist_split(uchar_t szc, uint_t color, int mnode, int mtype,
2497     pfn_t pfnhi, page_list_walker_t *plw)
2498 {
2499 	uchar_t nszc = szc + 1;
2500 	uint_t 	bin, sbin, bin_prev;
2501 	page_t	*pp, *firstpp;
2502 	page_t	*ret_pp = NULL;
2503 	uint_t  color_mask;
2504 
2505 	if (nszc == mmu_page_sizes)
2506 		return (NULL);
2507 
2508 	ASSERT(nszc < mmu_page_sizes);
2509 	color_mask = PAGE_GET_PAGECOLORS(nszc) - 1;
2510 	bin = sbin = PAGE_GET_NSZ_COLOR(szc, color);
2511 	bin_prev = (plw->plw_bin_split_prev == color) ? INVALID_COLOR :
2512 	    PAGE_GET_NSZ_COLOR(szc, plw->plw_bin_split_prev);
2513 
2514 	VM_STAT_ADD(vmm_vmstats.pfs_req[szc]);
2515 	/*
2516 	 * First try to break up a larger page to fill current size freelist.
2517 	 */
2518 	while (plw->plw_bins[nszc] != 0) {
2519 
2520 		ASSERT(nszc < mmu_page_sizes);
2521 
2522 		/*
2523 		 * If page found then demote it.
2524 		 */
2525 		if (PAGE_FREELISTS(mnode, nszc, bin, mtype)) {
2526 			page_freelist_lock(mnode);
2527 			firstpp = pp = PAGE_FREELISTS(mnode, nszc, bin, mtype);
2528 
2529 			/*
2530 			 * If pfnhi is not PFNNULL, look for large page below
2531 			 * pfnhi. PFNNULL signifies no pfn requirement.
2532 			 */
2533 			if (pfnhi != PFNNULL && pp->p_pagenum >= pfnhi) {
2534 				do {
2535 					pp = pp->p_vpnext;
2536 					if (pp == firstpp) {
2537 						pp = NULL;
2538 						break;
2539 					}
2540 				} while (pp->p_pagenum >= pfnhi);
2541 			}
2542 			if (pp) {
2543 				uint_t ccolor = page_correct_color(szc, nszc,
2544 				    color, bin, plw->plw_ceq_mask[szc]);
2545 
2546 				ASSERT(pp->p_szc == nszc);
2547 				VM_STAT_ADD(vmm_vmstats.pfs_demote[nszc]);
2548 				ret_pp = page_demote(mnode, pp->p_pagenum,
2549 				    pp->p_szc, szc, ccolor, PC_ALLOC);
2550 				if (ret_pp) {
2551 					page_freelist_unlock(mnode);
2552 #if defined(__sparc)
2553 					if (PP_ISNORELOC(ret_pp)) {
2554 						pgcnt_t npgs;
2555 
2556 						npgs = page_get_pagecnt(
2557 						    ret_pp->p_szc);
2558 						kcage_freemem_sub(npgs);
2559 					}
2560 #endif
2561 					return (ret_pp);
2562 				}
2563 			}
2564 			page_freelist_unlock(mnode);
2565 		}
2566 
2567 		/* loop through next size bins */
2568 		bin = ADD_MASKED(bin, 1, plw->plw_ceq_mask[nszc], color_mask);
2569 		plw->plw_bins[nszc]--;
2570 
2571 		if (bin == sbin) {
2572 			uchar_t nnszc = nszc + 1;
2573 
2574 			/* we are done with this page size - check next */
2575 			if (plw->plw_bins[nnszc] == 0)
2576 				/* we have already checked next size bins */
2577 				break;
2578 
2579 			bin = sbin = PAGE_GET_NSZ_COLOR(nszc, bin);
2580 			if (bin_prev != INVALID_COLOR) {
2581 				bin_prev = PAGE_GET_NSZ_COLOR(nszc, bin_prev);
2582 				if (!((bin ^ bin_prev) &
2583 				    plw->plw_ceq_mask[nnszc]))
2584 					break;
2585 			}
2586 			ASSERT(nnszc < mmu_page_sizes);
2587 			color_mask = PAGE_GET_PAGECOLORS(nnszc) - 1;
2588 			nszc = nnszc;
2589 			ASSERT(nszc < mmu_page_sizes);
2590 		}
2591 	}
2592 
2593 	return (ret_pp);
2594 }
2595 
2596 /*
2597  * Helper routine used only by the freelist code to lock
2598  * a page. If the page is a large page then it succeeds in
2599  * locking all the constituent pages or none at all.
2600  * Returns 1 on sucess, 0 on failure.
2601  */
2602 static int
2603 page_trylock_cons(page_t *pp, se_t se)
2604 {
2605 	page_t	*tpp, *first_pp = pp;
2606 
2607 	/*
2608 	 * Fail if can't lock first or only page.
2609 	 */
2610 	if (!page_trylock(pp, se)) {
2611 		return (0);
2612 	}
2613 
2614 	/*
2615 	 * PAGESIZE: common case.
2616 	 */
2617 	if (pp->p_szc == 0) {
2618 		return (1);
2619 	}
2620 
2621 	/*
2622 	 * Large page case.
2623 	 */
2624 	tpp = pp->p_next;
2625 	while (tpp != pp) {
2626 		if (!page_trylock(tpp, se)) {
2627 			/*
2628 			 * On failure unlock what we have locked so far.
2629 			 * We want to avoid attempting to capture these
2630 			 * pages as the pcm mutex may be held which could
2631 			 * lead to a recursive mutex panic.
2632 			 */
2633 			while (first_pp != tpp) {
2634 				page_unlock_nocapture(first_pp);
2635 				first_pp = first_pp->p_next;
2636 			}
2637 			return (0);
2638 		}
2639 		tpp = tpp->p_next;
2640 	}
2641 	return (1);
2642 }
2643 
2644 /*
2645  * init context for walking page lists
2646  * Called when a page of the given szc in unavailable. Sets markers
2647  * for the beginning of the search to detect when search has
2648  * completed a full cycle. Sets flags for splitting larger pages
2649  * and coalescing smaller pages. Page walking procedes until a page
2650  * of the desired equivalent color is found.
2651  */
2652 void
2653 page_list_walk_init(uchar_t szc, uint_t flags, uint_t bin, int can_split,
2654     int use_ceq, page_list_walker_t *plw)
2655 {
2656 	uint_t  nszc, ceq_mask, colors;
2657 	uchar_t ceq = use_ceq ? colorequivszc[szc] : 0;
2658 
2659 	ASSERT(szc < mmu_page_sizes);
2660 	colors = PAGE_GET_PAGECOLORS(szc);
2661 
2662 	plw->plw_colors = colors;
2663 	plw->plw_color_mask = colors - 1;
2664 	plw->plw_bin_marker = plw->plw_bin0 = bin;
2665 	plw->plw_bin_split_prev = bin;
2666 	plw->plw_bin_step = (szc == 0) ? vac_colors : 1;
2667 
2668 	/*
2669 	 * if vac aliasing is possible make sure lower order color
2670 	 * bits are never ignored
2671 	 */
2672 	if (vac_colors > 1)
2673 		ceq &= 0xf0;
2674 
2675 	/*
2676 	 * calculate the number of non-equivalent colors and
2677 	 * color equivalency mask
2678 	 */
2679 	plw->plw_ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
2680 	ASSERT(szc > 0 || plw->plw_ceq_dif >= vac_colors);
2681 	ASSERT(plw->plw_ceq_dif > 0);
2682 	plw->plw_ceq_mask[szc] = (plw->plw_ceq_dif - 1) << (ceq & 0xf);
2683 
2684 	if (flags & PG_MATCH_COLOR) {
2685 		if (cpu_page_colors <  0) {
2686 			/*
2687 			 * this is a heterogeneous machine with different CPUs
2688 			 * having different size e$ (not supported for ni2/rock
2689 			 */
2690 			uint_t cpucolors = CPUSETSIZE() >> PAGE_GET_SHIFT(szc);
2691 			cpucolors = MAX(cpucolors, 1);
2692 			ceq_mask = plw->plw_color_mask & (cpucolors - 1);
2693 			plw->plw_ceq_mask[szc] =
2694 			    MIN(ceq_mask, plw->plw_ceq_mask[szc]);
2695 		}
2696 		plw->plw_ceq_dif = 1;
2697 	}
2698 
2699 	/* we can split pages in the freelist, but not the cachelist */
2700 	if (can_split) {
2701 		plw->plw_do_split = (szc + 1 < mmu_page_sizes) ? 1 : 0;
2702 
2703 		/* set next szc color masks and number of free list bins */
2704 		for (nszc = szc + 1; nszc < mmu_page_sizes; nszc++, szc++) {
2705 			plw->plw_ceq_mask[nszc] = PAGE_GET_NSZ_MASK(szc,
2706 			    plw->plw_ceq_mask[szc]);
2707 			plw->plw_bins[nszc] = PAGE_GET_PAGECOLORS(nszc);
2708 		}
2709 		plw->plw_ceq_mask[nszc] = INVALID_MASK;
2710 		plw->plw_bins[nszc] = 0;
2711 
2712 	} else {
2713 		ASSERT(szc == 0);
2714 		plw->plw_do_split = 0;
2715 		plw->plw_bins[1] = 0;
2716 		plw->plw_ceq_mask[1] = INVALID_MASK;
2717 	}
2718 }
2719 
2720 /*
2721  * set mark to flag where next split should occur
2722  */
2723 #define	PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw) {		     \
2724 	uint_t bin_nsz = PAGE_GET_NSZ_COLOR(szc, bin);			     \
2725 	uint_t bin0_nsz = PAGE_GET_NSZ_COLOR(szc, plw->plw_bin0);	     \
2726 	uint_t neq_mask = ~plw->plw_ceq_mask[nszc] & plw->plw_color_mask;    \
2727 	plw->plw_split_next =						     \
2728 		INC_MASKED(bin_nsz, neq_mask, plw->plw_color_mask);	     \
2729 	if (!((plw->plw_split_next ^ bin0_nsz) & plw->plw_ceq_mask[nszc])) { \
2730 		plw->plw_split_next =					     \
2731 		INC_MASKED(plw->plw_split_next,				     \
2732 		    neq_mask, plw->plw_color_mask);			     \
2733 	}								     \
2734 }
2735 
2736 uint_t
2737 page_list_walk_next_bin(uchar_t szc, uint_t bin, page_list_walker_t *plw)
2738 {
2739 	uint_t  neq_mask = ~plw->plw_ceq_mask[szc] & plw->plw_color_mask;
2740 	uint_t  bin0_nsz, nbin_nsz, nbin0, nbin;
2741 	uchar_t nszc = szc + 1;
2742 
2743 	nbin = ADD_MASKED(bin,
2744 	    plw->plw_bin_step, neq_mask, plw->plw_color_mask);
2745 
2746 	if (plw->plw_do_split) {
2747 		plw->plw_bin_split_prev = bin;
2748 		PAGE_SET_NEXT_SPLIT_MARKER(szc, nszc, bin, plw);
2749 		plw->plw_do_split = 0;
2750 	}
2751 
2752 	if (szc == 0) {
2753 		if (plw->plw_count != 0 || plw->plw_ceq_dif == vac_colors) {
2754 			if (nbin == plw->plw_bin0 &&
2755 			    (vac_colors == 1 || nbin != plw->plw_bin_marker)) {
2756 				nbin = ADD_MASKED(nbin, plw->plw_bin_step,
2757 				    neq_mask, plw->plw_color_mask);
2758 				plw->plw_bin_split_prev = plw->plw_bin0;
2759 			}
2760 
2761 			if (vac_colors > 1 && nbin == plw->plw_bin_marker) {
2762 				plw->plw_bin_marker =
2763 				    nbin = INC_MASKED(nbin, neq_mask,
2764 				    plw->plw_color_mask);
2765 				plw->plw_bin_split_prev = plw->plw_bin0;
2766 				/*
2767 				 * large pages all have the same vac color
2768 				 * so by now we should be done with next
2769 				 * size page splitting process
2770 				 */
2771 				ASSERT(plw->plw_bins[1] == 0);
2772 				plw->plw_do_split = 0;
2773 				return (nbin);
2774 			}
2775 
2776 		} else {
2777 			uint_t bin_jump = (vac_colors == 1) ?
2778 			    (BIN_STEP & ~3) - (plw->plw_bin0 & 3) : BIN_STEP;
2779 
2780 			bin_jump &= ~(vac_colors - 1);
2781 
2782 			nbin0 = ADD_MASKED(plw->plw_bin0, bin_jump, neq_mask,
2783 			    plw->plw_color_mask);
2784 
2785 			if ((nbin0 ^ plw->plw_bin0) & plw->plw_ceq_mask[szc]) {
2786 
2787 				plw->plw_bin_marker = nbin = nbin0;
2788 
2789 				if (plw->plw_bins[nszc] != 0) {
2790 					/*
2791 					 * check if next page size bin is the
2792 					 * same as the next page size bin for
2793 					 * bin0
2794 					 */
2795 					nbin_nsz = PAGE_GET_NSZ_COLOR(szc,
2796 					    nbin);
2797 					bin0_nsz = PAGE_GET_NSZ_COLOR(szc,
2798 					    plw->plw_bin0);
2799 
2800 					if ((bin0_nsz ^ nbin_nsz) &
2801 					    plw->plw_ceq_mask[nszc])
2802 						plw->plw_do_split = 1;
2803 				}
2804 				return (nbin);
2805 			}
2806 		}
2807 	}
2808 
2809 	if (plw->plw_bins[nszc] != 0) {
2810 		nbin_nsz = PAGE_GET_NSZ_COLOR(szc, nbin);
2811 		if (!((plw->plw_split_next ^ nbin_nsz) &
2812 		    plw->plw_ceq_mask[nszc]))
2813 			plw->plw_do_split = 1;
2814 	}
2815 
2816 	return (nbin);
2817 }
2818 
2819 page_t *
2820 page_get_mnode_freelist(int mnode, uint_t bin, int mtype, uchar_t szc,
2821     uint_t flags)
2822 {
2823 	kmutex_t		*pcm;
2824 	page_t			*pp, *first_pp;
2825 	uint_t			sbin;
2826 	int			plw_initialized;
2827 	page_list_walker_t	plw;
2828 
2829 	ASSERT(szc < mmu_page_sizes);
2830 
2831 	VM_STAT_ADD(vmm_vmstats.pgmf_alloc[szc]);
2832 
2833 	MTYPE_START(mnode, mtype, flags);
2834 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
2835 		VM_STAT_ADD(vmm_vmstats.pgmf_allocempty[szc]);
2836 		return (NULL);
2837 	}
2838 try_again:
2839 
2840 	plw_initialized = 0;
2841 	plw.plw_ceq_dif = 1;
2842 
2843 	/*
2844 	 * Only hold one freelist lock at a time, that way we
2845 	 * can start anywhere and not have to worry about lock
2846 	 * ordering.
2847 	 */
2848 	for (plw.plw_count = 0;
2849 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
2850 		sbin = bin;
2851 		do {
2852 			if (!PAGE_FREELISTS(mnode, szc, bin, mtype))
2853 				goto bin_empty_1;
2854 
2855 			pcm = PC_BIN_MUTEX(mnode, bin, PG_FREE_LIST);
2856 			mutex_enter(pcm);
2857 			pp = PAGE_FREELISTS(mnode, szc, bin, mtype);
2858 			if (pp == NULL)
2859 				goto bin_empty_0;
2860 
2861 			/*
2862 			 * These were set before the page
2863 			 * was put on the free list,
2864 			 * they must still be set.
2865 			 */
2866 			ASSERT(PP_ISFREE(pp));
2867 			ASSERT(PP_ISAGED(pp));
2868 			ASSERT(pp->p_vnode == NULL);
2869 			ASSERT(pp->p_hash == NULL);
2870 			ASSERT(pp->p_offset == (u_offset_t)-1);
2871 			ASSERT(pp->p_szc == szc);
2872 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2873 
2874 			/*
2875 			 * Walk down the hash chain.
2876 			 * 8k pages are linked on p_next
2877 			 * and p_prev fields. Large pages
2878 			 * are a contiguous group of
2879 			 * constituent pages linked together
2880 			 * on their p_next and p_prev fields.
2881 			 * The large pages are linked together
2882 			 * on the hash chain using p_vpnext
2883 			 * p_vpprev of the base constituent
2884 			 * page of each large page.
2885 			 */
2886 			first_pp = pp;
2887 			while (!page_trylock_cons(pp, SE_EXCL)) {
2888 				if (szc == 0) {
2889 					pp = pp->p_next;
2890 				} else {
2891 					pp = pp->p_vpnext;
2892 				}
2893 
2894 				ASSERT(PP_ISFREE(pp));
2895 				ASSERT(PP_ISAGED(pp));
2896 				ASSERT(pp->p_vnode == NULL);
2897 				ASSERT(pp->p_hash == NULL);
2898 				ASSERT(pp->p_offset == (u_offset_t)-1);
2899 				ASSERT(pp->p_szc == szc);
2900 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
2901 
2902 				if (pp == first_pp)
2903 					goto bin_empty_0;
2904 			}
2905 
2906 			ASSERT(pp != NULL);
2907 			ASSERT(mtype == PP_2_MTYPE(pp));
2908 			ASSERT(pp->p_szc == szc);
2909 			if (szc == 0) {
2910 				page_sub(&PAGE_FREELISTS(mnode,
2911 				    szc, bin, mtype), pp);
2912 			} else {
2913 				page_vpsub(&PAGE_FREELISTS(mnode,
2914 				    szc, bin, mtype), pp);
2915 				CHK_LPG(pp, szc);
2916 			}
2917 			page_ctr_sub(mnode, mtype, pp, PG_FREE_LIST);
2918 
2919 			if ((PP_ISFREE(pp) == 0) || (PP_ISAGED(pp) == 0))
2920 				panic("free page is not. pp %p", (void *)pp);
2921 			mutex_exit(pcm);
2922 
2923 #if defined(__sparc)
2924 			ASSERT(!kcage_on || PP_ISNORELOC(pp) ||
2925 			    (flags & PG_NORELOC) == 0);
2926 
2927 			if (PP_ISNORELOC(pp))
2928 				kcage_freemem_sub(page_get_pagecnt(szc));
2929 #endif
2930 			VM_STAT_ADD(vmm_vmstats.pgmf_allocok[szc]);
2931 			return (pp);
2932 
2933 bin_empty_0:
2934 			mutex_exit(pcm);
2935 bin_empty_1:
2936 			if (plw_initialized == 0) {
2937 				page_list_walk_init(szc, flags, bin, 1, 1,
2938 				    &plw);
2939 				plw_initialized = 1;
2940 				ASSERT(plw.plw_colors <=
2941 				    PAGE_GET_PAGECOLORS(szc));
2942 				ASSERT(plw.plw_colors > 0);
2943 				ASSERT((plw.plw_colors &
2944 				    (plw.plw_colors - 1)) == 0);
2945 				ASSERT(bin < plw.plw_colors);
2946 				ASSERT(plw.plw_ceq_mask[szc] < plw.plw_colors);
2947 			}
2948 			/* calculate the next bin with equivalent color */
2949 			bin = ADD_MASKED(bin, plw.plw_bin_step,
2950 			    plw.plw_ceq_mask[szc], plw.plw_color_mask);
2951 		} while (sbin != bin);
2952 
2953 		/*
2954 		 * color bins are all empty if color match. Try and
2955 		 * satisfy the request by breaking up or coalescing
2956 		 * pages from a different size freelist of the correct
2957 		 * color that satisfies the ORIGINAL color requested.
2958 		 * If that fails then try pages of the same size but
2959 		 * different colors assuming we are not called with
2960 		 * PG_MATCH_COLOR.
2961 		 */
2962 		if (plw.plw_do_split &&
2963 		    (pp = page_freelist_split(szc, bin, mnode,
2964 		    mtype, PFNNULL, &plw)) != NULL)
2965 			return (pp);
2966 
2967 		if (szc > 0 && (pp = page_freelist_coalesce(mnode, szc,
2968 		    bin, plw.plw_ceq_mask[szc], mtype, PFNNULL)) !=  NULL)
2969 			return (pp);
2970 
2971 		if (plw.plw_ceq_dif > 1)
2972 			bin = page_list_walk_next_bin(szc, bin, &plw);
2973 	}
2974 
2975 	/* if allowed, cycle through additional mtypes */
2976 	MTYPE_NEXT(mnode, mtype, flags);
2977 	if (mtype >= 0)
2978 		goto try_again;
2979 
2980 	VM_STAT_ADD(vmm_vmstats.pgmf_allocfailed[szc]);
2981 
2982 	return (NULL);
2983 }
2984 
2985 /*
2986  * Returns the count of free pages for 'pp' with size code 'szc'.
2987  * Note: This function does not return an exact value as the page freelist
2988  * locks are not held and thus the values in the page_counters may be
2989  * changing as we walk through the data.
2990  */
2991 static int
2992 page_freecnt(int mnode, page_t *pp, uchar_t szc)
2993 {
2994 	pgcnt_t	pgfree;
2995 	pgcnt_t cnt;
2996 	ssize_t	r = szc;	/* region size */
2997 	ssize_t	idx;
2998 	int	i;
2999 	int	full, range;
3000 
3001 	/* Make sure pagenum passed in is aligned properly */
3002 	ASSERT((pp->p_pagenum & (PNUM_SIZE(szc) - 1)) == 0);
3003 	ASSERT(szc > 0);
3004 
3005 	/* Prevent page_counters dynamic memory from being freed */
3006 	rw_enter(&page_ctrs_rwlock[mnode], RW_READER);
3007 	idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3008 	cnt = PAGE_COUNTERS(mnode, r, idx);
3009 	pgfree = cnt << PNUM_SHIFT(r - 1);
3010 	range = FULL_REGION_CNT(szc);
3011 
3012 	/* Check for completely full region */
3013 	if (cnt == range) {
3014 		rw_exit(&page_ctrs_rwlock[mnode]);
3015 		return (pgfree);
3016 	}
3017 
3018 	while (--r > 0) {
3019 		idx = PNUM_TO_IDX(mnode, r, pp->p_pagenum);
3020 		full = FULL_REGION_CNT(r);
3021 		for (i = 0; i < range; i++, idx++) {
3022 			cnt = PAGE_COUNTERS(mnode, r, idx);
3023 			/*
3024 			 * If cnt here is full, that means we have already
3025 			 * accounted for these pages earlier.
3026 			 */
3027 			if (cnt != full) {
3028 				pgfree += (cnt << PNUM_SHIFT(r - 1));
3029 			}
3030 		}
3031 		range *= full;
3032 	}
3033 	rw_exit(&page_ctrs_rwlock[mnode]);
3034 	return (pgfree);
3035 }
3036 
3037 /*
3038  * Called from page_geti_contig_pages to exclusively lock constituent pages
3039  * starting from 'spp' for page size code 'szc'.
3040  *
3041  * If 'ptcpthreshold' is set, the number of free pages needed in the 'szc'
3042  * region needs to be greater than or equal to the threshold.
3043  */
3044 static int
3045 page_trylock_contig_pages(int mnode, page_t *spp, uchar_t szc, int flags)
3046 {
3047 	pgcnt_t	pgcnt = PNUM_SIZE(szc);
3048 	pgcnt_t pgfree, i;
3049 	page_t *pp;
3050 
3051 	VM_STAT_ADD(vmm_vmstats.ptcp[szc]);
3052 
3053 
3054 	if ((ptcpthreshold == 0) || (flags & PGI_PGCPHIPRI))
3055 		goto skipptcpcheck;
3056 	/*
3057 	 * check if there are sufficient free pages available before attempting
3058 	 * to trylock. Count is approximate as page counters can change.
3059 	 */
3060 	pgfree = page_freecnt(mnode, spp, szc);
3061 
3062 	/* attempt to trylock if there are sufficient already free pages */
3063 	if (pgfree < pgcnt/ptcpthreshold) {
3064 		VM_STAT_ADD(vmm_vmstats.ptcpfreethresh[szc]);
3065 		return (0);
3066 	}
3067 
3068 skipptcpcheck:
3069 
3070 	for (i = 0; i < pgcnt; i++) {
3071 		pp = &spp[i];
3072 		if (!page_trylock(pp, SE_EXCL)) {
3073 			VM_STAT_ADD(vmm_vmstats.ptcpfailexcl[szc]);
3074 			while (--i != (pgcnt_t)-1) {
3075 				pp = &spp[i];
3076 				ASSERT(PAGE_EXCL(pp));
3077 				page_unlock_nocapture(pp);
3078 			}
3079 			return (0);
3080 		}
3081 		ASSERT(spp[i].p_pagenum == spp->p_pagenum + i);
3082 		if ((pp->p_szc > szc || (szc && pp->p_szc == szc)) &&
3083 		    !PP_ISFREE(pp)) {
3084 			VM_STAT_ADD(vmm_vmstats.ptcpfailszc[szc]);
3085 			ASSERT(i == 0);
3086 			page_unlock_nocapture(pp);
3087 			return (0);
3088 		}
3089 		if (PP_ISNORELOC(pp)) {
3090 			VM_STAT_ADD(vmm_vmstats.ptcpfailcage[szc]);
3091 			while (i != (pgcnt_t)-1) {
3092 				pp = &spp[i];
3093 				ASSERT(PAGE_EXCL(pp));
3094 				page_unlock_nocapture(pp);
3095 				i--;
3096 			}
3097 			return (0);
3098 		}
3099 	}
3100 	VM_STAT_ADD(vmm_vmstats.ptcpok[szc]);
3101 	return (1);
3102 }
3103 
3104 /*
3105  * Claim large page pointed to by 'pp'. 'pp' is the starting set
3106  * of 'szc' constituent pages that had been locked exclusively previously.
3107  * Will attempt to relocate constituent pages in use.
3108  */
3109 static page_t *
3110 page_claim_contig_pages(page_t *pp, uchar_t szc, int flags)
3111 {
3112 	spgcnt_t pgcnt, npgs, i;
3113 	page_t *targpp, *rpp, *hpp;
3114 	page_t *replpp = NULL;
3115 	page_t *pplist = NULL;
3116 
3117 	ASSERT(pp != NULL);
3118 
3119 	pgcnt = page_get_pagecnt(szc);
3120 	while (pgcnt) {
3121 		ASSERT(PAGE_EXCL(pp));
3122 		ASSERT(!PP_ISNORELOC(pp));
3123 		if (PP_ISFREE(pp)) {
3124 			/*
3125 			 * If this is a PG_FREE_LIST page then its
3126 			 * size code can change underneath us due to
3127 			 * page promotion or demotion. As an optimzation
3128 			 * use page_list_sub_pages() instead of
3129 			 * page_list_sub().
3130 			 */
3131 			if (PP_ISAGED(pp)) {
3132 				page_list_sub_pages(pp, szc);
3133 				if (pp->p_szc == szc) {
3134 					return (pp);
3135 				}
3136 				ASSERT(pp->p_szc < szc);
3137 				npgs = page_get_pagecnt(pp->p_szc);
3138 				hpp = pp;
3139 				for (i = 0; i < npgs; i++, pp++) {
3140 					pp->p_szc = szc;
3141 				}
3142 				page_list_concat(&pplist, &hpp);
3143 				pgcnt -= npgs;
3144 				continue;
3145 			}
3146 			ASSERT(!PP_ISAGED(pp));
3147 			ASSERT(pp->p_szc == 0);
3148 			page_list_sub(pp, PG_CACHE_LIST);
3149 			page_hashout(pp, NULL);
3150 			PP_SETAGED(pp);
3151 			pp->p_szc = szc;
3152 			page_list_concat(&pplist, &pp);
3153 			pp++;
3154 			pgcnt--;
3155 			continue;
3156 		}
3157 		npgs = page_get_pagecnt(pp->p_szc);
3158 
3159 		/*
3160 		 * page_create_wait freemem accounting done by caller of
3161 		 * page_get_freelist and not necessary to call it prior to
3162 		 * calling page_get_replacement_page.
3163 		 *
3164 		 * page_get_replacement_page can call page_get_contig_pages
3165 		 * to acquire a large page (szc > 0); the replacement must be
3166 		 * smaller than the contig page size to avoid looping or
3167 		 * szc == 0 and PGI_PGCPSZC0 is set.
3168 		 */
3169 		if (pp->p_szc < szc || (szc == 0 && (flags & PGI_PGCPSZC0))) {
3170 			replpp = page_get_replacement_page(pp, NULL, 0);
3171 			if (replpp) {
3172 				npgs = page_get_pagecnt(pp->p_szc);
3173 				ASSERT(npgs <= pgcnt);
3174 				targpp = pp;
3175 			}
3176 		}
3177 
3178 		/*
3179 		 * If replacement is NULL or do_page_relocate fails, fail
3180 		 * coalescing of pages.
3181 		 */
3182 		if (replpp == NULL || (do_page_relocate(&targpp, &replpp, 0,
3183 		    &npgs, NULL) != 0)) {
3184 			/*
3185 			 * Unlock un-processed target list
3186 			 */
3187 			while (pgcnt--) {
3188 				ASSERT(PAGE_EXCL(pp));
3189 				page_unlock_nocapture(pp);
3190 				pp++;
3191 			}
3192 			/*
3193 			 * Free the processed target list.
3194 			 */
3195 			while (pplist) {
3196 				pp = pplist;
3197 				page_sub(&pplist, pp);
3198 				ASSERT(PAGE_EXCL(pp));
3199 				ASSERT(pp->p_szc == szc);
3200 				ASSERT(PP_ISFREE(pp));
3201 				ASSERT(PP_ISAGED(pp));
3202 				pp->p_szc = 0;
3203 				page_list_add(pp, PG_FREE_LIST | PG_LIST_TAIL);
3204 				page_unlock_nocapture(pp);
3205 			}
3206 
3207 			if (replpp != NULL)
3208 				page_free_replacement_page(replpp);
3209 
3210 			return (NULL);
3211 		}
3212 		ASSERT(pp == targpp);
3213 
3214 		/* LINTED */
3215 		ASSERT(hpp = pp); /* That's right, it's an assignment */
3216 
3217 		pp += npgs;
3218 		pgcnt -= npgs;
3219 
3220 		while (npgs--) {
3221 			ASSERT(PAGE_EXCL(targpp));
3222 			ASSERT(!PP_ISFREE(targpp));
3223 			ASSERT(!PP_ISNORELOC(targpp));
3224 			PP_SETFREE(targpp);
3225 			ASSERT(PP_ISAGED(targpp));
3226 			ASSERT(targpp->p_szc < szc || (szc == 0 &&
3227 			    (flags & PGI_PGCPSZC0)));
3228 			targpp->p_szc = szc;
3229 			targpp = targpp->p_next;
3230 
3231 			rpp = replpp;
3232 			ASSERT(rpp != NULL);
3233 			page_sub(&replpp, rpp);
3234 			ASSERT(PAGE_EXCL(rpp));
3235 			ASSERT(!PP_ISFREE(rpp));
3236 			page_unlock_nocapture(rpp);
3237 		}
3238 		ASSERT(targpp == hpp);
3239 		ASSERT(replpp == NULL);
3240 		page_list_concat(&pplist, &targpp);
3241 	}
3242 	CHK_LPG(pplist, szc);
3243 	return (pplist);
3244 }
3245 
3246 /*
3247  * Trim kernel cage from pfnlo-pfnhi and store result in lo-hi. Return code
3248  * of 0 means nothing left after trim.
3249  */
3250 int
3251 trimkcage(struct memseg *mseg, pfn_t *lo, pfn_t *hi, pfn_t pfnlo, pfn_t pfnhi)
3252 {
3253 	pfn_t	kcagepfn;
3254 	int	decr;
3255 	int	rc = 0;
3256 
3257 	if (PP_ISNORELOC(mseg->pages)) {
3258 		if (PP_ISNORELOC(mseg->epages - 1) == 0) {
3259 
3260 			/* lower part of this mseg inside kernel cage */
3261 			decr = kcage_current_pfn(&kcagepfn);
3262 
3263 			/* kernel cage may have transitioned past mseg */
3264 			if (kcagepfn >= mseg->pages_base &&
3265 			    kcagepfn < mseg->pages_end) {
3266 				ASSERT(decr == 0);
3267 				*lo = kcagepfn;
3268 				*hi = MIN(pfnhi,
3269 				    (mseg->pages_end - 1));
3270 				rc = 1;
3271 			}
3272 		}
3273 		/* else entire mseg in the cage */
3274 	} else {
3275 		if (PP_ISNORELOC(mseg->epages - 1)) {
3276 
3277 			/* upper part of this mseg inside kernel cage */
3278 			decr = kcage_current_pfn(&kcagepfn);
3279 
3280 			/* kernel cage may have transitioned past mseg */
3281 			if (kcagepfn >= mseg->pages_base &&
3282 			    kcagepfn < mseg->pages_end) {
3283 				ASSERT(decr);
3284 				*hi = kcagepfn;
3285 				*lo = MAX(pfnlo, mseg->pages_base);
3286 				rc = 1;
3287 			}
3288 		} else {
3289 			/* entire mseg outside of kernel cage */
3290 			*lo = MAX(pfnlo, mseg->pages_base);
3291 			*hi = MIN(pfnhi, (mseg->pages_end - 1));
3292 			rc = 1;
3293 		}
3294 	}
3295 	return (rc);
3296 }
3297 
3298 /*
3299  * called from page_get_contig_pages to search 'pfnlo' thru 'pfnhi' to claim a
3300  * page with size code 'szc'. Claiming such a page requires acquiring
3301  * exclusive locks on all constituent pages (page_trylock_contig_pages),
3302  * relocating pages in use and concatenating these constituent pages into a
3303  * large page.
3304  *
3305  * The page lists do not have such a large page and page_freelist_split has
3306  * already failed to demote larger pages and/or coalesce smaller free pages.
3307  *
3308  * 'flags' may specify PG_COLOR_MATCH which would limit the search of large
3309  * pages with the same color as 'bin'.
3310  *
3311  * 'pfnflag' specifies the subset of the pfn range to search.
3312  */
3313 
3314 
3315 static page_t *
3316 page_geti_contig_pages(int mnode, uint_t bin, uchar_t szc, int flags,
3317     pfn_t pfnlo, pfn_t pfnhi, pgcnt_t pfnflag)
3318 {
3319 	struct memseg *mseg;
3320 	pgcnt_t	szcpgcnt = page_get_pagecnt(szc);
3321 	pgcnt_t szcpgmask = szcpgcnt - 1;
3322 	pfn_t	randpfn;
3323 	page_t *pp, *randpp, *endpp;
3324 	uint_t colors, ceq_mask;
3325 	/* LINTED : set but not used in function */
3326 	uint_t color_mask;
3327 	pfn_t hi, lo;
3328 	uint_t skip;
3329 	MEM_NODE_ITERATOR_DECL(it);
3330 
3331 	ASSERT(szc != 0 || (flags & PGI_PGCPSZC0));
3332 
3333 	if ((pfnhi - pfnlo) + 1 < szcpgcnt)
3334 		return (NULL);
3335 
3336 	ASSERT(szc < mmu_page_sizes);
3337 
3338 	colors = PAGE_GET_PAGECOLORS(szc);
3339 	color_mask = colors - 1;
3340 	if ((colors > 1) && (flags & PG_MATCH_COLOR)) {
3341 		uchar_t ceq = colorequivszc[szc];
3342 		uint_t  ceq_dif = colors >> ((ceq >> 4) + (ceq & 0xf));
3343 
3344 		ASSERT(ceq_dif > 0);
3345 		ceq_mask = (ceq_dif - 1) << (ceq & 0xf);
3346 	} else {
3347 		ceq_mask = 0;
3348 	}
3349 
3350 	ASSERT(bin < colors);
3351 
3352 	/* clear "non-significant" color bits */
3353 	bin &= ceq_mask;
3354 
3355 	/*
3356 	 * trim the pfn range to search based on pfnflag. pfnflag is set
3357 	 * when there have been previous page_get_contig_page failures to
3358 	 * limit the search.
3359 	 *
3360 	 * The high bit in pfnflag specifies the number of 'slots' in the
3361 	 * pfn range and the remainder of pfnflag specifies which slot.
3362 	 * For example, a value of 1010b would mean the second slot of
3363 	 * the pfn range that has been divided into 8 slots.
3364 	 */
3365 	if (pfnflag > 1) {
3366 		int	slots = 1 << (highbit(pfnflag) - 1);
3367 		int	slotid = pfnflag & (slots - 1);
3368 		pgcnt_t	szcpages;
3369 		int	slotlen;
3370 
3371 		pfnlo = P2ROUNDUP(pfnlo, szcpgcnt);
3372 		pfnhi = pfnhi & ~(szcpgcnt - 1);
3373 
3374 		szcpages = ((pfnhi - pfnlo) + 1) / szcpgcnt;
3375 		slotlen = howmany(szcpages, slots);
3376 		pfnlo = pfnlo + (((slotid * slotlen) % szcpages) * szcpgcnt);
3377 		ASSERT(pfnlo < pfnhi);
3378 		if (pfnhi > pfnlo + (slotlen * szcpgcnt))
3379 			pfnhi = pfnlo + (slotlen * szcpgcnt);
3380 	}
3381 
3382 	memsegs_lock(0);
3383 
3384 	/*
3385 	 * loop through memsegs to look for contig page candidates
3386 	 */
3387 
3388 	for (mseg = memsegs; mseg != NULL; mseg = mseg->next) {
3389 		if (pfnhi < mseg->pages_base || pfnlo >= mseg->pages_end) {
3390 			/* no overlap */
3391 			continue;
3392 		}
3393 
3394 		if (mseg->pages_end - mseg->pages_base < szcpgcnt)
3395 			/* mseg too small */
3396 			continue;
3397 
3398 		/* trim off kernel cage pages from pfn range */
3399 		if (kcage_on) {
3400 			if (trimkcage(mseg, &lo, &hi, pfnlo, pfnhi) == 0)
3401 				continue;
3402 		} else {
3403 			lo = MAX(pfnlo, mseg->pages_base);
3404 			hi = MIN(pfnhi, (mseg->pages_end - 1));
3405 		}
3406 
3407 		/* round to szcpgcnt boundaries */
3408 		lo = P2ROUNDUP(lo, szcpgcnt);
3409 		MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
3410 		hi = hi & ~(szcpgcnt - 1);
3411 
3412 		if (hi <= lo)
3413 			continue;
3414 
3415 		/*
3416 		 * set lo to point to the pfn for the desired bin. Large
3417 		 * page sizes may only have a single page color
3418 		 */
3419 		skip = szcpgcnt;
3420 		if (ceq_mask > 0 || interleaved_mnodes) {
3421 			/* set lo to point at appropriate color */
3422 			if (((PFN_2_COLOR(lo, szc, &it) ^ bin) & ceq_mask) ||
3423 			    (interleaved_mnodes &&
3424 			    PFN_2_MEM_NODE(lo) != mnode)) {
3425 				PAGE_NEXT_PFN_FOR_COLOR(lo, szc, bin, ceq_mask,
3426 				    color_mask, &it);
3427 			}
3428 			if (hi <= lo)
3429 				/* mseg cannot satisfy color request */
3430 				continue;
3431 		}
3432 
3433 		/* randomly choose a point between lo and hi to begin search */
3434 
3435 		randpfn = (pfn_t)GETTICK();
3436 		randpfn = ((randpfn % (hi - lo)) + lo) & ~(skip - 1);
3437 		MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
3438 		if (ceq_mask || interleaved_mnodes) {
3439 			if (randpfn != (pfn_t)-1)
3440 				PAGE_NEXT_PFN_FOR_COLOR(randpfn, szc, bin,
3441 				    ceq_mask, color_mask, &it);
3442 			if (randpfn >= hi) {
3443 				randpfn = lo;
3444 				MEM_NODE_ITERATOR_INIT(randpfn, mnode, &it);
3445 			}
3446 		}
3447 		randpp = mseg->pages + (randpfn - mseg->pages_base);
3448 
3449 		ASSERT(randpp->p_pagenum == randpfn);
3450 
3451 		pp = randpp;
3452 		endpp =  mseg->pages + (hi - mseg->pages_base);
3453 
3454 		ASSERT(randpp + szcpgcnt <= endpp);
3455 
3456 		do {
3457 			ASSERT(!(pp->p_pagenum & szcpgmask));
3458 			ASSERT(((PP_2_BIN(pp) ^ bin) & ceq_mask) == 0);
3459 
3460 			if (page_trylock_contig_pages(mnode, pp, szc, flags)) {
3461 				/* pages unlocked by page_claim on failure */
3462 				if (page_claim_contig_pages(pp, szc, flags)) {
3463 					memsegs_unlock(0);
3464 					return (pp);
3465 				}
3466 			}
3467 
3468 			if (ceq_mask == 0 && !interleaved_mnodes) {
3469 				pp += skip;
3470 			} else {
3471 				pfn_t pfn = pp->p_pagenum;
3472 
3473 				PAGE_NEXT_PFN_FOR_COLOR(pfn, szc, bin,
3474 				    ceq_mask, color_mask, &it);
3475 				if (pfn == (pfn_t)-1) {
3476 					pp = endpp;
3477 				} else {
3478 					pp = mseg->pages +
3479 					    (pfn - mseg->pages_base);
3480 				}
3481 			}
3482 			if (pp >= endpp) {
3483 				/* start from the beginning */
3484 				MEM_NODE_ITERATOR_INIT(lo, mnode, &it);
3485 				pp = mseg->pages + (lo - mseg->pages_base);
3486 				ASSERT(pp->p_pagenum == lo);
3487 				ASSERT(pp + szcpgcnt <= endpp);
3488 			}
3489 		} while (pp != randpp);
3490 	}
3491 	memsegs_unlock(0);
3492 	return (NULL);
3493 }
3494 
3495 
3496 /*
3497  * controlling routine that searches through physical memory in an attempt to
3498  * claim a large page based on the input parameters.
3499  * on the page free lists.
3500  *
3501  * calls page_geti_contig_pages with an initial pfn range from the mnode
3502  * and mtype. page_geti_contig_pages will trim off the parts of the pfn range
3503  * that overlaps with the kernel cage or does not match the requested page
3504  * color if PG_MATCH_COLOR is set.  Since this search is very expensive,
3505  * page_geti_contig_pages may further limit the search range based on
3506  * previous failure counts (pgcpfailcnt[]).
3507  *
3508  * for PGI_PGCPSZC0 requests, page_get_contig_pages will relocate a base
3509  * pagesize page that satisfies mtype.
3510  */
3511 page_t *
3512 page_get_contig_pages(int mnode, uint_t bin, int mtype, uchar_t szc,
3513     uint_t flags)
3514 {
3515 	pfn_t		pfnlo, pfnhi;	/* contig pages pfn range */
3516 	page_t		*pp;
3517 	pgcnt_t		pfnflag = 0;	/* no limit on search if 0 */
3518 
3519 	VM_STAT_ADD(vmm_vmstats.pgcp_alloc[szc]);
3520 
3521 	/* no allocations from cage */
3522 	flags |= PGI_NOCAGE;
3523 
3524 	/* LINTED */
3525 	MTYPE_START(mnode, mtype, flags);
3526 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3527 		VM_STAT_ADD(vmm_vmstats.pgcp_allocempty[szc]);
3528 		return (NULL);
3529 	}
3530 
3531 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3532 
3533 	/* do not limit search and ignore color if hi pri */
3534 
3535 	if (pgcplimitsearch && ((flags & PGI_PGCPHIPRI) == 0))
3536 		pfnflag = pgcpfailcnt[szc];
3537 
3538 	/* remove color match to improve chances */
3539 
3540 	if (flags & PGI_PGCPHIPRI || pfnflag)
3541 		flags &= ~PG_MATCH_COLOR;
3542 
3543 	do {
3544 		/* get pfn range based on mnode and mtype */
3545 		MNODETYPE_2_PFN(mnode, mtype, pfnlo, pfnhi);
3546 
3547 		ASSERT(pfnhi >= pfnlo);
3548 
3549 		pp = page_geti_contig_pages(mnode, bin, szc, flags,
3550 		    pfnlo, pfnhi, pfnflag);
3551 
3552 		if (pp != NULL) {
3553 			pfnflag = pgcpfailcnt[szc];
3554 			if (pfnflag) {
3555 				/* double the search size */
3556 				pgcpfailcnt[szc] = pfnflag >> 1;
3557 			}
3558 			VM_STAT_ADD(vmm_vmstats.pgcp_allocok[szc]);
3559 			return (pp);
3560 		}
3561 		MTYPE_NEXT(mnode, mtype, flags);
3562 	} while (mtype >= 0);
3563 
3564 	VM_STAT_ADD(vmm_vmstats.pgcp_allocfailed[szc]);
3565 	return (NULL);
3566 }
3567 
3568 
3569 /*
3570  * Find the `best' page on the freelist for this (vp,off) (as,vaddr) pair.
3571  *
3572  * Does its own locking and accounting.
3573  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3574  * pages of the proper color even if there are pages of a different color.
3575  *
3576  * Finds a page, removes it, THEN locks it.
3577  */
3578 
3579 /*ARGSUSED*/
3580 page_t *
3581 page_get_freelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3582 	caddr_t vaddr, size_t size, uint_t flags, struct lgrp *lgrp)
3583 {
3584 	struct as	*as = seg->s_as;
3585 	page_t		*pp = NULL;
3586 	ulong_t		bin;
3587 	uchar_t		szc;
3588 	int		mnode;
3589 	int		mtype;
3590 	page_t		*(*page_get_func)(int, uint_t, int, uchar_t, uint_t);
3591 	lgrp_mnode_cookie_t	lgrp_cookie;
3592 
3593 	page_get_func = page_get_mnode_freelist;
3594 
3595 	/*
3596 	 * If we aren't passed a specific lgroup, or passed a freed lgrp
3597 	 * assume we wish to allocate near to the current thread's home.
3598 	 */
3599 	if (!LGRP_EXISTS(lgrp))
3600 		lgrp = lgrp_home_lgrp();
3601 
3602 	if (kcage_on) {
3603 		if ((flags & (PG_NORELOC | PG_PANIC)) == PG_NORELOC &&
3604 		    kcage_freemem < kcage_throttlefree + btop(size) &&
3605 		    curthread != kcage_cageout_thread) {
3606 			/*
3607 			 * Set a "reserve" of kcage_throttlefree pages for
3608 			 * PG_PANIC and cageout thread allocations.
3609 			 *
3610 			 * Everybody else has to serialize in
3611 			 * page_create_get_something() to get a cage page, so
3612 			 * that we don't deadlock cageout!
3613 			 */
3614 			return (NULL);
3615 		}
3616 	} else {
3617 		flags &= ~PG_NORELOC;
3618 		flags |= PGI_NOCAGE;
3619 	}
3620 
3621 	/* LINTED */
3622 	MTYPE_INIT(mtype, vp, vaddr, flags, size);
3623 
3624 	/*
3625 	 * Convert size to page size code.
3626 	 */
3627 	if ((szc = page_szc(size)) == (uchar_t)-1)
3628 		panic("page_get_freelist: illegal page size request");
3629 	ASSERT(szc < mmu_page_sizes);
3630 
3631 	VM_STAT_ADD(vmm_vmstats.pgf_alloc[szc]);
3632 
3633 	/* LINTED */
3634 	AS_2_BIN(as, seg, vp, vaddr, bin, szc);
3635 
3636 	ASSERT(bin < PAGE_GET_PAGECOLORS(szc));
3637 
3638 	/*
3639 	 * Try to get a local page first, but try remote if we can't
3640 	 * get a page of the right color.
3641 	 */
3642 pgretry:
3643 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3644 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3645 		pp = page_get_func(mnode, bin, mtype, szc, flags);
3646 		if (pp != NULL) {
3647 			VM_STAT_ADD(vmm_vmstats.pgf_allocok[szc]);
3648 			DTRACE_PROBE4(page__get,
3649 			    lgrp_t *, lgrp,
3650 			    int, mnode,
3651 			    ulong_t, bin,
3652 			    uint_t, flags);
3653 			return (pp);
3654 		}
3655 	}
3656 	ASSERT(pp == NULL);
3657 
3658 	/*
3659 	 * for non-SZC0 PAGESIZE requests, check cachelist before checking
3660 	 * remote free lists.  Caller expected to call page_get_cachelist which
3661 	 * will check local cache lists and remote free lists.
3662 	 */
3663 	if (szc == 0 && ((flags & PGI_PGCPSZC0) == 0)) {
3664 		VM_STAT_ADD(vmm_vmstats.pgf_allocdeferred);
3665 		return (NULL);
3666 	}
3667 
3668 	ASSERT(szc > 0 || (flags & PGI_PGCPSZC0));
3669 
3670 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3671 
3672 	if (!(flags & PG_LOCAL)) {
3673 		/*
3674 		 * Try to get a non-local freelist page.
3675 		 */
3676 		LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3677 		while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3678 			pp = page_get_func(mnode, bin, mtype, szc, flags);
3679 			if (pp != NULL) {
3680 				DTRACE_PROBE4(page__get,
3681 				    lgrp_t *, lgrp,
3682 				    int, mnode,
3683 				    ulong_t, bin,
3684 				    uint_t, flags);
3685 				VM_STAT_ADD(vmm_vmstats.pgf_allocokrem[szc]);
3686 				return (pp);
3687 			}
3688 		}
3689 		ASSERT(pp == NULL);
3690 	}
3691 
3692 	/*
3693 	 * when the cage is off chances are page_get_contig_pages() will fail
3694 	 * to lock a large page chunk therefore when the cage is off it's not
3695 	 * called by default.  this can be changed via /etc/system.
3696 	 *
3697 	 * page_get_contig_pages() also called to acquire a base pagesize page
3698 	 * for page_create_get_something().
3699 	 */
3700 	if (!(flags & PG_NORELOC) && (pg_contig_disable == 0) &&
3701 	    (kcage_on || pg_lpgcreate_nocage || szc == 0) &&
3702 	    (page_get_func != page_get_contig_pages)) {
3703 
3704 		VM_STAT_ADD(vmm_vmstats.pgf_allocretry[szc]);
3705 		page_get_func = page_get_contig_pages;
3706 		goto pgretry;
3707 	}
3708 
3709 	if (!(flags & PG_LOCAL) && pgcplimitsearch &&
3710 	    page_get_func == page_get_contig_pages)
3711 		SETPGCPFAILCNT(szc);
3712 
3713 	VM_STAT_ADD(vmm_vmstats.pgf_allocfailed[szc]);
3714 	return (NULL);
3715 }
3716 
3717 /*
3718  * Find the `best' page on the cachelist for this (vp,off) (as,vaddr) pair.
3719  *
3720  * Does its own locking.
3721  * If PG_MATCH_COLOR is set, then NULL will be returned if there are no
3722  * pages of the proper color even if there are pages of a different color.
3723  * Otherwise, scan the bins for ones with pages.  For each bin with pages,
3724  * try to lock one of them.  If no page can be locked, try the
3725  * next bin.  Return NULL if a page can not be found and locked.
3726  *
3727  * Finds a pages, trys to lock it, then removes it.
3728  */
3729 
3730 /*ARGSUSED*/
3731 page_t *
3732 page_get_cachelist(struct vnode *vp, u_offset_t off, struct seg *seg,
3733     caddr_t vaddr, uint_t flags, struct lgrp *lgrp)
3734 {
3735 	page_t		*pp;
3736 	struct as	*as = seg->s_as;
3737 	ulong_t		bin;
3738 	/*LINTED*/
3739 	int		mnode;
3740 	int		mtype;
3741 	lgrp_mnode_cookie_t	lgrp_cookie;
3742 
3743 	/*
3744 	 * If we aren't passed a specific lgroup, or pasased a freed lgrp
3745 	 * assume we wish to allocate near to the current thread's home.
3746 	 */
3747 	if (!LGRP_EXISTS(lgrp))
3748 		lgrp = lgrp_home_lgrp();
3749 
3750 	if (!kcage_on) {
3751 		flags &= ~PG_NORELOC;
3752 		flags |= PGI_NOCAGE;
3753 	}
3754 
3755 	if ((flags & (PG_NORELOC | PG_PANIC | PG_PUSHPAGE)) == PG_NORELOC &&
3756 	    kcage_freemem <= kcage_throttlefree) {
3757 		/*
3758 		 * Reserve kcage_throttlefree pages for critical kernel
3759 		 * threads.
3760 		 *
3761 		 * Everybody else has to go to page_create_get_something()
3762 		 * to get a cage page, so we don't deadlock cageout.
3763 		 */
3764 		return (NULL);
3765 	}
3766 
3767 	/* LINTED */
3768 	AS_2_BIN(as, seg, vp, vaddr, bin, 0);
3769 
3770 	ASSERT(bin < PAGE_GET_PAGECOLORS(0));
3771 
3772 	/* LINTED */
3773 	MTYPE_INIT(mtype, vp, vaddr, flags, MMU_PAGESIZE);
3774 
3775 	VM_STAT_ADD(vmm_vmstats.pgc_alloc);
3776 
3777 	/*
3778 	 * Try local cachelists first
3779 	 */
3780 	LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp, LGRP_SRCH_LOCAL);
3781 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3782 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3783 		if (pp != NULL) {
3784 			VM_STAT_ADD(vmm_vmstats.pgc_allocok);
3785 			DTRACE_PROBE4(page__get,
3786 			    lgrp_t *, lgrp,
3787 			    int, mnode,
3788 			    ulong_t, bin,
3789 			    uint_t, flags);
3790 			return (pp);
3791 		}
3792 	}
3793 
3794 	lgrp_stat_add(lgrp->lgrp_id, LGRP_NUM_ALLOC_FAIL, 1);
3795 
3796 	/*
3797 	 * Try freelists/cachelists that are farther away
3798 	 * This is our only chance to allocate remote pages for PAGESIZE
3799 	 * requests.
3800 	 */
3801 	LGRP_MNODE_COOKIE_UPGRADE(lgrp_cookie);
3802 	while ((mnode = lgrp_memnode_choose(&lgrp_cookie)) >= 0) {
3803 		pp = page_get_mnode_freelist(mnode, bin, mtype,
3804 		    0, flags);
3805 		if (pp != NULL) {
3806 			VM_STAT_ADD(vmm_vmstats.pgc_allocokdeferred);
3807 			DTRACE_PROBE4(page__get,
3808 			    lgrp_t *, lgrp,
3809 			    int, mnode,
3810 			    ulong_t, bin,
3811 			    uint_t, flags);
3812 			return (pp);
3813 		}
3814 		pp = page_get_mnode_cachelist(bin, flags, mnode, mtype);
3815 		if (pp != NULL) {
3816 			VM_STAT_ADD(vmm_vmstats.pgc_allocokrem);
3817 			DTRACE_PROBE4(page__get,
3818 			    lgrp_t *, lgrp,
3819 			    int, mnode,
3820 			    ulong_t, bin,
3821 			    uint_t, flags);
3822 			return (pp);
3823 		}
3824 	}
3825 
3826 	VM_STAT_ADD(vmm_vmstats.pgc_allocfailed);
3827 	return (NULL);
3828 }
3829 
3830 page_t *
3831 page_get_mnode_cachelist(uint_t bin, uint_t flags, int mnode, int mtype)
3832 {
3833 	kmutex_t		*pcm;
3834 	page_t			*pp, *first_pp;
3835 	uint_t			sbin;
3836 	int			plw_initialized;
3837 	page_list_walker_t	plw;
3838 
3839 	VM_STAT_ADD(vmm_vmstats.pgmc_alloc);
3840 
3841 	/* LINTED */
3842 	MTYPE_START(mnode, mtype, flags);
3843 	if (mtype < 0) {	/* mnode does not have memory in mtype range */
3844 		VM_STAT_ADD(vmm_vmstats.pgmc_allocempty);
3845 		return (NULL);
3846 	}
3847 
3848 try_again:
3849 
3850 	plw_initialized = 0;
3851 	plw.plw_ceq_dif = 1;
3852 
3853 	/*
3854 	 * Only hold one cachelist lock at a time, that way we
3855 	 * can start anywhere and not have to worry about lock
3856 	 * ordering.
3857 	 */
3858 
3859 	for (plw.plw_count = 0;
3860 	    plw.plw_count < plw.plw_ceq_dif; plw.plw_count++) {
3861 		sbin = bin;
3862 		do {
3863 
3864 			if (!PAGE_CACHELISTS(mnode, bin, mtype))
3865 				goto bin_empty_1;
3866 			pcm = PC_BIN_MUTEX(mnode, bin, PG_CACHE_LIST);
3867 			mutex_enter(pcm);
3868 			pp = PAGE_CACHELISTS(mnode, bin, mtype);
3869 			if (pp == NULL)
3870 				goto bin_empty_0;
3871 
3872 			first_pp = pp;
3873 			ASSERT(pp->p_vnode);
3874 			ASSERT(PP_ISAGED(pp) == 0);
3875 			ASSERT(pp->p_szc == 0);
3876 			ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) == mnode);
3877 			while (!page_trylock(pp, SE_EXCL)) {
3878 				pp = pp->p_next;
3879 				ASSERT(pp->p_szc == 0);
3880 				if (pp == first_pp) {
3881 					/*
3882 					 * We have searched the complete list!
3883 					 * And all of them (might only be one)
3884 					 * are locked. This can happen since
3885 					 * these pages can also be found via
3886 					 * the hash list. When found via the
3887 					 * hash list, they are locked first,
3888 					 * then removed. We give up to let the
3889 					 * other thread run.
3890 					 */
3891 					pp = NULL;
3892 					break;
3893 				}
3894 				ASSERT(pp->p_vnode);
3895 				ASSERT(PP_ISFREE(pp));
3896 				ASSERT(PP_ISAGED(pp) == 0);
3897 				ASSERT(PFN_2_MEM_NODE(pp->p_pagenum) ==
3898 				    mnode);
3899 			}
3900 
3901 			if (pp) {
3902 				page_t	**ppp;
3903 				/*
3904 				 * Found and locked a page.
3905 				 * Pull it off the list.
3906 				 */
3907 				ASSERT(mtype == PP_2_MTYPE(pp));
3908 				ppp = &PAGE_CACHELISTS(mnode, bin, mtype);
3909 				page_sub(ppp, pp);
3910 				/*
3911 				 * Subtract counters before releasing pcm mutex
3912 				 * to avoid a race with page_freelist_coalesce
3913 				 * and page_freelist_split.
3914 				 */
3915 				page_ctr_sub(mnode, mtype, pp, PG_CACHE_LIST);
3916 				mutex_exit(pcm);
3917 				ASSERT(pp->p_vnode);
3918 				ASSERT(PP_ISAGED(pp) == 0);
3919 #if defined(__sparc)
3920 				ASSERT(!kcage_on ||
3921 				    (flags & PG_NORELOC) == 0 ||
3922 				    PP_ISNORELOC(pp));
3923 				if (PP_ISNORELOC(pp)) {
3924 					kcage_freemem_sub(1);
3925 				}
3926 #endif
3927 				VM_STAT_ADD(vmm_vmstats. pgmc_allocok);
3928 				return (pp);
3929 			}
3930 bin_empty_0:
3931 			mutex_exit(pcm);
3932 bin_empty_1:
3933 			if (plw_initialized == 0) {
3934 				page_list_walk_init(0, flags, bin, 0, 1, &plw);
3935 				plw_initialized = 1;
3936 			}
3937 			/* calculate the next bin with equivalent color */
3938 			bin = ADD_MASKED(bin, plw.plw_bin_step,
3939 			    plw.plw_ceq_mask[0], plw.plw_color_mask);
3940 		} while (sbin != bin);
3941 
3942 		if (plw.plw_ceq_dif > 1)
3943 			bin = page_list_walk_next_bin(0, bin, &plw);
3944 	}
3945 
3946 	MTYPE_NEXT(mnode, mtype, flags);
3947 	if (mtype >= 0)
3948 		goto try_again;
3949 
3950 	VM_STAT_ADD(vmm_vmstats.pgmc_allocfailed);
3951 	return (NULL);
3952 }
3953 
3954 #ifdef DEBUG
3955 #define	REPL_PAGE_STATS
3956 #endif /* DEBUG */
3957 
3958 #ifdef REPL_PAGE_STATS
3959 struct repl_page_stats {
3960 	uint_t	ngets;
3961 	uint_t	ngets_noreloc;
3962 	uint_t	npgr_noreloc;
3963 	uint_t	nnopage_first;
3964 	uint_t	nnopage;
3965 	uint_t	nhashout;
3966 	uint_t	nnofree;
3967 	uint_t	nnext_pp;
3968 } repl_page_stats;
3969 #define	REPL_STAT_INCR(v)	atomic_add_32(&repl_page_stats.v, 1)
3970 #else /* REPL_PAGE_STATS */
3971 #define	REPL_STAT_INCR(v)
3972 #endif /* REPL_PAGE_STATS */
3973 
3974 int	pgrppgcp;
3975 
3976 /*
3977  * The freemem accounting must be done by the caller.
3978  * First we try to get a replacement page of the same size as like_pp,
3979  * if that is not possible, then we just get a set of discontiguous
3980  * PAGESIZE pages.
3981  */
3982 page_t *
3983 page_get_replacement_page(page_t *orig_like_pp, struct lgrp *lgrp_target,
3984     uint_t pgrflags)
3985 {
3986 	page_t		*like_pp;
3987 	page_t		*pp, *pplist;
3988 	page_t		*pl = NULL;
3989 	ulong_t		bin;
3990 	int		mnode, page_mnode;
3991 	int		szc;
3992 	spgcnt_t	npgs, pg_cnt;
3993 	pfn_t		pfnum;
3994 	int		mtype;
3995 	int		flags = 0;
3996 	lgrp_mnode_cookie_t	lgrp_cookie;
3997 	lgrp_t		*lgrp;
3998 
3999 	REPL_STAT_INCR(ngets);
4000 	like_pp = orig_like_pp;
4001 	ASSERT(PAGE_EXCL(like_pp));
4002 
4003 	szc = like_pp->p_szc;
4004 	npgs = page_get_pagecnt(szc);
4005 	/*
4006 	 * Now we reset like_pp to the base page_t.
4007 	 * That way, we won't walk past the end of this 'szc' page.
4008 	 */
4009 	pfnum = PFN_BASE(like_pp->p_pagenum, szc);
4010 	like_pp = page_numtopp_nolock(pfnum);
4011 	ASSERT(like_pp->p_szc == szc);
4012 
4013 	if (PP_ISNORELOC(like_pp)) {
4014 		ASSERT(kcage_on);
4015 		REPL_STAT_INCR(ngets_noreloc);
4016 		flags = PGI_RELOCONLY;
4017 	} else if (pgrflags & PGR_NORELOC) {
4018 		ASSERT(kcage_on);
4019 		REPL_STAT_INCR(npgr_noreloc);
4020 		flags = PG_NORELOC;
4021 	}
4022 
4023 	/*
4024 	 * Kernel pages must always be replaced with the same size
4025 	 * pages, since we cannot properly handle demotion of kernel
4026 	 * pages.
4027 	 */
4028 	if (PP_ISKAS(like_pp))
4029 		pgrflags |= PGR_SAMESZC;
4030 
4031 	/* LINTED */
4032 	MTYPE_PGR_INIT(mtype, flags, like_pp, page_mnode, npgs);
4033 
4034 	while (npgs) {
4035 		pplist = NULL;
4036 		for (;;) {
4037 			pg_cnt = page_get_pagecnt(szc);
4038 			bin = PP_2_BIN(like_pp);
4039 			ASSERT(like_pp->p_szc == orig_like_pp->p_szc);
4040 			ASSERT(pg_cnt <= npgs);
4041 
4042 			/*
4043 			 * If an lgroup was specified, try to get the
4044 			 * page from that lgroup.
4045 			 * NOTE: Must be careful with code below because
4046 			 *	 lgroup may disappear and reappear since there
4047 			 *	 is no locking for lgroup here.
4048 			 */
4049 			if (LGRP_EXISTS(lgrp_target)) {
4050 				/*
4051 				 * Keep local variable for lgroup separate
4052 				 * from lgroup argument since this code should
4053 				 * only be exercised when lgroup argument
4054 				 * exists....
4055 				 */
4056 				lgrp = lgrp_target;
4057 
4058 				/* Try the lgroup's freelists first */
4059 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4060 				    LGRP_SRCH_LOCAL);
4061 				while ((pplist == NULL) &&
4062 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4063 				    != -1) {
4064 					pplist =
4065 					    page_get_mnode_freelist(mnode, bin,
4066 					    mtype, szc, flags);
4067 				}
4068 
4069 				/*
4070 				 * Now try it's cachelists if this is a
4071 				 * small page. Don't need to do it for
4072 				 * larger ones since page_freelist_coalesce()
4073 				 * already failed.
4074 				 */
4075 				if (pplist != NULL || szc != 0)
4076 					break;
4077 
4078 				/* Now try it's cachelists */
4079 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4080 				    LGRP_SRCH_LOCAL);
4081 
4082 				while ((pplist == NULL) &&
4083 				    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4084 				    != -1) {
4085 					pplist =
4086 					    page_get_mnode_cachelist(bin, flags,
4087 					    mnode, mtype);
4088 				}
4089 				if (pplist != NULL) {
4090 					page_hashout(pplist, NULL);
4091 					PP_SETAGED(pplist);
4092 					REPL_STAT_INCR(nhashout);
4093 					break;
4094 				}
4095 				/* Done looking in this lgroup. Bail out. */
4096 				break;
4097 			}
4098 
4099 			/*
4100 			 * No lgroup was specified (or lgroup was removed by
4101 			 * DR, so just try to get the page as close to
4102 			 * like_pp's mnode as possible.
4103 			 * First try the local freelist...
4104 			 */
4105 			mnode = PP_2_MEM_NODE(like_pp);
4106 			pplist = page_get_mnode_freelist(mnode, bin,
4107 			    mtype, szc, flags);
4108 			if (pplist != NULL)
4109 				break;
4110 
4111 			REPL_STAT_INCR(nnofree);
4112 
4113 			/*
4114 			 * ...then the local cachelist. Don't need to do it for
4115 			 * larger pages cause page_freelist_coalesce() already
4116 			 * failed there anyway.
4117 			 */
4118 			if (szc == 0) {
4119 				pplist = page_get_mnode_cachelist(bin, flags,
4120 				    mnode, mtype);
4121 				if (pplist != NULL) {
4122 					page_hashout(pplist, NULL);
4123 					PP_SETAGED(pplist);
4124 					REPL_STAT_INCR(nhashout);
4125 					break;
4126 				}
4127 			}
4128 
4129 			/* Now try remote freelists */
4130 			page_mnode = mnode;
4131 			lgrp =
4132 			    lgrp_hand_to_lgrp(MEM_NODE_2_LGRPHAND(page_mnode));
4133 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4134 			    LGRP_SRCH_HIER);
4135 			while (pplist == NULL &&
4136 			    (mnode = lgrp_memnode_choose(&lgrp_cookie))
4137 			    != -1) {
4138 				/*
4139 				 * Skip local mnode.
4140 				 */
4141 				if ((mnode == page_mnode) ||
4142 				    (mem_node_config[mnode].exists == 0))
4143 					continue;
4144 
4145 				pplist = page_get_mnode_freelist(mnode,
4146 				    bin, mtype, szc, flags);
4147 			}
4148 
4149 			if (pplist != NULL)
4150 				break;
4151 
4152 
4153 			/* Now try remote cachelists */
4154 			LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4155 			    LGRP_SRCH_HIER);
4156 			while (pplist == NULL && szc == 0) {
4157 				mnode = lgrp_memnode_choose(&lgrp_cookie);
4158 				if (mnode == -1)
4159 					break;
4160 				/*
4161 				 * Skip local mnode.
4162 				 */
4163 				if ((mnode == page_mnode) ||
4164 				    (mem_node_config[mnode].exists == 0))
4165 					continue;
4166 
4167 				pplist = page_get_mnode_cachelist(bin,
4168 				    flags, mnode, mtype);
4169 
4170 				if (pplist != NULL) {
4171 					page_hashout(pplist, NULL);
4172 					PP_SETAGED(pplist);
4173 					REPL_STAT_INCR(nhashout);
4174 					break;
4175 				}
4176 			}
4177 
4178 			/*
4179 			 * Break out of while loop under the following cases:
4180 			 * - If we successfully got a page.
4181 			 * - If pgrflags specified only returning a specific
4182 			 *   page size and we could not find that page size.
4183 			 * - If we could not satisfy the request with PAGESIZE
4184 			 *   or larger pages.
4185 			 */
4186 			if (pplist != NULL || szc == 0)
4187 				break;
4188 
4189 			if ((pgrflags & PGR_SAMESZC) || pgrppgcp) {
4190 				/* try to find contig page */
4191 
4192 				LGRP_MNODE_COOKIE_INIT(lgrp_cookie, lgrp,
4193 				    LGRP_SRCH_HIER);
4194 
4195 				while ((pplist == NULL) &&
4196 				    (mnode =
4197 				    lgrp_memnode_choose(&lgrp_cookie))
4198 				    != -1) {
4199 					pplist = page_get_contig_pages(
4200 					    mnode, bin, mtype, szc,
4201 					    flags | PGI_PGCPHIPRI);
4202 				}
4203 				break;
4204 			}
4205 
4206 			/*
4207 			 * The correct thing to do here is try the next
4208 			 * page size down using szc--. Due to a bug
4209 			 * with the processing of HAT_RELOAD_SHARE
4210 			 * where the sfmmu_ttecnt arrays of all
4211 			 * hats sharing an ISM segment don't get updated,
4212 			 * using intermediate size pages for relocation
4213 			 * can lead to continuous page faults.
4214 			 */
4215 			szc = 0;
4216 		}
4217 
4218 		if (pplist != NULL) {
4219 			DTRACE_PROBE4(page__get,
4220 			    lgrp_t *, lgrp,
4221 			    int, mnode,
4222 			    ulong_t, bin,
4223 			    uint_t, flags);
4224 
4225 			while (pplist != NULL && pg_cnt--) {
4226 				ASSERT(pplist != NULL);
4227 				pp = pplist;
4228 				page_sub(&pplist, pp);
4229 				PP_CLRFREE(pp);
4230 				PP_CLRAGED(pp);
4231 				page_list_concat(&pl, &pp);
4232 				npgs--;
4233 				like_pp = like_pp + 1;
4234 				REPL_STAT_INCR(nnext_pp);
4235 			}
4236 			ASSERT(pg_cnt == 0);
4237 		} else {
4238 			break;
4239 		}
4240 	}
4241 
4242 	if (npgs) {
4243 		/*
4244 		 * We were unable to allocate the necessary number
4245 		 * of pages.
4246 		 * We need to free up any pl.
4247 		 */
4248 		REPL_STAT_INCR(nnopage);
4249 		page_free_replacement_page(pl);
4250 		return (NULL);
4251 	} else {
4252 		return (pl);
4253 	}
4254 }
4255 
4256 /*
4257  * demote a free large page to it's constituent pages
4258  */
4259 void
4260 page_demote_free_pages(page_t *pp)
4261 {
4262 
4263 	int mnode;
4264 
4265 	ASSERT(pp != NULL);
4266 	ASSERT(PAGE_LOCKED(pp));
4267 	ASSERT(PP_ISFREE(pp));
4268 	ASSERT(pp->p_szc != 0 && pp->p_szc < mmu_page_sizes);
4269 
4270 	mnode = PP_2_MEM_NODE(pp);
4271 	page_freelist_lock(mnode);
4272 	if (pp->p_szc != 0) {
4273 		(void) page_demote(mnode, PFN_BASE(pp->p_pagenum,
4274 		    pp->p_szc), pp->p_szc, 0, PC_NO_COLOR, PC_FREE);
4275 	}
4276 	page_freelist_unlock(mnode);
4277 	ASSERT(pp->p_szc == 0);
4278 }
4279 
4280 /*
4281  * Factor in colorequiv to check additional 'equivalent' bins.
4282  * colorequiv may be set in /etc/system
4283  */
4284 void
4285 page_set_colorequiv_arr(void)
4286 {
4287 	if (colorequiv > 1) {
4288 		int i;
4289 		uint_t sv_a = lowbit(colorequiv) - 1;
4290 
4291 		if (sv_a > 15)
4292 			sv_a = 15;
4293 
4294 		for (i = 0; i < MMU_PAGE_SIZES; i++) {
4295 			uint_t colors;
4296 			uint_t a = sv_a;
4297 
4298 			if ((colors = hw_page_array[i].hp_colors) <= 1) {
4299 				continue;
4300 			}
4301 			while ((colors >> a) == 0)
4302 				a--;
4303 			if ((a << 4) > colorequivszc[i]) {
4304 				colorequivszc[i] = (a << 4);
4305 			}
4306 		}
4307 	}
4308 }
4309