xref: /illumos-gate/usr/src/uts/sun4/vm/vm_dep.c (revision 3db86aab)
1 /*
2  * CDDL HEADER START
3  *
4  * The contents of this file are subject to the terms of the
5  * Common Development and Distribution License (the "License").
6  * You may not use this file except in compliance with the License.
7  *
8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
9  * or http://www.opensolaris.org/os/licensing.
10  * See the License for the specific language governing permissions
11  * and limitations under the License.
12  *
13  * When distributing Covered Code, include this CDDL HEADER in each
14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
15  * If applicable, add the following below this CDDL HEADER, with the
16  * fields enclosed by brackets "[]" replaced with your own identifying
17  * information: Portions Copyright [yyyy] [name of copyright owner]
18  *
19  * CDDL HEADER END
20  */
21 /*
22  * Copyright 2006 Sun Microsystems, Inc.  All rights reserved.
23  * Use is subject to license terms.
24  */
25 
26 #pragma ident	"%Z%%M%	%I%	%E% SMI"
27 
28 /*
29  * UNIX machine dependent virtual memory support.
30  */
31 
32 #include <sys/vm.h>
33 #include <sys/exec.h>
34 
35 #include <sys/exechdr.h>
36 #include <vm/seg_kmem.h>
37 #include <sys/atomic.h>
38 #include <sys/archsystm.h>
39 #include <sys/machsystm.h>
40 #include <sys/kdi.h>
41 #include <sys/cpu_module.h>
42 
43 #include <vm/hat_sfmmu.h>
44 
45 #include <sys/memnode.h>
46 
47 #include <sys/mem_config.h>
48 #include <sys/mem_cage.h>
49 #include <vm/vm_dep.h>
50 #include <sys/platform_module.h>
51 
52 /*
53  * These variables are set by module specific config routines.
54  * They are only set by modules which will use physical cache page coloring
55  * and/or virtual cache page coloring.
56  */
57 int do_pg_coloring = 0;
58 int do_virtual_coloring = 0;
59 
60 /*
61  * These variables can be conveniently patched at kernel load time to
62  * prevent do_pg_coloring or do_virtual_coloring from being enabled by
63  * module specific config routines.
64  */
65 
66 int use_page_coloring = 1;
67 int use_virtual_coloring = 1;
68 
69 /*
70  * initialized by page_coloring_init()
71  */
72 extern uint_t page_colors;
73 extern uint_t page_colors_mask;
74 extern uint_t page_coloring_shift;
75 int cpu_page_colors;
76 uint_t vac_colors = 0;
77 uint_t vac_colors_mask = 0;
78 
79 /*
80  * get the ecache setsize for the current cpu.
81  */
82 #define	CPUSETSIZE()	(cpunodes[CPU->cpu_id].ecache_setsize)
83 
84 plcnt_t		plcnt;		/* page list count */
85 
86 /*
87  * This variable is set by the cpu module to contain the lowest
88  * address not affected by the SF_ERRATA_57 workaround.  It should
89  * remain 0 if the workaround is not needed.
90  */
91 #if defined(SF_ERRATA_57)
92 caddr_t errata57_limit;
93 #endif
94 
95 extern int disable_auto_large_pages;	/* used by map_pgsz*() routines */
96 
97 extern void page_relocate_hash(page_t *, page_t *);
98 
99 /*
100  * these must be defined in platform specific areas
101  */
102 extern void map_addr_proc(caddr_t *, size_t, offset_t, int, caddr_t,
103 	struct proc *, uint_t);
104 extern page_t *page_get_freelist(struct vnode *, u_offset_t, struct seg *,
105 	caddr_t, size_t, uint_t, struct lgrp *);
106 /*
107  * Convert page frame number to an OBMEM page frame number
108  * (i.e. put in the type bits -- zero for this implementation)
109  */
110 pfn_t
111 impl_obmem_pfnum(pfn_t pf)
112 {
113 	return (pf);
114 }
115 
116 /*
117  * Use physmax to determine the highest physical page of DRAM memory
118  * It is assumed that any physical addresses above physmax is in IO space.
119  * We don't bother checking the low end because we assume that memory space
120  * begins at physical page frame 0.
121  *
122  * Return 1 if the page frame is onboard DRAM memory, else 0.
123  * Returns 0 for nvram so it won't be cached.
124  */
125 int
126 pf_is_memory(pfn_t pf)
127 {
128 	/* We must be IO space */
129 	if (pf > physmax)
130 		return (0);
131 
132 	/* We must be memory space */
133 	return (1);
134 }
135 
136 /*
137  * Handle a pagefault.
138  */
139 faultcode_t
140 pagefault(caddr_t addr, enum fault_type type, enum seg_rw rw, int iskernel)
141 {
142 	struct as *as;
143 	struct proc *p;
144 	faultcode_t res;
145 	caddr_t base;
146 	size_t len;
147 	int err;
148 
149 	if (INVALID_VADDR(addr))
150 		return (FC_NOMAP);
151 
152 	if (iskernel) {
153 		as = &kas;
154 	} else {
155 		p = curproc;
156 		as = p->p_as;
157 #if defined(SF_ERRATA_57)
158 		/*
159 		 * Prevent infinite loops due to a segment driver
160 		 * setting the execute permissions and the sfmmu hat
161 		 * silently ignoring them.
162 		 */
163 		if (rw == S_EXEC && AS_TYPE_64BIT(as) &&
164 		    addr < errata57_limit) {
165 			res = FC_NOMAP;
166 			goto out;
167 		}
168 #endif
169 	}
170 
171 	/*
172 	 * Dispatch pagefault.
173 	 */
174 	res = as_fault(as->a_hat, as, addr, 1, type, rw);
175 
176 	/*
177 	 * If this isn't a potential unmapped hole in the user's
178 	 * UNIX data or stack segments, just return status info.
179 	 */
180 	if (!(res == FC_NOMAP && iskernel == 0))
181 		goto out;
182 
183 	/*
184 	 * Check to see if we happened to faulted on a currently unmapped
185 	 * part of the UNIX data or stack segments.  If so, create a zfod
186 	 * mapping there and then try calling the fault routine again.
187 	 */
188 	base = p->p_brkbase;
189 	len = p->p_brksize;
190 
191 	if (addr < base || addr >= base + len) {		/* data seg? */
192 		base = (caddr_t)(p->p_usrstack - p->p_stksize);
193 		len = p->p_stksize;
194 		if (addr < base || addr >= p->p_usrstack) {	/* stack seg? */
195 			/* not in either UNIX data or stack segments */
196 			res = FC_NOMAP;
197 			goto out;
198 		}
199 	}
200 
201 	/* the rest of this function implements a 3.X 4.X 5.X compatibility */
202 	/* This code is probably not needed anymore */
203 
204 	/* expand the gap to the page boundaries on each side */
205 	len = (((uintptr_t)base + len + PAGEOFFSET) & PAGEMASK) -
206 	    ((uintptr_t)base & PAGEMASK);
207 	base = (caddr_t)((uintptr_t)base & PAGEMASK);
208 
209 	as_rangelock(as);
210 	as_purge(as);
211 	if (as_gap(as, PAGESIZE, &base, &len, AH_CONTAIN, addr) == 0) {
212 		err = as_map(as, base, len, segvn_create, zfod_argsp);
213 		as_rangeunlock(as);
214 		if (err) {
215 			res = FC_MAKE_ERR(err);
216 			goto out;
217 		}
218 	} else {
219 		/*
220 		 * This page is already mapped by another thread after we
221 		 * returned from as_fault() above.  We just fallthrough
222 		 * as_fault() below.
223 		 */
224 		as_rangeunlock(as);
225 	}
226 
227 	res = as_fault(as->a_hat, as, addr, 1, F_INVAL, rw);
228 
229 out:
230 
231 	return (res);
232 }
233 
234 /*
235  * This is the routine which defines the address limit implied
236  * by the flag '_MAP_LOW32'.  USERLIMIT32 matches the highest
237  * mappable address in a 32-bit process on this platform (though
238  * perhaps we should make it be UINT32_MAX here?)
239  */
240 void
241 map_addr(caddr_t *addrp, size_t len, offset_t off, int vacalign, uint_t flags)
242 {
243 	struct proc *p = curproc;
244 	caddr_t userlimit = flags & _MAP_LOW32 ?
245 		(caddr_t)USERLIMIT32 : p->p_as->a_userlimit;
246 	map_addr_proc(addrp, len, off, vacalign, userlimit, p, flags);
247 }
248 
249 /*
250  * Some V9 CPUs have holes in the middle of the 64-bit virtual address range.
251  */
252 caddr_t	hole_start, hole_end;
253 
254 /*
255  * kpm mapping window
256  */
257 caddr_t kpm_vbase;
258 size_t  kpm_size;
259 uchar_t kpm_size_shift;
260 
261 /*
262  * Determine whether [base, base+len] contains a mapable range of
263  * addresses at least minlen long. base and len are adjusted if
264  * required to provide a mapable range.
265  */
266 /* ARGSUSED */
267 int
268 valid_va_range(caddr_t *basep, size_t *lenp, size_t minlen, int dir)
269 {
270 	caddr_t hi, lo;
271 
272 	lo = *basep;
273 	hi = lo + *lenp;
274 
275 	/*
276 	 * If hi rolled over the top, try cutting back.
277 	 */
278 	if (hi < lo) {
279 		size_t newlen = 0 - (uintptr_t)lo - 1l;
280 
281 		if (newlen + (uintptr_t)hi < minlen)
282 			return (0);
283 		if (newlen < minlen)
284 			return (0);
285 		*lenp = newlen;
286 	} else if (hi - lo < minlen)
287 		return (0);
288 
289 	/*
290 	 * Deal with a possible hole in the address range between
291 	 * hole_start and hole_end that should never be mapped by the MMU.
292 	 */
293 	hi = lo + *lenp;
294 
295 	if (lo < hole_start) {
296 		if (hi > hole_start)
297 			if (hi < hole_end)
298 				hi = hole_start;
299 			else
300 				/* lo < hole_start && hi >= hole_end */
301 				if (dir == AH_LO) {
302 					/*
303 					 * prefer lowest range
304 					 */
305 					if (hole_start - lo >= minlen)
306 						hi = hole_start;
307 					else if (hi - hole_end >= minlen)
308 						lo = hole_end;
309 					else
310 						return (0);
311 				} else {
312 					/*
313 					 * prefer highest range
314 					 */
315 					if (hi - hole_end >= minlen)
316 						lo = hole_end;
317 					else if (hole_start - lo >= minlen)
318 						hi = hole_start;
319 					else
320 						return (0);
321 				}
322 	} else {
323 		/* lo >= hole_start */
324 		if (hi < hole_end)
325 			return (0);
326 		if (lo < hole_end)
327 			lo = hole_end;
328 	}
329 
330 	if (hi - lo < minlen)
331 		return (0);
332 
333 	*basep = lo;
334 	*lenp = hi - lo;
335 
336 	return (1);
337 }
338 
339 /*
340  * Determine whether [addr, addr+len] with protections `prot' are valid
341  * for a user address space.
342  */
343 /*ARGSUSED*/
344 int
345 valid_usr_range(caddr_t addr, size_t len, uint_t prot, struct as *as,
346     caddr_t userlimit)
347 {
348 	caddr_t eaddr = addr + len;
349 
350 	if (eaddr <= addr || addr >= userlimit || eaddr > userlimit)
351 		return (RANGE_BADADDR);
352 
353 	/*
354 	 * Determine if the address range falls within an illegal
355 	 * range of the MMU.
356 	 */
357 	if (eaddr > hole_start && addr < hole_end)
358 		return (RANGE_BADADDR);
359 
360 #if defined(SF_ERRATA_57)
361 	/*
362 	 * Make sure USERLIMIT isn't raised too high
363 	 */
364 	ASSERT64(addr <= (caddr_t)0xffffffff80000000ul ||
365 	    errata57_limit == 0);
366 
367 	if (AS_TYPE_64BIT(as) &&
368 	    (addr < errata57_limit) &&
369 	    (prot & PROT_EXEC))
370 		return (RANGE_BADPROT);
371 #endif /* SF_ERRATA57 */
372 	return (RANGE_OKAY);
373 }
374 
375 /*
376  * Routine used to check to see if an a.out can be executed
377  * by the current machine/architecture.
378  */
379 int
380 chkaout(struct exdata *exp)
381 {
382 	if (exp->ux_mach == M_SPARC)
383 		return (0);
384 	else
385 		return (ENOEXEC);
386 }
387 
388 /*
389  * The following functions return information about an a.out
390  * which is used when a program is executed.
391  */
392 
393 /*
394  * Return the load memory address for the data segment.
395  */
396 caddr_t
397 getdmem(struct exec *exp)
398 {
399 	/*
400 	 * XXX - Sparc Reference Hack approaching
401 	 * Remember that we are loading
402 	 * 8k executables into a 4k machine
403 	 * DATA_ALIGN == 2 * PAGESIZE
404 	 */
405 	if (exp->a_text)
406 		return ((caddr_t)(roundup(USRTEXT + exp->a_text, DATA_ALIGN)));
407 	else
408 		return ((caddr_t)USRTEXT);
409 }
410 
411 /*
412  * Return the starting disk address for the data segment.
413  */
414 ulong_t
415 getdfile(struct exec *exp)
416 {
417 	if (exp->a_magic == ZMAGIC)
418 		return (exp->a_text);
419 	else
420 		return (sizeof (struct exec) + exp->a_text);
421 }
422 
423 /*
424  * Return the load memory address for the text segment.
425  */
426 
427 /*ARGSUSED*/
428 caddr_t
429 gettmem(struct exec *exp)
430 {
431 	return ((caddr_t)USRTEXT);
432 }
433 
434 /*
435  * Return the file byte offset for the text segment.
436  */
437 uint_t
438 gettfile(struct exec *exp)
439 {
440 	if (exp->a_magic == ZMAGIC)
441 		return (0);
442 	else
443 		return (sizeof (struct exec));
444 }
445 
446 void
447 getexinfo(
448 	struct exdata *edp_in,
449 	struct exdata *edp_out,
450 	int *pagetext,
451 	int *pagedata)
452 {
453 	*edp_out = *edp_in;	/* structure copy */
454 
455 	if ((edp_in->ux_mag == ZMAGIC) &&
456 	    ((edp_in->vp->v_flag & VNOMAP) == 0)) {
457 		*pagetext = 1;
458 		*pagedata = 1;
459 	} else {
460 		*pagetext = 0;
461 		*pagedata = 0;
462 	}
463 }
464 
465 #define	MAP_PGSZ_COMMON(pgsz, n, upper, lower, len)	\
466 	for ((n) = (upper); (n) > (lower); (n)--) {		\
467 		if (disable_auto_large_pages & (1 << (n)))		\
468 			continue;				\
469 		if (hw_page_array[(n)].hp_size <= (len)) {	\
470 			(pgsz) = hw_page_array[(n)].hp_size;	\
471 			break;					\
472 		}						\
473 	}
474 
475 
476 /*ARGSUSED*/
477 size_t
478 map_pgszva(struct proc *p, caddr_t addr, size_t len)
479 {
480 	size_t		pgsz = MMU_PAGESIZE;
481 	int		n, upper;
482 
483 	/*
484 	 * Select the best fit page size within the constraints of
485 	 * auto_lpg_{min,max}szc.
486 	 *
487 	 * Note that we also take the heap size into account when
488 	 * deciding if we've crossed the threshold at which we should
489 	 * increase the page size.  This isn't perfect since the heap
490 	 * may not have reached its full size yet, but it's better than
491 	 * not considering it at all.
492 	 */
493 	len += p->p_brksize;
494 	if (ptob(auto_lpg_tlb_threshold) <= len) {
495 
496 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
497 
498 		/*
499 		 * Use auto_lpg_minszc - 1 as the limit so we never drop
500 		 * below auto_lpg_minszc.  We don't have a size code to refer
501 		 * to like we have for bss and stack, so we assume 0.
502 		 * auto_lpg_minszc should always be >= 0.  Using
503 		 * auto_lpg_minszc cuts off the loop.
504 		 */
505 		MAP_PGSZ_COMMON(pgsz, n, upper, auto_lpg_minszc - 1, len);
506 	}
507 
508 	return (pgsz);
509 }
510 
511 size_t
512 map_pgszheap(struct proc *p, caddr_t addr, size_t len)
513 {
514 	size_t		pgsz;
515 	int		n, upper, lower;
516 
517 	/*
518 	 * If len is zero, retrieve from proc and don't demote the page size.
519 	 */
520 	if (len == 0) {
521 		len = p->p_brksize;
522 	}
523 
524 	/*
525 	 * Still zero?  Then we don't have a heap yet, so pick the default
526 	 * heap size.
527 	 */
528 	if (len == 0) {
529 		pgsz = auto_lpg_heap_default;
530 	} else {
531 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
532 	}
533 
534 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
535 		/*
536 		 * We're past the threshold, so select the best fit
537 		 * page size within the constraints of
538 		 * auto_lpg_{min,max}szc and the minimum required
539 		 * alignment.
540 		 */
541 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
542 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
543 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
544 	}
545 
546 	/*
547 	 * If addr == 0 we were called by memcntl() or exec_args() when the
548 	 * size code is 0.  Don't set pgsz less than current size.
549 	 */
550 	if (addr == 0 && (pgsz < hw_page_array[p->p_brkpageszc].hp_size)) {
551 		pgsz = hw_page_array[p->p_brkpageszc].hp_size;
552 	}
553 
554 	return (pgsz);
555 }
556 
557 size_t
558 map_pgszstk(struct proc *p, caddr_t addr, size_t len)
559 {
560 	size_t		pgsz;
561 	int		n, upper, lower;
562 
563 	/*
564 	 * If len is zero, retrieve from proc and don't demote the page size.
565 	 */
566 	if (len == 0) {
567 		len = p->p_stksize;
568 	}
569 
570 	/*
571 	 * Still zero?  Then we don't have a heap yet, so pick the default
572 	 * stack size.
573 	 */
574 	if (len == 0) {
575 		pgsz = auto_lpg_stack_default;
576 	} else {
577 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
578 	}
579 
580 	if ((pgsz * auto_lpg_tlb_threshold) <= len) {
581 		/*
582 		 * We're past the threshold, so select the best fit
583 		 * page size within the constraints of
584 		 * auto_lpg_{min,max}szc and the minimum required
585 		 * alignment.
586 		 */
587 		upper = MIN(mmu_page_sizes - 1, auto_lpg_maxszc);
588 		lower = MAX(auto_lpg_minszc - 1, p->p_brkpageszc);
589 		MAP_PGSZ_COMMON(pgsz, n, upper, lower, len);
590 	}
591 
592 	/*
593 	 * If addr == 0 we were called by memcntl() or exec_args() when the
594 	 * size code is 0.  Don't set pgsz less than current size.
595 	 */
596 	if (addr == 0 && (pgsz < hw_page_array[p->p_stkpageszc].hp_size)) {
597 		pgsz = hw_page_array[p->p_stkpageszc].hp_size;
598 	}
599 
600 	return (pgsz);
601 }
602 
603 
604 /*
605  * Return non 0 value if the address may cause a VAC alias with KPM mappings.
606  * KPM selects an address such that it's equal offset modulo shm_alignment and
607  * assumes it can't be in VAC conflict with any larger than PAGESIZE mapping.
608  */
609 int
610 map_addr_vacalign_check(caddr_t addr, u_offset_t off)
611 {
612 	if (vac) {
613 		return (((uintptr_t)addr ^ off) & shm_alignment - 1);
614 	} else {
615 		return (0);
616 	}
617 }
618 
619 /*
620  * use_text_pgsz64k, use_initdata_pgsz64k and use_text_pgsz4m
621  * can be set in platform or CPU specific code but user can change the
622  * default values via /etc/system.
623  *
624  * Initial values are defined in architecture specific mach_vm_dep.c file.
625  */
626 extern int use_text_pgsz64k;
627 extern int use_text_pgsz4m;
628 extern int use_initdata_pgsz64k;
629 
630 /*
631  * disable_text_largepages and disable_initdata_largepages bitmaks are set in
632  * platform or CPU specific code to disable page sizes that should not be
633  * used. These variables normally shouldn't be changed via /etc/system. A
634  * particular page size for text or inititialized data will be used by default
635  * if both one of use_* variables is set to 1 AND this page size is not
636  * disabled in the corresponding disable_* bitmask variable.
637  *
638  * Initial values are defined in architecture specific mach_vm_dep.c file.
639  */
640 extern int disable_text_largepages;
641 extern int disable_initdata_largepages;
642 
643 /*
644  * Minimum segment size tunables before 64K or 4M large pages
645  * should be used to map it.
646  *
647  * Initial values are defined in architecture specific mach_vm_dep.c file.
648  */
649 extern size_t text_pgsz64k_minsize;
650 extern size_t text_pgsz4m_minsize;
651 extern size_t initdata_pgsz64k_minsize;
652 
653 /*
654  * Sanity control. Don't use large pages regardless of user
655  * settings if there's less than execseg_lpg_min_physmem memory installed.
656  * The units for this variable is 8K pages.
657  */
658 pgcnt_t execseg_lpg_min_physmem = 131072;		/* 1GB */
659 
660 
661 /* assumes TTE8K...TTE4M == szc */
662 
663 static uint_t
664 map_text_pgsz4m(caddr_t addr, size_t len)
665 {
666 	caddr_t a;
667 
668 	if (len < text_pgsz4m_minsize) {
669 		return (0);
670 	}
671 
672 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
673 	if (a < addr || a >= addr + len) {
674 		return (0);
675 	}
676 	len -= (a - addr);
677 	if (len < MMU_PAGESIZE4M) {
678 		return (0);
679 	}
680 
681 	return (1 << TTE4M);
682 }
683 
684 static uint_t
685 map_text_pgsz64k(caddr_t addr, size_t len)
686 {
687 	caddr_t a;
688 	size_t svlen = len;
689 
690 	if (len < text_pgsz64k_minsize) {
691 		return (0);
692 	}
693 
694 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
695 	if (a < addr || a >= addr + len) {
696 		return (0);
697 	}
698 	len -= (a - addr);
699 	if (len < MMU_PAGESIZE64K) {
700 		return (0);
701 	}
702 	if (!use_text_pgsz4m ||
703 	    disable_text_largepages & (1 << TTE4M)) {
704 		return (1 << TTE64K);
705 	}
706 	if (svlen < text_pgsz4m_minsize) {
707 		return (1 << TTE64K);
708 	}
709 	addr = a;
710 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE4M, uintptr_t);
711 	if (a < addr || a >= addr + len) {
712 		return (1 << TTE64K);
713 	}
714 	len -= (a - addr);
715 	if (len < MMU_PAGESIZE4M) {
716 		return (1 << TTE64K);
717 	}
718 	return ((1 << TTE4M) | (1 << TTE64K));
719 }
720 
721 static uint_t
722 map_initdata_pgsz64k(caddr_t addr, size_t len)
723 {
724 	caddr_t a;
725 
726 	if (len < initdata_pgsz64k_minsize) {
727 		return (0);
728 	}
729 
730 	a = (caddr_t)P2ROUNDUP_TYPED(addr, MMU_PAGESIZE64K, uintptr_t);
731 	if (a < addr || a >= addr + len) {
732 		return (0);
733 	}
734 	len -= (a - addr);
735 	if (len < MMU_PAGESIZE64K) {
736 		return (0);
737 	}
738 	return (1 << TTE64K);
739 }
740 
741 /*
742  * Return a bit vector of large page size codes that
743  * can be used to map [addr, addr + len) region.
744  */
745 uint_t
746 map_execseg_pgszcvec(int text, caddr_t addr, size_t len)
747 {
748 	uint_t ret = 0;
749 
750 	if (physmem < execseg_lpg_min_physmem) {
751 		return (0);
752 	}
753 
754 	if (text) {
755 		if (use_text_pgsz64k &&
756 		    !(disable_text_largepages & (1 << TTE64K))) {
757 			ret = map_text_pgsz64k(addr, len);
758 		} else if (use_text_pgsz4m &&
759 		    !(disable_text_largepages & (1 << TTE4M))) {
760 			ret = map_text_pgsz4m(addr, len);
761 		}
762 	} else if (use_initdata_pgsz64k &&
763 	    !(disable_initdata_largepages & (1 << TTE64K))) {
764 		ret = map_initdata_pgsz64k(addr, len);
765 	}
766 
767 	return (ret);
768 }
769 
770 #define	PNUM_SIZE(size_code)						\
771 	(hw_page_array[size_code].hp_size >> hw_page_array[0].hp_shift)
772 
773 /*
774  * Anchored in the table below are counters used to keep track
775  * of free contiguous physical memory. Each element of the table contains
776  * the array of counters, the size of array which is allocated during
777  * startup based on physmax and a shift value used to convert a pagenum
778  * into a counter array index or vice versa. The table has page size
779  * for rows and region size for columns:
780  *
781  *	page_counters[page_size][region_size]
782  *
783  *	page_size: 	TTE size code of pages on page_size freelist.
784  *
785  *	region_size:	TTE size code of a candidate larger page made up
786  *			made up of contiguous free page_size pages.
787  *
788  * As you go across a page_size row increasing region_size each
789  * element keeps track of how many (region_size - 1) size groups
790  * made up of page_size free pages can be coalesced into a
791  * regsion_size page. Yuck! Lets try an example:
792  *
793  * 	page_counters[1][3] is the table element used for identifying
794  *	candidate 4M pages from contiguous pages off the 64K free list.
795  *	Each index in the page_counters[1][3].array spans 4M. Its the
796  *	number of free 512K size (regsion_size - 1) groups of contiguous
797  *	64K free pages.	So when page_counters[1][3].counters[n] == 8
798  *	we know we have a candidate 4M page made up of 512K size groups
799  *	of 64K free pages.
800  */
801 
802 /*
803  * Per page size free lists. 3rd (max_mem_nodes) and 4th (page coloring bins)
804  * dimensions are allocated dynamically.
805  */
806 page_t ***page_freelists[MMU_PAGE_SIZES][MAX_MEM_TYPES];
807 
808 /*
809  * For now there is only a single size cache list.
810  * Allocated dynamically.
811  */
812 page_t ***page_cachelists[MAX_MEM_TYPES];
813 
814 kmutex_t *fpc_mutex[NPC_MUTEX];
815 kmutex_t *cpc_mutex[NPC_MUTEX];
816 
817 caddr_t
818 alloc_page_freelists(int mnode, caddr_t alloc_base, int alloc_align)
819 {
820 	int	mtype;
821 	uint_t	szc;
822 
823 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
824 
825 	/*
826 	 * We only support small pages in the cachelist.
827 	 */
828 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
829 		page_cachelists[mtype][mnode] = (page_t **)alloc_base;
830 		alloc_base += (sizeof (page_t *) * page_colors);
831 		/*
832 		 * Allocate freelists bins for all
833 		 * supported page sizes.
834 		 */
835 		for (szc = 0; szc < mmu_page_sizes; szc++) {
836 			page_freelists[szc][mtype][mnode] =
837 			    (page_t **)alloc_base;
838 			alloc_base += ((sizeof (page_t *) *
839 			    page_get_pagecolors(szc)));
840 		}
841 	}
842 
843 	alloc_base = (caddr_t)roundup((uintptr_t)alloc_base, alloc_align);
844 
845 	return (alloc_base);
846 }
847 
848 /*
849  * Allocate page_freelists bin headers for a memnode from the
850  * nucleus data area. This is the first time that mmu_page_sizes is
851  * used during sun4u bootup, so check mmu_page_sizes initialization.
852  */
853 int
854 ndata_alloc_page_freelists(struct memlist *ndata, int mnode)
855 {
856 	size_t alloc_sz;
857 	caddr_t alloc_base;
858 	caddr_t end;
859 	int	mtype;
860 	uint_t	szc;
861 	int32_t allp = 0;
862 
863 	if (&mmu_init_mmu_page_sizes) {
864 		if (!mmu_init_mmu_page_sizes(allp)) {
865 			cmn_err(CE_PANIC, "mmu_page_sizes %d not initialized",
866 			    mmu_page_sizes);
867 		}
868 	}
869 	ASSERT(mmu_page_sizes >= DEFAULT_MMU_PAGE_SIZES);
870 
871 	/* first time called - allocate max_mem_nodes dimension */
872 	if (mnode == 0) {
873 		int	i;
874 
875 		/* page_cachelists */
876 		alloc_sz = MAX_MEM_TYPES * max_mem_nodes *
877 		    sizeof (page_t **);
878 
879 		/* page_freelists */
880 		alloc_sz += MAX_MEM_TYPES * mmu_page_sizes * max_mem_nodes *
881 		    sizeof (page_t **);
882 
883 		/* fpc_mutex and cpc_mutex */
884 		alloc_sz += 2 * NPC_MUTEX * max_mem_nodes * sizeof (kmutex_t);
885 
886 		alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
887 		if (alloc_base == NULL)
888 			return (-1);
889 
890 		ASSERT(((uintptr_t)alloc_base & (ecache_alignsize - 1)) == 0);
891 
892 		for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
893 			page_cachelists[mtype] = (page_t ***)alloc_base;
894 			alloc_base += (max_mem_nodes * sizeof (page_t **));
895 			for (szc = 0; szc < mmu_page_sizes; szc++) {
896 				page_freelists[szc][mtype] =
897 				    (page_t ***)alloc_base;
898 				alloc_base += (max_mem_nodes *
899 				    sizeof (page_t **));
900 			}
901 		}
902 		for (i = 0; i < NPC_MUTEX; i++) {
903 			fpc_mutex[i] = (kmutex_t *)alloc_base;
904 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
905 			cpc_mutex[i] = (kmutex_t *)alloc_base;
906 			alloc_base += (sizeof (kmutex_t) * max_mem_nodes);
907 		}
908 		alloc_sz = 0;
909 	}
910 
911 	/*
912 	 * Calculate the size needed by alloc_page_freelists().
913 	 */
914 	for (mtype = 0; mtype < MAX_MEM_TYPES; mtype++) {
915 		alloc_sz += sizeof (page_t *) * page_colors;
916 
917 		for (szc = 0; szc < mmu_page_sizes; szc++)
918 			alloc_sz += sizeof (page_t *) *
919 			    page_get_pagecolors(szc);
920 	}
921 
922 	alloc_base = ndata_alloc(ndata, alloc_sz, ecache_alignsize);
923 	if (alloc_base == NULL)
924 		return (-1);
925 
926 	end = alloc_page_freelists(mnode, alloc_base, ecache_alignsize);
927 	ASSERT((uintptr_t)end == roundup((uintptr_t)alloc_base + alloc_sz,
928 	    ecache_alignsize));
929 
930 	return (0);
931 }
932 
933 /*
934  * To select our starting bin, we stride through the bins with a stride
935  * of 337.  Why 337?  It's prime, it's largeish, and it performs well both
936  * in simulation and practice for different workloads on varying cache sizes.
937  */
938 uint32_t color_start_current = 0;
939 uint32_t color_start_stride = 337;
940 int color_start_random = 0;
941 
942 /* ARGSUSED */
943 uint_t
944 get_color_start(struct as *as)
945 {
946 	uint32_t old, new;
947 
948 	if (consistent_coloring == 2 || color_start_random) {
949 		return ((uint_t)(((gettick()) << (vac_shift - MMU_PAGESHIFT)) &
950 		    page_colors_mask));
951 	}
952 
953 	do {
954 		old = color_start_current;
955 		new = old + (color_start_stride << (vac_shift - MMU_PAGESHIFT));
956 	} while (cas32(&color_start_current, old, new) != old);
957 
958 	return ((uint_t)(new));
959 }
960 
961 /*
962  * Called once at startup from kphysm_init() -- before memialloc()
963  * is invoked to do the 1st page_free()/page_freelist_add().
964  *
965  * initializes page_colors and page_colors_mask based on ecache_setsize.
966  *
967  * Also initializes the counter locks.
968  */
969 void
970 page_coloring_init()
971 {
972 	int	a;
973 
974 	if (do_pg_coloring == 0) {
975 		page_colors = 1;
976 		return;
977 	}
978 
979 	/*
980 	 * Calculate page_colors from ecache_setsize. ecache_setsize contains
981 	 * the max ecache setsize of all cpus configured in the system or, for
982 	 * cheetah+ systems, the max possible ecache setsize for all possible
983 	 * cheetah+ cpus.
984 	 */
985 	page_colors = ecache_setsize / MMU_PAGESIZE;
986 	page_colors_mask = page_colors - 1;
987 
988 	/*
989 	 * initialize cpu_page_colors if ecache setsizes are homogenous.
990 	 * cpu_page_colors set to -1 during DR operation or during startup
991 	 * if setsizes are heterogenous.
992 	 *
993 	 * The value of cpu_page_colors determines if additional color bins
994 	 * need to be checked for a particular color in the page_get routines.
995 	 */
996 	if ((cpu_page_colors == 0) && (cpu_setsize < ecache_setsize))
997 		cpu_page_colors = cpu_setsize / MMU_PAGESIZE;
998 
999 	vac_colors = vac_size / MMU_PAGESIZE;
1000 	vac_colors_mask = vac_colors -1;
1001 
1002 	page_coloring_shift = 0;
1003 	a = ecache_setsize;
1004 	while (a >>= 1) {
1005 		page_coloring_shift++;
1006 	}
1007 }
1008 
1009 int
1010 bp_color(struct buf *bp)
1011 {
1012 	int color = -1;
1013 
1014 	if (vac) {
1015 		if ((bp->b_flags & B_PAGEIO) != 0) {
1016 			color = sfmmu_get_ppvcolor(bp->b_pages);
1017 		} else if (bp->b_un.b_addr != NULL) {
1018 			color = sfmmu_get_addrvcolor(bp->b_un.b_addr);
1019 		}
1020 	}
1021 	return (color < 0 ? 0 : ptob(color));
1022 }
1023 
1024 /*
1025  * Create & Initialise pageout scanner thread. The thread has to
1026  * start at procedure with process pp and priority pri.
1027  */
1028 void
1029 pageout_init(void (*procedure)(), proc_t *pp, pri_t pri)
1030 {
1031 	(void) thread_create(NULL, 0, procedure, NULL, 0, pp, TS_RUN, pri);
1032 }
1033 
1034 /*
1035  * Function for flushing D-cache when performing module relocations
1036  * to an alternate mapping.  Stubbed out on all platforms except sun4u,
1037  * at least for now.
1038  */
1039 void
1040 dcache_flushall()
1041 {
1042 	sfmmu_cache_flushall();
1043 }
1044 
1045 static int
1046 kdi_range_overlap(uintptr_t va1, size_t sz1, uintptr_t va2, size_t sz2)
1047 {
1048 	if (va1 < va2 && va1 + sz1 <= va2)
1049 		return (0);
1050 
1051 	if (va2 < va1 && va2 + sz2 <= va1)
1052 		return (0);
1053 
1054 	return (1);
1055 }
1056 
1057 /*
1058  * Return the number of bytes, relative to the beginning of a given range, that
1059  * are non-toxic (can be read from and written to with relative impunity).
1060  */
1061 size_t
1062 kdi_range_is_nontoxic(uintptr_t va, size_t sz, int write)
1063 {
1064 	/* OBP reads are harmless, but we don't want people writing there */
1065 	if (write && kdi_range_overlap(va, sz, OFW_START_ADDR, OFW_END_ADDR -
1066 	    OFW_START_ADDR + 1))
1067 		return (va < OFW_START_ADDR ? OFW_START_ADDR - va : 0);
1068 
1069 	if (kdi_range_overlap(va, sz, PIOMAPBASE, PIOMAPSIZE))
1070 		return (va < PIOMAPBASE ? PIOMAPBASE - va : 0);
1071 
1072 	return (sz); /* no overlap */
1073 }
1074 
1075 /*
1076  * Minimum physmem required for enabling large pages for kernel heap
1077  * Currently we do not enable lp for kmem on systems with less
1078  * than 1GB of memory. This value can be changed via /etc/system
1079  */
1080 size_t segkmem_lpminphysmem = 0x40000000;	/* 1GB */
1081 
1082 /*
1083  * this function chooses large page size for kernel heap
1084  */
1085 size_t
1086 get_segkmem_lpsize(size_t lpsize)
1087 {
1088 	size_t memtotal = physmem * PAGESIZE;
1089 	size_t mmusz;
1090 	uint_t szc;
1091 	extern int disable_large_pages;
1092 
1093 	if (memtotal < segkmem_lpminphysmem)
1094 		return (PAGESIZE);
1095 
1096 	if (plat_lpkmem_is_supported != NULL &&
1097 	    plat_lpkmem_is_supported() == 0)
1098 		return (PAGESIZE);
1099 
1100 	mmusz = mmu_get_kernel_lpsize(lpsize);
1101 	szc = page_szc(mmusz);
1102 
1103 	while (szc) {
1104 		if (!(disable_large_pages & (1 << szc)))
1105 			return (page_get_pagesize(szc));
1106 		szc--;
1107 	}
1108 	return (PAGESIZE);
1109 }
1110