xref: /freebsd/sys/riscv/riscv/pmap.c (revision 53b70c86)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  * Copyright (c) 2003 Peter Wemm
11  * All rights reserved.
12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13  * All rights reserved.
14  * Copyright (c) 2014 Andrew Turner
15  * All rights reserved.
16  * Copyright (c) 2014 The FreeBSD Foundation
17  * All rights reserved.
18  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
19  * All rights reserved.
20  *
21  * This code is derived from software contributed to Berkeley by
22  * the Systems Programming Group of the University of Utah Computer
23  * Science Department and William Jolitz of UUNET Technologies Inc.
24  *
25  * Portions of this software were developed by Andrew Turner under
26  * sponsorship from The FreeBSD Foundation.
27  *
28  * Portions of this software were developed by SRI International and the
29  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
30  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
31  *
32  * Portions of this software were developed by the University of Cambridge
33  * Computer Laboratory as part of the CTSRD Project, with support from the
34  * UK Higher Education Innovation Fund (HEIF).
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgement:
46  *	This product includes software developed by the University of
47  *	California, Berkeley and its contributors.
48  * 4. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  *
64  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
65  */
66 /*-
67  * Copyright (c) 2003 Networks Associates Technology, Inc.
68  * All rights reserved.
69  *
70  * This software was developed for the FreeBSD Project by Jake Burkholder,
71  * Safeport Network Services, and Network Associates Laboratories, the
72  * Security Research Division of Network Associates, Inc. under
73  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
74  * CHATS research program.
75  *
76  * Redistribution and use in source and binary forms, with or without
77  * modification, are permitted provided that the following conditions
78  * are met:
79  * 1. Redistributions of source code must retain the above copyright
80  *    notice, this list of conditions and the following disclaimer.
81  * 2. Redistributions in binary form must reproduce the above copyright
82  *    notice, this list of conditions and the following disclaimer in the
83  *    documentation and/or other materials provided with the distribution.
84  *
85  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
86  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
87  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
88  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
89  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
90  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
91  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
92  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
93  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
94  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
95  * SUCH DAMAGE.
96  */
97 
98 #include <sys/cdefs.h>
99 __FBSDID("$FreeBSD$");
100 
101 /*
102  *	Manages physical address maps.
103  *
104  *	Since the information managed by this module is
105  *	also stored by the logical address mapping module,
106  *	this module may throw away valid virtual-to-physical
107  *	mappings at almost any time.  However, invalidations
108  *	of virtual-to-physical mappings must be done as
109  *	requested.
110  *
111  *	In order to cope with hardware architectures which
112  *	make virtual-to-physical map invalidates expensive,
113  *	this module may delay invalidate or reduced protection
114  *	operations until such time as they are actually
115  *	necessary.  This module is given full information as
116  *	to which processors are currently using which maps,
117  *	and to when physical maps must be made correct.
118  */
119 
120 #include <sys/param.h>
121 #include <sys/systm.h>
122 #include <sys/bitstring.h>
123 #include <sys/bus.h>
124 #include <sys/cpuset.h>
125 #include <sys/kernel.h>
126 #include <sys/ktr.h>
127 #include <sys/lock.h>
128 #include <sys/malloc.h>
129 #include <sys/mman.h>
130 #include <sys/msgbuf.h>
131 #include <sys/mutex.h>
132 #include <sys/physmem.h>
133 #include <sys/proc.h>
134 #include <sys/rwlock.h>
135 #include <sys/sbuf.h>
136 #include <sys/sx.h>
137 #include <sys/vmem.h>
138 #include <sys/vmmeter.h>
139 #include <sys/sched.h>
140 #include <sys/sysctl.h>
141 #include <sys/smp.h>
142 
143 #include <vm/vm.h>
144 #include <vm/vm_param.h>
145 #include <vm/vm_kern.h>
146 #include <vm/vm_page.h>
147 #include <vm/vm_map.h>
148 #include <vm/vm_object.h>
149 #include <vm/vm_extern.h>
150 #include <vm/vm_pageout.h>
151 #include <vm/vm_pager.h>
152 #include <vm/vm_phys.h>
153 #include <vm/vm_radix.h>
154 #include <vm/vm_reserv.h>
155 #include <vm/vm_dumpset.h>
156 #include <vm/uma.h>
157 
158 #include <machine/machdep.h>
159 #include <machine/md_var.h>
160 #include <machine/pcb.h>
161 #include <machine/sbi.h>
162 
163 #define	NUL1E		(Ln_ENTRIES * Ln_ENTRIES)
164 #define	NUL2E		(Ln_ENTRIES * NUL1E)
165 
166 #if !defined(DIAGNOSTIC)
167 #ifdef __GNUC_GNU_INLINE__
168 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
169 #else
170 #define PMAP_INLINE	extern inline
171 #endif
172 #else
173 #define PMAP_INLINE
174 #endif
175 
176 #ifdef PV_STATS
177 #define PV_STAT(x)	do { x ; } while (0)
178 #else
179 #define PV_STAT(x)	do { } while (0)
180 #endif
181 
182 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
183 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
184 
185 #define	NPV_LIST_LOCKS	MAXCPU
186 
187 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
188 			(&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS])
189 
190 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
191 	struct rwlock **_lockp = (lockp);		\
192 	struct rwlock *_new_lock;			\
193 							\
194 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
195 	if (_new_lock != *_lockp) {			\
196 		if (*_lockp != NULL)			\
197 			rw_wunlock(*_lockp);		\
198 		*_lockp = _new_lock;			\
199 		rw_wlock(*_lockp);			\
200 	}						\
201 } while (0)
202 
203 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
204 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
205 
206 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
207 	struct rwlock **_lockp = (lockp);		\
208 							\
209 	if (*_lockp != NULL) {				\
210 		rw_wunlock(*_lockp);			\
211 		*_lockp = NULL;				\
212 	}						\
213 } while (0)
214 
215 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
216 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
217 
218 /* The list of all the user pmaps */
219 LIST_HEAD(pmaplist, pmap);
220 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
221 
222 struct pmap kernel_pmap_store;
223 
224 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
225 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
226 vm_offset_t kernel_vm_end = 0;
227 
228 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
229 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
230 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
231 
232 /* This code assumes all L1 DMAP entries will be used */
233 CTASSERT((DMAP_MIN_ADDRESS  & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
234 CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
235 
236 static struct rwlock_padalign pvh_global_lock;
237 static struct mtx_padalign allpmaps_lock;
238 
239 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
240     "VM/pmap parameters");
241 
242 static int superpages_enabled = 1;
243 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
244     CTLFLAG_RDTUN, &superpages_enabled, 0,
245     "Enable support for transparent superpages");
246 
247 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
248     "2MB page mapping counters");
249 
250 static u_long pmap_l2_demotions;
251 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
252     &pmap_l2_demotions, 0,
253     "2MB page demotions");
254 
255 static u_long pmap_l2_mappings;
256 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
257     &pmap_l2_mappings, 0,
258     "2MB page mappings");
259 
260 static u_long pmap_l2_p_failures;
261 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
262     &pmap_l2_p_failures, 0,
263     "2MB page promotion failures");
264 
265 static u_long pmap_l2_promotions;
266 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
267     &pmap_l2_promotions, 0,
268     "2MB page promotions");
269 
270 /*
271  * Data for the pv entry allocation mechanism
272  */
273 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
274 static struct mtx pv_chunks_mutex;
275 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
276 static struct md_page *pv_table;
277 static struct md_page pv_dummy;
278 
279 extern cpuset_t all_harts;
280 
281 /*
282  * Internal flags for pmap_enter()'s helper functions.
283  */
284 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
285 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
286 
287 static void	free_pv_chunk(struct pv_chunk *pc);
288 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
289 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
290 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
291 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
292 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
293 		    vm_offset_t va);
294 static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
295 static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
296 		    vm_offset_t va, struct rwlock **lockp);
297 static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
298 		    u_int flags, vm_page_t m, struct rwlock **lockp);
299 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
300     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
301 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
302     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
303 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
304     vm_page_t m, struct rwlock **lockp);
305 
306 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
307 		struct rwlock **lockp);
308 
309 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
310     struct spglist *free);
311 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
312 
313 #define	pmap_clear(pte)			pmap_store(pte, 0)
314 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
315 #define	pmap_load_store(pte, entry)	atomic_swap_64(pte, entry)
316 #define	pmap_load_clear(pte)		pmap_load_store(pte, 0)
317 #define	pmap_load(pte)			atomic_load_64(pte)
318 #define	pmap_store(pte, entry)		atomic_store_64(pte, entry)
319 #define	pmap_store_bits(pte, bits)	atomic_set_64(pte, bits)
320 
321 /********************/
322 /* Inline functions */
323 /********************/
324 
325 static __inline void
326 pagecopy(void *s, void *d)
327 {
328 
329 	memcpy(d, s, PAGE_SIZE);
330 }
331 
332 static __inline void
333 pagezero(void *p)
334 {
335 
336 	bzero(p, PAGE_SIZE);
337 }
338 
339 #define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
340 #define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
341 #define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
342 
343 #define	PTE_TO_PHYS(pte) \
344     ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
345 #define	L2PTE_TO_PHYS(l2) \
346     ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
347 
348 static __inline pd_entry_t *
349 pmap_l1(pmap_t pmap, vm_offset_t va)
350 {
351 
352 	return (&pmap->pm_l1[pmap_l1_index(va)]);
353 }
354 
355 static __inline pd_entry_t *
356 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
357 {
358 	vm_paddr_t phys;
359 	pd_entry_t *l2;
360 
361 	phys = PTE_TO_PHYS(pmap_load(l1));
362 	l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
363 
364 	return (&l2[pmap_l2_index(va)]);
365 }
366 
367 static __inline pd_entry_t *
368 pmap_l2(pmap_t pmap, vm_offset_t va)
369 {
370 	pd_entry_t *l1;
371 
372 	l1 = pmap_l1(pmap, va);
373 	if ((pmap_load(l1) & PTE_V) == 0)
374 		return (NULL);
375 	if ((pmap_load(l1) & PTE_RX) != 0)
376 		return (NULL);
377 
378 	return (pmap_l1_to_l2(l1, va));
379 }
380 
381 static __inline pt_entry_t *
382 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
383 {
384 	vm_paddr_t phys;
385 	pt_entry_t *l3;
386 
387 	phys = PTE_TO_PHYS(pmap_load(l2));
388 	l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
389 
390 	return (&l3[pmap_l3_index(va)]);
391 }
392 
393 static __inline pt_entry_t *
394 pmap_l3(pmap_t pmap, vm_offset_t va)
395 {
396 	pd_entry_t *l2;
397 
398 	l2 = pmap_l2(pmap, va);
399 	if (l2 == NULL)
400 		return (NULL);
401 	if ((pmap_load(l2) & PTE_V) == 0)
402 		return (NULL);
403 	if ((pmap_load(l2) & PTE_RX) != 0)
404 		return (NULL);
405 
406 	return (pmap_l2_to_l3(l2, va));
407 }
408 
409 static __inline void
410 pmap_resident_count_inc(pmap_t pmap, int count)
411 {
412 
413 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
414 	pmap->pm_stats.resident_count += count;
415 }
416 
417 static __inline void
418 pmap_resident_count_dec(pmap_t pmap, int count)
419 {
420 
421 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
422 	KASSERT(pmap->pm_stats.resident_count >= count,
423 	    ("pmap %p resident count underflow %ld %d", pmap,
424 	    pmap->pm_stats.resident_count, count));
425 	pmap->pm_stats.resident_count -= count;
426 }
427 
428 static void
429 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
430     pt_entry_t entry)
431 {
432 	struct pmap *user_pmap;
433 	pd_entry_t *l1;
434 
435 	/* Distribute new kernel L1 entry to all the user pmaps */
436 	if (pmap != kernel_pmap)
437 		return;
438 
439 	mtx_lock(&allpmaps_lock);
440 	LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
441 		l1 = &user_pmap->pm_l1[l1index];
442 		pmap_store(l1, entry);
443 	}
444 	mtx_unlock(&allpmaps_lock);
445 }
446 
447 static pt_entry_t *
448 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
449     u_int *l2_slot)
450 {
451 	pt_entry_t *l2;
452 	pd_entry_t *l1;
453 
454 	l1 = (pd_entry_t *)l1pt;
455 	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
456 
457 	/* Check locore has used a table L1 map */
458 	KASSERT((l1[*l1_slot] & PTE_RX) == 0,
459 		("Invalid bootstrap L1 table"));
460 
461 	/* Find the address of the L2 table */
462 	l2 = (pt_entry_t *)init_pt_va;
463 	*l2_slot = pmap_l2_index(va);
464 
465 	return (l2);
466 }
467 
468 static vm_paddr_t
469 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
470 {
471 	u_int l1_slot, l2_slot;
472 	pt_entry_t *l2;
473 	vm_paddr_t ret;
474 
475 	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
476 
477 	/* Check locore has used L2 superpages */
478 	KASSERT((l2[l2_slot] & PTE_RX) != 0,
479 		("Invalid bootstrap L2 table"));
480 
481 	/* L2 is superpages */
482 	ret = L2PTE_TO_PHYS(l2[l2_slot]);
483 	ret += (va & L2_OFFSET);
484 
485 	return (ret);
486 }
487 
488 static void
489 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
490 {
491 	vm_offset_t va;
492 	vm_paddr_t pa;
493 	pd_entry_t *l1;
494 	u_int l1_slot;
495 	pt_entry_t entry;
496 	pn_t pn;
497 
498 	pa = dmap_phys_base = min_pa & ~L1_OFFSET;
499 	va = DMAP_MIN_ADDRESS;
500 	l1 = (pd_entry_t *)kern_l1;
501 	l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS);
502 
503 	for (; va < DMAP_MAX_ADDRESS && pa < max_pa;
504 	    pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
505 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
506 
507 		/* superpages */
508 		pn = (pa / PAGE_SIZE);
509 		entry = PTE_KERN;
510 		entry |= (pn << PTE_PPN0_S);
511 		pmap_store(&l1[l1_slot], entry);
512 	}
513 
514 	/* Set the upper limit of the DMAP region */
515 	dmap_phys_max = pa;
516 	dmap_max_addr = va;
517 
518 	sfence_vma();
519 }
520 
521 static vm_offset_t
522 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
523 {
524 	vm_offset_t l3pt;
525 	pt_entry_t entry;
526 	pd_entry_t *l2;
527 	vm_paddr_t pa;
528 	u_int l2_slot;
529 	pn_t pn;
530 
531 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
532 
533 	l2 = pmap_l2(kernel_pmap, va);
534 	l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1));
535 	l2_slot = pmap_l2_index(va);
536 	l3pt = l3_start;
537 
538 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
539 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
540 
541 		pa = pmap_early_vtophys(l1pt, l3pt);
542 		pn = (pa / PAGE_SIZE);
543 		entry = (PTE_V);
544 		entry |= (pn << PTE_PPN0_S);
545 		pmap_store(&l2[l2_slot], entry);
546 		l3pt += PAGE_SIZE;
547 	}
548 
549 	/* Clean the L2 page table */
550 	memset((void *)l3_start, 0, l3pt - l3_start);
551 
552 	return (l3pt);
553 }
554 
555 /*
556  *	Bootstrap the system enough to run with virtual memory.
557  */
558 void
559 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen)
560 {
561 	u_int l1_slot, l2_slot;
562 	vm_offset_t freemempos;
563 	vm_offset_t dpcpu, msgbufpv;
564 	vm_paddr_t max_pa, min_pa, pa;
565 	pt_entry_t *l2p;
566 	int i;
567 
568 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
569 
570 	/* Set this early so we can use the pagetable walking functions */
571 	kernel_pmap_store.pm_l1 = (pd_entry_t *)l1pt;
572 	PMAP_LOCK_INIT(kernel_pmap);
573 
574 	rw_init(&pvh_global_lock, "pmap pv global");
575 
576 	/*
577 	 * Set the current CPU as active in the kernel pmap. Secondary cores
578 	 * will add themselves later in init_secondary(). The SBI firmware
579 	 * may rely on this mask being precise, so CPU_FILL() is not used.
580 	 */
581 	CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active);
582 
583 	/* Assume the address we were loaded to is a valid physical address. */
584 	min_pa = max_pa = kernstart;
585 
586 	physmap_idx = physmem_avail(physmap, nitems(physmap));
587 	physmap_idx /= 2;
588 
589 	/*
590 	 * Find the minimum physical address. physmap is sorted,
591 	 * but may contain empty ranges.
592 	 */
593 	for (i = 0; i < physmap_idx * 2; i += 2) {
594 		if (physmap[i] == physmap[i + 1])
595 			continue;
596 		if (physmap[i] <= min_pa)
597 			min_pa = physmap[i];
598 		if (physmap[i + 1] > max_pa)
599 			max_pa = physmap[i + 1];
600 	}
601 	printf("physmap_idx %u\n", physmap_idx);
602 	printf("min_pa %lx\n", min_pa);
603 	printf("max_pa %lx\n", max_pa);
604 
605 	/* Create a direct map region early so we can use it for pa -> va */
606 	pmap_bootstrap_dmap(l1pt, min_pa, max_pa);
607 
608 	/*
609 	 * Read the page table to find out what is already mapped.
610 	 * This assumes we have mapped a block of memory from KERNBASE
611 	 * using a single L1 entry.
612 	 */
613 	(void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
614 
615 	/* Sanity check the index, KERNBASE should be the first VA */
616 	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
617 
618 	freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE);
619 
620 	/* Create the l3 tables for the early devmap */
621 	freemempos = pmap_bootstrap_l3(l1pt,
622 	    VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
623 
624 	/*
625 	 * Invalidate the mapping we created for the DTB. At this point a copy
626 	 * has been created, and we no longer need it. We want to avoid the
627 	 * possibility of an aliased mapping in the future.
628 	 */
629 	l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS);
630 	if ((pmap_load(l2p) & PTE_V) != 0)
631 		pmap_clear(l2p);
632 
633 	sfence_vma();
634 
635 #define alloc_pages(var, np)						\
636 	(var) = freemempos;						\
637 	freemempos += (np * PAGE_SIZE);					\
638 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
639 
640 	/* Allocate dynamic per-cpu area. */
641 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
642 	dpcpu_init((void *)dpcpu, 0);
643 
644 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
645 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
646 	msgbufp = (void *)msgbufpv;
647 
648 	virtual_avail = roundup2(freemempos, L2_SIZE);
649 	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
650 	kernel_vm_end = virtual_avail;
651 
652 	pa = pmap_early_vtophys(l1pt, freemempos);
653 
654 	physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC);
655 }
656 
657 /*
658  *	Initialize a vm_page's machine-dependent fields.
659  */
660 void
661 pmap_page_init(vm_page_t m)
662 {
663 
664 	TAILQ_INIT(&m->md.pv_list);
665 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
666 }
667 
668 /*
669  *	Initialize the pmap module.
670  *	Called by vm_init, to initialize any structures that the pmap
671  *	system needs to map virtual memory.
672  */
673 void
674 pmap_init(void)
675 {
676 	vm_size_t s;
677 	int i, pv_npg;
678 
679 	/*
680 	 * Initialize the pv chunk and pmap list mutexes.
681 	 */
682 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
683 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
684 
685 	/*
686 	 * Initialize the pool of pv list locks.
687 	 */
688 	for (i = 0; i < NPV_LIST_LOCKS; i++)
689 		rw_init(&pv_list_locks[i], "pmap pv list");
690 
691 	/*
692 	 * Calculate the size of the pv head table for superpages.
693 	 */
694 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
695 
696 	/*
697 	 * Allocate memory for the pv head table for superpages.
698 	 */
699 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
700 	s = round_page(s);
701 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
702 	for (i = 0; i < pv_npg; i++)
703 		TAILQ_INIT(&pv_table[i].pv_list);
704 	TAILQ_INIT(&pv_dummy.pv_list);
705 
706 	if (superpages_enabled)
707 		pagesizes[1] = L2_SIZE;
708 }
709 
710 #ifdef SMP
711 /*
712  * For SMP, these functions have to use IPIs for coherence.
713  *
714  * In general, the calling thread uses a plain fence to order the
715  * writes to the page tables before invoking an SBI callback to invoke
716  * sfence_vma() on remote CPUs.
717  */
718 static void
719 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
720 {
721 	cpuset_t mask;
722 
723 	sched_pin();
724 	mask = pmap->pm_active;
725 	CPU_CLR(PCPU_GET(hart), &mask);
726 	fence();
727 	if (!CPU_EMPTY(&mask) && smp_started)
728 		sbi_remote_sfence_vma(mask.__bits, va, 1);
729 	sfence_vma_page(va);
730 	sched_unpin();
731 }
732 
733 static void
734 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
735 {
736 	cpuset_t mask;
737 
738 	sched_pin();
739 	mask = pmap->pm_active;
740 	CPU_CLR(PCPU_GET(hart), &mask);
741 	fence();
742 	if (!CPU_EMPTY(&mask) && smp_started)
743 		sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
744 
745 	/*
746 	 * Might consider a loop of sfence_vma_page() for a small
747 	 * number of pages in the future.
748 	 */
749 	sfence_vma();
750 	sched_unpin();
751 }
752 
753 static void
754 pmap_invalidate_all(pmap_t pmap)
755 {
756 	cpuset_t mask;
757 
758 	sched_pin();
759 	mask = pmap->pm_active;
760 	CPU_CLR(PCPU_GET(hart), &mask);
761 
762 	/*
763 	 * XXX: The SBI doc doesn't detail how to specify x0 as the
764 	 * address to perform a global fence.  BBL currently treats
765 	 * all sfence_vma requests as global however.
766 	 */
767 	fence();
768 	if (!CPU_EMPTY(&mask) && smp_started)
769 		sbi_remote_sfence_vma(mask.__bits, 0, 0);
770 	sfence_vma();
771 	sched_unpin();
772 }
773 #else
774 /*
775  * Normal, non-SMP, invalidation functions.
776  * We inline these within pmap.c for speed.
777  */
778 static __inline void
779 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
780 {
781 
782 	sfence_vma_page(va);
783 }
784 
785 static __inline void
786 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
787 {
788 
789 	/*
790 	 * Might consider a loop of sfence_vma_page() for a small
791 	 * number of pages in the future.
792 	 */
793 	sfence_vma();
794 }
795 
796 static __inline void
797 pmap_invalidate_all(pmap_t pmap)
798 {
799 
800 	sfence_vma();
801 }
802 #endif
803 
804 /*
805  *	Routine:	pmap_extract
806  *	Function:
807  *		Extract the physical page address associated
808  *		with the given map/virtual_address pair.
809  */
810 vm_paddr_t
811 pmap_extract(pmap_t pmap, vm_offset_t va)
812 {
813 	pd_entry_t *l2p, l2;
814 	pt_entry_t *l3p, l3;
815 	vm_paddr_t pa;
816 
817 	pa = 0;
818 	PMAP_LOCK(pmap);
819 	/*
820 	 * Start with the l2 tabel. We are unable to allocate
821 	 * pages in the l1 table.
822 	 */
823 	l2p = pmap_l2(pmap, va);
824 	if (l2p != NULL) {
825 		l2 = pmap_load(l2p);
826 		if ((l2 & PTE_RX) == 0) {
827 			l3p = pmap_l2_to_l3(l2p, va);
828 			if (l3p != NULL) {
829 				l3 = pmap_load(l3p);
830 				pa = PTE_TO_PHYS(l3);
831 				pa |= (va & L3_OFFSET);
832 			}
833 		} else {
834 			/* L2 is superpages */
835 			pa = L2PTE_TO_PHYS(l2);
836 			pa |= (va & L2_OFFSET);
837 		}
838 	}
839 	PMAP_UNLOCK(pmap);
840 	return (pa);
841 }
842 
843 /*
844  *	Routine:	pmap_extract_and_hold
845  *	Function:
846  *		Atomically extract and hold the physical page
847  *		with the given pmap and virtual address pair
848  *		if that mapping permits the given protection.
849  */
850 vm_page_t
851 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
852 {
853 	pt_entry_t *l3p, l3;
854 	vm_paddr_t phys;
855 	vm_page_t m;
856 
857 	m = NULL;
858 	PMAP_LOCK(pmap);
859 	l3p = pmap_l3(pmap, va);
860 	if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) {
861 		if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
862 			phys = PTE_TO_PHYS(l3);
863 			m = PHYS_TO_VM_PAGE(phys);
864 			if (!vm_page_wire_mapped(m))
865 				m = NULL;
866 		}
867 	}
868 	PMAP_UNLOCK(pmap);
869 	return (m);
870 }
871 
872 vm_paddr_t
873 pmap_kextract(vm_offset_t va)
874 {
875 	pd_entry_t *l2, l2e;
876 	pt_entry_t *l3;
877 	vm_paddr_t pa;
878 
879 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
880 		pa = DMAP_TO_PHYS(va);
881 	} else {
882 		l2 = pmap_l2(kernel_pmap, va);
883 		if (l2 == NULL)
884 			panic("pmap_kextract: No l2");
885 		l2e = pmap_load(l2);
886 		/*
887 		 * Beware of concurrent promotion and demotion! We must
888 		 * use l2e rather than loading from l2 multiple times to
889 		 * ensure we see a consistent state, including the
890 		 * implicit load in pmap_l2_to_l3.  It is, however, safe
891 		 * to use an old l2e because the L3 page is preserved by
892 		 * promotion.
893 		 */
894 		if ((l2e & PTE_RX) != 0) {
895 			/* superpages */
896 			pa = L2PTE_TO_PHYS(l2e);
897 			pa |= (va & L2_OFFSET);
898 			return (pa);
899 		}
900 
901 		l3 = pmap_l2_to_l3(&l2e, va);
902 		if (l3 == NULL)
903 			panic("pmap_kextract: No l3...");
904 		pa = PTE_TO_PHYS(pmap_load(l3));
905 		pa |= (va & PAGE_MASK);
906 	}
907 	return (pa);
908 }
909 
910 /***************************************************
911  * Low level mapping routines.....
912  ***************************************************/
913 
914 void
915 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
916 {
917 	pt_entry_t entry;
918 	pt_entry_t *l3;
919 	vm_offset_t va;
920 	pn_t pn;
921 
922 	KASSERT((pa & L3_OFFSET) == 0,
923 	   ("pmap_kenter_device: Invalid physical address"));
924 	KASSERT((sva & L3_OFFSET) == 0,
925 	   ("pmap_kenter_device: Invalid virtual address"));
926 	KASSERT((size & PAGE_MASK) == 0,
927 	    ("pmap_kenter_device: Mapping is not page-sized"));
928 
929 	va = sva;
930 	while (size != 0) {
931 		l3 = pmap_l3(kernel_pmap, va);
932 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
933 
934 		pn = (pa / PAGE_SIZE);
935 		entry = PTE_KERN;
936 		entry |= (pn << PTE_PPN0_S);
937 		pmap_store(l3, entry);
938 
939 		va += PAGE_SIZE;
940 		pa += PAGE_SIZE;
941 		size -= PAGE_SIZE;
942 	}
943 	pmap_invalidate_range(kernel_pmap, sva, va);
944 }
945 
946 /*
947  * Remove a page from the kernel pagetables.
948  * Note: not SMP coherent.
949  */
950 PMAP_INLINE void
951 pmap_kremove(vm_offset_t va)
952 {
953 	pt_entry_t *l3;
954 
955 	l3 = pmap_l3(kernel_pmap, va);
956 	KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
957 
958 	pmap_clear(l3);
959 	sfence_vma();
960 }
961 
962 void
963 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
964 {
965 	pt_entry_t *l3;
966 	vm_offset_t va;
967 
968 	KASSERT((sva & L3_OFFSET) == 0,
969 	   ("pmap_kremove_device: Invalid virtual address"));
970 	KASSERT((size & PAGE_MASK) == 0,
971 	    ("pmap_kremove_device: Mapping is not page-sized"));
972 
973 	va = sva;
974 	while (size != 0) {
975 		l3 = pmap_l3(kernel_pmap, va);
976 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
977 		pmap_clear(l3);
978 
979 		va += PAGE_SIZE;
980 		size -= PAGE_SIZE;
981 	}
982 
983 	pmap_invalidate_range(kernel_pmap, sva, va);
984 }
985 
986 /*
987  *	Used to map a range of physical addresses into kernel
988  *	virtual address space.
989  *
990  *	The value passed in '*virt' is a suggested virtual address for
991  *	the mapping. Architectures which can support a direct-mapped
992  *	physical to virtual region can return the appropriate address
993  *	within that region, leaving '*virt' unchanged. Other
994  *	architectures should map the pages starting at '*virt' and
995  *	update '*virt' with the first usable address after the mapped
996  *	region.
997  */
998 vm_offset_t
999 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1000 {
1001 
1002 	return PHYS_TO_DMAP(start);
1003 }
1004 
1005 /*
1006  * Add a list of wired pages to the kva
1007  * this routine is only used for temporary
1008  * kernel mappings that do not need to have
1009  * page modification or references recorded.
1010  * Note that old mappings are simply written
1011  * over.  The page *must* be wired.
1012  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1013  */
1014 void
1015 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1016 {
1017 	pt_entry_t *l3, pa;
1018 	vm_offset_t va;
1019 	vm_page_t m;
1020 	pt_entry_t entry;
1021 	pn_t pn;
1022 	int i;
1023 
1024 	va = sva;
1025 	for (i = 0; i < count; i++) {
1026 		m = ma[i];
1027 		pa = VM_PAGE_TO_PHYS(m);
1028 		pn = (pa / PAGE_SIZE);
1029 		l3 = pmap_l3(kernel_pmap, va);
1030 
1031 		entry = PTE_KERN;
1032 		entry |= (pn << PTE_PPN0_S);
1033 		pmap_store(l3, entry);
1034 
1035 		va += L3_SIZE;
1036 	}
1037 	pmap_invalidate_range(kernel_pmap, sva, va);
1038 }
1039 
1040 /*
1041  * This routine tears out page mappings from the
1042  * kernel -- it is meant only for temporary mappings.
1043  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1044  */
1045 void
1046 pmap_qremove(vm_offset_t sva, int count)
1047 {
1048 	pt_entry_t *l3;
1049 	vm_offset_t va;
1050 
1051 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1052 
1053 	for (va = sva; count-- > 0; va += PAGE_SIZE) {
1054 		l3 = pmap_l3(kernel_pmap, va);
1055 		KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1056 		pmap_clear(l3);
1057 	}
1058 	pmap_invalidate_range(kernel_pmap, sva, va);
1059 }
1060 
1061 bool
1062 pmap_ps_enabled(pmap_t pmap __unused)
1063 {
1064 
1065 	return (superpages_enabled);
1066 }
1067 
1068 /***************************************************
1069  * Page table page management routines.....
1070  ***************************************************/
1071 /*
1072  * Schedule the specified unused page table page to be freed.  Specifically,
1073  * add the page to the specified list of pages that will be released to the
1074  * physical memory manager after the TLB has been updated.
1075  */
1076 static __inline void
1077 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1078     boolean_t set_PG_ZERO)
1079 {
1080 
1081 	if (set_PG_ZERO)
1082 		m->flags |= PG_ZERO;
1083 	else
1084 		m->flags &= ~PG_ZERO;
1085 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1086 }
1087 
1088 /*
1089  * Inserts the specified page table page into the specified pmap's collection
1090  * of idle page table pages.  Each of a pmap's page table pages is responsible
1091  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1092  * ordered by this virtual address range.
1093  *
1094  * If "promoted" is false, then the page table page "ml3" must be zero filled.
1095  */
1096 static __inline int
1097 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted)
1098 {
1099 
1100 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1101 	ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0;
1102 	return (vm_radix_insert(&pmap->pm_root, ml3));
1103 }
1104 
1105 /*
1106  * Removes the page table page mapping the specified virtual address from the
1107  * specified pmap's collection of idle page table pages, and returns it.
1108  * Otherwise, returns NULL if there is no page table page corresponding to the
1109  * specified virtual address.
1110  */
1111 static __inline vm_page_t
1112 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1113 {
1114 
1115 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1116 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
1117 }
1118 
1119 /*
1120  * Decrements a page table page's reference count, which is used to record the
1121  * number of valid page table entries within the page.  If the reference count
1122  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1123  * page table page was unmapped and FALSE otherwise.
1124  */
1125 static inline boolean_t
1126 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1127 {
1128 
1129 	--m->ref_count;
1130 	if (m->ref_count == 0) {
1131 		_pmap_unwire_ptp(pmap, va, m, free);
1132 		return (TRUE);
1133 	} else {
1134 		return (FALSE);
1135 	}
1136 }
1137 
1138 static void
1139 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1140 {
1141 	vm_paddr_t phys;
1142 
1143 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1144 	if (m->pindex >= NUL2E) {
1145 		pd_entry_t *l1;
1146 		l1 = pmap_l1(pmap, va);
1147 		pmap_clear(l1);
1148 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
1149 	} else {
1150 		pd_entry_t *l2;
1151 		l2 = pmap_l2(pmap, va);
1152 		pmap_clear(l2);
1153 	}
1154 	pmap_resident_count_dec(pmap, 1);
1155 	if (m->pindex < NUL2E) {
1156 		pd_entry_t *l1;
1157 		vm_page_t pdpg;
1158 
1159 		l1 = pmap_l1(pmap, va);
1160 		phys = PTE_TO_PHYS(pmap_load(l1));
1161 		pdpg = PHYS_TO_VM_PAGE(phys);
1162 		pmap_unwire_ptp(pmap, va, pdpg, free);
1163 	}
1164 	pmap_invalidate_page(pmap, va);
1165 
1166 	vm_wire_sub(1);
1167 
1168 	/*
1169 	 * Put page on a list so that it is released after
1170 	 * *ALL* TLB shootdown is done
1171 	 */
1172 	pmap_add_delayed_free_list(m, free, TRUE);
1173 }
1174 
1175 /*
1176  * After removing a page table entry, this routine is used to
1177  * conditionally free the page, and manage the reference count.
1178  */
1179 static int
1180 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1181     struct spglist *free)
1182 {
1183 	vm_page_t mpte;
1184 
1185 	if (va >= VM_MAXUSER_ADDRESS)
1186 		return (0);
1187 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1188 	mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde));
1189 	return (pmap_unwire_ptp(pmap, va, mpte, free));
1190 }
1191 
1192 void
1193 pmap_pinit0(pmap_t pmap)
1194 {
1195 
1196 	PMAP_LOCK_INIT(pmap);
1197 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1198 	pmap->pm_l1 = kernel_pmap->pm_l1;
1199 	pmap->pm_satp = SATP_MODE_SV39 | (vtophys(pmap->pm_l1) >> PAGE_SHIFT);
1200 	CPU_ZERO(&pmap->pm_active);
1201 	pmap_activate_boot(pmap);
1202 }
1203 
1204 int
1205 pmap_pinit(pmap_t pmap)
1206 {
1207 	vm_paddr_t l1phys;
1208 	vm_page_t l1pt;
1209 
1210 	/*
1211 	 * allocate the l1 page
1212 	 */
1213 	while ((l1pt = vm_page_alloc(NULL, 0xdeadbeef, VM_ALLOC_NORMAL |
1214 	    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL)
1215 		vm_wait(NULL);
1216 
1217 	l1phys = VM_PAGE_TO_PHYS(l1pt);
1218 	pmap->pm_l1 = (pd_entry_t *)PHYS_TO_DMAP(l1phys);
1219 	pmap->pm_satp = SATP_MODE_SV39 | (l1phys >> PAGE_SHIFT);
1220 
1221 	if ((l1pt->flags & PG_ZERO) == 0)
1222 		pagezero(pmap->pm_l1);
1223 
1224 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1225 
1226 	CPU_ZERO(&pmap->pm_active);
1227 
1228 	/* Install kernel pagetables */
1229 	memcpy(pmap->pm_l1, kernel_pmap->pm_l1, PAGE_SIZE);
1230 
1231 	/* Add to the list of all user pmaps */
1232 	mtx_lock(&allpmaps_lock);
1233 	LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1234 	mtx_unlock(&allpmaps_lock);
1235 
1236 	vm_radix_init(&pmap->pm_root);
1237 
1238 	return (1);
1239 }
1240 
1241 /*
1242  * This routine is called if the desired page table page does not exist.
1243  *
1244  * If page table page allocation fails, this routine may sleep before
1245  * returning NULL.  It sleeps only if a lock pointer was given.
1246  *
1247  * Note: If a page allocation fails at page table level two or three,
1248  * one or two pages may be held during the wait, only to be released
1249  * afterwards.  This conservative approach is easily argued to avoid
1250  * race conditions.
1251  */
1252 static vm_page_t
1253 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1254 {
1255 	vm_page_t m, /*pdppg, */pdpg;
1256 	pt_entry_t entry;
1257 	vm_paddr_t phys;
1258 	pn_t pn;
1259 
1260 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1261 
1262 	/*
1263 	 * Allocate a page table page.
1264 	 */
1265 	if ((m = vm_page_alloc(NULL, ptepindex, VM_ALLOC_NOOBJ |
1266 	    VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1267 		if (lockp != NULL) {
1268 			RELEASE_PV_LIST_LOCK(lockp);
1269 			PMAP_UNLOCK(pmap);
1270 			rw_runlock(&pvh_global_lock);
1271 			vm_wait(NULL);
1272 			rw_rlock(&pvh_global_lock);
1273 			PMAP_LOCK(pmap);
1274 		}
1275 
1276 		/*
1277 		 * Indicate the need to retry.  While waiting, the page table
1278 		 * page may have been allocated.
1279 		 */
1280 		return (NULL);
1281 	}
1282 
1283 	if ((m->flags & PG_ZERO) == 0)
1284 		pmap_zero_page(m);
1285 
1286 	/*
1287 	 * Map the pagetable page into the process address space, if
1288 	 * it isn't already there.
1289 	 */
1290 
1291 	if (ptepindex >= NUL2E) {
1292 		pd_entry_t *l1;
1293 		vm_pindex_t l1index;
1294 
1295 		l1index = ptepindex - NUL2E;
1296 		l1 = &pmap->pm_l1[l1index];
1297 		KASSERT((pmap_load(l1) & PTE_V) == 0,
1298 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1299 
1300 		pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
1301 		entry = (PTE_V);
1302 		entry |= (pn << PTE_PPN0_S);
1303 		pmap_store(l1, entry);
1304 		pmap_distribute_l1(pmap, l1index, entry);
1305 	} else {
1306 		vm_pindex_t l1index;
1307 		pd_entry_t *l1, *l2;
1308 
1309 		l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
1310 		l1 = &pmap->pm_l1[l1index];
1311 		if (pmap_load(l1) == 0) {
1312 			/* recurse for allocating page dir */
1313 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1314 			    lockp) == NULL) {
1315 				vm_page_unwire_noq(m);
1316 				vm_page_free_zero(m);
1317 				return (NULL);
1318 			}
1319 		} else {
1320 			phys = PTE_TO_PHYS(pmap_load(l1));
1321 			pdpg = PHYS_TO_VM_PAGE(phys);
1322 			pdpg->ref_count++;
1323 		}
1324 
1325 		phys = PTE_TO_PHYS(pmap_load(l1));
1326 		l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1327 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1328 		KASSERT((pmap_load(l2) & PTE_V) == 0,
1329 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1330 
1331 		pn = (VM_PAGE_TO_PHYS(m) / PAGE_SIZE);
1332 		entry = (PTE_V);
1333 		entry |= (pn << PTE_PPN0_S);
1334 		pmap_store(l2, entry);
1335 	}
1336 
1337 	pmap_resident_count_inc(pmap, 1);
1338 
1339 	return (m);
1340 }
1341 
1342 static vm_page_t
1343 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1344 {
1345 	pd_entry_t *l1;
1346 	vm_page_t l2pg;
1347 	vm_pindex_t l2pindex;
1348 
1349 retry:
1350 	l1 = pmap_l1(pmap, va);
1351 	if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) {
1352 		KASSERT((pmap_load(l1) & PTE_RWX) == 0,
1353 		    ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__,
1354 		    pmap_load(l1), va));
1355 		/* Add a reference to the L2 page. */
1356 		l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1)));
1357 		l2pg->ref_count++;
1358 	} else {
1359 		/* Allocate a L2 page. */
1360 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
1361 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
1362 		if (l2pg == NULL && lockp != NULL)
1363 			goto retry;
1364 	}
1365 	return (l2pg);
1366 }
1367 
1368 static vm_page_t
1369 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1370 {
1371 	vm_pindex_t ptepindex;
1372 	pd_entry_t *l2;
1373 	vm_paddr_t phys;
1374 	vm_page_t m;
1375 
1376 	/*
1377 	 * Calculate pagetable page index
1378 	 */
1379 	ptepindex = pmap_l2_pindex(va);
1380 retry:
1381 	/*
1382 	 * Get the page directory entry
1383 	 */
1384 	l2 = pmap_l2(pmap, va);
1385 
1386 	/*
1387 	 * If the page table page is mapped, we just increment the
1388 	 * hold count, and activate it.
1389 	 */
1390 	if (l2 != NULL && pmap_load(l2) != 0) {
1391 		phys = PTE_TO_PHYS(pmap_load(l2));
1392 		m = PHYS_TO_VM_PAGE(phys);
1393 		m->ref_count++;
1394 	} else {
1395 		/*
1396 		 * Here if the pte page isn't mapped, or if it has been
1397 		 * deallocated.
1398 		 */
1399 		m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1400 		if (m == NULL && lockp != NULL)
1401 			goto retry;
1402 	}
1403 	return (m);
1404 }
1405 
1406 /***************************************************
1407  * Pmap allocation/deallocation routines.
1408  ***************************************************/
1409 
1410 /*
1411  * Release any resources held by the given physical map.
1412  * Called when a pmap initialized by pmap_pinit is being released.
1413  * Should only be called if the map contains no valid mappings.
1414  */
1415 void
1416 pmap_release(pmap_t pmap)
1417 {
1418 	vm_page_t m;
1419 
1420 	KASSERT(pmap->pm_stats.resident_count == 0,
1421 	    ("pmap_release: pmap resident count %ld != 0",
1422 	    pmap->pm_stats.resident_count));
1423 	KASSERT(CPU_EMPTY(&pmap->pm_active),
1424 	    ("releasing active pmap %p", pmap));
1425 
1426 	mtx_lock(&allpmaps_lock);
1427 	LIST_REMOVE(pmap, pm_list);
1428 	mtx_unlock(&allpmaps_lock);
1429 
1430 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_l1));
1431 	vm_page_unwire_noq(m);
1432 	vm_page_free(m);
1433 }
1434 
1435 static int
1436 kvm_size(SYSCTL_HANDLER_ARGS)
1437 {
1438 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1439 
1440 	return sysctl_handle_long(oidp, &ksize, 0, req);
1441 }
1442 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1443     0, 0, kvm_size, "LU",
1444     "Size of KVM");
1445 
1446 static int
1447 kvm_free(SYSCTL_HANDLER_ARGS)
1448 {
1449 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1450 
1451 	return sysctl_handle_long(oidp, &kfree, 0, req);
1452 }
1453 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1454     0, 0, kvm_free, "LU",
1455     "Amount of KVM free");
1456 
1457 /*
1458  * grow the number of kernel page table entries, if needed
1459  */
1460 void
1461 pmap_growkernel(vm_offset_t addr)
1462 {
1463 	vm_paddr_t paddr;
1464 	vm_page_t nkpg;
1465 	pd_entry_t *l1, *l2;
1466 	pt_entry_t entry;
1467 	pn_t pn;
1468 
1469 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1470 
1471 	addr = roundup2(addr, L2_SIZE);
1472 	if (addr - 1 >= vm_map_max(kernel_map))
1473 		addr = vm_map_max(kernel_map);
1474 	while (kernel_vm_end < addr) {
1475 		l1 = pmap_l1(kernel_pmap, kernel_vm_end);
1476 		if (pmap_load(l1) == 0) {
1477 			/* We need a new PDP entry */
1478 			nkpg = vm_page_alloc(NULL, kernel_vm_end >> L1_SHIFT,
1479 			    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ |
1480 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1481 			if (nkpg == NULL)
1482 				panic("pmap_growkernel: no memory to grow kernel");
1483 			if ((nkpg->flags & PG_ZERO) == 0)
1484 				pmap_zero_page(nkpg);
1485 			paddr = VM_PAGE_TO_PHYS(nkpg);
1486 
1487 			pn = (paddr / PAGE_SIZE);
1488 			entry = (PTE_V);
1489 			entry |= (pn << PTE_PPN0_S);
1490 			pmap_store(l1, entry);
1491 			pmap_distribute_l1(kernel_pmap,
1492 			    pmap_l1_index(kernel_vm_end), entry);
1493 			continue; /* try again */
1494 		}
1495 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1496 		if ((pmap_load(l2) & PTE_V) != 0 &&
1497 		    (pmap_load(l2) & PTE_RWX) == 0) {
1498 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1499 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1500 				kernel_vm_end = vm_map_max(kernel_map);
1501 				break;
1502 			}
1503 			continue;
1504 		}
1505 
1506 		nkpg = vm_page_alloc(NULL, kernel_vm_end >> L2_SHIFT,
1507 		    VM_ALLOC_INTERRUPT | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
1508 		    VM_ALLOC_ZERO);
1509 		if (nkpg == NULL)
1510 			panic("pmap_growkernel: no memory to grow kernel");
1511 		if ((nkpg->flags & PG_ZERO) == 0) {
1512 			pmap_zero_page(nkpg);
1513 		}
1514 		paddr = VM_PAGE_TO_PHYS(nkpg);
1515 
1516 		pn = (paddr / PAGE_SIZE);
1517 		entry = (PTE_V);
1518 		entry |= (pn << PTE_PPN0_S);
1519 		pmap_store(l2, entry);
1520 
1521 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
1522 
1523 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1524 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1525 			kernel_vm_end = vm_map_max(kernel_map);
1526 			break;
1527 		}
1528 	}
1529 }
1530 
1531 /***************************************************
1532  * page management routines.
1533  ***************************************************/
1534 
1535 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1536 CTASSERT(_NPCM == 3);
1537 CTASSERT(_NPCPV == 168);
1538 
1539 static __inline struct pv_chunk *
1540 pv_to_chunk(pv_entry_t pv)
1541 {
1542 
1543 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1544 }
1545 
1546 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1547 
1548 #define	PC_FREE0	0xfffffffffffffffful
1549 #define	PC_FREE1	0xfffffffffffffffful
1550 #define	PC_FREE2	0x000000fffffffffful
1551 
1552 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1553 
1554 #if 0
1555 #ifdef PV_STATS
1556 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1557 
1558 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1559 	"Current number of pv entry chunks");
1560 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1561 	"Current number of pv entry chunks allocated");
1562 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1563 	"Current number of pv entry chunks frees");
1564 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1565 	"Number of times tried to get a chunk page but failed.");
1566 
1567 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1568 static int pv_entry_spare;
1569 
1570 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1571 	"Current number of pv entry frees");
1572 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1573 	"Current number of pv entry allocs");
1574 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1575 	"Current number of pv entries");
1576 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1577 	"Current number of spare pv entries");
1578 #endif
1579 #endif /* 0 */
1580 
1581 /*
1582  * We are in a serious low memory condition.  Resort to
1583  * drastic measures to free some pages so we can allocate
1584  * another pv entry chunk.
1585  *
1586  * Returns NULL if PV entries were reclaimed from the specified pmap.
1587  *
1588  * We do not, however, unmap 2mpages because subsequent accesses will
1589  * allocate per-page pv entries until repromotion occurs, thereby
1590  * exacerbating the shortage of free pv entries.
1591  */
1592 static vm_page_t
1593 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1594 {
1595 
1596 	panic("RISCVTODO: reclaim_pv_chunk");
1597 }
1598 
1599 /*
1600  * free the pv_entry back to the free list
1601  */
1602 static void
1603 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1604 {
1605 	struct pv_chunk *pc;
1606 	int idx, field, bit;
1607 
1608 	rw_assert(&pvh_global_lock, RA_LOCKED);
1609 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1610 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1611 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1612 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1613 	pc = pv_to_chunk(pv);
1614 	idx = pv - &pc->pc_pventry[0];
1615 	field = idx / 64;
1616 	bit = idx % 64;
1617 	pc->pc_map[field] |= 1ul << bit;
1618 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1619 	    pc->pc_map[2] != PC_FREE2) {
1620 		/* 98% of the time, pc is already at the head of the list. */
1621 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1622 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1623 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1624 		}
1625 		return;
1626 	}
1627 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1628 	free_pv_chunk(pc);
1629 }
1630 
1631 static void
1632 free_pv_chunk(struct pv_chunk *pc)
1633 {
1634 	vm_page_t m;
1635 
1636 	mtx_lock(&pv_chunks_mutex);
1637  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1638 	mtx_unlock(&pv_chunks_mutex);
1639 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1640 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1641 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1642 	/* entire chunk is free, return it */
1643 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1644 	dump_drop_page(m->phys_addr);
1645 	vm_page_unwire_noq(m);
1646 	vm_page_free(m);
1647 }
1648 
1649 /*
1650  * Returns a new PV entry, allocating a new PV chunk from the system when
1651  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
1652  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
1653  * returned.
1654  *
1655  * The given PV list lock may be released.
1656  */
1657 static pv_entry_t
1658 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1659 {
1660 	int bit, field;
1661 	pv_entry_t pv;
1662 	struct pv_chunk *pc;
1663 	vm_page_t m;
1664 
1665 	rw_assert(&pvh_global_lock, RA_LOCKED);
1666 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1667 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1668 retry:
1669 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1670 	if (pc != NULL) {
1671 		for (field = 0; field < _NPCM; field++) {
1672 			if (pc->pc_map[field]) {
1673 				bit = ffsl(pc->pc_map[field]) - 1;
1674 				break;
1675 			}
1676 		}
1677 		if (field < _NPCM) {
1678 			pv = &pc->pc_pventry[field * 64 + bit];
1679 			pc->pc_map[field] &= ~(1ul << bit);
1680 			/* If this was the last item, move it to tail */
1681 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
1682 			    pc->pc_map[2] == 0) {
1683 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1684 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1685 				    pc_list);
1686 			}
1687 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
1688 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1689 			return (pv);
1690 		}
1691 	}
1692 	/* No free items, allocate another chunk */
1693 	m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1694 	    VM_ALLOC_WIRED);
1695 	if (m == NULL) {
1696 		if (lockp == NULL) {
1697 			PV_STAT(pc_chunk_tryfail++);
1698 			return (NULL);
1699 		}
1700 		m = reclaim_pv_chunk(pmap, lockp);
1701 		if (m == NULL)
1702 			goto retry;
1703 	}
1704 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1705 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1706 	dump_add_page(m->phys_addr);
1707 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1708 	pc->pc_pmap = pmap;
1709 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
1710 	pc->pc_map[1] = PC_FREE1;
1711 	pc->pc_map[2] = PC_FREE2;
1712 	mtx_lock(&pv_chunks_mutex);
1713 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1714 	mtx_unlock(&pv_chunks_mutex);
1715 	pv = &pc->pc_pventry[0];
1716 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1717 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
1718 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1719 	return (pv);
1720 }
1721 
1722 /*
1723  * Ensure that the number of spare PV entries in the specified pmap meets or
1724  * exceeds the given count, "needed".
1725  *
1726  * The given PV list lock may be released.
1727  */
1728 static void
1729 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
1730 {
1731 	struct pch new_tail;
1732 	struct pv_chunk *pc;
1733 	vm_page_t m;
1734 	int avail, free;
1735 	bool reclaimed;
1736 
1737 	rw_assert(&pvh_global_lock, RA_LOCKED);
1738 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1739 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
1740 
1741 	/*
1742 	 * Newly allocated PV chunks must be stored in a private list until
1743 	 * the required number of PV chunks have been allocated.  Otherwise,
1744 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
1745 	 * contrast, these chunks must be added to the pmap upon allocation.
1746 	 */
1747 	TAILQ_INIT(&new_tail);
1748 retry:
1749 	avail = 0;
1750 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
1751 		bit_count((bitstr_t *)pc->pc_map, 0,
1752 		    sizeof(pc->pc_map) * NBBY, &free);
1753 		if (free == 0)
1754 			break;
1755 		avail += free;
1756 		if (avail >= needed)
1757 			break;
1758 	}
1759 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
1760 		m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL | VM_ALLOC_NOOBJ |
1761 		    VM_ALLOC_WIRED);
1762 		if (m == NULL) {
1763 			m = reclaim_pv_chunk(pmap, lockp);
1764 			if (m == NULL)
1765 				goto retry;
1766 			reclaimed = true;
1767 		}
1768 		/* XXX PV STATS */
1769 #if 0
1770 		dump_add_page(m->phys_addr);
1771 #endif
1772 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1773 		pc->pc_pmap = pmap;
1774 		pc->pc_map[0] = PC_FREE0;
1775 		pc->pc_map[1] = PC_FREE1;
1776 		pc->pc_map[2] = PC_FREE2;
1777 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1778 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1779 
1780 		/*
1781 		 * The reclaim might have freed a chunk from the current pmap.
1782 		 * If that chunk contained available entries, we need to
1783 		 * re-count the number of available entries.
1784 		 */
1785 		if (reclaimed)
1786 			goto retry;
1787 	}
1788 	if (!TAILQ_EMPTY(&new_tail)) {
1789 		mtx_lock(&pv_chunks_mutex);
1790 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
1791 		mtx_unlock(&pv_chunks_mutex);
1792 	}
1793 }
1794 
1795 /*
1796  * First find and then remove the pv entry for the specified pmap and virtual
1797  * address from the specified pv list.  Returns the pv entry if found and NULL
1798  * otherwise.  This operation can be performed on pv lists for either 4KB or
1799  * 2MB page mappings.
1800  */
1801 static __inline pv_entry_t
1802 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1803 {
1804 	pv_entry_t pv;
1805 
1806 	rw_assert(&pvh_global_lock, RA_LOCKED);
1807 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
1808 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1809 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
1810 			pvh->pv_gen++;
1811 			break;
1812 		}
1813 	}
1814 	return (pv);
1815 }
1816 
1817 /*
1818  * First find and then destroy the pv entry for the specified pmap and virtual
1819  * address.  This operation can be performed on pv lists for either 4KB or 2MB
1820  * page mappings.
1821  */
1822 static void
1823 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1824 {
1825 	pv_entry_t pv;
1826 
1827 	pv = pmap_pvh_remove(pvh, pmap, va);
1828 
1829 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
1830 	free_pv_entry(pmap, pv);
1831 }
1832 
1833 /*
1834  * Conditionally create the PV entry for a 4KB page mapping if the required
1835  * memory can be allocated without resorting to reclamation.
1836  */
1837 static boolean_t
1838 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
1839     struct rwlock **lockp)
1840 {
1841 	pv_entry_t pv;
1842 
1843 	rw_assert(&pvh_global_lock, RA_LOCKED);
1844 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1845 	/* Pass NULL instead of the lock pointer to disable reclamation. */
1846 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
1847 		pv->pv_va = va;
1848 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
1849 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
1850 		m->md.pv_gen++;
1851 		return (TRUE);
1852 	} else
1853 		return (FALSE);
1854 }
1855 
1856 /*
1857  * After demotion from a 2MB page mapping to 512 4KB page mappings,
1858  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
1859  * entries for each of the 4KB page mappings.
1860  */
1861 static void __unused
1862 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1863     struct rwlock **lockp)
1864 {
1865 	struct md_page *pvh;
1866 	struct pv_chunk *pc;
1867 	pv_entry_t pv;
1868 	vm_page_t m;
1869 	vm_offset_t va_last;
1870 	int bit, field;
1871 
1872 	rw_assert(&pvh_global_lock, RA_LOCKED);
1873 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1874 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1875 
1876 	/*
1877 	 * Transfer the 2mpage's pv entry for this mapping to the first
1878 	 * page's pv list.  Once this transfer begins, the pv list lock
1879 	 * must not be released until the last pv entry is reinstantiated.
1880 	 */
1881 	pvh = pa_to_pvh(pa);
1882 	va &= ~L2_OFFSET;
1883 	pv = pmap_pvh_remove(pvh, pmap, va);
1884 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
1885 	m = PHYS_TO_VM_PAGE(pa);
1886 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
1887 	m->md.pv_gen++;
1888 	/* Instantiate the remaining 511 pv entries. */
1889 	va_last = va + L2_SIZE - PAGE_SIZE;
1890 	for (;;) {
1891 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1892 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
1893 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
1894 		for (field = 0; field < _NPCM; field++) {
1895 			while (pc->pc_map[field] != 0) {
1896 				bit = ffsl(pc->pc_map[field]) - 1;
1897 				pc->pc_map[field] &= ~(1ul << bit);
1898 				pv = &pc->pc_pventry[field * 64 + bit];
1899 				va += PAGE_SIZE;
1900 				pv->pv_va = va;
1901 				m++;
1902 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
1903 			    ("pmap_pv_demote_l2: page %p is not managed", m));
1904 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
1905 				m->md.pv_gen++;
1906 				if (va == va_last)
1907 					goto out;
1908 			}
1909 		}
1910 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1911 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1912 	}
1913 out:
1914 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
1915 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1916 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
1917 	}
1918 	/* XXX PV stats */
1919 }
1920 
1921 #if VM_NRESERVLEVEL > 0
1922 static void
1923 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
1924     struct rwlock **lockp)
1925 {
1926 	struct md_page *pvh;
1927 	pv_entry_t pv;
1928 	vm_page_t m;
1929 	vm_offset_t va_last;
1930 
1931 	rw_assert(&pvh_global_lock, RA_LOCKED);
1932 	KASSERT((va & L2_OFFSET) == 0,
1933 	    ("pmap_pv_promote_l2: misaligned va %#lx", va));
1934 
1935 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1936 
1937 	m = PHYS_TO_VM_PAGE(pa);
1938 	pv = pmap_pvh_remove(&m->md, pmap, va);
1939 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
1940 	pvh = pa_to_pvh(pa);
1941 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
1942 	pvh->pv_gen++;
1943 
1944 	va_last = va + L2_SIZE - PAGE_SIZE;
1945 	do {
1946 		m++;
1947 		va += PAGE_SIZE;
1948 		pmap_pvh_free(&m->md, pmap, va);
1949 	} while (va < va_last);
1950 }
1951 #endif /* VM_NRESERVLEVEL > 0 */
1952 
1953 /*
1954  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
1955  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
1956  * false if the PV entry cannot be allocated without resorting to reclamation.
1957  */
1958 static bool
1959 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
1960     struct rwlock **lockp)
1961 {
1962 	struct md_page *pvh;
1963 	pv_entry_t pv;
1964 	vm_paddr_t pa;
1965 
1966 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1967 	/* Pass NULL instead of the lock pointer to disable reclamation. */
1968 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
1969 	    NULL : lockp)) == NULL)
1970 		return (false);
1971 	pv->pv_va = va;
1972 	pa = PTE_TO_PHYS(l2e);
1973 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
1974 	pvh = pa_to_pvh(pa);
1975 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
1976 	pvh->pv_gen++;
1977 	return (true);
1978 }
1979 
1980 static void
1981 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
1982 {
1983 	pt_entry_t newl2, oldl2;
1984 	vm_page_t ml3;
1985 	vm_paddr_t ml3pa;
1986 
1987 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
1988 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
1989 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1990 
1991 	ml3 = pmap_remove_pt_page(pmap, va);
1992 	if (ml3 == NULL)
1993 		panic("pmap_remove_kernel_l2: Missing pt page");
1994 
1995 	ml3pa = VM_PAGE_TO_PHYS(ml3);
1996 	newl2 = ml3pa | PTE_V;
1997 
1998 	/*
1999 	 * If this page table page was unmapped by a promotion, then it
2000 	 * contains valid mappings.  Zero it to invalidate those mappings.
2001 	 */
2002 	if (ml3->valid != 0)
2003 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
2004 
2005 	/*
2006 	 * Demote the mapping.
2007 	 */
2008 	oldl2 = pmap_load_store(l2, newl2);
2009 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2010 	    __func__, l2, oldl2));
2011 }
2012 
2013 /*
2014  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2015  */
2016 static int
2017 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2018     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2019 {
2020 	struct md_page *pvh;
2021 	pt_entry_t oldl2;
2022 	vm_offset_t eva, va;
2023 	vm_page_t m, ml3;
2024 
2025 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2026 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2027 	oldl2 = pmap_load_clear(l2);
2028 	KASSERT((oldl2 & PTE_RWX) != 0,
2029 	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
2030 
2031 	/*
2032 	 * The sfence.vma documentation states that it is sufficient to specify
2033 	 * a single address within a superpage mapping.  However, since we do
2034 	 * not perform any invalidation upon promotion, TLBs may still be
2035 	 * caching 4KB mappings within the superpage, so we must invalidate the
2036 	 * entire range.
2037 	 */
2038 	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2039 	if ((oldl2 & PTE_SW_WIRED) != 0)
2040 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2041 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2042 	if ((oldl2 & PTE_SW_MANAGED) != 0) {
2043 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
2044 		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
2045 		pmap_pvh_free(pvh, pmap, sva);
2046 		eva = sva + L2_SIZE;
2047 		for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2));
2048 		    va < eva; va += PAGE_SIZE, m++) {
2049 			if ((oldl2 & PTE_D) != 0)
2050 				vm_page_dirty(m);
2051 			if ((oldl2 & PTE_A) != 0)
2052 				vm_page_aflag_set(m, PGA_REFERENCED);
2053 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2054 			    TAILQ_EMPTY(&pvh->pv_list))
2055 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2056 		}
2057 	}
2058 	if (pmap == kernel_pmap) {
2059 		pmap_remove_kernel_l2(pmap, l2, sva);
2060 	} else {
2061 		ml3 = pmap_remove_pt_page(pmap, sva);
2062 		if (ml3 != NULL) {
2063 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
2064 			    ("pmap_remove_l2: l3 page not promoted"));
2065 			pmap_resident_count_dec(pmap, 1);
2066 			KASSERT(ml3->ref_count == Ln_ENTRIES,
2067 			    ("pmap_remove_l2: l3 page ref count error"));
2068 			ml3->ref_count = 1;
2069 			vm_page_unwire_noq(ml3);
2070 			pmap_add_delayed_free_list(ml3, free, FALSE);
2071 		}
2072 	}
2073 	return (pmap_unuse_pt(pmap, sva, l1e, free));
2074 }
2075 
2076 /*
2077  * pmap_remove_l3: do the things to unmap a page in a process
2078  */
2079 static int
2080 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2081     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2082 {
2083 	struct md_page *pvh;
2084 	pt_entry_t old_l3;
2085 	vm_paddr_t phys;
2086 	vm_page_t m;
2087 
2088 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2089 	old_l3 = pmap_load_clear(l3);
2090 	pmap_invalidate_page(pmap, va);
2091 	if (old_l3 & PTE_SW_WIRED)
2092 		pmap->pm_stats.wired_count -= 1;
2093 	pmap_resident_count_dec(pmap, 1);
2094 	if (old_l3 & PTE_SW_MANAGED) {
2095 		phys = PTE_TO_PHYS(old_l3);
2096 		m = PHYS_TO_VM_PAGE(phys);
2097 		if ((old_l3 & PTE_D) != 0)
2098 			vm_page_dirty(m);
2099 		if (old_l3 & PTE_A)
2100 			vm_page_aflag_set(m, PGA_REFERENCED);
2101 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2102 		pmap_pvh_free(&m->md, pmap, va);
2103 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2104 		    (m->flags & PG_FICTITIOUS) == 0) {
2105 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2106 			if (TAILQ_EMPTY(&pvh->pv_list))
2107 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2108 		}
2109 	}
2110 
2111 	return (pmap_unuse_pt(pmap, va, l2e, free));
2112 }
2113 
2114 /*
2115  *	Remove the given range of addresses from the specified map.
2116  *
2117  *	It is assumed that the start and end are properly
2118  *	rounded to the page size.
2119  */
2120 void
2121 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2122 {
2123 	struct spglist free;
2124 	struct rwlock *lock;
2125 	vm_offset_t va, va_next;
2126 	pd_entry_t *l1, *l2, l2e;
2127 	pt_entry_t *l3;
2128 
2129 	/*
2130 	 * Perform an unsynchronized read.  This is, however, safe.
2131 	 */
2132 	if (pmap->pm_stats.resident_count == 0)
2133 		return;
2134 
2135 	SLIST_INIT(&free);
2136 
2137 	rw_rlock(&pvh_global_lock);
2138 	PMAP_LOCK(pmap);
2139 
2140 	lock = NULL;
2141 	for (; sva < eva; sva = va_next) {
2142 		if (pmap->pm_stats.resident_count == 0)
2143 			break;
2144 
2145 		l1 = pmap_l1(pmap, sva);
2146 		if (pmap_load(l1) == 0) {
2147 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2148 			if (va_next < sva)
2149 				va_next = eva;
2150 			continue;
2151 		}
2152 
2153 		/*
2154 		 * Calculate index for next page table.
2155 		 */
2156 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2157 		if (va_next < sva)
2158 			va_next = eva;
2159 
2160 		l2 = pmap_l1_to_l2(l1, sva);
2161 		if (l2 == NULL)
2162 			continue;
2163 		if ((l2e = pmap_load(l2)) == 0)
2164 			continue;
2165 		if ((l2e & PTE_RWX) != 0) {
2166 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2167 				(void)pmap_remove_l2(pmap, l2, sva,
2168 				    pmap_load(l1), &free, &lock);
2169 				continue;
2170 			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
2171 			    &lock)) {
2172 				/*
2173 				 * The large page mapping was destroyed.
2174 				 */
2175 				continue;
2176 			}
2177 			l2e = pmap_load(l2);
2178 		}
2179 
2180 		/*
2181 		 * Limit our scan to either the end of the va represented
2182 		 * by the current page table page, or to the end of the
2183 		 * range being removed.
2184 		 */
2185 		if (va_next > eva)
2186 			va_next = eva;
2187 
2188 		va = va_next;
2189 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2190 		    sva += L3_SIZE) {
2191 			if (pmap_load(l3) == 0) {
2192 				if (va != va_next) {
2193 					pmap_invalidate_range(pmap, va, sva);
2194 					va = va_next;
2195 				}
2196 				continue;
2197 			}
2198 			if (va == va_next)
2199 				va = sva;
2200 			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
2201 				sva += L3_SIZE;
2202 				break;
2203 			}
2204 		}
2205 		if (va != va_next)
2206 			pmap_invalidate_range(pmap, va, sva);
2207 	}
2208 	if (lock != NULL)
2209 		rw_wunlock(lock);
2210 	rw_runlock(&pvh_global_lock);
2211 	PMAP_UNLOCK(pmap);
2212 	vm_page_free_pages_toq(&free, false);
2213 }
2214 
2215 /*
2216  *	Routine:	pmap_remove_all
2217  *	Function:
2218  *		Removes this physical page from
2219  *		all physical maps in which it resides.
2220  *		Reflects back modify bits to the pager.
2221  *
2222  *	Notes:
2223  *		Original versions of this routine were very
2224  *		inefficient because they iteratively called
2225  *		pmap_remove (slow...)
2226  */
2227 
2228 void
2229 pmap_remove_all(vm_page_t m)
2230 {
2231 	struct spglist free;
2232 	struct md_page *pvh;
2233 	pmap_t pmap;
2234 	pt_entry_t *l3, l3e;
2235 	pd_entry_t *l2, l2e;
2236 	pv_entry_t pv;
2237 	vm_offset_t va;
2238 
2239 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2240 	    ("pmap_remove_all: page %p is not managed", m));
2241 	SLIST_INIT(&free);
2242 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2243 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
2244 
2245 	rw_wlock(&pvh_global_lock);
2246 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2247 		pmap = PV_PMAP(pv);
2248 		PMAP_LOCK(pmap);
2249 		va = pv->pv_va;
2250 		l2 = pmap_l2(pmap, va);
2251 		(void)pmap_demote_l2(pmap, l2, va);
2252 		PMAP_UNLOCK(pmap);
2253 	}
2254 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2255 		pmap = PV_PMAP(pv);
2256 		PMAP_LOCK(pmap);
2257 		pmap_resident_count_dec(pmap, 1);
2258 		l2 = pmap_l2(pmap, pv->pv_va);
2259 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
2260 		l2e = pmap_load(l2);
2261 
2262 		KASSERT((l2e & PTE_RX) == 0,
2263 		    ("pmap_remove_all: found a superpage in %p's pv list", m));
2264 
2265 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
2266 		l3e = pmap_load_clear(l3);
2267 		pmap_invalidate_page(pmap, pv->pv_va);
2268 		if (l3e & PTE_SW_WIRED)
2269 			pmap->pm_stats.wired_count--;
2270 		if ((l3e & PTE_A) != 0)
2271 			vm_page_aflag_set(m, PGA_REFERENCED);
2272 
2273 		/*
2274 		 * Update the vm_page_t clean and reference bits.
2275 		 */
2276 		if ((l3e & PTE_D) != 0)
2277 			vm_page_dirty(m);
2278 		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
2279 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2280 		m->md.pv_gen++;
2281 		free_pv_entry(pmap, pv);
2282 		PMAP_UNLOCK(pmap);
2283 	}
2284 	vm_page_aflag_clear(m, PGA_WRITEABLE);
2285 	rw_wunlock(&pvh_global_lock);
2286 	vm_page_free_pages_toq(&free, false);
2287 }
2288 
2289 /*
2290  *	Set the physical protection on the
2291  *	specified range of this map as requested.
2292  */
2293 void
2294 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2295 {
2296 	pd_entry_t *l1, *l2, l2e;
2297 	pt_entry_t *l3, l3e, mask;
2298 	vm_page_t m, mt;
2299 	vm_paddr_t pa;
2300 	vm_offset_t va_next;
2301 	bool anychanged, pv_lists_locked;
2302 
2303 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2304 		pmap_remove(pmap, sva, eva);
2305 		return;
2306 	}
2307 
2308 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2309 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
2310 		return;
2311 
2312 	anychanged = false;
2313 	pv_lists_locked = false;
2314 	mask = 0;
2315 	if ((prot & VM_PROT_WRITE) == 0)
2316 		mask |= PTE_W | PTE_D;
2317 	if ((prot & VM_PROT_EXECUTE) == 0)
2318 		mask |= PTE_X;
2319 resume:
2320 	PMAP_LOCK(pmap);
2321 	for (; sva < eva; sva = va_next) {
2322 		l1 = pmap_l1(pmap, sva);
2323 		if (pmap_load(l1) == 0) {
2324 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2325 			if (va_next < sva)
2326 				va_next = eva;
2327 			continue;
2328 		}
2329 
2330 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2331 		if (va_next < sva)
2332 			va_next = eva;
2333 
2334 		l2 = pmap_l1_to_l2(l1, sva);
2335 		if (l2 == NULL || (l2e = pmap_load(l2)) == 0)
2336 			continue;
2337 		if ((l2e & PTE_RWX) != 0) {
2338 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2339 retryl2:
2340 				if ((prot & VM_PROT_WRITE) == 0 &&
2341 				    (l2e & (PTE_SW_MANAGED | PTE_D)) ==
2342 				    (PTE_SW_MANAGED | PTE_D)) {
2343 					pa = PTE_TO_PHYS(l2e);
2344 					m = PHYS_TO_VM_PAGE(pa);
2345 					for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
2346 						vm_page_dirty(mt);
2347 				}
2348 				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
2349 					goto retryl2;
2350 				anychanged = true;
2351 				continue;
2352 			} else {
2353 				if (!pv_lists_locked) {
2354 					pv_lists_locked = true;
2355 					if (!rw_try_rlock(&pvh_global_lock)) {
2356 						if (anychanged)
2357 							pmap_invalidate_all(
2358 							    pmap);
2359 						PMAP_UNLOCK(pmap);
2360 						rw_rlock(&pvh_global_lock);
2361 						goto resume;
2362 					}
2363 				}
2364 				if (!pmap_demote_l2(pmap, l2, sva)) {
2365 					/*
2366 					 * The large page mapping was destroyed.
2367 					 */
2368 					continue;
2369 				}
2370 			}
2371 		}
2372 
2373 		if (va_next > eva)
2374 			va_next = eva;
2375 
2376 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2377 		    sva += L3_SIZE) {
2378 			l3e = pmap_load(l3);
2379 retryl3:
2380 			if ((l3e & PTE_V) == 0)
2381 				continue;
2382 			if ((prot & VM_PROT_WRITE) == 0 &&
2383 			    (l3e & (PTE_SW_MANAGED | PTE_D)) ==
2384 			    (PTE_SW_MANAGED | PTE_D)) {
2385 				m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e));
2386 				vm_page_dirty(m);
2387 			}
2388 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
2389 				goto retryl3;
2390 			anychanged = true;
2391 		}
2392 	}
2393 	if (anychanged)
2394 		pmap_invalidate_all(pmap);
2395 	if (pv_lists_locked)
2396 		rw_runlock(&pvh_global_lock);
2397 	PMAP_UNLOCK(pmap);
2398 }
2399 
2400 int
2401 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
2402 {
2403 	pd_entry_t *l2, l2e;
2404 	pt_entry_t bits, *pte, oldpte;
2405 	int rv;
2406 
2407 	rv = 0;
2408 	PMAP_LOCK(pmap);
2409 	l2 = pmap_l2(pmap, va);
2410 	if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
2411 		goto done;
2412 	if ((l2e & PTE_RWX) == 0) {
2413 		pte = pmap_l2_to_l3(l2, va);
2414 		if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0)
2415 			goto done;
2416 	} else {
2417 		pte = l2;
2418 		oldpte = l2e;
2419 	}
2420 
2421 	if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
2422 	    (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
2423 	    (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
2424 	    (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
2425 		goto done;
2426 
2427 	bits = PTE_A;
2428 	if (ftype == VM_PROT_WRITE)
2429 		bits |= PTE_D;
2430 
2431 	/*
2432 	 * Spurious faults can occur if the implementation caches invalid
2433 	 * entries in the TLB, or if simultaneous accesses on multiple CPUs
2434 	 * race with each other.
2435 	 */
2436 	if ((oldpte & bits) != bits)
2437 		pmap_store_bits(pte, bits);
2438 	sfence_vma();
2439 	rv = 1;
2440 done:
2441 	PMAP_UNLOCK(pmap);
2442 	return (rv);
2443 }
2444 
2445 static bool
2446 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
2447 {
2448 	struct rwlock *lock;
2449 	bool rv;
2450 
2451 	lock = NULL;
2452 	rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
2453 	if (lock != NULL)
2454 		rw_wunlock(lock);
2455 	return (rv);
2456 }
2457 
2458 /*
2459  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
2460  * mapping is invalidated.
2461  */
2462 static bool
2463 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2464     struct rwlock **lockp)
2465 {
2466 	struct spglist free;
2467 	vm_page_t mpte;
2468 	pd_entry_t newl2, oldl2;
2469 	pt_entry_t *firstl3, newl3;
2470 	vm_paddr_t mptepa;
2471 	int i;
2472 
2473 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2474 
2475 	oldl2 = pmap_load(l2);
2476 	KASSERT((oldl2 & PTE_RWX) != 0,
2477 	    ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
2478 	if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
2479 	    NULL) {
2480 		if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc(NULL,
2481 		    pmap_l2_pindex(va), (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT :
2482 		    VM_ALLOC_NORMAL) | VM_ALLOC_NOOBJ | VM_ALLOC_WIRED)) ==
2483 		    NULL) {
2484 			SLIST_INIT(&free);
2485 			(void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
2486 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
2487 			vm_page_free_pages_toq(&free, true);
2488 			CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
2489 			    "failure for va %#lx in pmap %p", va, pmap);
2490 			return (false);
2491 		}
2492 		if (va < VM_MAXUSER_ADDRESS) {
2493 			mpte->ref_count = Ln_ENTRIES;
2494 			pmap_resident_count_inc(pmap, 1);
2495 		}
2496 	}
2497 	mptepa = VM_PAGE_TO_PHYS(mpte);
2498 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2499 	newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
2500 	KASSERT((oldl2 & PTE_A) != 0,
2501 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
2502 	KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
2503 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
2504 	newl3 = oldl2;
2505 
2506 	/*
2507 	 * If the page table page is not leftover from an earlier promotion,
2508 	 * initialize it.
2509 	 */
2510 	if (mpte->valid == 0) {
2511 		for (i = 0; i < Ln_ENTRIES; i++)
2512 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
2513 	}
2514 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
2515 	    ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
2516 	    "addresses"));
2517 
2518 	/*
2519 	 * If the mapping has changed attributes, update the page table
2520 	 * entries.
2521 	 */
2522 	if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
2523 		for (i = 0; i < Ln_ENTRIES; i++)
2524 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
2525 
2526 	/*
2527 	 * The spare PV entries must be reserved prior to demoting the
2528 	 * mapping, that is, prior to changing the L2 entry.  Otherwise, the
2529 	 * state of the L2 entry and the PV lists will be inconsistent, which
2530 	 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
2531 	 * the wrong PV list and pmap_pv_demote_l2() failing to find the
2532 	 * expected PV entry for the 2MB page mapping that is being demoted.
2533 	 */
2534 	if ((oldl2 & PTE_SW_MANAGED) != 0)
2535 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
2536 
2537 	/*
2538 	 * Demote the mapping.
2539 	 */
2540 	pmap_store(l2, newl2);
2541 
2542 	/*
2543 	 * Demote the PV entry.
2544 	 */
2545 	if ((oldl2 & PTE_SW_MANAGED) != 0)
2546 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
2547 
2548 	atomic_add_long(&pmap_l2_demotions, 1);
2549 	CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
2550 	    va, pmap);
2551 	return (true);
2552 }
2553 
2554 #if VM_NRESERVLEVEL > 0
2555 static void
2556 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2557     struct rwlock **lockp)
2558 {
2559 	pt_entry_t *firstl3, firstl3e, *l3, l3e;
2560 	vm_paddr_t pa;
2561 	vm_page_t ml3;
2562 
2563 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2564 
2565 	va &= ~L2_OFFSET;
2566 	KASSERT((pmap_load(l2) & PTE_RWX) == 0,
2567 	    ("pmap_promote_l2: invalid l2 entry %p", l2));
2568 
2569 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
2570 	firstl3e = pmap_load(firstl3);
2571 	pa = PTE_TO_PHYS(firstl3e);
2572 	if ((pa & L2_OFFSET) != 0) {
2573 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
2574 		    va, pmap);
2575 		atomic_add_long(&pmap_l2_p_failures, 1);
2576 		return;
2577 	}
2578 
2579 	/*
2580 	 * Downgrade a clean, writable mapping to read-only to ensure that the
2581 	 * hardware does not set PTE_D while we are comparing PTEs.
2582 	 *
2583 	 * Upon a write access to a clean mapping, the implementation will
2584 	 * either atomically check protections and set PTE_D, or raise a page
2585 	 * fault.  In the latter case, the pmap lock provides atomicity.  Thus,
2586 	 * we do not issue an sfence.vma here and instead rely on pmap_fault()
2587 	 * to do so lazily.
2588 	 */
2589 	while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) {
2590 		if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) {
2591 			firstl3e &= ~PTE_W;
2592 			break;
2593 		}
2594 	}
2595 
2596 	pa += PAGE_SIZE;
2597 	for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) {
2598 		l3e = pmap_load(l3);
2599 		if (PTE_TO_PHYS(l3e) != pa) {
2600 			CTR2(KTR_PMAP,
2601 			    "pmap_promote_l2: failure for va %#lx pmap %p",
2602 			    va, pmap);
2603 			atomic_add_long(&pmap_l2_p_failures, 1);
2604 			return;
2605 		}
2606 		while ((l3e & (PTE_W | PTE_D)) == PTE_W) {
2607 			if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) {
2608 				l3e &= ~PTE_W;
2609 				break;
2610 			}
2611 		}
2612 		if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) {
2613 			CTR2(KTR_PMAP,
2614 			    "pmap_promote_l2: failure for va %#lx pmap %p",
2615 			    va, pmap);
2616 			atomic_add_long(&pmap_l2_p_failures, 1);
2617 			return;
2618 		}
2619 		pa += PAGE_SIZE;
2620 	}
2621 
2622 	ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
2623 	KASSERT(ml3->pindex == pmap_l2_pindex(va),
2624 	    ("pmap_promote_l2: page table page's pindex is wrong"));
2625 	if (pmap_insert_pt_page(pmap, ml3, true)) {
2626 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
2627 		    va, pmap);
2628 		atomic_add_long(&pmap_l2_p_failures, 1);
2629 		return;
2630 	}
2631 
2632 	if ((firstl3e & PTE_SW_MANAGED) != 0)
2633 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp);
2634 
2635 	pmap_store(l2, firstl3e);
2636 
2637 	atomic_add_long(&pmap_l2_promotions, 1);
2638 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
2639 	    pmap);
2640 }
2641 #endif
2642 
2643 /*
2644  *	Insert the given physical page (p) at
2645  *	the specified virtual address (v) in the
2646  *	target physical map with the protection requested.
2647  *
2648  *	If specified, the page will be wired down, meaning
2649  *	that the related pte can not be reclaimed.
2650  *
2651  *	NB:  This is the only routine which MAY NOT lazy-evaluate
2652  *	or lose information.  That is, this routine must actually
2653  *	insert this page into the given map NOW.
2654  */
2655 int
2656 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2657     u_int flags, int8_t psind)
2658 {
2659 	struct rwlock *lock;
2660 	pd_entry_t *l1, *l2, l2e;
2661 	pt_entry_t new_l3, orig_l3;
2662 	pt_entry_t *l3;
2663 	pv_entry_t pv;
2664 	vm_paddr_t opa, pa, l2_pa, l3_pa;
2665 	vm_page_t mpte, om, l2_m, l3_m;
2666 	pt_entry_t entry;
2667 	pn_t l2_pn, l3_pn, pn;
2668 	int rv;
2669 	bool nosleep;
2670 
2671 	va = trunc_page(va);
2672 	if ((m->oflags & VPO_UNMANAGED) == 0)
2673 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
2674 	pa = VM_PAGE_TO_PHYS(m);
2675 	pn = (pa / PAGE_SIZE);
2676 
2677 	new_l3 = PTE_V | PTE_R | PTE_A;
2678 	if (prot & VM_PROT_EXECUTE)
2679 		new_l3 |= PTE_X;
2680 	if (flags & VM_PROT_WRITE)
2681 		new_l3 |= PTE_D;
2682 	if (prot & VM_PROT_WRITE)
2683 		new_l3 |= PTE_W;
2684 	if (va < VM_MAX_USER_ADDRESS)
2685 		new_l3 |= PTE_U;
2686 
2687 	new_l3 |= (pn << PTE_PPN0_S);
2688 	if ((flags & PMAP_ENTER_WIRED) != 0)
2689 		new_l3 |= PTE_SW_WIRED;
2690 
2691 	/*
2692 	 * Set modified bit gratuitously for writeable mappings if
2693 	 * the page is unmanaged. We do not want to take a fault
2694 	 * to do the dirty bit accounting for these mappings.
2695 	 */
2696 	if ((m->oflags & VPO_UNMANAGED) != 0) {
2697 		if (prot & VM_PROT_WRITE)
2698 			new_l3 |= PTE_D;
2699 	} else
2700 		new_l3 |= PTE_SW_MANAGED;
2701 
2702 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
2703 
2704 	lock = NULL;
2705 	mpte = NULL;
2706 	rw_rlock(&pvh_global_lock);
2707 	PMAP_LOCK(pmap);
2708 	if (psind == 1) {
2709 		/* Assert the required virtual and physical alignment. */
2710 		KASSERT((va & L2_OFFSET) == 0,
2711 		    ("pmap_enter: va %#lx unaligned", va));
2712 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
2713 		rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
2714 		goto out;
2715 	}
2716 
2717 	l2 = pmap_l2(pmap, va);
2718 	if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
2719 	    ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
2720 	    va, &lock))) {
2721 		l3 = pmap_l2_to_l3(l2, va);
2722 		if (va < VM_MAXUSER_ADDRESS) {
2723 			mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
2724 			mpte->ref_count++;
2725 		}
2726 	} else if (va < VM_MAXUSER_ADDRESS) {
2727 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2728 		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
2729 		if (mpte == NULL && nosleep) {
2730 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
2731 			if (lock != NULL)
2732 				rw_wunlock(lock);
2733 			rw_runlock(&pvh_global_lock);
2734 			PMAP_UNLOCK(pmap);
2735 			return (KERN_RESOURCE_SHORTAGE);
2736 		}
2737 		l3 = pmap_l3(pmap, va);
2738 	} else {
2739 		l3 = pmap_l3(pmap, va);
2740 		/* TODO: This is not optimal, but should mostly work */
2741 		if (l3 == NULL) {
2742 			if (l2 == NULL) {
2743 				l2_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2744 				    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED |
2745 				    VM_ALLOC_ZERO);
2746 				if (l2_m == NULL)
2747 					panic("pmap_enter: l2 pte_m == NULL");
2748 				if ((l2_m->flags & PG_ZERO) == 0)
2749 					pmap_zero_page(l2_m);
2750 
2751 				l2_pa = VM_PAGE_TO_PHYS(l2_m);
2752 				l2_pn = (l2_pa / PAGE_SIZE);
2753 
2754 				l1 = pmap_l1(pmap, va);
2755 				entry = (PTE_V);
2756 				entry |= (l2_pn << PTE_PPN0_S);
2757 				pmap_store(l1, entry);
2758 				pmap_distribute_l1(pmap, pmap_l1_index(va), entry);
2759 				l2 = pmap_l1_to_l2(l1, va);
2760 			}
2761 
2762 			l3_m = vm_page_alloc(NULL, 0, VM_ALLOC_NORMAL |
2763 			    VM_ALLOC_NOOBJ | VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2764 			if (l3_m == NULL)
2765 				panic("pmap_enter: l3 pte_m == NULL");
2766 			if ((l3_m->flags & PG_ZERO) == 0)
2767 				pmap_zero_page(l3_m);
2768 
2769 			l3_pa = VM_PAGE_TO_PHYS(l3_m);
2770 			l3_pn = (l3_pa / PAGE_SIZE);
2771 			entry = (PTE_V);
2772 			entry |= (l3_pn << PTE_PPN0_S);
2773 			pmap_store(l2, entry);
2774 			l3 = pmap_l2_to_l3(l2, va);
2775 		}
2776 		pmap_invalidate_page(pmap, va);
2777 	}
2778 
2779 	orig_l3 = pmap_load(l3);
2780 	opa = PTE_TO_PHYS(orig_l3);
2781 	pv = NULL;
2782 
2783 	/*
2784 	 * Is the specified virtual address already mapped?
2785 	 */
2786 	if ((orig_l3 & PTE_V) != 0) {
2787 		/*
2788 		 * Wiring change, just update stats. We don't worry about
2789 		 * wiring PT pages as they remain resident as long as there
2790 		 * are valid mappings in them. Hence, if a user page is wired,
2791 		 * the PT page will be also.
2792 		 */
2793 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
2794 		    (orig_l3 & PTE_SW_WIRED) == 0)
2795 			pmap->pm_stats.wired_count++;
2796 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
2797 		    (orig_l3 & PTE_SW_WIRED) != 0)
2798 			pmap->pm_stats.wired_count--;
2799 
2800 		/*
2801 		 * Remove the extra PT page reference.
2802 		 */
2803 		if (mpte != NULL) {
2804 			mpte->ref_count--;
2805 			KASSERT(mpte->ref_count > 0,
2806 			    ("pmap_enter: missing reference to page table page,"
2807 			     " va: 0x%lx", va));
2808 		}
2809 
2810 		/*
2811 		 * Has the physical page changed?
2812 		 */
2813 		if (opa == pa) {
2814 			/*
2815 			 * No, might be a protection or wiring change.
2816 			 */
2817 			if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
2818 			    (new_l3 & PTE_W) != 0)
2819 				vm_page_aflag_set(m, PGA_WRITEABLE);
2820 			goto validate;
2821 		}
2822 
2823 		/*
2824 		 * The physical page has changed.  Temporarily invalidate
2825 		 * the mapping.  This ensures that all threads sharing the
2826 		 * pmap keep a consistent view of the mapping, which is
2827 		 * necessary for the correct handling of COW faults.  It
2828 		 * also permits reuse of the old mapping's PV entry,
2829 		 * avoiding an allocation.
2830 		 *
2831 		 * For consistency, handle unmanaged mappings the same way.
2832 		 */
2833 		orig_l3 = pmap_load_clear(l3);
2834 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
2835 		    ("pmap_enter: unexpected pa update for %#lx", va));
2836 		if ((orig_l3 & PTE_SW_MANAGED) != 0) {
2837 			om = PHYS_TO_VM_PAGE(opa);
2838 
2839 			/*
2840 			 * The pmap lock is sufficient to synchronize with
2841 			 * concurrent calls to pmap_page_test_mappings() and
2842 			 * pmap_ts_referenced().
2843 			 */
2844 			if ((orig_l3 & PTE_D) != 0)
2845 				vm_page_dirty(om);
2846 			if ((orig_l3 & PTE_A) != 0)
2847 				vm_page_aflag_set(om, PGA_REFERENCED);
2848 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
2849 			pv = pmap_pvh_remove(&om->md, pmap, va);
2850 			KASSERT(pv != NULL,
2851 			    ("pmap_enter: no PV entry for %#lx", va));
2852 			if ((new_l3 & PTE_SW_MANAGED) == 0)
2853 				free_pv_entry(pmap, pv);
2854 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
2855 			    TAILQ_EMPTY(&om->md.pv_list) &&
2856 			    ((om->flags & PG_FICTITIOUS) != 0 ||
2857 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
2858 				vm_page_aflag_clear(om, PGA_WRITEABLE);
2859 		}
2860 		pmap_invalidate_page(pmap, va);
2861 		orig_l3 = 0;
2862 	} else {
2863 		/*
2864 		 * Increment the counters.
2865 		 */
2866 		if ((new_l3 & PTE_SW_WIRED) != 0)
2867 			pmap->pm_stats.wired_count++;
2868 		pmap_resident_count_inc(pmap, 1);
2869 	}
2870 	/*
2871 	 * Enter on the PV list if part of our managed memory.
2872 	 */
2873 	if ((new_l3 & PTE_SW_MANAGED) != 0) {
2874 		if (pv == NULL) {
2875 			pv = get_pv_entry(pmap, &lock);
2876 			pv->pv_va = va;
2877 		}
2878 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
2879 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2880 		m->md.pv_gen++;
2881 		if ((new_l3 & PTE_W) != 0)
2882 			vm_page_aflag_set(m, PGA_WRITEABLE);
2883 	}
2884 
2885 validate:
2886 	/*
2887 	 * Sync the i-cache on all harts before updating the PTE
2888 	 * if the new PTE is executable.
2889 	 */
2890 	if (prot & VM_PROT_EXECUTE)
2891 		pmap_sync_icache(pmap, va, PAGE_SIZE);
2892 
2893 	/*
2894 	 * Update the L3 entry.
2895 	 */
2896 	if (orig_l3 != 0) {
2897 		orig_l3 = pmap_load_store(l3, new_l3);
2898 		pmap_invalidate_page(pmap, va);
2899 		KASSERT(PTE_TO_PHYS(orig_l3) == pa,
2900 		    ("pmap_enter: invalid update"));
2901 		if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
2902 		    (PTE_D | PTE_SW_MANAGED))
2903 			vm_page_dirty(m);
2904 	} else {
2905 		pmap_store(l3, new_l3);
2906 	}
2907 
2908 #if VM_NRESERVLEVEL > 0
2909 	if (mpte != NULL && mpte->ref_count == Ln_ENTRIES &&
2910 	    pmap_ps_enabled(pmap) &&
2911 	    (m->flags & PG_FICTITIOUS) == 0 &&
2912 	    vm_reserv_level_iffullpop(m) == 0)
2913 		pmap_promote_l2(pmap, l2, va, &lock);
2914 #endif
2915 
2916 	rv = KERN_SUCCESS;
2917 out:
2918 	if (lock != NULL)
2919 		rw_wunlock(lock);
2920 	rw_runlock(&pvh_global_lock);
2921 	PMAP_UNLOCK(pmap);
2922 	return (rv);
2923 }
2924 
2925 /*
2926  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
2927  * if successful.  Returns false if (1) a page table page cannot be allocated
2928  * without sleeping, (2) a mapping already exists at the specified virtual
2929  * address, or (3) a PV entry cannot be allocated without reclaiming another
2930  * PV entry.
2931  */
2932 static bool
2933 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2934     struct rwlock **lockp)
2935 {
2936 	pd_entry_t new_l2;
2937 	pn_t pn;
2938 
2939 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2940 
2941 	pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
2942 	new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V);
2943 	if ((m->oflags & VPO_UNMANAGED) == 0)
2944 		new_l2 |= PTE_SW_MANAGED;
2945 	if ((prot & VM_PROT_EXECUTE) != 0)
2946 		new_l2 |= PTE_X;
2947 	if (va < VM_MAXUSER_ADDRESS)
2948 		new_l2 |= PTE_U;
2949 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
2950 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
2951 	    KERN_SUCCESS);
2952 }
2953 
2954 /*
2955  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
2956  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
2957  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
2958  * a mapping already exists at the specified virtual address.  Returns
2959  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
2960  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
2961  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
2962  *
2963  * The parameter "m" is only used when creating a managed, writeable mapping.
2964  */
2965 static int
2966 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
2967     vm_page_t m, struct rwlock **lockp)
2968 {
2969 	struct spglist free;
2970 	pd_entry_t *l2, *l3, oldl2;
2971 	vm_offset_t sva;
2972 	vm_page_t l2pg, mt;
2973 
2974 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2975 
2976 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
2977 	    NULL : lockp)) == NULL) {
2978 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
2979 		    va, pmap);
2980 		return (KERN_RESOURCE_SHORTAGE);
2981 	}
2982 
2983 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2984 	l2 = &l2[pmap_l2_index(va)];
2985 	if ((oldl2 = pmap_load(l2)) != 0) {
2986 		KASSERT(l2pg->ref_count > 1,
2987 		    ("pmap_enter_l2: l2pg's ref count is too low"));
2988 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
2989 			l2pg->ref_count--;
2990 			CTR2(KTR_PMAP,
2991 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
2992 			    va, pmap);
2993 			return (KERN_FAILURE);
2994 		}
2995 		SLIST_INIT(&free);
2996 		if ((oldl2 & PTE_RWX) != 0)
2997 			(void)pmap_remove_l2(pmap, l2, va,
2998 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
2999 		else
3000 			for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
3001 				l3 = pmap_l2_to_l3(l2, sva);
3002 				if ((pmap_load(l3) & PTE_V) != 0 &&
3003 				    pmap_remove_l3(pmap, l3, sva, oldl2, &free,
3004 				    lockp) != 0)
3005 					break;
3006 			}
3007 		vm_page_free_pages_toq(&free, true);
3008 		if (va >= VM_MAXUSER_ADDRESS) {
3009 			/*
3010 			 * Both pmap_remove_l2() and pmap_remove_l3() will
3011 			 * leave the kernel page table page zero filled.
3012 			 */
3013 			mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
3014 			if (pmap_insert_pt_page(pmap, mt, false))
3015 				panic("pmap_enter_l2: trie insert failed");
3016 		} else
3017 			KASSERT(pmap_load(l2) == 0,
3018 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
3019 	}
3020 
3021 	if ((new_l2 & PTE_SW_MANAGED) != 0) {
3022 		/*
3023 		 * Abort this mapping if its PV entry could not be created.
3024 		 */
3025 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
3026 			SLIST_INIT(&free);
3027 			if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
3028 				/*
3029 				 * Although "va" is not mapped, paging-structure
3030 				 * caches could nonetheless have entries that
3031 				 * refer to the freed page table pages.
3032 				 * Invalidate those entries.
3033 				 */
3034 				pmap_invalidate_page(pmap, va);
3035 				vm_page_free_pages_toq(&free, true);
3036 			}
3037 			CTR2(KTR_PMAP,
3038 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
3039 			    va, pmap);
3040 			return (KERN_RESOURCE_SHORTAGE);
3041 		}
3042 		if ((new_l2 & PTE_W) != 0)
3043 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3044 				vm_page_aflag_set(mt, PGA_WRITEABLE);
3045 	}
3046 
3047 	/*
3048 	 * Increment counters.
3049 	 */
3050 	if ((new_l2 & PTE_SW_WIRED) != 0)
3051 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
3052 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
3053 
3054 	/*
3055 	 * Map the superpage.
3056 	 */
3057 	pmap_store(l2, new_l2);
3058 
3059 	atomic_add_long(&pmap_l2_mappings, 1);
3060 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
3061 	    va, pmap);
3062 
3063 	return (KERN_SUCCESS);
3064 }
3065 
3066 /*
3067  * Maps a sequence of resident pages belonging to the same object.
3068  * The sequence begins with the given page m_start.  This page is
3069  * mapped at the given virtual address start.  Each subsequent page is
3070  * mapped at a virtual address that is offset from start by the same
3071  * amount as the page is offset from m_start within the object.  The
3072  * last page in the sequence is the page with the largest offset from
3073  * m_start that can be mapped at a virtual address less than the given
3074  * virtual address end.  Not every virtual page between start and end
3075  * is mapped; only those for which a resident page exists with the
3076  * corresponding offset from m_start are mapped.
3077  */
3078 void
3079 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3080     vm_page_t m_start, vm_prot_t prot)
3081 {
3082 	struct rwlock *lock;
3083 	vm_offset_t va;
3084 	vm_page_t m, mpte;
3085 	vm_pindex_t diff, psize;
3086 
3087 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3088 
3089 	psize = atop(end - start);
3090 	mpte = NULL;
3091 	m = m_start;
3092 	lock = NULL;
3093 	rw_rlock(&pvh_global_lock);
3094 	PMAP_LOCK(pmap);
3095 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3096 		va = start + ptoa(diff);
3097 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
3098 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
3099 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
3100 			m = &m[L2_SIZE / PAGE_SIZE - 1];
3101 		else
3102 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
3103 			    &lock);
3104 		m = TAILQ_NEXT(m, listq);
3105 	}
3106 	if (lock != NULL)
3107 		rw_wunlock(lock);
3108 	rw_runlock(&pvh_global_lock);
3109 	PMAP_UNLOCK(pmap);
3110 }
3111 
3112 /*
3113  * this code makes some *MAJOR* assumptions:
3114  * 1. Current pmap & pmap exists.
3115  * 2. Not wired.
3116  * 3. Read access.
3117  * 4. No page table pages.
3118  * but is *MUCH* faster than pmap_enter...
3119  */
3120 
3121 void
3122 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3123 {
3124 	struct rwlock *lock;
3125 
3126 	lock = NULL;
3127 	rw_rlock(&pvh_global_lock);
3128 	PMAP_LOCK(pmap);
3129 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3130 	if (lock != NULL)
3131 		rw_wunlock(lock);
3132 	rw_runlock(&pvh_global_lock);
3133 	PMAP_UNLOCK(pmap);
3134 }
3135 
3136 static vm_page_t
3137 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3138     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3139 {
3140 	struct spglist free;
3141 	vm_paddr_t phys;
3142 	pd_entry_t *l2;
3143 	pt_entry_t *l3, newl3;
3144 
3145 	KASSERT(!VA_IS_CLEANMAP(va) ||
3146 	    (m->oflags & VPO_UNMANAGED) != 0,
3147 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3148 	rw_assert(&pvh_global_lock, RA_LOCKED);
3149 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3150 
3151 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3152 	/*
3153 	 * In the case that a page table page is not
3154 	 * resident, we are creating it here.
3155 	 */
3156 	if (va < VM_MAXUSER_ADDRESS) {
3157 		vm_pindex_t l2pindex;
3158 
3159 		/*
3160 		 * Calculate pagetable page index
3161 		 */
3162 		l2pindex = pmap_l2_pindex(va);
3163 		if (mpte && (mpte->pindex == l2pindex)) {
3164 			mpte->ref_count++;
3165 		} else {
3166 			/*
3167 			 * Get the l2 entry
3168 			 */
3169 			l2 = pmap_l2(pmap, va);
3170 
3171 			/*
3172 			 * If the page table page is mapped, we just increment
3173 			 * the hold count, and activate it.  Otherwise, we
3174 			 * attempt to allocate a page table page.  If this
3175 			 * attempt fails, we don't retry.  Instead, we give up.
3176 			 */
3177 			if (l2 != NULL && pmap_load(l2) != 0) {
3178 				phys = PTE_TO_PHYS(pmap_load(l2));
3179 				mpte = PHYS_TO_VM_PAGE(phys);
3180 				mpte->ref_count++;
3181 			} else {
3182 				/*
3183 				 * Pass NULL instead of the PV list lock
3184 				 * pointer, because we don't intend to sleep.
3185 				 */
3186 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3187 				if (mpte == NULL)
3188 					return (mpte);
3189 			}
3190 		}
3191 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3192 		l3 = &l3[pmap_l3_index(va)];
3193 	} else {
3194 		mpte = NULL;
3195 		l3 = pmap_l3(kernel_pmap, va);
3196 	}
3197 	if (l3 == NULL)
3198 		panic("pmap_enter_quick_locked: No l3");
3199 	if (pmap_load(l3) != 0) {
3200 		if (mpte != NULL) {
3201 			mpte->ref_count--;
3202 			mpte = NULL;
3203 		}
3204 		return (mpte);
3205 	}
3206 
3207 	/*
3208 	 * Enter on the PV list if part of our managed memory.
3209 	 */
3210 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3211 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3212 		if (mpte != NULL) {
3213 			SLIST_INIT(&free);
3214 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
3215 				pmap_invalidate_page(pmap, va);
3216 				vm_page_free_pages_toq(&free, false);
3217 			}
3218 			mpte = NULL;
3219 		}
3220 		return (mpte);
3221 	}
3222 
3223 	/*
3224 	 * Increment counters
3225 	 */
3226 	pmap_resident_count_inc(pmap, 1);
3227 
3228 	newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
3229 	    PTE_V | PTE_R;
3230 	if ((prot & VM_PROT_EXECUTE) != 0)
3231 		newl3 |= PTE_X;
3232 	if ((m->oflags & VPO_UNMANAGED) == 0)
3233 		newl3 |= PTE_SW_MANAGED;
3234 	if (va < VM_MAX_USER_ADDRESS)
3235 		newl3 |= PTE_U;
3236 
3237 	/*
3238 	 * Sync the i-cache on all harts before updating the PTE
3239 	 * if the new PTE is executable.
3240 	 */
3241 	if (prot & VM_PROT_EXECUTE)
3242 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3243 
3244 	pmap_store(l3, newl3);
3245 
3246 	pmap_invalidate_page(pmap, va);
3247 	return (mpte);
3248 }
3249 
3250 /*
3251  * This code maps large physical mmap regions into the
3252  * processor address space.  Note that some shortcuts
3253  * are taken, but the code works.
3254  */
3255 void
3256 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3257     vm_pindex_t pindex, vm_size_t size)
3258 {
3259 
3260 	VM_OBJECT_ASSERT_WLOCKED(object);
3261 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3262 	    ("pmap_object_init_pt: non-device object"));
3263 }
3264 
3265 /*
3266  *	Clear the wired attribute from the mappings for the specified range of
3267  *	addresses in the given pmap.  Every valid mapping within that range
3268  *	must have the wired attribute set.  In contrast, invalid mappings
3269  *	cannot have the wired attribute set, so they are ignored.
3270  *
3271  *	The wired attribute of the page table entry is not a hardware feature,
3272  *	so there is no need to invalidate any TLB entries.
3273  */
3274 void
3275 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3276 {
3277 	vm_offset_t va_next;
3278 	pd_entry_t *l1, *l2, l2e;
3279 	pt_entry_t *l3, l3e;
3280 	bool pv_lists_locked;
3281 
3282 	pv_lists_locked = false;
3283 retry:
3284 	PMAP_LOCK(pmap);
3285 	for (; sva < eva; sva = va_next) {
3286 		l1 = pmap_l1(pmap, sva);
3287 		if (pmap_load(l1) == 0) {
3288 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3289 			if (va_next < sva)
3290 				va_next = eva;
3291 			continue;
3292 		}
3293 
3294 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3295 		if (va_next < sva)
3296 			va_next = eva;
3297 
3298 		l2 = pmap_l1_to_l2(l1, sva);
3299 		if ((l2e = pmap_load(l2)) == 0)
3300 			continue;
3301 		if ((l2e & PTE_RWX) != 0) {
3302 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3303 				if ((l2e & PTE_SW_WIRED) == 0)
3304 					panic("pmap_unwire: l2 %#jx is missing "
3305 					    "PTE_SW_WIRED", (uintmax_t)l2e);
3306 				pmap_clear_bits(l2, PTE_SW_WIRED);
3307 				continue;
3308 			} else {
3309 				if (!pv_lists_locked) {
3310 					pv_lists_locked = true;
3311 					if (!rw_try_rlock(&pvh_global_lock)) {
3312 						PMAP_UNLOCK(pmap);
3313 						rw_rlock(&pvh_global_lock);
3314 						/* Repeat sva. */
3315 						goto retry;
3316 					}
3317 				}
3318 				if (!pmap_demote_l2(pmap, l2, sva))
3319 					panic("pmap_unwire: demotion failed");
3320 			}
3321 		}
3322 
3323 		if (va_next > eva)
3324 			va_next = eva;
3325 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3326 		    sva += L3_SIZE) {
3327 			if ((l3e = pmap_load(l3)) == 0)
3328 				continue;
3329 			if ((l3e & PTE_SW_WIRED) == 0)
3330 				panic("pmap_unwire: l3 %#jx is missing "
3331 				    "PTE_SW_WIRED", (uintmax_t)l3e);
3332 
3333 			/*
3334 			 * PG_W must be cleared atomically.  Although the pmap
3335 			 * lock synchronizes access to PG_W, another processor
3336 			 * could be setting PG_M and/or PG_A concurrently.
3337 			 */
3338 			pmap_clear_bits(l3, PTE_SW_WIRED);
3339 			pmap->pm_stats.wired_count--;
3340 		}
3341 	}
3342 	if (pv_lists_locked)
3343 		rw_runlock(&pvh_global_lock);
3344 	PMAP_UNLOCK(pmap);
3345 }
3346 
3347 /*
3348  *	Copy the range specified by src_addr/len
3349  *	from the source map to the range dst_addr/len
3350  *	in the destination map.
3351  *
3352  *	This routine is only advisory and need not do anything.
3353  */
3354 
3355 void
3356 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3357     vm_offset_t src_addr)
3358 {
3359 
3360 }
3361 
3362 /*
3363  *	pmap_zero_page zeros the specified hardware page by mapping
3364  *	the page into KVM and using bzero to clear its contents.
3365  */
3366 void
3367 pmap_zero_page(vm_page_t m)
3368 {
3369 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3370 
3371 	pagezero((void *)va);
3372 }
3373 
3374 /*
3375  *	pmap_zero_page_area zeros the specified hardware page by mapping
3376  *	the page into KVM and using bzero to clear its contents.
3377  *
3378  *	off and size may not cover an area beyond a single hardware page.
3379  */
3380 void
3381 pmap_zero_page_area(vm_page_t m, int off, int size)
3382 {
3383 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3384 
3385 	if (off == 0 && size == PAGE_SIZE)
3386 		pagezero((void *)va);
3387 	else
3388 		bzero((char *)va + off, size);
3389 }
3390 
3391 /*
3392  *	pmap_copy_page copies the specified (machine independent)
3393  *	page by mapping the page into virtual memory and using
3394  *	bcopy to copy the page, one machine dependent page at a
3395  *	time.
3396  */
3397 void
3398 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3399 {
3400 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3401 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3402 
3403 	pagecopy((void *)src, (void *)dst);
3404 }
3405 
3406 int unmapped_buf_allowed = 1;
3407 
3408 void
3409 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
3410     vm_offset_t b_offset, int xfersize)
3411 {
3412 	void *a_cp, *b_cp;
3413 	vm_page_t m_a, m_b;
3414 	vm_paddr_t p_a, p_b;
3415 	vm_offset_t a_pg_offset, b_pg_offset;
3416 	int cnt;
3417 
3418 	while (xfersize > 0) {
3419 		a_pg_offset = a_offset & PAGE_MASK;
3420 		m_a = ma[a_offset >> PAGE_SHIFT];
3421 		p_a = m_a->phys_addr;
3422 		b_pg_offset = b_offset & PAGE_MASK;
3423 		m_b = mb[b_offset >> PAGE_SHIFT];
3424 		p_b = m_b->phys_addr;
3425 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
3426 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
3427 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
3428 			panic("!DMAP a %lx", p_a);
3429 		} else {
3430 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
3431 		}
3432 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
3433 			panic("!DMAP b %lx", p_b);
3434 		} else {
3435 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
3436 		}
3437 		bcopy(a_cp, b_cp, cnt);
3438 		a_offset += cnt;
3439 		b_offset += cnt;
3440 		xfersize -= cnt;
3441 	}
3442 }
3443 
3444 vm_offset_t
3445 pmap_quick_enter_page(vm_page_t m)
3446 {
3447 
3448 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
3449 }
3450 
3451 void
3452 pmap_quick_remove_page(vm_offset_t addr)
3453 {
3454 }
3455 
3456 /*
3457  * Returns true if the pmap's pv is one of the first
3458  * 16 pvs linked to from this page.  This count may
3459  * be changed upwards or downwards in the future; it
3460  * is only necessary that true be returned for a small
3461  * subset of pmaps for proper page aging.
3462  */
3463 boolean_t
3464 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3465 {
3466 	struct md_page *pvh;
3467 	struct rwlock *lock;
3468 	pv_entry_t pv;
3469 	int loops = 0;
3470 	boolean_t rv;
3471 
3472 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3473 	    ("pmap_page_exists_quick: page %p is not managed", m));
3474 	rv = FALSE;
3475 	rw_rlock(&pvh_global_lock);
3476 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3477 	rw_rlock(lock);
3478 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3479 		if (PV_PMAP(pv) == pmap) {
3480 			rv = TRUE;
3481 			break;
3482 		}
3483 		loops++;
3484 		if (loops >= 16)
3485 			break;
3486 	}
3487 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
3488 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3489 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3490 			if (PV_PMAP(pv) == pmap) {
3491 				rv = TRUE;
3492 				break;
3493 			}
3494 			loops++;
3495 			if (loops >= 16)
3496 				break;
3497 		}
3498 	}
3499 	rw_runlock(lock);
3500 	rw_runlock(&pvh_global_lock);
3501 	return (rv);
3502 }
3503 
3504 /*
3505  *	pmap_page_wired_mappings:
3506  *
3507  *	Return the number of managed mappings to the given physical page
3508  *	that are wired.
3509  */
3510 int
3511 pmap_page_wired_mappings(vm_page_t m)
3512 {
3513 	struct md_page *pvh;
3514 	struct rwlock *lock;
3515 	pmap_t pmap;
3516 	pd_entry_t *l2;
3517 	pt_entry_t *l3;
3518 	pv_entry_t pv;
3519 	int count, md_gen, pvh_gen;
3520 
3521 	if ((m->oflags & VPO_UNMANAGED) != 0)
3522 		return (0);
3523 	rw_rlock(&pvh_global_lock);
3524 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3525 	rw_rlock(lock);
3526 restart:
3527 	count = 0;
3528 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3529 		pmap = PV_PMAP(pv);
3530 		if (!PMAP_TRYLOCK(pmap)) {
3531 			md_gen = m->md.pv_gen;
3532 			rw_runlock(lock);
3533 			PMAP_LOCK(pmap);
3534 			rw_rlock(lock);
3535 			if (md_gen != m->md.pv_gen) {
3536 				PMAP_UNLOCK(pmap);
3537 				goto restart;
3538 			}
3539 		}
3540 		l2 = pmap_l2(pmap, pv->pv_va);
3541 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3542 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
3543 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
3544 		if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
3545 			count++;
3546 		PMAP_UNLOCK(pmap);
3547 	}
3548 	if ((m->flags & PG_FICTITIOUS) == 0) {
3549 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3550 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3551 			pmap = PV_PMAP(pv);
3552 			if (!PMAP_TRYLOCK(pmap)) {
3553 				md_gen = m->md.pv_gen;
3554 				pvh_gen = pvh->pv_gen;
3555 				rw_runlock(lock);
3556 				PMAP_LOCK(pmap);
3557 				rw_rlock(lock);
3558 				if (md_gen != m->md.pv_gen ||
3559 				    pvh_gen != pvh->pv_gen) {
3560 					PMAP_UNLOCK(pmap);
3561 					goto restart;
3562 				}
3563 			}
3564 			l2 = pmap_l2(pmap, pv->pv_va);
3565 			if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
3566 				count++;
3567 			PMAP_UNLOCK(pmap);
3568 		}
3569 	}
3570 	rw_runlock(lock);
3571 	rw_runlock(&pvh_global_lock);
3572 	return (count);
3573 }
3574 
3575 /*
3576  * Returns true if the given page is mapped individually or as part of
3577  * a 2mpage.  Otherwise, returns false.
3578  */
3579 bool
3580 pmap_page_is_mapped(vm_page_t m)
3581 {
3582 	struct rwlock *lock;
3583 	bool rv;
3584 
3585 	if ((m->oflags & VPO_UNMANAGED) != 0)
3586 		return (false);
3587 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3588 	rw_rlock(lock);
3589 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
3590 	    ((m->flags & PG_FICTITIOUS) == 0 &&
3591 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
3592 	rw_runlock(lock);
3593 	return (rv);
3594 }
3595 
3596 static void
3597 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
3598     struct spglist *free, bool superpage)
3599 {
3600 	struct md_page *pvh;
3601 	vm_page_t mpte, mt;
3602 
3603 	if (superpage) {
3604 		pmap_resident_count_dec(pmap, Ln_ENTRIES);
3605 		pvh = pa_to_pvh(m->phys_addr);
3606 		TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3607 		pvh->pv_gen++;
3608 		if (TAILQ_EMPTY(&pvh->pv_list)) {
3609 			for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
3610 				if (TAILQ_EMPTY(&mt->md.pv_list) &&
3611 				    (mt->a.flags & PGA_WRITEABLE) != 0)
3612 					vm_page_aflag_clear(mt, PGA_WRITEABLE);
3613 		}
3614 		mpte = pmap_remove_pt_page(pmap, pv->pv_va);
3615 		if (mpte != NULL) {
3616 			KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
3617 			    ("pmap_remove_pages: pte page not promoted"));
3618 			pmap_resident_count_dec(pmap, 1);
3619 			KASSERT(mpte->ref_count == Ln_ENTRIES,
3620 			    ("pmap_remove_pages: pte page ref count error"));
3621 			mpte->ref_count = 0;
3622 			pmap_add_delayed_free_list(mpte, free, FALSE);
3623 		}
3624 	} else {
3625 		pmap_resident_count_dec(pmap, 1);
3626 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3627 		m->md.pv_gen++;
3628 		if (TAILQ_EMPTY(&m->md.pv_list) &&
3629 		    (m->a.flags & PGA_WRITEABLE) != 0) {
3630 			pvh = pa_to_pvh(m->phys_addr);
3631 			if (TAILQ_EMPTY(&pvh->pv_list))
3632 				vm_page_aflag_clear(m, PGA_WRITEABLE);
3633 		}
3634 	}
3635 }
3636 
3637 /*
3638  * Destroy all managed, non-wired mappings in the given user-space
3639  * pmap.  This pmap cannot be active on any processor besides the
3640  * caller.
3641  *
3642  * This function cannot be applied to the kernel pmap.  Moreover, it
3643  * is not intended for general use.  It is only to be used during
3644  * process termination.  Consequently, it can be implemented in ways
3645  * that make it faster than pmap_remove().  First, it can more quickly
3646  * destroy mappings by iterating over the pmap's collection of PV
3647  * entries, rather than searching the page table.  Second, it doesn't
3648  * have to test and clear the page table entries atomically, because
3649  * no processor is currently accessing the user address space.  In
3650  * particular, a page table entry's dirty bit won't change state once
3651  * this function starts.
3652  */
3653 void
3654 pmap_remove_pages(pmap_t pmap)
3655 {
3656 	struct spglist free;
3657 	pd_entry_t ptepde;
3658 	pt_entry_t *pte, tpte;
3659 	vm_page_t m, mt;
3660 	pv_entry_t pv;
3661 	struct pv_chunk *pc, *npc;
3662 	struct rwlock *lock;
3663 	int64_t bit;
3664 	uint64_t inuse, bitmask;
3665 	int allfree, field, freed, idx;
3666 	bool superpage;
3667 
3668 	lock = NULL;
3669 
3670 	SLIST_INIT(&free);
3671 	rw_rlock(&pvh_global_lock);
3672 	PMAP_LOCK(pmap);
3673 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3674 		allfree = 1;
3675 		freed = 0;
3676 		for (field = 0; field < _NPCM; field++) {
3677 			inuse = ~pc->pc_map[field] & pc_freemask[field];
3678 			while (inuse != 0) {
3679 				bit = ffsl(inuse) - 1;
3680 				bitmask = 1UL << bit;
3681 				idx = field * 64 + bit;
3682 				pv = &pc->pc_pventry[idx];
3683 				inuse &= ~bitmask;
3684 
3685 				pte = pmap_l1(pmap, pv->pv_va);
3686 				ptepde = pmap_load(pte);
3687 				pte = pmap_l1_to_l2(pte, pv->pv_va);
3688 				tpte = pmap_load(pte);
3689 				if ((tpte & PTE_RWX) != 0) {
3690 					superpage = true;
3691 				} else {
3692 					ptepde = tpte;
3693 					pte = pmap_l2_to_l3(pte, pv->pv_va);
3694 					tpte = pmap_load(pte);
3695 					superpage = false;
3696 				}
3697 
3698 				/*
3699 				 * We cannot remove wired pages from a
3700 				 * process' mapping at this time.
3701 				 */
3702 				if (tpte & PTE_SW_WIRED) {
3703 					allfree = 0;
3704 					continue;
3705 				}
3706 
3707 				m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte));
3708 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
3709 				    m < &vm_page_array[vm_page_array_size],
3710 				    ("pmap_remove_pages: bad pte %#jx",
3711 				    (uintmax_t)tpte));
3712 
3713 				pmap_clear(pte);
3714 
3715 				/*
3716 				 * Update the vm_page_t clean/reference bits.
3717 				 */
3718 				if ((tpte & (PTE_D | PTE_W)) ==
3719 				    (PTE_D | PTE_W)) {
3720 					if (superpage)
3721 						for (mt = m;
3722 						    mt < &m[Ln_ENTRIES]; mt++)
3723 							vm_page_dirty(mt);
3724 					else
3725 						vm_page_dirty(m);
3726 				}
3727 
3728 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
3729 
3730 				/* Mark free */
3731 				pc->pc_map[field] |= bitmask;
3732 
3733 				pmap_remove_pages_pv(pmap, m, pv, &free,
3734 				    superpage);
3735 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
3736 				freed++;
3737 			}
3738 		}
3739 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3740 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3741 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3742 		if (allfree) {
3743 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3744 			free_pv_chunk(pc);
3745 		}
3746 	}
3747 	if (lock != NULL)
3748 		rw_wunlock(lock);
3749 	pmap_invalidate_all(pmap);
3750 	rw_runlock(&pvh_global_lock);
3751 	PMAP_UNLOCK(pmap);
3752 	vm_page_free_pages_toq(&free, false);
3753 }
3754 
3755 static bool
3756 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3757 {
3758 	struct md_page *pvh;
3759 	struct rwlock *lock;
3760 	pd_entry_t *l2;
3761 	pt_entry_t *l3, mask;
3762 	pv_entry_t pv;
3763 	pmap_t pmap;
3764 	int md_gen, pvh_gen;
3765 	bool rv;
3766 
3767 	mask = 0;
3768 	if (modified)
3769 		mask |= PTE_D;
3770 	if (accessed)
3771 		mask |= PTE_A;
3772 
3773 	rv = FALSE;
3774 	rw_rlock(&pvh_global_lock);
3775 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3776 	rw_rlock(lock);
3777 restart:
3778 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3779 		pmap = PV_PMAP(pv);
3780 		if (!PMAP_TRYLOCK(pmap)) {
3781 			md_gen = m->md.pv_gen;
3782 			rw_runlock(lock);
3783 			PMAP_LOCK(pmap);
3784 			rw_rlock(lock);
3785 			if (md_gen != m->md.pv_gen) {
3786 				PMAP_UNLOCK(pmap);
3787 				goto restart;
3788 			}
3789 		}
3790 		l2 = pmap_l2(pmap, pv->pv_va);
3791 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3792 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
3793 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
3794 		rv = (pmap_load(l3) & mask) == mask;
3795 		PMAP_UNLOCK(pmap);
3796 		if (rv)
3797 			goto out;
3798 	}
3799 	if ((m->flags & PG_FICTITIOUS) == 0) {
3800 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3801 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3802 			pmap = PV_PMAP(pv);
3803 			if (!PMAP_TRYLOCK(pmap)) {
3804 				md_gen = m->md.pv_gen;
3805 				pvh_gen = pvh->pv_gen;
3806 				rw_runlock(lock);
3807 				PMAP_LOCK(pmap);
3808 				rw_rlock(lock);
3809 				if (md_gen != m->md.pv_gen ||
3810 				    pvh_gen != pvh->pv_gen) {
3811 					PMAP_UNLOCK(pmap);
3812 					goto restart;
3813 				}
3814 			}
3815 			l2 = pmap_l2(pmap, pv->pv_va);
3816 			rv = (pmap_load(l2) & mask) == mask;
3817 			PMAP_UNLOCK(pmap);
3818 			if (rv)
3819 				goto out;
3820 		}
3821 	}
3822 out:
3823 	rw_runlock(lock);
3824 	rw_runlock(&pvh_global_lock);
3825 	return (rv);
3826 }
3827 
3828 /*
3829  *	pmap_is_modified:
3830  *
3831  *	Return whether or not the specified physical page was modified
3832  *	in any physical maps.
3833  */
3834 boolean_t
3835 pmap_is_modified(vm_page_t m)
3836 {
3837 
3838 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3839 	    ("pmap_is_modified: page %p is not managed", m));
3840 
3841 	/*
3842 	 * If the page is not busied then this check is racy.
3843 	 */
3844 	if (!pmap_page_is_write_mapped(m))
3845 		return (FALSE);
3846 	return (pmap_page_test_mappings(m, FALSE, TRUE));
3847 }
3848 
3849 /*
3850  *	pmap_is_prefaultable:
3851  *
3852  *	Return whether or not the specified virtual address is eligible
3853  *	for prefault.
3854  */
3855 boolean_t
3856 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
3857 {
3858 	pt_entry_t *l3;
3859 	boolean_t rv;
3860 
3861 	rv = FALSE;
3862 	PMAP_LOCK(pmap);
3863 	l3 = pmap_l3(pmap, addr);
3864 	if (l3 != NULL && pmap_load(l3) != 0) {
3865 		rv = TRUE;
3866 	}
3867 	PMAP_UNLOCK(pmap);
3868 	return (rv);
3869 }
3870 
3871 /*
3872  *	pmap_is_referenced:
3873  *
3874  *	Return whether or not the specified physical page was referenced
3875  *	in any physical maps.
3876  */
3877 boolean_t
3878 pmap_is_referenced(vm_page_t m)
3879 {
3880 
3881 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3882 	    ("pmap_is_referenced: page %p is not managed", m));
3883 	return (pmap_page_test_mappings(m, TRUE, FALSE));
3884 }
3885 
3886 /*
3887  * Clear the write and modified bits in each of the given page's mappings.
3888  */
3889 void
3890 pmap_remove_write(vm_page_t m)
3891 {
3892 	struct md_page *pvh;
3893 	struct rwlock *lock;
3894 	pmap_t pmap;
3895 	pd_entry_t *l2;
3896 	pt_entry_t *l3, oldl3, newl3;
3897 	pv_entry_t next_pv, pv;
3898 	vm_offset_t va;
3899 	int md_gen, pvh_gen;
3900 
3901 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3902 	    ("pmap_remove_write: page %p is not managed", m));
3903 	vm_page_assert_busied(m);
3904 
3905 	if (!pmap_page_is_write_mapped(m))
3906 		return;
3907 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3908 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
3909 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
3910 	rw_rlock(&pvh_global_lock);
3911 retry_pv_loop:
3912 	rw_wlock(lock);
3913 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
3914 		pmap = PV_PMAP(pv);
3915 		if (!PMAP_TRYLOCK(pmap)) {
3916 			pvh_gen = pvh->pv_gen;
3917 			rw_wunlock(lock);
3918 			PMAP_LOCK(pmap);
3919 			rw_wlock(lock);
3920 			if (pvh_gen != pvh->pv_gen) {
3921 				PMAP_UNLOCK(pmap);
3922 				rw_wunlock(lock);
3923 				goto retry_pv_loop;
3924 			}
3925 		}
3926 		va = pv->pv_va;
3927 		l2 = pmap_l2(pmap, va);
3928 		if ((pmap_load(l2) & PTE_W) != 0)
3929 			(void)pmap_demote_l2_locked(pmap, l2, va, &lock);
3930 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
3931 		    ("inconsistent pv lock %p %p for page %p",
3932 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
3933 		PMAP_UNLOCK(pmap);
3934 	}
3935 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3936 		pmap = PV_PMAP(pv);
3937 		if (!PMAP_TRYLOCK(pmap)) {
3938 			pvh_gen = pvh->pv_gen;
3939 			md_gen = m->md.pv_gen;
3940 			rw_wunlock(lock);
3941 			PMAP_LOCK(pmap);
3942 			rw_wlock(lock);
3943 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3944 				PMAP_UNLOCK(pmap);
3945 				rw_wunlock(lock);
3946 				goto retry_pv_loop;
3947 			}
3948 		}
3949 		l2 = pmap_l2(pmap, pv->pv_va);
3950 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3951 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
3952 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
3953 		oldl3 = pmap_load(l3);
3954 retry:
3955 		if ((oldl3 & PTE_W) != 0) {
3956 			newl3 = oldl3 & ~(PTE_D | PTE_W);
3957 			if (!atomic_fcmpset_long(l3, &oldl3, newl3))
3958 				goto retry;
3959 			if ((oldl3 & PTE_D) != 0)
3960 				vm_page_dirty(m);
3961 			pmap_invalidate_page(pmap, pv->pv_va);
3962 		}
3963 		PMAP_UNLOCK(pmap);
3964 	}
3965 	rw_wunlock(lock);
3966 	vm_page_aflag_clear(m, PGA_WRITEABLE);
3967 	rw_runlock(&pvh_global_lock);
3968 }
3969 
3970 /*
3971  *	pmap_ts_referenced:
3972  *
3973  *	Return a count of reference bits for a page, clearing those bits.
3974  *	It is not necessary for every reference bit to be cleared, but it
3975  *	is necessary that 0 only be returned when there are truly no
3976  *	reference bits set.
3977  *
3978  *	As an optimization, update the page's dirty field if a modified bit is
3979  *	found while counting reference bits.  This opportunistic update can be
3980  *	performed at low cost and can eliminate the need for some future calls
3981  *	to pmap_is_modified().  However, since this function stops after
3982  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
3983  *	dirty pages.  Those dirty pages will only be detected by a future call
3984  *	to pmap_is_modified().
3985  */
3986 int
3987 pmap_ts_referenced(vm_page_t m)
3988 {
3989 	struct spglist free;
3990 	struct md_page *pvh;
3991 	struct rwlock *lock;
3992 	pv_entry_t pv, pvf;
3993 	pmap_t pmap;
3994 	pd_entry_t *l2, l2e;
3995 	pt_entry_t *l3, l3e;
3996 	vm_paddr_t pa;
3997 	vm_offset_t va;
3998 	int cleared, md_gen, not_cleared, pvh_gen;
3999 
4000 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4001 	    ("pmap_ts_referenced: page %p is not managed", m));
4002 	SLIST_INIT(&free);
4003 	cleared = 0;
4004 	pa = VM_PAGE_TO_PHYS(m);
4005 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4006 
4007 	lock = PHYS_TO_PV_LIST_LOCK(pa);
4008 	rw_rlock(&pvh_global_lock);
4009 	rw_wlock(lock);
4010 retry:
4011 	not_cleared = 0;
4012 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4013 		goto small_mappings;
4014 	pv = pvf;
4015 	do {
4016 		pmap = PV_PMAP(pv);
4017 		if (!PMAP_TRYLOCK(pmap)) {
4018 			pvh_gen = pvh->pv_gen;
4019 			rw_wunlock(lock);
4020 			PMAP_LOCK(pmap);
4021 			rw_wlock(lock);
4022 			if (pvh_gen != pvh->pv_gen) {
4023 				PMAP_UNLOCK(pmap);
4024 				goto retry;
4025 			}
4026 		}
4027 		va = pv->pv_va;
4028 		l2 = pmap_l2(pmap, va);
4029 		l2e = pmap_load(l2);
4030 		if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
4031 			/*
4032 			 * Although l2e is mapping a 2MB page, because
4033 			 * this function is called at a 4KB page granularity,
4034 			 * we only update the 4KB page under test.
4035 			 */
4036 			vm_page_dirty(m);
4037 		}
4038 		if ((l2e & PTE_A) != 0) {
4039 			/*
4040 			 * Since this reference bit is shared by 512 4KB
4041 			 * pages, it should not be cleared every time it is
4042 			 * tested.  Apply a simple "hash" function on the
4043 			 * physical page number, the virtual superpage number,
4044 			 * and the pmap address to select one 4KB page out of
4045 			 * the 512 on which testing the reference bit will
4046 			 * result in clearing that reference bit.  This
4047 			 * function is designed to avoid the selection of the
4048 			 * same 4KB page for every 2MB page mapping.
4049 			 *
4050 			 * On demotion, a mapping that hasn't been referenced
4051 			 * is simply destroyed.  To avoid the possibility of a
4052 			 * subsequent page fault on a demoted wired mapping,
4053 			 * always leave its reference bit set.  Moreover,
4054 			 * since the superpage is wired, the current state of
4055 			 * its reference bit won't affect page replacement.
4056 			 */
4057 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4058 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4059 			    (l2e & PTE_SW_WIRED) == 0) {
4060 				pmap_clear_bits(l2, PTE_A);
4061 				pmap_invalidate_page(pmap, va);
4062 				cleared++;
4063 			} else
4064 				not_cleared++;
4065 		}
4066 		PMAP_UNLOCK(pmap);
4067 		/* Rotate the PV list if it has more than one entry. */
4068 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4069 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4070 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4071 			pvh->pv_gen++;
4072 		}
4073 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4074 			goto out;
4075 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4076 small_mappings:
4077 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4078 		goto out;
4079 	pv = pvf;
4080 	do {
4081 		pmap = PV_PMAP(pv);
4082 		if (!PMAP_TRYLOCK(pmap)) {
4083 			pvh_gen = pvh->pv_gen;
4084 			md_gen = m->md.pv_gen;
4085 			rw_wunlock(lock);
4086 			PMAP_LOCK(pmap);
4087 			rw_wlock(lock);
4088 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4089 				PMAP_UNLOCK(pmap);
4090 				goto retry;
4091 			}
4092 		}
4093 		l2 = pmap_l2(pmap, pv->pv_va);
4094 
4095 		KASSERT((pmap_load(l2) & PTE_RX) == 0,
4096 		    ("pmap_ts_referenced: found an invalid l2 table"));
4097 
4098 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4099 		l3e = pmap_load(l3);
4100 		if ((l3e & PTE_D) != 0)
4101 			vm_page_dirty(m);
4102 		if ((l3e & PTE_A) != 0) {
4103 			if ((l3e & PTE_SW_WIRED) == 0) {
4104 				/*
4105 				 * Wired pages cannot be paged out so
4106 				 * doing accessed bit emulation for
4107 				 * them is wasted effort. We do the
4108 				 * hard work for unwired pages only.
4109 				 */
4110 				pmap_clear_bits(l3, PTE_A);
4111 				pmap_invalidate_page(pmap, pv->pv_va);
4112 				cleared++;
4113 			} else
4114 				not_cleared++;
4115 		}
4116 		PMAP_UNLOCK(pmap);
4117 		/* Rotate the PV list if it has more than one entry. */
4118 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4119 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4120 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4121 			m->md.pv_gen++;
4122 		}
4123 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4124 	    not_cleared < PMAP_TS_REFERENCED_MAX);
4125 out:
4126 	rw_wunlock(lock);
4127 	rw_runlock(&pvh_global_lock);
4128 	vm_page_free_pages_toq(&free, false);
4129 	return (cleared + not_cleared);
4130 }
4131 
4132 /*
4133  *	Apply the given advice to the specified range of addresses within the
4134  *	given pmap.  Depending on the advice, clear the referenced and/or
4135  *	modified flags in each mapping and set the mapped page's dirty field.
4136  */
4137 void
4138 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4139 {
4140 }
4141 
4142 /*
4143  *	Clear the modify bits on the specified physical page.
4144  */
4145 void
4146 pmap_clear_modify(vm_page_t m)
4147 {
4148 	struct md_page *pvh;
4149 	struct rwlock *lock;
4150 	pmap_t pmap;
4151 	pv_entry_t next_pv, pv;
4152 	pd_entry_t *l2, oldl2;
4153 	pt_entry_t *l3;
4154 	vm_offset_t va;
4155 	int md_gen, pvh_gen;
4156 
4157 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4158 	    ("pmap_clear_modify: page %p is not managed", m));
4159 	vm_page_assert_busied(m);
4160 
4161 	if (!pmap_page_is_write_mapped(m))
4162 	        return;
4163 
4164 	/*
4165 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4166 	 * If the object containing the page is locked and the page is not
4167 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
4168 	 */
4169 	if ((m->a.flags & PGA_WRITEABLE) == 0)
4170 		return;
4171 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4172 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4173 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4174 	rw_rlock(&pvh_global_lock);
4175 	rw_wlock(lock);
4176 restart:
4177 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4178 		pmap = PV_PMAP(pv);
4179 		if (!PMAP_TRYLOCK(pmap)) {
4180 			pvh_gen = pvh->pv_gen;
4181 			rw_wunlock(lock);
4182 			PMAP_LOCK(pmap);
4183 			rw_wlock(lock);
4184 			if (pvh_gen != pvh->pv_gen) {
4185 				PMAP_UNLOCK(pmap);
4186 				goto restart;
4187 			}
4188 		}
4189 		va = pv->pv_va;
4190 		l2 = pmap_l2(pmap, va);
4191 		oldl2 = pmap_load(l2);
4192 		/* If oldl2 has PTE_W set, then it also has PTE_D set. */
4193 		if ((oldl2 & PTE_W) != 0 &&
4194 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
4195 		    (oldl2 & PTE_SW_WIRED) == 0) {
4196 			/*
4197 			 * Write protect the mapping to a single page so that
4198 			 * a subsequent write access may repromote.
4199 			 */
4200 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
4201 			l3 = pmap_l2_to_l3(l2, va);
4202 			pmap_clear_bits(l3, PTE_D | PTE_W);
4203 			vm_page_dirty(m);
4204 			pmap_invalidate_page(pmap, va);
4205 		}
4206 		PMAP_UNLOCK(pmap);
4207 	}
4208 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4209 		pmap = PV_PMAP(pv);
4210 		if (!PMAP_TRYLOCK(pmap)) {
4211 			md_gen = m->md.pv_gen;
4212 			pvh_gen = pvh->pv_gen;
4213 			rw_wunlock(lock);
4214 			PMAP_LOCK(pmap);
4215 			rw_wlock(lock);
4216 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4217 				PMAP_UNLOCK(pmap);
4218 				goto restart;
4219 			}
4220 		}
4221 		l2 = pmap_l2(pmap, pv->pv_va);
4222 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4223 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4224 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4225 		if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
4226 			pmap_clear_bits(l3, PTE_D | PTE_W);
4227 			pmap_invalidate_page(pmap, pv->pv_va);
4228 		}
4229 		PMAP_UNLOCK(pmap);
4230 	}
4231 	rw_wunlock(lock);
4232 	rw_runlock(&pvh_global_lock);
4233 }
4234 
4235 void *
4236 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4237 {
4238 
4239         return ((void *)PHYS_TO_DMAP(pa));
4240 }
4241 
4242 void
4243 pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
4244 {
4245 }
4246 
4247 /*
4248  * Sets the memory attribute for the specified page.
4249  */
4250 void
4251 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4252 {
4253 
4254 	m->md.pv_memattr = ma;
4255 }
4256 
4257 /*
4258  * Perform the pmap work for mincore(2).  If the page is not both referenced and
4259  * modified by this pmap, returns its physical address so that the caller can
4260  * find other mappings.
4261  */
4262 int
4263 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
4264 {
4265 	pt_entry_t *l2, *l3, tpte;
4266 	vm_paddr_t pa;
4267 	int val;
4268 	bool managed;
4269 
4270 	PMAP_LOCK(pmap);
4271 	l2 = pmap_l2(pmap, addr);
4272 	if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
4273 		if ((tpte & PTE_RWX) != 0) {
4274 			pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
4275 			val = MINCORE_INCORE | MINCORE_PSIND(1);
4276 		} else {
4277 			l3 = pmap_l2_to_l3(l2, addr);
4278 			tpte = pmap_load(l3);
4279 			if ((tpte & PTE_V) == 0) {
4280 				PMAP_UNLOCK(pmap);
4281 				return (0);
4282 			}
4283 			pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
4284 			val = MINCORE_INCORE;
4285 		}
4286 
4287 		if ((tpte & PTE_D) != 0)
4288 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4289 		if ((tpte & PTE_A) != 0)
4290 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4291 		managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
4292 	} else {
4293 		managed = false;
4294 		val = 0;
4295 	}
4296 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
4297 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
4298 		*pap = pa;
4299 	}
4300 	PMAP_UNLOCK(pmap);
4301 	return (val);
4302 }
4303 
4304 void
4305 pmap_activate_sw(struct thread *td)
4306 {
4307 	pmap_t oldpmap, pmap;
4308 	u_int hart;
4309 
4310 	oldpmap = PCPU_GET(curpmap);
4311 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4312 	if (pmap == oldpmap)
4313 		return;
4314 	load_satp(pmap->pm_satp);
4315 
4316 	hart = PCPU_GET(hart);
4317 #ifdef SMP
4318 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
4319 	CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
4320 #else
4321 	CPU_SET(hart, &pmap->pm_active);
4322 	CPU_CLR(hart, &oldpmap->pm_active);
4323 #endif
4324 	PCPU_SET(curpmap, pmap);
4325 
4326 	sfence_vma();
4327 }
4328 
4329 void
4330 pmap_activate(struct thread *td)
4331 {
4332 
4333 	critical_enter();
4334 	pmap_activate_sw(td);
4335 	critical_exit();
4336 }
4337 
4338 void
4339 pmap_activate_boot(pmap_t pmap)
4340 {
4341 	u_int hart;
4342 
4343 	hart = PCPU_GET(hart);
4344 #ifdef SMP
4345 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
4346 #else
4347 	CPU_SET(hart, &pmap->pm_active);
4348 #endif
4349 	PCPU_SET(curpmap, pmap);
4350 }
4351 
4352 void
4353 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
4354 {
4355 	cpuset_t mask;
4356 
4357 	/*
4358 	 * From the RISC-V User-Level ISA V2.2:
4359 	 *
4360 	 * "To make a store to instruction memory visible to all
4361 	 * RISC-V harts, the writing hart has to execute a data FENCE
4362 	 * before requesting that all remote RISC-V harts execute a
4363 	 * FENCE.I."
4364 	 *
4365 	 * However, this is slightly misleading; we still need to
4366 	 * perform a FENCE.I for the local hart, as FENCE does nothing
4367 	 * for its icache. FENCE.I alone is also sufficient for the
4368 	 * local hart.
4369 	 */
4370 	sched_pin();
4371 	mask = all_harts;
4372 	CPU_CLR(PCPU_GET(hart), &mask);
4373 	fence_i();
4374 	if (!CPU_EMPTY(&mask) && smp_started) {
4375 		fence();
4376 		sbi_remote_fence_i(mask.__bits);
4377 	}
4378 	sched_unpin();
4379 }
4380 
4381 /*
4382  *	Increase the starting virtual address of the given mapping if a
4383  *	different alignment might result in more superpage mappings.
4384  */
4385 void
4386 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4387     vm_offset_t *addr, vm_size_t size)
4388 {
4389 	vm_offset_t superpage_offset;
4390 
4391 	if (size < L2_SIZE)
4392 		return;
4393 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4394 		offset += ptoa(object->pg_color);
4395 	superpage_offset = offset & L2_OFFSET;
4396 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
4397 	    (*addr & L2_OFFSET) == superpage_offset)
4398 		return;
4399 	if ((*addr & L2_OFFSET) < superpage_offset)
4400 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
4401 	else
4402 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
4403 }
4404 
4405 /**
4406  * Get the kernel virtual address of a set of physical pages. If there are
4407  * physical addresses not covered by the DMAP perform a transient mapping
4408  * that will be removed when calling pmap_unmap_io_transient.
4409  *
4410  * \param page        The pages the caller wishes to obtain the virtual
4411  *                    address on the kernel memory map.
4412  * \param vaddr       On return contains the kernel virtual memory address
4413  *                    of the pages passed in the page parameter.
4414  * \param count       Number of pages passed in.
4415  * \param can_fault   TRUE if the thread using the mapped pages can take
4416  *                    page faults, FALSE otherwise.
4417  *
4418  * \returns TRUE if the caller must call pmap_unmap_io_transient when
4419  *          finished or FALSE otherwise.
4420  *
4421  */
4422 boolean_t
4423 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
4424     boolean_t can_fault)
4425 {
4426 	vm_paddr_t paddr;
4427 	boolean_t needs_mapping;
4428 	int error, i;
4429 
4430 	/*
4431 	 * Allocate any KVA space that we need, this is done in a separate
4432 	 * loop to prevent calling vmem_alloc while pinned.
4433 	 */
4434 	needs_mapping = FALSE;
4435 	for (i = 0; i < count; i++) {
4436 		paddr = VM_PAGE_TO_PHYS(page[i]);
4437 		if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
4438 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
4439 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
4440 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
4441 			needs_mapping = TRUE;
4442 		} else {
4443 			vaddr[i] = PHYS_TO_DMAP(paddr);
4444 		}
4445 	}
4446 
4447 	/* Exit early if everything is covered by the DMAP */
4448 	if (!needs_mapping)
4449 		return (FALSE);
4450 
4451 	if (!can_fault)
4452 		sched_pin();
4453 	for (i = 0; i < count; i++) {
4454 		paddr = VM_PAGE_TO_PHYS(page[i]);
4455 		if (paddr >= DMAP_MAX_PHYSADDR) {
4456 			panic(
4457 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
4458 		}
4459 	}
4460 
4461 	return (needs_mapping);
4462 }
4463 
4464 void
4465 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
4466     boolean_t can_fault)
4467 {
4468 	vm_paddr_t paddr;
4469 	int i;
4470 
4471 	if (!can_fault)
4472 		sched_unpin();
4473 	for (i = 0; i < count; i++) {
4474 		paddr = VM_PAGE_TO_PHYS(page[i]);
4475 		if (paddr >= DMAP_MAX_PHYSADDR) {
4476 			panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
4477 		}
4478 	}
4479 }
4480 
4481 boolean_t
4482 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
4483 {
4484 
4485 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK);
4486 }
4487 
4488 bool
4489 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
4490     pt_entry_t **l3)
4491 {
4492 	pd_entry_t *l1p, *l2p;
4493 
4494 	/* Get l1 directory entry. */
4495 	l1p = pmap_l1(pmap, va);
4496 	*l1 = l1p;
4497 
4498 	if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
4499 		return (false);
4500 
4501 	if ((pmap_load(l1p) & PTE_RX) != 0) {
4502 		*l2 = NULL;
4503 		*l3 = NULL;
4504 		return (true);
4505 	}
4506 
4507 	/* Get l2 directory entry. */
4508 	l2p = pmap_l1_to_l2(l1p, va);
4509 	*l2 = l2p;
4510 
4511 	if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
4512 		return (false);
4513 
4514 	if ((pmap_load(l2p) & PTE_RX) != 0) {
4515 		*l3 = NULL;
4516 		return (true);
4517 	}
4518 
4519 	/* Get l3 page table entry. */
4520 	*l3 = pmap_l2_to_l3(l2p, va);
4521 
4522 	return (true);
4523 }
4524 
4525 /*
4526  * Track a range of the kernel's virtual address space that is contiguous
4527  * in various mapping attributes.
4528  */
4529 struct pmap_kernel_map_range {
4530 	vm_offset_t sva;
4531 	pt_entry_t attrs;
4532 	int l3pages;
4533 	int l2pages;
4534 	int l1pages;
4535 };
4536 
4537 static void
4538 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
4539     vm_offset_t eva)
4540 {
4541 
4542 	if (eva <= range->sva)
4543 		return;
4544 
4545 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n",
4546 	    range->sva, eva,
4547 	    (range->attrs & PTE_W) == PTE_W ? 'w' : '-',
4548 	    (range->attrs & PTE_X) == PTE_X ? 'x' : '-',
4549 	    (range->attrs & PTE_U) == PTE_U ? 'u' : 's',
4550 	    (range->attrs & PTE_G) == PTE_G ? 'g' : '-',
4551 	    range->l1pages, range->l2pages, range->l3pages);
4552 
4553 	/* Reset to sentinel value. */
4554 	range->sva = 0xfffffffffffffffful;
4555 }
4556 
4557 /*
4558  * Determine whether the attributes specified by a page table entry match those
4559  * being tracked by the current range.
4560  */
4561 static bool
4562 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
4563 {
4564 
4565 	return (range->attrs == attrs);
4566 }
4567 
4568 static void
4569 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
4570     pt_entry_t attrs)
4571 {
4572 
4573 	memset(range, 0, sizeof(*range));
4574 	range->sva = va;
4575 	range->attrs = attrs;
4576 }
4577 
4578 /*
4579  * Given a leaf PTE, derive the mapping's attributes. If they do not match
4580  * those of the current run, dump the address range and its attributes, and
4581  * begin a new run.
4582  */
4583 static void
4584 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
4585     vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e)
4586 {
4587 	pt_entry_t attrs;
4588 
4589 	/* The PTE global bit is inherited by lower levels. */
4590 	attrs = l1e & PTE_G;
4591 	if ((l1e & PTE_RWX) != 0)
4592 		attrs |= l1e & (PTE_RWX | PTE_U);
4593 	else if (l2e != 0)
4594 		attrs |= l2e & PTE_G;
4595 	if ((l2e & PTE_RWX) != 0)
4596 		attrs |= l2e & (PTE_RWX | PTE_U);
4597 	else if (l3e != 0)
4598 		attrs |= l3e & (PTE_RWX | PTE_U | PTE_G);
4599 
4600 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
4601 		sysctl_kmaps_dump(sb, range, va);
4602 		sysctl_kmaps_reinit(range, va, attrs);
4603 	}
4604 }
4605 
4606 static int
4607 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
4608 {
4609 	struct pmap_kernel_map_range range;
4610 	struct sbuf sbuf, *sb;
4611 	pd_entry_t l1e, *l2, l2e;
4612 	pt_entry_t *l3, l3e;
4613 	vm_offset_t sva;
4614 	vm_paddr_t pa;
4615 	int error, i, j, k;
4616 
4617 	error = sysctl_wire_old_buffer(req, 0);
4618 	if (error != 0)
4619 		return (error);
4620 	sb = &sbuf;
4621 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
4622 
4623 	/* Sentinel value. */
4624 	range.sva = 0xfffffffffffffffful;
4625 
4626 	/*
4627 	 * Iterate over the kernel page tables without holding the kernel pmap
4628 	 * lock. Kernel page table pages are never freed, so at worst we will
4629 	 * observe inconsistencies in the output.
4630 	 */
4631 	sva = VM_MIN_KERNEL_ADDRESS;
4632 	for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) {
4633 		if (i == pmap_l1_index(DMAP_MIN_ADDRESS))
4634 			sbuf_printf(sb, "\nDirect map:\n");
4635 		else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS))
4636 			sbuf_printf(sb, "\nKernel map:\n");
4637 
4638 		l1e = kernel_pmap->pm_l1[i];
4639 		if ((l1e & PTE_V) == 0) {
4640 			sysctl_kmaps_dump(sb, &range, sva);
4641 			sva += L1_SIZE;
4642 			continue;
4643 		}
4644 		if ((l1e & PTE_RWX) != 0) {
4645 			sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0);
4646 			range.l1pages++;
4647 			sva += L1_SIZE;
4648 			continue;
4649 		}
4650 		pa = PTE_TO_PHYS(l1e);
4651 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
4652 
4653 		for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) {
4654 			l2e = l2[j];
4655 			if ((l2e & PTE_V) == 0) {
4656 				sysctl_kmaps_dump(sb, &range, sva);
4657 				sva += L2_SIZE;
4658 				continue;
4659 			}
4660 			if ((l2e & PTE_RWX) != 0) {
4661 				sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0);
4662 				range.l2pages++;
4663 				sva += L2_SIZE;
4664 				continue;
4665 			}
4666 			pa = PTE_TO_PHYS(l2e);
4667 			l3 = (pd_entry_t *)PHYS_TO_DMAP(pa);
4668 
4669 			for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++,
4670 			    sva += L3_SIZE) {
4671 				l3e = l3[k];
4672 				if ((l3e & PTE_V) == 0) {
4673 					sysctl_kmaps_dump(sb, &range, sva);
4674 					continue;
4675 				}
4676 				sysctl_kmaps_check(sb, &range, sva,
4677 				    l1e, l2e, l3e);
4678 				range.l3pages++;
4679 			}
4680 		}
4681 	}
4682 
4683 	error = sbuf_finish(sb);
4684 	sbuf_delete(sb);
4685 	return (error);
4686 }
4687 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
4688     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
4689     NULL, 0, sysctl_kmaps, "A",
4690     "Dump kernel address layout");
4691