xref: /freebsd/sys/riscv/riscv/pmap.c (revision c7046f76)
1 /*-
2  * SPDX-License-Identifier: BSD-4-Clause
3  *
4  * Copyright (c) 1991 Regents of the University of California.
5  * All rights reserved.
6  * Copyright (c) 1994 John S. Dyson
7  * All rights reserved.
8  * Copyright (c) 1994 David Greenman
9  * All rights reserved.
10  * Copyright (c) 2003 Peter Wemm
11  * All rights reserved.
12  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
13  * All rights reserved.
14  * Copyright (c) 2014 Andrew Turner
15  * All rights reserved.
16  * Copyright (c) 2014 The FreeBSD Foundation
17  * All rights reserved.
18  * Copyright (c) 2015-2018 Ruslan Bukin <br@bsdpad.com>
19  * All rights reserved.
20  *
21  * This code is derived from software contributed to Berkeley by
22  * the Systems Programming Group of the University of Utah Computer
23  * Science Department and William Jolitz of UUNET Technologies Inc.
24  *
25  * Portions of this software were developed by Andrew Turner under
26  * sponsorship from The FreeBSD Foundation.
27  *
28  * Portions of this software were developed by SRI International and the
29  * University of Cambridge Computer Laboratory under DARPA/AFRL contract
30  * FA8750-10-C-0237 ("CTSRD"), as part of the DARPA CRASH research programme.
31  *
32  * Portions of this software were developed by the University of Cambridge
33  * Computer Laboratory as part of the CTSRD Project, with support from the
34  * UK Higher Education Innovation Fund (HEIF).
35  *
36  * Redistribution and use in source and binary forms, with or without
37  * modification, are permitted provided that the following conditions
38  * are met:
39  * 1. Redistributions of source code must retain the above copyright
40  *    notice, this list of conditions and the following disclaimer.
41  * 2. Redistributions in binary form must reproduce the above copyright
42  *    notice, this list of conditions and the following disclaimer in the
43  *    documentation and/or other materials provided with the distribution.
44  * 3. All advertising materials mentioning features or use of this software
45  *    must display the following acknowledgement:
46  *	This product includes software developed by the University of
47  *	California, Berkeley and its contributors.
48  * 4. Neither the name of the University nor the names of its contributors
49  *    may be used to endorse or promote products derived from this software
50  *    without specific prior written permission.
51  *
52  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
53  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
54  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
55  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
56  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
57  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
58  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
59  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
60  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
61  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
62  * SUCH DAMAGE.
63  *
64  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
65  */
66 /*-
67  * Copyright (c) 2003 Networks Associates Technology, Inc.
68  * All rights reserved.
69  *
70  * This software was developed for the FreeBSD Project by Jake Burkholder,
71  * Safeport Network Services, and Network Associates Laboratories, the
72  * Security Research Division of Network Associates, Inc. under
73  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
74  * CHATS research program.
75  *
76  * Redistribution and use in source and binary forms, with or without
77  * modification, are permitted provided that the following conditions
78  * are met:
79  * 1. Redistributions of source code must retain the above copyright
80  *    notice, this list of conditions and the following disclaimer.
81  * 2. Redistributions in binary form must reproduce the above copyright
82  *    notice, this list of conditions and the following disclaimer in the
83  *    documentation and/or other materials provided with the distribution.
84  *
85  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
86  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
87  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
88  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
89  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
90  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
91  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
92  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
93  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
94  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
95  * SUCH DAMAGE.
96  */
97 
98 #include <sys/cdefs.h>
99 __FBSDID("$FreeBSD$");
100 
101 /*
102  *	Manages physical address maps.
103  *
104  *	Since the information managed by this module is
105  *	also stored by the logical address mapping module,
106  *	this module may throw away valid virtual-to-physical
107  *	mappings at almost any time.  However, invalidations
108  *	of virtual-to-physical mappings must be done as
109  *	requested.
110  *
111  *	In order to cope with hardware architectures which
112  *	make virtual-to-physical map invalidates expensive,
113  *	this module may delay invalidate or reduced protection
114  *	operations until such time as they are actually
115  *	necessary.  This module is given full information as
116  *	to which processors are currently using which maps,
117  *	and to when physical maps must be made correct.
118  */
119 
120 #include <sys/param.h>
121 #include <sys/systm.h>
122 #include <sys/bitstring.h>
123 #include <sys/bus.h>
124 #include <sys/cpuset.h>
125 #include <sys/kernel.h>
126 #include <sys/ktr.h>
127 #include <sys/lock.h>
128 #include <sys/malloc.h>
129 #include <sys/mman.h>
130 #include <sys/msgbuf.h>
131 #include <sys/mutex.h>
132 #include <sys/physmem.h>
133 #include <sys/proc.h>
134 #include <sys/rwlock.h>
135 #include <sys/sbuf.h>
136 #include <sys/sx.h>
137 #include <sys/vmem.h>
138 #include <sys/vmmeter.h>
139 #include <sys/sched.h>
140 #include <sys/sysctl.h>
141 #include <sys/smp.h>
142 
143 #include <vm/vm.h>
144 #include <vm/vm_param.h>
145 #include <vm/vm_kern.h>
146 #include <vm/vm_page.h>
147 #include <vm/vm_map.h>
148 #include <vm/vm_object.h>
149 #include <vm/vm_extern.h>
150 #include <vm/vm_pageout.h>
151 #include <vm/vm_pager.h>
152 #include <vm/vm_phys.h>
153 #include <vm/vm_radix.h>
154 #include <vm/vm_reserv.h>
155 #include <vm/vm_dumpset.h>
156 #include <vm/uma.h>
157 
158 #include <machine/machdep.h>
159 #include <machine/md_var.h>
160 #include <machine/pcb.h>
161 #include <machine/sbi.h>
162 
163 /*
164  * Boundary values for the page table page index space:
165  *
166  * L3 pages: [0, NUL2E)
167  * L2 pages: [NUL2E, NUL2E + NUL1E)
168  * L1 pages: [NUL2E + NUL1E, NUL2E + NUL1E + NUL0E)
169  *
170  * Note that these ranges are used in both SV39 and SV48 mode.  In SV39 mode the
171  * ranges are not fully populated since there are at most Ln_ENTRIES^2 L3 pages
172  * in a set of page tables.
173  */
174 #define	NUL0E		Ln_ENTRIES
175 #define	NUL1E		(Ln_ENTRIES * NUL0E)
176 #define	NUL2E		(Ln_ENTRIES * NUL1E)
177 
178 #if !defined(DIAGNOSTIC)
179 #ifdef __GNUC_GNU_INLINE__
180 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
181 #else
182 #define PMAP_INLINE	extern inline
183 #endif
184 #else
185 #define PMAP_INLINE
186 #endif
187 
188 #ifdef PV_STATS
189 #define PV_STAT(x)	do { x ; } while (0)
190 #else
191 #define PV_STAT(x)	do { } while (0)
192 #endif
193 
194 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
195 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
196 #define	pa_to_pvh(pa)		(&pv_table[pa_index(pa)])
197 
198 #define	NPV_LIST_LOCKS	MAXCPU
199 
200 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
201 			(&pv_list_locks[pmap_l2_pindex(pa) % NPV_LIST_LOCKS])
202 
203 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
204 	struct rwlock **_lockp = (lockp);		\
205 	struct rwlock *_new_lock;			\
206 							\
207 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
208 	if (_new_lock != *_lockp) {			\
209 		if (*_lockp != NULL)			\
210 			rw_wunlock(*_lockp);		\
211 		*_lockp = _new_lock;			\
212 		rw_wlock(*_lockp);			\
213 	}						\
214 } while (0)
215 
216 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
217 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
218 
219 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
220 	struct rwlock **_lockp = (lockp);		\
221 							\
222 	if (*_lockp != NULL) {				\
223 		rw_wunlock(*_lockp);			\
224 		*_lockp = NULL;				\
225 	}						\
226 } while (0)
227 
228 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
229 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
230 
231 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
232     "VM/pmap parameters");
233 
234 /* The list of all the user pmaps */
235 LIST_HEAD(pmaplist, pmap);
236 static struct pmaplist allpmaps = LIST_HEAD_INITIALIZER();
237 
238 enum pmap_mode __read_frequently pmap_mode = PMAP_MODE_SV39;
239 SYSCTL_INT(_vm_pmap, OID_AUTO, mode, CTLFLAG_RDTUN | CTLFLAG_NOFETCH,
240     &pmap_mode, 0,
241     "translation mode, 0 = SV39, 1 = SV48");
242 
243 struct pmap kernel_pmap_store;
244 
245 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
246 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
247 vm_offset_t kernel_vm_end = 0;
248 
249 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
250 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
251 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
252 
253 /* This code assumes all L1 DMAP entries will be used */
254 CTASSERT((DMAP_MIN_ADDRESS  & ~L1_OFFSET) == DMAP_MIN_ADDRESS);
255 CTASSERT((DMAP_MAX_ADDRESS  & ~L1_OFFSET) == DMAP_MAX_ADDRESS);
256 
257 static struct rwlock_padalign pvh_global_lock;
258 static struct mtx_padalign allpmaps_lock;
259 
260 static int superpages_enabled = 1;
261 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
262     CTLFLAG_RDTUN, &superpages_enabled, 0,
263     "Enable support for transparent superpages");
264 
265 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
266     "2MB page mapping counters");
267 
268 static u_long pmap_l2_demotions;
269 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
270     &pmap_l2_demotions, 0,
271     "2MB page demotions");
272 
273 static u_long pmap_l2_mappings;
274 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
275     &pmap_l2_mappings, 0,
276     "2MB page mappings");
277 
278 static u_long pmap_l2_p_failures;
279 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
280     &pmap_l2_p_failures, 0,
281     "2MB page promotion failures");
282 
283 static u_long pmap_l2_promotions;
284 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
285     &pmap_l2_promotions, 0,
286     "2MB page promotions");
287 
288 /*
289  * Data for the pv entry allocation mechanism
290  */
291 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
292 static struct mtx pv_chunks_mutex;
293 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
294 static struct md_page *pv_table;
295 static struct md_page pv_dummy;
296 
297 extern cpuset_t all_harts;
298 
299 /*
300  * Internal flags for pmap_enter()'s helper functions.
301  */
302 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
303 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
304 
305 static void	free_pv_chunk(struct pv_chunk *pc);
306 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
307 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
308 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
309 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
310 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
311 		    vm_offset_t va);
312 static bool	pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va);
313 static bool	pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2,
314 		    vm_offset_t va, struct rwlock **lockp);
315 static int	pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
316 		    u_int flags, vm_page_t m, struct rwlock **lockp);
317 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
318     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
319 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
320     pd_entry_t ptepde, struct spglist *free, struct rwlock **lockp);
321 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
322     vm_page_t m, struct rwlock **lockp);
323 
324 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
325 		struct rwlock **lockp);
326 
327 static void _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m,
328     struct spglist *free);
329 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
330 
331 static int pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode);
332 
333 #define	pmap_clear(pte)			pmap_store(pte, 0)
334 #define	pmap_clear_bits(pte, bits)	atomic_clear_64(pte, bits)
335 #define	pmap_load_store(pte, entry)	atomic_swap_64(pte, entry)
336 #define	pmap_load_clear(pte)		pmap_load_store(pte, 0)
337 #define	pmap_load(pte)			atomic_load_64(pte)
338 #define	pmap_store(pte, entry)		atomic_store_64(pte, entry)
339 #define	pmap_store_bits(pte, bits)	atomic_set_64(pte, bits)
340 
341 /********************/
342 /* Inline functions */
343 /********************/
344 
345 static __inline void
346 pagecopy(void *s, void *d)
347 {
348 
349 	memcpy(d, s, PAGE_SIZE);
350 }
351 
352 static __inline void
353 pagezero(void *p)
354 {
355 
356 	bzero(p, PAGE_SIZE);
357 }
358 
359 #define	pmap_l0_index(va)	(((va) >> L0_SHIFT) & Ln_ADDR_MASK)
360 #define	pmap_l1_index(va)	(((va) >> L1_SHIFT) & Ln_ADDR_MASK)
361 #define	pmap_l2_index(va)	(((va) >> L2_SHIFT) & Ln_ADDR_MASK)
362 #define	pmap_l3_index(va)	(((va) >> L3_SHIFT) & Ln_ADDR_MASK)
363 
364 #define	PTE_TO_PHYS(pte) \
365     ((((pte) & ~PTE_HI_MASK) >> PTE_PPN0_S) * PAGE_SIZE)
366 #define	L2PTE_TO_PHYS(l2) \
367     ((((l2) & ~PTE_HI_MASK) >> PTE_PPN1_S) << L2_SHIFT)
368 
369 static __inline pd_entry_t *
370 pmap_l0(pmap_t pmap, vm_offset_t va)
371 {
372 	KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
373 	KASSERT(VIRT_IS_VALID(va),
374 	    ("%s: malformed virtual address %#lx", __func__, va));
375 	return (&pmap->pm_top[pmap_l0_index(va)]);
376 }
377 
378 static __inline pd_entry_t *
379 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
380 {
381 	vm_paddr_t phys;
382 	pd_entry_t *l1;
383 
384 	KASSERT(pmap_mode != PMAP_MODE_SV39, ("%s: in SV39 mode", __func__));
385 	phys = PTE_TO_PHYS(pmap_load(l0));
386 	l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
387 
388 	return (&l1[pmap_l1_index(va)]);
389 }
390 
391 static __inline pd_entry_t *
392 pmap_l1(pmap_t pmap, vm_offset_t va)
393 {
394 	pd_entry_t *l0;
395 
396 	KASSERT(VIRT_IS_VALID(va),
397 	    ("%s: malformed virtual address %#lx", __func__, va));
398 	if (pmap_mode == PMAP_MODE_SV39) {
399 		return (&pmap->pm_top[pmap_l1_index(va)]);
400 	} else {
401 		l0 = pmap_l0(pmap, va);
402 		if ((pmap_load(l0) & PTE_V) == 0)
403 			return (NULL);
404 		if ((pmap_load(l0) & PTE_RX) != 0)
405 			return (NULL);
406 		return (pmap_l0_to_l1(l0, va));
407 	}
408 }
409 
410 static __inline pd_entry_t *
411 pmap_l1_to_l2(pd_entry_t *l1, vm_offset_t va)
412 {
413 	vm_paddr_t phys;
414 	pd_entry_t *l2;
415 
416 	phys = PTE_TO_PHYS(pmap_load(l1));
417 	l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
418 
419 	return (&l2[pmap_l2_index(va)]);
420 }
421 
422 static __inline pd_entry_t *
423 pmap_l2(pmap_t pmap, vm_offset_t va)
424 {
425 	pd_entry_t *l1;
426 
427 	l1 = pmap_l1(pmap, va);
428 	if (l1 == NULL)
429 		return (NULL);
430 	if ((pmap_load(l1) & PTE_V) == 0)
431 		return (NULL);
432 	if ((pmap_load(l1) & PTE_RX) != 0)
433 		return (NULL);
434 
435 	return (pmap_l1_to_l2(l1, va));
436 }
437 
438 static __inline pt_entry_t *
439 pmap_l2_to_l3(pd_entry_t *l2, vm_offset_t va)
440 {
441 	vm_paddr_t phys;
442 	pt_entry_t *l3;
443 
444 	phys = PTE_TO_PHYS(pmap_load(l2));
445 	l3 = (pd_entry_t *)PHYS_TO_DMAP(phys);
446 
447 	return (&l3[pmap_l3_index(va)]);
448 }
449 
450 static __inline pt_entry_t *
451 pmap_l3(pmap_t pmap, vm_offset_t va)
452 {
453 	pd_entry_t *l2;
454 
455 	l2 = pmap_l2(pmap, va);
456 	if (l2 == NULL)
457 		return (NULL);
458 	if ((pmap_load(l2) & PTE_V) == 0)
459 		return (NULL);
460 	if ((pmap_load(l2) & PTE_RX) != 0)
461 		return (NULL);
462 
463 	return (pmap_l2_to_l3(l2, va));
464 }
465 
466 static __inline void
467 pmap_resident_count_inc(pmap_t pmap, int count)
468 {
469 
470 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
471 	pmap->pm_stats.resident_count += count;
472 }
473 
474 static __inline void
475 pmap_resident_count_dec(pmap_t pmap, int count)
476 {
477 
478 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
479 	KASSERT(pmap->pm_stats.resident_count >= count,
480 	    ("pmap %p resident count underflow %ld %d", pmap,
481 	    pmap->pm_stats.resident_count, count));
482 	pmap->pm_stats.resident_count -= count;
483 }
484 
485 static void
486 pmap_distribute_l1(struct pmap *pmap, vm_pindex_t l1index,
487     pt_entry_t entry)
488 {
489 	struct pmap *user_pmap;
490 	pd_entry_t *l1;
491 
492 	/*
493 	 * Distribute new kernel L1 entry to all the user pmaps.  This is only
494 	 * necessary with three-level paging configured: with four-level paging
495 	 * the kernel's half of the top-level page table page is static and can
496 	 * simply be copied at pmap initialization time.
497 	 */
498 	if (pmap != kernel_pmap || pmap_mode != PMAP_MODE_SV39)
499 		return;
500 
501 	mtx_lock(&allpmaps_lock);
502 	LIST_FOREACH(user_pmap, &allpmaps, pm_list) {
503 		l1 = &user_pmap->pm_top[l1index];
504 		pmap_store(l1, entry);
505 	}
506 	mtx_unlock(&allpmaps_lock);
507 }
508 
509 static pt_entry_t *
510 pmap_early_page_idx(vm_offset_t l1pt, vm_offset_t va, u_int *l1_slot,
511     u_int *l2_slot)
512 {
513 	pt_entry_t *l2;
514 	pd_entry_t *l1 __diagused;
515 
516 	l1 = (pd_entry_t *)l1pt;
517 	*l1_slot = (va >> L1_SHIFT) & Ln_ADDR_MASK;
518 
519 	/* Check locore has used a table L1 map */
520 	KASSERT((l1[*l1_slot] & PTE_RX) == 0,
521 		("Invalid bootstrap L1 table"));
522 
523 	/* Find the address of the L2 table */
524 	l2 = (pt_entry_t *)init_pt_va;
525 	*l2_slot = pmap_l2_index(va);
526 
527 	return (l2);
528 }
529 
530 static vm_paddr_t
531 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
532 {
533 	u_int l1_slot, l2_slot;
534 	pt_entry_t *l2;
535 	vm_paddr_t ret;
536 
537 	l2 = pmap_early_page_idx(l1pt, va, &l1_slot, &l2_slot);
538 
539 	/* Check locore has used L2 superpages */
540 	KASSERT((l2[l2_slot] & PTE_RX) != 0,
541 		("Invalid bootstrap L2 table"));
542 
543 	/* L2 is superpages */
544 	ret = L2PTE_TO_PHYS(l2[l2_slot]);
545 	ret += (va & L2_OFFSET);
546 
547 	return (ret);
548 }
549 
550 static void
551 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa, vm_paddr_t max_pa)
552 {
553 	vm_offset_t va;
554 	vm_paddr_t pa;
555 	pd_entry_t *l1;
556 	u_int l1_slot;
557 	pt_entry_t entry;
558 	pn_t pn;
559 
560 	pa = dmap_phys_base = min_pa & ~L1_OFFSET;
561 	va = DMAP_MIN_ADDRESS;
562 	l1 = (pd_entry_t *)kern_l1;
563 	l1_slot = pmap_l1_index(DMAP_MIN_ADDRESS);
564 
565 	for (; va < DMAP_MAX_ADDRESS && pa < max_pa;
566 	    pa += L1_SIZE, va += L1_SIZE, l1_slot++) {
567 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
568 
569 		/* superpages */
570 		pn = (pa / PAGE_SIZE);
571 		entry = PTE_KERN;
572 		entry |= (pn << PTE_PPN0_S);
573 		pmap_store(&l1[l1_slot], entry);
574 	}
575 
576 	/* Set the upper limit of the DMAP region */
577 	dmap_phys_max = pa;
578 	dmap_max_addr = va;
579 
580 	sfence_vma();
581 }
582 
583 static vm_offset_t
584 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
585 {
586 	vm_offset_t l3pt;
587 	pt_entry_t entry;
588 	pd_entry_t *l2;
589 	vm_paddr_t pa;
590 	u_int l2_slot;
591 	pn_t pn;
592 
593 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
594 
595 	l2 = pmap_l2(kernel_pmap, va);
596 	l2 = (pd_entry_t *)((uintptr_t)l2 & ~(PAGE_SIZE - 1));
597 	l2_slot = pmap_l2_index(va);
598 	l3pt = l3_start;
599 
600 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
601 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
602 
603 		pa = pmap_early_vtophys(l1pt, l3pt);
604 		pn = (pa / PAGE_SIZE);
605 		entry = (PTE_V);
606 		entry |= (pn << PTE_PPN0_S);
607 		pmap_store(&l2[l2_slot], entry);
608 		l3pt += PAGE_SIZE;
609 	}
610 
611 	/* Clean the L2 page table */
612 	memset((void *)l3_start, 0, l3pt - l3_start);
613 
614 	return (l3pt);
615 }
616 
617 /*
618  *	Bootstrap the system enough to run with virtual memory.
619  */
620 void
621 pmap_bootstrap(vm_offset_t l1pt, vm_paddr_t kernstart, vm_size_t kernlen)
622 {
623 	vm_paddr_t physmap[PHYS_AVAIL_ENTRIES];
624 	uint64_t satp;
625 	vm_offset_t dpcpu, freemempos, l0pv, msgbufpv;
626 	vm_paddr_t l0pa, l1pa, max_pa, min_pa, pa;
627 	pd_entry_t *l0p;
628 	pt_entry_t *l2p;
629 	u_int l1_slot, l2_slot;
630 	u_int physmap_idx;
631 	int i, mode;
632 
633 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
634 
635 	/* Set this early so we can use the pagetable walking functions */
636 	kernel_pmap_store.pm_top = (pd_entry_t *)l1pt;
637 	PMAP_LOCK_INIT(kernel_pmap);
638 
639 	rw_init(&pvh_global_lock, "pmap pv global");
640 
641 	/*
642 	 * Set the current CPU as active in the kernel pmap. Secondary cores
643 	 * will add themselves later in init_secondary(). The SBI firmware
644 	 * may rely on this mask being precise, so CPU_FILL() is not used.
645 	 */
646 	CPU_SET(PCPU_GET(hart), &kernel_pmap->pm_active);
647 
648 	/* Assume the address we were loaded to is a valid physical address. */
649 	min_pa = max_pa = kernstart;
650 
651 	physmap_idx = physmem_avail(physmap, nitems(physmap));
652 	physmap_idx /= 2;
653 
654 	/*
655 	 * Find the minimum physical address. physmap is sorted,
656 	 * but may contain empty ranges.
657 	 */
658 	for (i = 0; i < physmap_idx * 2; i += 2) {
659 		if (physmap[i] == physmap[i + 1])
660 			continue;
661 		if (physmap[i] <= min_pa)
662 			min_pa = physmap[i];
663 		if (physmap[i + 1] > max_pa)
664 			max_pa = physmap[i + 1];
665 	}
666 	printf("physmap_idx %u\n", physmap_idx);
667 	printf("min_pa %lx\n", min_pa);
668 	printf("max_pa %lx\n", max_pa);
669 
670 	/* Create a direct map region early so we can use it for pa -> va */
671 	pmap_bootstrap_dmap(l1pt, min_pa, max_pa);
672 
673 	/*
674 	 * Read the page table to find out what is already mapped.
675 	 * This assumes we have mapped a block of memory from KERNBASE
676 	 * using a single L1 entry.
677 	 */
678 	(void)pmap_early_page_idx(l1pt, KERNBASE, &l1_slot, &l2_slot);
679 
680 	/* Sanity check the index, KERNBASE should be the first VA */
681 	KASSERT(l2_slot == 0, ("The L2 index is non-zero"));
682 
683 	freemempos = roundup2(KERNBASE + kernlen, PAGE_SIZE);
684 
685 	/* Create the l3 tables for the early devmap */
686 	freemempos = pmap_bootstrap_l3(l1pt,
687 	    VM_MAX_KERNEL_ADDRESS - L2_SIZE, freemempos);
688 
689 	/*
690 	 * Invalidate the mapping we created for the DTB. At this point a copy
691 	 * has been created, and we no longer need it. We want to avoid the
692 	 * possibility of an aliased mapping in the future.
693 	 */
694 	l2p = pmap_l2(kernel_pmap, VM_EARLY_DTB_ADDRESS);
695 	if ((pmap_load(l2p) & PTE_V) != 0)
696 		pmap_clear(l2p);
697 
698 	sfence_vma();
699 
700 #define alloc_pages(var, np)						\
701 	(var) = freemempos;						\
702 	freemempos += (np * PAGE_SIZE);					\
703 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
704 
705 	mode = 0;
706 	TUNABLE_INT_FETCH("vm.pmap.mode", &mode);
707 	if (mode == PMAP_MODE_SV48) {
708 		/*
709 		 * Enable SV48 mode: allocate an L0 page and set SV48 mode in
710 		 * SATP.  If the implementation does not provide SV48 mode,
711 		 * the mode read back from the (WARL) SATP register will be
712 		 * unchanged, and we continue in SV39 mode.
713 		 */
714 		alloc_pages(l0pv, 1);
715 		l0p = (void *)l0pv;
716 		l1pa = pmap_early_vtophys(l1pt, l1pt);
717 		l0p[pmap_l0_index(KERNBASE)] = PTE_V | PTE_A | PTE_D |
718 		    ((l1pa >> PAGE_SHIFT) << PTE_PPN0_S);
719 
720 		l0pa = pmap_early_vtophys(l1pt, l0pv);
721 		csr_write(satp, (l0pa >> PAGE_SHIFT) | SATP_MODE_SV48);
722 		satp = csr_read(satp);
723 		if ((satp & SATP_MODE_M) == SATP_MODE_SV48) {
724 			pmap_mode = PMAP_MODE_SV48;
725 			kernel_pmap_store.pm_top = l0p;
726 		} else {
727 			/* Mode didn't change, give the page back. */
728 			freemempos -= PAGE_SIZE;
729 		}
730 	}
731 
732 	/* Allocate dynamic per-cpu area. */
733 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
734 	dpcpu_init((void *)dpcpu, 0);
735 
736 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
737 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
738 	msgbufp = (void *)msgbufpv;
739 
740 	virtual_avail = roundup2(freemempos, L2_SIZE);
741 	virtual_end = VM_MAX_KERNEL_ADDRESS - L2_SIZE;
742 	kernel_vm_end = virtual_avail;
743 
744 	pa = pmap_early_vtophys(l1pt, freemempos);
745 
746 	physmem_exclude_region(kernstart, pa - kernstart, EXFLAG_NOALLOC);
747 }
748 
749 /*
750  *	Initialize a vm_page's machine-dependent fields.
751  */
752 void
753 pmap_page_init(vm_page_t m)
754 {
755 
756 	TAILQ_INIT(&m->md.pv_list);
757 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
758 }
759 
760 /*
761  *	Initialize the pmap module.
762  *	Called by vm_init, to initialize any structures that the pmap
763  *	system needs to map virtual memory.
764  */
765 void
766 pmap_init(void)
767 {
768 	vm_size_t s;
769 	int i, pv_npg;
770 
771 	/*
772 	 * Initialize the pv chunk and pmap list mutexes.
773 	 */
774 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
775 	mtx_init(&allpmaps_lock, "allpmaps", NULL, MTX_DEF);
776 
777 	/*
778 	 * Initialize the pool of pv list locks.
779 	 */
780 	for (i = 0; i < NPV_LIST_LOCKS; i++)
781 		rw_init(&pv_list_locks[i], "pmap pv list");
782 
783 	/*
784 	 * Calculate the size of the pv head table for superpages.
785 	 */
786 	pv_npg = howmany(vm_phys_segs[vm_phys_nsegs - 1].end, L2_SIZE);
787 
788 	/*
789 	 * Allocate memory for the pv head table for superpages.
790 	 */
791 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
792 	s = round_page(s);
793 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
794 	for (i = 0; i < pv_npg; i++)
795 		TAILQ_INIT(&pv_table[i].pv_list);
796 	TAILQ_INIT(&pv_dummy.pv_list);
797 
798 	if (superpages_enabled)
799 		pagesizes[1] = L2_SIZE;
800 }
801 
802 #ifdef SMP
803 /*
804  * For SMP, these functions have to use IPIs for coherence.
805  *
806  * In general, the calling thread uses a plain fence to order the
807  * writes to the page tables before invoking an SBI callback to invoke
808  * sfence_vma() on remote CPUs.
809  */
810 static void
811 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
812 {
813 	cpuset_t mask;
814 
815 	sched_pin();
816 	mask = pmap->pm_active;
817 	CPU_CLR(PCPU_GET(hart), &mask);
818 	fence();
819 	if (!CPU_EMPTY(&mask) && smp_started)
820 		sbi_remote_sfence_vma(mask.__bits, va, 1);
821 	sfence_vma_page(va);
822 	sched_unpin();
823 }
824 
825 static void
826 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
827 {
828 	cpuset_t mask;
829 
830 	sched_pin();
831 	mask = pmap->pm_active;
832 	CPU_CLR(PCPU_GET(hart), &mask);
833 	fence();
834 	if (!CPU_EMPTY(&mask) && smp_started)
835 		sbi_remote_sfence_vma(mask.__bits, sva, eva - sva + 1);
836 
837 	/*
838 	 * Might consider a loop of sfence_vma_page() for a small
839 	 * number of pages in the future.
840 	 */
841 	sfence_vma();
842 	sched_unpin();
843 }
844 
845 static void
846 pmap_invalidate_all(pmap_t pmap)
847 {
848 	cpuset_t mask;
849 
850 	sched_pin();
851 	mask = pmap->pm_active;
852 	CPU_CLR(PCPU_GET(hart), &mask);
853 
854 	/*
855 	 * XXX: The SBI doc doesn't detail how to specify x0 as the
856 	 * address to perform a global fence.  BBL currently treats
857 	 * all sfence_vma requests as global however.
858 	 */
859 	fence();
860 	if (!CPU_EMPTY(&mask) && smp_started)
861 		sbi_remote_sfence_vma(mask.__bits, 0, 0);
862 	sfence_vma();
863 	sched_unpin();
864 }
865 #else
866 /*
867  * Normal, non-SMP, invalidation functions.
868  * We inline these within pmap.c for speed.
869  */
870 static __inline void
871 pmap_invalidate_page(pmap_t pmap, vm_offset_t va)
872 {
873 
874 	sfence_vma_page(va);
875 }
876 
877 static __inline void
878 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
879 {
880 
881 	/*
882 	 * Might consider a loop of sfence_vma_page() for a small
883 	 * number of pages in the future.
884 	 */
885 	sfence_vma();
886 }
887 
888 static __inline void
889 pmap_invalidate_all(pmap_t pmap)
890 {
891 
892 	sfence_vma();
893 }
894 #endif
895 
896 /*
897  *	Routine:	pmap_extract
898  *	Function:
899  *		Extract the physical page address associated
900  *		with the given map/virtual_address pair.
901  */
902 vm_paddr_t
903 pmap_extract(pmap_t pmap, vm_offset_t va)
904 {
905 	pd_entry_t *l2p, l2;
906 	pt_entry_t *l3p, l3;
907 	vm_paddr_t pa;
908 
909 	pa = 0;
910 	PMAP_LOCK(pmap);
911 	/*
912 	 * Start with the l2 tabel. We are unable to allocate
913 	 * pages in the l1 table.
914 	 */
915 	l2p = pmap_l2(pmap, va);
916 	if (l2p != NULL) {
917 		l2 = pmap_load(l2p);
918 		if ((l2 & PTE_RX) == 0) {
919 			l3p = pmap_l2_to_l3(l2p, va);
920 			if (l3p != NULL) {
921 				l3 = pmap_load(l3p);
922 				pa = PTE_TO_PHYS(l3);
923 				pa |= (va & L3_OFFSET);
924 			}
925 		} else {
926 			/* L2 is superpages */
927 			pa = L2PTE_TO_PHYS(l2);
928 			pa |= (va & L2_OFFSET);
929 		}
930 	}
931 	PMAP_UNLOCK(pmap);
932 	return (pa);
933 }
934 
935 /*
936  *	Routine:	pmap_extract_and_hold
937  *	Function:
938  *		Atomically extract and hold the physical page
939  *		with the given pmap and virtual address pair
940  *		if that mapping permits the given protection.
941  */
942 vm_page_t
943 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
944 {
945 	pt_entry_t *l3p, l3;
946 	vm_paddr_t phys;
947 	vm_page_t m;
948 
949 	m = NULL;
950 	PMAP_LOCK(pmap);
951 	l3p = pmap_l3(pmap, va);
952 	if (l3p != NULL && (l3 = pmap_load(l3p)) != 0) {
953 		if ((l3 & PTE_W) != 0 || (prot & VM_PROT_WRITE) == 0) {
954 			phys = PTE_TO_PHYS(l3);
955 			m = PHYS_TO_VM_PAGE(phys);
956 			if (!vm_page_wire_mapped(m))
957 				m = NULL;
958 		}
959 	}
960 	PMAP_UNLOCK(pmap);
961 	return (m);
962 }
963 
964 vm_paddr_t
965 pmap_kextract(vm_offset_t va)
966 {
967 	pd_entry_t *l2, l2e;
968 	pt_entry_t *l3;
969 	vm_paddr_t pa;
970 
971 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS) {
972 		pa = DMAP_TO_PHYS(va);
973 	} else {
974 		l2 = pmap_l2(kernel_pmap, va);
975 		if (l2 == NULL)
976 			panic("pmap_kextract: No l2");
977 		l2e = pmap_load(l2);
978 		/*
979 		 * Beware of concurrent promotion and demotion! We must
980 		 * use l2e rather than loading from l2 multiple times to
981 		 * ensure we see a consistent state, including the
982 		 * implicit load in pmap_l2_to_l3.  It is, however, safe
983 		 * to use an old l2e because the L3 page is preserved by
984 		 * promotion.
985 		 */
986 		if ((l2e & PTE_RX) != 0) {
987 			/* superpages */
988 			pa = L2PTE_TO_PHYS(l2e);
989 			pa |= (va & L2_OFFSET);
990 			return (pa);
991 		}
992 
993 		l3 = pmap_l2_to_l3(&l2e, va);
994 		if (l3 == NULL)
995 			panic("pmap_kextract: No l3...");
996 		pa = PTE_TO_PHYS(pmap_load(l3));
997 		pa |= (va & PAGE_MASK);
998 	}
999 	return (pa);
1000 }
1001 
1002 /***************************************************
1003  * Low level mapping routines.....
1004  ***************************************************/
1005 
1006 void
1007 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode __unused)
1008 {
1009 	pt_entry_t entry;
1010 	pt_entry_t *l3;
1011 	vm_offset_t va;
1012 	pn_t pn;
1013 
1014 	KASSERT((pa & L3_OFFSET) == 0,
1015 	   ("pmap_kenter_device: Invalid physical address"));
1016 	KASSERT((sva & L3_OFFSET) == 0,
1017 	   ("pmap_kenter_device: Invalid virtual address"));
1018 	KASSERT((size & PAGE_MASK) == 0,
1019 	    ("pmap_kenter_device: Mapping is not page-sized"));
1020 
1021 	va = sva;
1022 	while (size != 0) {
1023 		l3 = pmap_l3(kernel_pmap, va);
1024 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1025 
1026 		pn = (pa / PAGE_SIZE);
1027 		entry = PTE_KERN;
1028 		entry |= (pn << PTE_PPN0_S);
1029 		pmap_store(l3, entry);
1030 
1031 		va += PAGE_SIZE;
1032 		pa += PAGE_SIZE;
1033 		size -= PAGE_SIZE;
1034 	}
1035 	pmap_invalidate_range(kernel_pmap, sva, va);
1036 }
1037 
1038 void
1039 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1040 {
1041 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1042 }
1043 
1044 /*
1045  * Remove a page from the kernel pagetables.
1046  * Note: not SMP coherent.
1047  */
1048 PMAP_INLINE void
1049 pmap_kremove(vm_offset_t va)
1050 {
1051 	pt_entry_t *l3;
1052 
1053 	l3 = pmap_l3(kernel_pmap, va);
1054 	KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1055 
1056 	pmap_clear(l3);
1057 	sfence_vma();
1058 }
1059 
1060 void
1061 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1062 {
1063 	pt_entry_t *l3;
1064 	vm_offset_t va;
1065 
1066 	KASSERT((sva & L3_OFFSET) == 0,
1067 	   ("pmap_kremove_device: Invalid virtual address"));
1068 	KASSERT((size & PAGE_MASK) == 0,
1069 	    ("pmap_kremove_device: Mapping is not page-sized"));
1070 
1071 	va = sva;
1072 	while (size != 0) {
1073 		l3 = pmap_l3(kernel_pmap, va);
1074 		KASSERT(l3 != NULL, ("Invalid page table, va: 0x%lx", va));
1075 		pmap_clear(l3);
1076 
1077 		va += PAGE_SIZE;
1078 		size -= PAGE_SIZE;
1079 	}
1080 
1081 	pmap_invalidate_range(kernel_pmap, sva, va);
1082 }
1083 
1084 /*
1085  *	Used to map a range of physical addresses into kernel
1086  *	virtual address space.
1087  *
1088  *	The value passed in '*virt' is a suggested virtual address for
1089  *	the mapping. Architectures which can support a direct-mapped
1090  *	physical to virtual region can return the appropriate address
1091  *	within that region, leaving '*virt' unchanged. Other
1092  *	architectures should map the pages starting at '*virt' and
1093  *	update '*virt' with the first usable address after the mapped
1094  *	region.
1095  */
1096 vm_offset_t
1097 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1098 {
1099 
1100 	return PHYS_TO_DMAP(start);
1101 }
1102 
1103 /*
1104  * Add a list of wired pages to the kva
1105  * this routine is only used for temporary
1106  * kernel mappings that do not need to have
1107  * page modification or references recorded.
1108  * Note that old mappings are simply written
1109  * over.  The page *must* be wired.
1110  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1111  */
1112 void
1113 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1114 {
1115 	pt_entry_t *l3, pa;
1116 	vm_offset_t va;
1117 	vm_page_t m;
1118 	pt_entry_t entry;
1119 	pn_t pn;
1120 	int i;
1121 
1122 	va = sva;
1123 	for (i = 0; i < count; i++) {
1124 		m = ma[i];
1125 		pa = VM_PAGE_TO_PHYS(m);
1126 		pn = (pa / PAGE_SIZE);
1127 		l3 = pmap_l3(kernel_pmap, va);
1128 
1129 		entry = PTE_KERN;
1130 		entry |= (pn << PTE_PPN0_S);
1131 		pmap_store(l3, entry);
1132 
1133 		va += L3_SIZE;
1134 	}
1135 	pmap_invalidate_range(kernel_pmap, sva, va);
1136 }
1137 
1138 /*
1139  * This routine tears out page mappings from the
1140  * kernel -- it is meant only for temporary mappings.
1141  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1142  */
1143 void
1144 pmap_qremove(vm_offset_t sva, int count)
1145 {
1146 	pt_entry_t *l3;
1147 	vm_offset_t va;
1148 
1149 	KASSERT(sva >= VM_MIN_KERNEL_ADDRESS, ("usermode va %lx", sva));
1150 
1151 	for (va = sva; count-- > 0; va += PAGE_SIZE) {
1152 		l3 = pmap_l3(kernel_pmap, va);
1153 		KASSERT(l3 != NULL, ("pmap_kremove: Invalid address"));
1154 		pmap_clear(l3);
1155 	}
1156 	pmap_invalidate_range(kernel_pmap, sva, va);
1157 }
1158 
1159 bool
1160 pmap_ps_enabled(pmap_t pmap __unused)
1161 {
1162 
1163 	return (superpages_enabled);
1164 }
1165 
1166 /***************************************************
1167  * Page table page management routines.....
1168  ***************************************************/
1169 /*
1170  * Schedule the specified unused page table page to be freed.  Specifically,
1171  * add the page to the specified list of pages that will be released to the
1172  * physical memory manager after the TLB has been updated.
1173  */
1174 static __inline void
1175 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1176     boolean_t set_PG_ZERO)
1177 {
1178 
1179 	if (set_PG_ZERO)
1180 		m->flags |= PG_ZERO;
1181 	else
1182 		m->flags &= ~PG_ZERO;
1183 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1184 }
1185 
1186 /*
1187  * Inserts the specified page table page into the specified pmap's collection
1188  * of idle page table pages.  Each of a pmap's page table pages is responsible
1189  * for mapping a distinct range of virtual addresses.  The pmap's collection is
1190  * ordered by this virtual address range.
1191  *
1192  * If "promoted" is false, then the page table page "ml3" must be zero filled.
1193  */
1194 static __inline int
1195 pmap_insert_pt_page(pmap_t pmap, vm_page_t ml3, bool promoted)
1196 {
1197 
1198 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1199 	ml3->valid = promoted ? VM_PAGE_BITS_ALL : 0;
1200 	return (vm_radix_insert(&pmap->pm_root, ml3));
1201 }
1202 
1203 /*
1204  * Removes the page table page mapping the specified virtual address from the
1205  * specified pmap's collection of idle page table pages, and returns it.
1206  * Otherwise, returns NULL if there is no page table page corresponding to the
1207  * specified virtual address.
1208  */
1209 static __inline vm_page_t
1210 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
1211 {
1212 
1213 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1214 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
1215 }
1216 
1217 /*
1218  * Decrements a page table page's reference count, which is used to record the
1219  * number of valid page table entries within the page.  If the reference count
1220  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1221  * page table page was unmapped and FALSE otherwise.
1222  */
1223 static inline boolean_t
1224 pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1225 {
1226 	KASSERT(m->ref_count > 0,
1227 	    ("%s: page %p ref count underflow", __func__, m));
1228 
1229 	--m->ref_count;
1230 	if (m->ref_count == 0) {
1231 		_pmap_unwire_ptp(pmap, va, m, free);
1232 		return (TRUE);
1233 	} else {
1234 		return (FALSE);
1235 	}
1236 }
1237 
1238 static void
1239 _pmap_unwire_ptp(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1240 {
1241 	vm_paddr_t phys;
1242 
1243 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1244 	if (m->pindex >= NUL2E + NUL1E) {
1245 		pd_entry_t *l0;
1246 		l0 = pmap_l0(pmap, va);
1247 		pmap_clear(l0);
1248 	} else if (m->pindex >= NUL2E) {
1249 		pd_entry_t *l1;
1250 		l1 = pmap_l1(pmap, va);
1251 		pmap_clear(l1);
1252 		pmap_distribute_l1(pmap, pmap_l1_index(va), 0);
1253 	} else {
1254 		pd_entry_t *l2;
1255 		l2 = pmap_l2(pmap, va);
1256 		pmap_clear(l2);
1257 	}
1258 	pmap_resident_count_dec(pmap, 1);
1259 	if (m->pindex < NUL2E) {
1260 		pd_entry_t *l1;
1261 		vm_page_t pdpg;
1262 
1263 		l1 = pmap_l1(pmap, va);
1264 		phys = PTE_TO_PHYS(pmap_load(l1));
1265 		pdpg = PHYS_TO_VM_PAGE(phys);
1266 		pmap_unwire_ptp(pmap, va, pdpg, free);
1267 	} else if (m->pindex < NUL2E + NUL1E && pmap_mode != PMAP_MODE_SV39) {
1268 		pd_entry_t *l0;
1269 		vm_page_t pdpg;
1270 
1271 		MPASS(pmap_mode != PMAP_MODE_SV39);
1272 		l0 = pmap_l0(pmap, va);
1273 		phys = PTE_TO_PHYS(pmap_load(l0));
1274 		pdpg = PHYS_TO_VM_PAGE(phys);
1275 		pmap_unwire_ptp(pmap, va, pdpg, free);
1276 	}
1277 	pmap_invalidate_page(pmap, va);
1278 
1279 	vm_wire_sub(1);
1280 
1281 	/*
1282 	 * Put page on a list so that it is released after
1283 	 * *ALL* TLB shootdown is done
1284 	 */
1285 	pmap_add_delayed_free_list(m, free, TRUE);
1286 }
1287 
1288 /*
1289  * After removing a page table entry, this routine is used to
1290  * conditionally free the page, and manage the reference count.
1291  */
1292 static int
1293 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1294     struct spglist *free)
1295 {
1296 	vm_page_t mpte;
1297 
1298 	if (va >= VM_MAXUSER_ADDRESS)
1299 		return (0);
1300 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1301 	mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(ptepde));
1302 	return (pmap_unwire_ptp(pmap, va, mpte, free));
1303 }
1304 
1305 static uint64_t
1306 pmap_satp_mode(void)
1307 {
1308 	return (pmap_mode == PMAP_MODE_SV39 ? SATP_MODE_SV39 : SATP_MODE_SV48);
1309 }
1310 
1311 void
1312 pmap_pinit0(pmap_t pmap)
1313 {
1314 	PMAP_LOCK_INIT(pmap);
1315 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1316 	pmap->pm_top = kernel_pmap->pm_top;
1317 	pmap->pm_satp = pmap_satp_mode() |
1318 	    (vtophys(pmap->pm_top) >> PAGE_SHIFT);
1319 	CPU_ZERO(&pmap->pm_active);
1320 	pmap_activate_boot(pmap);
1321 }
1322 
1323 int
1324 pmap_pinit(pmap_t pmap)
1325 {
1326 	vm_paddr_t topphys;
1327 	vm_page_t mtop;
1328 	size_t i;
1329 
1330 	mtop = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO |
1331 	    VM_ALLOC_WAITOK);
1332 
1333 	topphys = VM_PAGE_TO_PHYS(mtop);
1334 	pmap->pm_top = (pd_entry_t *)PHYS_TO_DMAP(topphys);
1335 	pmap->pm_satp = pmap_satp_mode() | (topphys >> PAGE_SHIFT);
1336 
1337 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1338 
1339 	CPU_ZERO(&pmap->pm_active);
1340 
1341 	if (pmap_mode == PMAP_MODE_SV39) {
1342 		/*
1343 		 * Copy L1 entries from the kernel pmap.  This must be done with
1344 		 * the allpmaps lock held to avoid races with
1345 		 * pmap_distribute_l1().
1346 		 */
1347 		mtx_lock(&allpmaps_lock);
1348 		LIST_INSERT_HEAD(&allpmaps, pmap, pm_list);
1349 		for (i = pmap_l1_index(VM_MIN_KERNEL_ADDRESS);
1350 		    i < pmap_l1_index(VM_MAX_KERNEL_ADDRESS); i++)
1351 			pmap->pm_top[i] = kernel_pmap->pm_top[i];
1352 		for (i = pmap_l1_index(DMAP_MIN_ADDRESS);
1353 		    i < pmap_l1_index(DMAP_MAX_ADDRESS); i++)
1354 			pmap->pm_top[i] = kernel_pmap->pm_top[i];
1355 		mtx_unlock(&allpmaps_lock);
1356 	} else {
1357 		i = pmap_l0_index(VM_MIN_KERNEL_ADDRESS);
1358 		pmap->pm_top[i] = kernel_pmap->pm_top[i];
1359 	}
1360 
1361 	vm_radix_init(&pmap->pm_root);
1362 
1363 	return (1);
1364 }
1365 
1366 /*
1367  * This routine is called if the desired page table page does not exist.
1368  *
1369  * If page table page allocation fails, this routine may sleep before
1370  * returning NULL.  It sleeps only if a lock pointer was given.
1371  *
1372  * Note: If a page allocation fails at page table level two or three,
1373  * one or two pages may be held during the wait, only to be released
1374  * afterwards.  This conservative approach is easily argued to avoid
1375  * race conditions.
1376  */
1377 static vm_page_t
1378 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1379 {
1380 	vm_page_t m, pdpg;
1381 	pt_entry_t entry;
1382 	vm_paddr_t phys;
1383 	pn_t pn;
1384 
1385 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1386 
1387 	/*
1388 	 * Allocate a page table page.
1389 	 */
1390 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1391 	if (m == NULL) {
1392 		if (lockp != NULL) {
1393 			RELEASE_PV_LIST_LOCK(lockp);
1394 			PMAP_UNLOCK(pmap);
1395 			rw_runlock(&pvh_global_lock);
1396 			vm_wait(NULL);
1397 			rw_rlock(&pvh_global_lock);
1398 			PMAP_LOCK(pmap);
1399 		}
1400 
1401 		/*
1402 		 * Indicate the need to retry.  While waiting, the page table
1403 		 * page may have been allocated.
1404 		 */
1405 		return (NULL);
1406 	}
1407 	m->pindex = ptepindex;
1408 
1409 	/*
1410 	 * Map the pagetable page into the process address space, if
1411 	 * it isn't already there.
1412 	 */
1413 	pn = VM_PAGE_TO_PHYS(m) >> PAGE_SHIFT;
1414 	if (ptepindex >= NUL2E + NUL1E) {
1415 		pd_entry_t *l0;
1416 		vm_pindex_t l0index;
1417 
1418 		KASSERT(pmap_mode != PMAP_MODE_SV39,
1419 		    ("%s: pindex %#lx in SV39 mode", __func__, ptepindex));
1420 		KASSERT(ptepindex < NUL2E + NUL1E + NUL0E,
1421 		    ("%s: pindex %#lx out of range", __func__, ptepindex));
1422 
1423 		l0index = ptepindex - (NUL2E + NUL1E);
1424 		l0 = &pmap->pm_top[l0index];
1425 		KASSERT((pmap_load(l0) & PTE_V) == 0,
1426 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0)));
1427 
1428 		entry = PTE_V | (pn << PTE_PPN0_S);
1429 		pmap_store(l0, entry);
1430 	} else if (ptepindex >= NUL2E) {
1431 		pd_entry_t *l0, *l1;
1432 		vm_pindex_t l0index, l1index;
1433 
1434 		l1index = ptepindex - NUL2E;
1435 		if (pmap_mode == PMAP_MODE_SV39) {
1436 			l1 = &pmap->pm_top[l1index];
1437 		} else {
1438 			l0index = l1index >> Ln_ENTRIES_SHIFT;
1439 			l0 = &pmap->pm_top[l0index];
1440 			if (pmap_load(l0) == 0) {
1441 				/* Recurse to allocate the L1 page. */
1442 				if (_pmap_alloc_l3(pmap,
1443 				    NUL2E + NUL1E + l0index, lockp) == NULL)
1444 					goto fail;
1445 				phys = PTE_TO_PHYS(pmap_load(l0));
1446 			} else {
1447 				phys = PTE_TO_PHYS(pmap_load(l0));
1448 				pdpg = PHYS_TO_VM_PAGE(phys);
1449 				pdpg->ref_count++;
1450 			}
1451 			l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1452 			l1 = &l1[ptepindex & Ln_ADDR_MASK];
1453 		}
1454 		KASSERT((pmap_load(l1) & PTE_V) == 0,
1455 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1456 
1457 		entry = PTE_V | (pn << PTE_PPN0_S);
1458 		pmap_store(l1, entry);
1459 		pmap_distribute_l1(pmap, l1index, entry);
1460 	} else {
1461 		vm_pindex_t l0index, l1index;
1462 		pd_entry_t *l0, *l1, *l2;
1463 
1464 		l1index = ptepindex >> (L1_SHIFT - L2_SHIFT);
1465 		if (pmap_mode == PMAP_MODE_SV39) {
1466 			l1 = &pmap->pm_top[l1index];
1467 			if (pmap_load(l1) == 0) {
1468 				/* recurse for allocating page dir */
1469 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1470 				    lockp) == NULL)
1471 					goto fail;
1472 			} else {
1473 				phys = PTE_TO_PHYS(pmap_load(l1));
1474 				pdpg = PHYS_TO_VM_PAGE(phys);
1475 				pdpg->ref_count++;
1476 			}
1477 		} else {
1478 			l0index = l1index >> Ln_ENTRIES_SHIFT;
1479 			l0 = &pmap->pm_top[l0index];
1480 			if (pmap_load(l0) == 0) {
1481 				/* Recurse to allocate the L1 entry. */
1482 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1483 				    lockp) == NULL)
1484 					goto fail;
1485 				phys = PTE_TO_PHYS(pmap_load(l0));
1486 				l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1487 				l1 = &l1[l1index & Ln_ADDR_MASK];
1488 			} else {
1489 				phys = PTE_TO_PHYS(pmap_load(l0));
1490 				l1 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1491 				l1 = &l1[l1index & Ln_ADDR_MASK];
1492 				if (pmap_load(l1) == 0) {
1493 					/* Recurse to allocate the L2 page. */
1494 					if (_pmap_alloc_l3(pmap,
1495 					    NUL2E + l1index, lockp) == NULL)
1496 						goto fail;
1497 				} else {
1498 					phys = PTE_TO_PHYS(pmap_load(l1));
1499 					pdpg = PHYS_TO_VM_PAGE(phys);
1500 					pdpg->ref_count++;
1501 				}
1502 			}
1503 		}
1504 
1505 		phys = PTE_TO_PHYS(pmap_load(l1));
1506 		l2 = (pd_entry_t *)PHYS_TO_DMAP(phys);
1507 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
1508 		KASSERT((pmap_load(l2) & PTE_V) == 0,
1509 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
1510 
1511 		entry = PTE_V | (pn << PTE_PPN0_S);
1512 		pmap_store(l2, entry);
1513 	}
1514 
1515 	pmap_resident_count_inc(pmap, 1);
1516 
1517 	return (m);
1518 
1519 fail:
1520 	vm_page_unwire_noq(m);
1521 	vm_page_free_zero(m);
1522 	return (NULL);
1523 }
1524 
1525 static vm_page_t
1526 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1527 {
1528 	pd_entry_t *l1;
1529 	vm_page_t l2pg;
1530 	vm_pindex_t l2pindex;
1531 
1532 retry:
1533 	l1 = pmap_l1(pmap, va);
1534 	if (l1 != NULL && (pmap_load(l1) & PTE_V) != 0) {
1535 		KASSERT((pmap_load(l1) & PTE_RWX) == 0,
1536 		    ("%s: L1 entry %#lx for VA %#lx is a leaf", __func__,
1537 		    pmap_load(l1), va));
1538 		/* Add a reference to the L2 page. */
1539 		l2pg = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l1)));
1540 		l2pg->ref_count++;
1541 	} else {
1542 		/* Allocate a L2 page. */
1543 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
1544 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
1545 		if (l2pg == NULL && lockp != NULL)
1546 			goto retry;
1547 	}
1548 	return (l2pg);
1549 }
1550 
1551 static vm_page_t
1552 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
1553 {
1554 	vm_pindex_t ptepindex;
1555 	pd_entry_t *l2;
1556 	vm_paddr_t phys;
1557 	vm_page_t m;
1558 
1559 	/*
1560 	 * Calculate pagetable page index
1561 	 */
1562 	ptepindex = pmap_l2_pindex(va);
1563 retry:
1564 	/*
1565 	 * Get the page directory entry
1566 	 */
1567 	l2 = pmap_l2(pmap, va);
1568 
1569 	/*
1570 	 * If the page table page is mapped, we just increment the
1571 	 * hold count, and activate it.
1572 	 */
1573 	if (l2 != NULL && pmap_load(l2) != 0) {
1574 		phys = PTE_TO_PHYS(pmap_load(l2));
1575 		m = PHYS_TO_VM_PAGE(phys);
1576 		m->ref_count++;
1577 	} else {
1578 		/*
1579 		 * Here if the pte page isn't mapped, or if it has been
1580 		 * deallocated.
1581 		 */
1582 		m = _pmap_alloc_l3(pmap, ptepindex, lockp);
1583 		if (m == NULL && lockp != NULL)
1584 			goto retry;
1585 	}
1586 	return (m);
1587 }
1588 
1589 /***************************************************
1590  * Pmap allocation/deallocation routines.
1591  ***************************************************/
1592 
1593 /*
1594  * Release any resources held by the given physical map.
1595  * Called when a pmap initialized by pmap_pinit is being released.
1596  * Should only be called if the map contains no valid mappings.
1597  */
1598 void
1599 pmap_release(pmap_t pmap)
1600 {
1601 	vm_page_t m;
1602 
1603 	KASSERT(pmap->pm_stats.resident_count == 0,
1604 	    ("pmap_release: pmap resident count %ld != 0",
1605 	    pmap->pm_stats.resident_count));
1606 	KASSERT(CPU_EMPTY(&pmap->pm_active),
1607 	    ("releasing active pmap %p", pmap));
1608 
1609 	if (pmap_mode == PMAP_MODE_SV39) {
1610 		mtx_lock(&allpmaps_lock);
1611 		LIST_REMOVE(pmap, pm_list);
1612 		mtx_unlock(&allpmaps_lock);
1613 	}
1614 
1615 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pmap->pm_top));
1616 	vm_page_unwire_noq(m);
1617 	vm_page_free(m);
1618 }
1619 
1620 static int
1621 kvm_size(SYSCTL_HANDLER_ARGS)
1622 {
1623 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
1624 
1625 	return sysctl_handle_long(oidp, &ksize, 0, req);
1626 }
1627 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1628     0, 0, kvm_size, "LU",
1629     "Size of KVM");
1630 
1631 static int
1632 kvm_free(SYSCTL_HANDLER_ARGS)
1633 {
1634 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
1635 
1636 	return sysctl_handle_long(oidp, &kfree, 0, req);
1637 }
1638 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
1639     0, 0, kvm_free, "LU",
1640     "Amount of KVM free");
1641 
1642 /*
1643  * grow the number of kernel page table entries, if needed
1644  */
1645 void
1646 pmap_growkernel(vm_offset_t addr)
1647 {
1648 	vm_paddr_t paddr;
1649 	vm_page_t nkpg;
1650 	pd_entry_t *l1, *l2;
1651 	pt_entry_t entry;
1652 	pn_t pn;
1653 
1654 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
1655 
1656 	addr = roundup2(addr, L2_SIZE);
1657 	if (addr - 1 >= vm_map_max(kernel_map))
1658 		addr = vm_map_max(kernel_map);
1659 	while (kernel_vm_end < addr) {
1660 		l1 = pmap_l1(kernel_pmap, kernel_vm_end);
1661 		if (pmap_load(l1) == 0) {
1662 			/* We need a new PDP entry */
1663 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
1664 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
1665 			if (nkpg == NULL)
1666 				panic("pmap_growkernel: no memory to grow kernel");
1667 			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
1668 			paddr = VM_PAGE_TO_PHYS(nkpg);
1669 
1670 			pn = (paddr / PAGE_SIZE);
1671 			entry = (PTE_V);
1672 			entry |= (pn << PTE_PPN0_S);
1673 			pmap_store(l1, entry);
1674 			pmap_distribute_l1(kernel_pmap,
1675 			    pmap_l1_index(kernel_vm_end), entry);
1676 			continue; /* try again */
1677 		}
1678 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
1679 		if ((pmap_load(l2) & PTE_V) != 0 &&
1680 		    (pmap_load(l2) & PTE_RWX) == 0) {
1681 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1682 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1683 				kernel_vm_end = vm_map_max(kernel_map);
1684 				break;
1685 			}
1686 			continue;
1687 		}
1688 
1689 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
1690 		    VM_ALLOC_ZERO);
1691 		if (nkpg == NULL)
1692 			panic("pmap_growkernel: no memory to grow kernel");
1693 		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
1694 		paddr = VM_PAGE_TO_PHYS(nkpg);
1695 
1696 		pn = (paddr / PAGE_SIZE);
1697 		entry = (PTE_V);
1698 		entry |= (pn << PTE_PPN0_S);
1699 		pmap_store(l2, entry);
1700 
1701 		pmap_invalidate_page(kernel_pmap, kernel_vm_end);
1702 
1703 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
1704 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
1705 			kernel_vm_end = vm_map_max(kernel_map);
1706 			break;
1707 		}
1708 	}
1709 }
1710 
1711 /***************************************************
1712  * page management routines.
1713  ***************************************************/
1714 
1715 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
1716 CTASSERT(_NPCM == 3);
1717 CTASSERT(_NPCPV == 168);
1718 
1719 static __inline struct pv_chunk *
1720 pv_to_chunk(pv_entry_t pv)
1721 {
1722 
1723 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
1724 }
1725 
1726 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
1727 
1728 #define	PC_FREE0	0xfffffffffffffffful
1729 #define	PC_FREE1	0xfffffffffffffffful
1730 #define	PC_FREE2	((1ul << (_NPCPV % 64)) - 1)
1731 
1732 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
1733 
1734 #if 0
1735 #ifdef PV_STATS
1736 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
1737 
1738 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
1739 	"Current number of pv entry chunks");
1740 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
1741 	"Current number of pv entry chunks allocated");
1742 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
1743 	"Current number of pv entry chunks frees");
1744 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
1745 	"Number of times tried to get a chunk page but failed.");
1746 
1747 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
1748 static int pv_entry_spare;
1749 
1750 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
1751 	"Current number of pv entry frees");
1752 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
1753 	"Current number of pv entry allocs");
1754 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
1755 	"Current number of pv entries");
1756 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
1757 	"Current number of spare pv entries");
1758 #endif
1759 #endif /* 0 */
1760 
1761 /*
1762  * We are in a serious low memory condition.  Resort to
1763  * drastic measures to free some pages so we can allocate
1764  * another pv entry chunk.
1765  *
1766  * Returns NULL if PV entries were reclaimed from the specified pmap.
1767  *
1768  * We do not, however, unmap 2mpages because subsequent accesses will
1769  * allocate per-page pv entries until repromotion occurs, thereby
1770  * exacerbating the shortage of free pv entries.
1771  */
1772 static vm_page_t
1773 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
1774 {
1775 
1776 	panic("RISCVTODO: reclaim_pv_chunk");
1777 }
1778 
1779 /*
1780  * free the pv_entry back to the free list
1781  */
1782 static void
1783 free_pv_entry(pmap_t pmap, pv_entry_t pv)
1784 {
1785 	struct pv_chunk *pc;
1786 	int idx, field, bit;
1787 
1788 	rw_assert(&pvh_global_lock, RA_LOCKED);
1789 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1790 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
1791 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
1792 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
1793 	pc = pv_to_chunk(pv);
1794 	idx = pv - &pc->pc_pventry[0];
1795 	field = idx / 64;
1796 	bit = idx % 64;
1797 	pc->pc_map[field] |= 1ul << bit;
1798 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
1799 	    pc->pc_map[2] != PC_FREE2) {
1800 		/* 98% of the time, pc is already at the head of the list. */
1801 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
1802 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1803 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1804 		}
1805 		return;
1806 	}
1807 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1808 	free_pv_chunk(pc);
1809 }
1810 
1811 static void
1812 free_pv_chunk(struct pv_chunk *pc)
1813 {
1814 	vm_page_t m;
1815 
1816 	mtx_lock(&pv_chunks_mutex);
1817  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
1818 	mtx_unlock(&pv_chunks_mutex);
1819 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
1820 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
1821 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
1822 	/* entire chunk is free, return it */
1823 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
1824 	dump_drop_page(m->phys_addr);
1825 	vm_page_unwire_noq(m);
1826 	vm_page_free(m);
1827 }
1828 
1829 /*
1830  * Returns a new PV entry, allocating a new PV chunk from the system when
1831  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
1832  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
1833  * returned.
1834  *
1835  * The given PV list lock may be released.
1836  */
1837 static pv_entry_t
1838 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
1839 {
1840 	int bit, field;
1841 	pv_entry_t pv;
1842 	struct pv_chunk *pc;
1843 	vm_page_t m;
1844 
1845 	rw_assert(&pvh_global_lock, RA_LOCKED);
1846 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1847 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
1848 retry:
1849 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
1850 	if (pc != NULL) {
1851 		for (field = 0; field < _NPCM; field++) {
1852 			if (pc->pc_map[field]) {
1853 				bit = ffsl(pc->pc_map[field]) - 1;
1854 				break;
1855 			}
1856 		}
1857 		if (field < _NPCM) {
1858 			pv = &pc->pc_pventry[field * 64 + bit];
1859 			pc->pc_map[field] &= ~(1ul << bit);
1860 			/* If this was the last item, move it to tail */
1861 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
1862 			    pc->pc_map[2] == 0) {
1863 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
1864 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
1865 				    pc_list);
1866 			}
1867 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
1868 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
1869 			return (pv);
1870 		}
1871 	}
1872 	/* No free items, allocate another chunk */
1873 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
1874 	if (m == NULL) {
1875 		if (lockp == NULL) {
1876 			PV_STAT(pc_chunk_tryfail++);
1877 			return (NULL);
1878 		}
1879 		m = reclaim_pv_chunk(pmap, lockp);
1880 		if (m == NULL)
1881 			goto retry;
1882 	}
1883 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
1884 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
1885 	dump_add_page(m->phys_addr);
1886 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1887 	pc->pc_pmap = pmap;
1888 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
1889 	pc->pc_map[1] = PC_FREE1;
1890 	pc->pc_map[2] = PC_FREE2;
1891 	mtx_lock(&pv_chunks_mutex);
1892 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
1893 	mtx_unlock(&pv_chunks_mutex);
1894 	pv = &pc->pc_pventry[0];
1895 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1896 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
1897 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
1898 	return (pv);
1899 }
1900 
1901 /*
1902  * Ensure that the number of spare PV entries in the specified pmap meets or
1903  * exceeds the given count, "needed".
1904  *
1905  * The given PV list lock may be released.
1906  */
1907 static void
1908 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
1909 {
1910 	struct pch new_tail;
1911 	struct pv_chunk *pc;
1912 	vm_page_t m;
1913 	int avail, free;
1914 	bool reclaimed;
1915 
1916 	rw_assert(&pvh_global_lock, RA_LOCKED);
1917 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1918 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
1919 
1920 	/*
1921 	 * Newly allocated PV chunks must be stored in a private list until
1922 	 * the required number of PV chunks have been allocated.  Otherwise,
1923 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
1924 	 * contrast, these chunks must be added to the pmap upon allocation.
1925 	 */
1926 	TAILQ_INIT(&new_tail);
1927 retry:
1928 	avail = 0;
1929 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
1930 		bit_count((bitstr_t *)pc->pc_map, 0,
1931 		    sizeof(pc->pc_map) * NBBY, &free);
1932 		if (free == 0)
1933 			break;
1934 		avail += free;
1935 		if (avail >= needed)
1936 			break;
1937 	}
1938 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
1939 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
1940 		if (m == NULL) {
1941 			m = reclaim_pv_chunk(pmap, lockp);
1942 			if (m == NULL)
1943 				goto retry;
1944 			reclaimed = true;
1945 		}
1946 		/* XXX PV STATS */
1947 #if 0
1948 		dump_add_page(m->phys_addr);
1949 #endif
1950 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
1951 		pc->pc_pmap = pmap;
1952 		pc->pc_map[0] = PC_FREE0;
1953 		pc->pc_map[1] = PC_FREE1;
1954 		pc->pc_map[2] = PC_FREE2;
1955 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
1956 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
1957 
1958 		/*
1959 		 * The reclaim might have freed a chunk from the current pmap.
1960 		 * If that chunk contained available entries, we need to
1961 		 * re-count the number of available entries.
1962 		 */
1963 		if (reclaimed)
1964 			goto retry;
1965 	}
1966 	if (!TAILQ_EMPTY(&new_tail)) {
1967 		mtx_lock(&pv_chunks_mutex);
1968 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
1969 		mtx_unlock(&pv_chunks_mutex);
1970 	}
1971 }
1972 
1973 /*
1974  * First find and then remove the pv entry for the specified pmap and virtual
1975  * address from the specified pv list.  Returns the pv entry if found and NULL
1976  * otherwise.  This operation can be performed on pv lists for either 4KB or
1977  * 2MB page mappings.
1978  */
1979 static __inline pv_entry_t
1980 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
1981 {
1982 	pv_entry_t pv;
1983 
1984 	rw_assert(&pvh_global_lock, RA_LOCKED);
1985 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
1986 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
1987 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
1988 			pvh->pv_gen++;
1989 			break;
1990 		}
1991 	}
1992 	return (pv);
1993 }
1994 
1995 /*
1996  * First find and then destroy the pv entry for the specified pmap and virtual
1997  * address.  This operation can be performed on pv lists for either 4KB or 2MB
1998  * page mappings.
1999  */
2000 static void
2001 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2002 {
2003 	pv_entry_t pv;
2004 
2005 	pv = pmap_pvh_remove(pvh, pmap, va);
2006 
2007 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found for %#lx", va));
2008 	free_pv_entry(pmap, pv);
2009 }
2010 
2011 /*
2012  * Conditionally create the PV entry for a 4KB page mapping if the required
2013  * memory can be allocated without resorting to reclamation.
2014  */
2015 static boolean_t
2016 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2017     struct rwlock **lockp)
2018 {
2019 	pv_entry_t pv;
2020 
2021 	rw_assert(&pvh_global_lock, RA_LOCKED);
2022 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2023 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2024 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2025 		pv->pv_va = va;
2026 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2027 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2028 		m->md.pv_gen++;
2029 		return (TRUE);
2030 	} else
2031 		return (FALSE);
2032 }
2033 
2034 /*
2035  * After demotion from a 2MB page mapping to 512 4KB page mappings,
2036  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2037  * entries for each of the 4KB page mappings.
2038  */
2039 static void __unused
2040 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2041     struct rwlock **lockp)
2042 {
2043 	struct md_page *pvh;
2044 	struct pv_chunk *pc;
2045 	pv_entry_t pv;
2046 	vm_page_t m;
2047 	vm_offset_t va_last;
2048 	int bit, field;
2049 
2050 	rw_assert(&pvh_global_lock, RA_LOCKED);
2051 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2052 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2053 
2054 	/*
2055 	 * Transfer the 2mpage's pv entry for this mapping to the first
2056 	 * page's pv list.  Once this transfer begins, the pv list lock
2057 	 * must not be released until the last pv entry is reinstantiated.
2058 	 */
2059 	pvh = pa_to_pvh(pa);
2060 	va &= ~L2_OFFSET;
2061 	pv = pmap_pvh_remove(pvh, pmap, va);
2062 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2063 	m = PHYS_TO_VM_PAGE(pa);
2064 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2065 	m->md.pv_gen++;
2066 	/* Instantiate the remaining 511 pv entries. */
2067 	va_last = va + L2_SIZE - PAGE_SIZE;
2068 	for (;;) {
2069 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2070 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
2071 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
2072 		for (field = 0; field < _NPCM; field++) {
2073 			while (pc->pc_map[field] != 0) {
2074 				bit = ffsl(pc->pc_map[field]) - 1;
2075 				pc->pc_map[field] &= ~(1ul << bit);
2076 				pv = &pc->pc_pventry[field * 64 + bit];
2077 				va += PAGE_SIZE;
2078 				pv->pv_va = va;
2079 				m++;
2080 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2081 			    ("pmap_pv_demote_l2: page %p is not managed", m));
2082 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2083 				m->md.pv_gen++;
2084 				if (va == va_last)
2085 					goto out;
2086 			}
2087 		}
2088 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2089 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2090 	}
2091 out:
2092 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
2093 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2094 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2095 	}
2096 	/* XXX PV stats */
2097 }
2098 
2099 #if VM_NRESERVLEVEL > 0
2100 static void
2101 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2102     struct rwlock **lockp)
2103 {
2104 	struct md_page *pvh;
2105 	pv_entry_t pv;
2106 	vm_page_t m;
2107 	vm_offset_t va_last;
2108 
2109 	rw_assert(&pvh_global_lock, RA_LOCKED);
2110 	KASSERT((va & L2_OFFSET) == 0,
2111 	    ("pmap_pv_promote_l2: misaligned va %#lx", va));
2112 
2113 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2114 
2115 	m = PHYS_TO_VM_PAGE(pa);
2116 	pv = pmap_pvh_remove(&m->md, pmap, va);
2117 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv for %#lx not found", va));
2118 	pvh = pa_to_pvh(pa);
2119 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2120 	pvh->pv_gen++;
2121 
2122 	va_last = va + L2_SIZE - PAGE_SIZE;
2123 	do {
2124 		m++;
2125 		va += PAGE_SIZE;
2126 		pmap_pvh_free(&m->md, pmap, va);
2127 	} while (va < va_last);
2128 }
2129 #endif /* VM_NRESERVLEVEL > 0 */
2130 
2131 /*
2132  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
2133  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
2134  * false if the PV entry cannot be allocated without resorting to reclamation.
2135  */
2136 static bool
2137 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2138     struct rwlock **lockp)
2139 {
2140 	struct md_page *pvh;
2141 	pv_entry_t pv;
2142 	vm_paddr_t pa;
2143 
2144 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2145 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2146 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2147 	    NULL : lockp)) == NULL)
2148 		return (false);
2149 	pv->pv_va = va;
2150 	pa = PTE_TO_PHYS(l2e);
2151 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2152 	pvh = pa_to_pvh(pa);
2153 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2154 	pvh->pv_gen++;
2155 	return (true);
2156 }
2157 
2158 static void
2159 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2160 {
2161 	pt_entry_t newl2, oldl2 __diagused;
2162 	vm_page_t ml3;
2163 	vm_paddr_t ml3pa;
2164 
2165 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2166 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2167 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2168 
2169 	ml3 = pmap_remove_pt_page(pmap, va);
2170 	if (ml3 == NULL)
2171 		panic("pmap_remove_kernel_l2: Missing pt page");
2172 
2173 	ml3pa = VM_PAGE_TO_PHYS(ml3);
2174 	newl2 = ml3pa | PTE_V;
2175 
2176 	/*
2177 	 * If this page table page was unmapped by a promotion, then it
2178 	 * contains valid mappings.  Zero it to invalidate those mappings.
2179 	 */
2180 	if (ml3->valid != 0)
2181 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
2182 
2183 	/*
2184 	 * Demote the mapping.
2185 	 */
2186 	oldl2 = pmap_load_store(l2, newl2);
2187 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2188 	    __func__, l2, oldl2));
2189 }
2190 
2191 /*
2192  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2193  */
2194 static int
2195 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2196     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2197 {
2198 	struct md_page *pvh;
2199 	pt_entry_t oldl2;
2200 	vm_offset_t eva, va;
2201 	vm_page_t m, ml3;
2202 
2203 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2204 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2205 	oldl2 = pmap_load_clear(l2);
2206 	KASSERT((oldl2 & PTE_RWX) != 0,
2207 	    ("pmap_remove_l2: L2e %lx is not a superpage mapping", oldl2));
2208 
2209 	/*
2210 	 * The sfence.vma documentation states that it is sufficient to specify
2211 	 * a single address within a superpage mapping.  However, since we do
2212 	 * not perform any invalidation upon promotion, TLBs may still be
2213 	 * caching 4KB mappings within the superpage, so we must invalidate the
2214 	 * entire range.
2215 	 */
2216 	pmap_invalidate_range(pmap, sva, sva + L2_SIZE);
2217 	if ((oldl2 & PTE_SW_WIRED) != 0)
2218 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2219 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2220 	if ((oldl2 & PTE_SW_MANAGED) != 0) {
2221 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, PTE_TO_PHYS(oldl2));
2222 		pvh = pa_to_pvh(PTE_TO_PHYS(oldl2));
2223 		pmap_pvh_free(pvh, pmap, sva);
2224 		eva = sva + L2_SIZE;
2225 		for (va = sva, m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(oldl2));
2226 		    va < eva; va += PAGE_SIZE, m++) {
2227 			if ((oldl2 & PTE_D) != 0)
2228 				vm_page_dirty(m);
2229 			if ((oldl2 & PTE_A) != 0)
2230 				vm_page_aflag_set(m, PGA_REFERENCED);
2231 			if (TAILQ_EMPTY(&m->md.pv_list) &&
2232 			    TAILQ_EMPTY(&pvh->pv_list))
2233 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2234 		}
2235 	}
2236 	if (pmap == kernel_pmap) {
2237 		pmap_remove_kernel_l2(pmap, l2, sva);
2238 	} else {
2239 		ml3 = pmap_remove_pt_page(pmap, sva);
2240 		if (ml3 != NULL) {
2241 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
2242 			    ("pmap_remove_l2: l3 page not promoted"));
2243 			pmap_resident_count_dec(pmap, 1);
2244 			KASSERT(ml3->ref_count == Ln_ENTRIES,
2245 			    ("pmap_remove_l2: l3 page ref count error"));
2246 			ml3->ref_count = 1;
2247 			vm_page_unwire_noq(ml3);
2248 			pmap_add_delayed_free_list(ml3, free, FALSE);
2249 		}
2250 	}
2251 	return (pmap_unuse_pt(pmap, sva, l1e, free));
2252 }
2253 
2254 /*
2255  * pmap_remove_l3: do the things to unmap a page in a process
2256  */
2257 static int
2258 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2259     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2260 {
2261 	struct md_page *pvh;
2262 	pt_entry_t old_l3;
2263 	vm_paddr_t phys;
2264 	vm_page_t m;
2265 
2266 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2267 	old_l3 = pmap_load_clear(l3);
2268 	pmap_invalidate_page(pmap, va);
2269 	if (old_l3 & PTE_SW_WIRED)
2270 		pmap->pm_stats.wired_count -= 1;
2271 	pmap_resident_count_dec(pmap, 1);
2272 	if (old_l3 & PTE_SW_MANAGED) {
2273 		phys = PTE_TO_PHYS(old_l3);
2274 		m = PHYS_TO_VM_PAGE(phys);
2275 		if ((old_l3 & PTE_D) != 0)
2276 			vm_page_dirty(m);
2277 		if (old_l3 & PTE_A)
2278 			vm_page_aflag_set(m, PGA_REFERENCED);
2279 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2280 		pmap_pvh_free(&m->md, pmap, va);
2281 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2282 		    (m->flags & PG_FICTITIOUS) == 0) {
2283 			pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
2284 			if (TAILQ_EMPTY(&pvh->pv_list))
2285 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2286 		}
2287 	}
2288 
2289 	return (pmap_unuse_pt(pmap, va, l2e, free));
2290 }
2291 
2292 /*
2293  *	Remove the given range of addresses from the specified map.
2294  *
2295  *	It is assumed that the start and end are properly
2296  *	rounded to the page size.
2297  */
2298 void
2299 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
2300 {
2301 	struct spglist free;
2302 	struct rwlock *lock;
2303 	vm_offset_t va, va_next;
2304 	pd_entry_t *l0, *l1, *l2, l2e;
2305 	pt_entry_t *l3;
2306 
2307 	/*
2308 	 * Perform an unsynchronized read.  This is, however, safe.
2309 	 */
2310 	if (pmap->pm_stats.resident_count == 0)
2311 		return;
2312 
2313 	SLIST_INIT(&free);
2314 
2315 	rw_rlock(&pvh_global_lock);
2316 	PMAP_LOCK(pmap);
2317 
2318 	lock = NULL;
2319 	for (; sva < eva; sva = va_next) {
2320 		if (pmap->pm_stats.resident_count == 0)
2321 			break;
2322 
2323 		if (pmap_mode == PMAP_MODE_SV48) {
2324 			l0 = pmap_l0(pmap, sva);
2325 			if (pmap_load(l0) == 0) {
2326 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2327 				if (va_next < sva)
2328 					va_next = eva;
2329 				continue;
2330 			}
2331 			l1 = pmap_l0_to_l1(l0, sva);
2332 		} else {
2333 			l1 = pmap_l1(pmap, sva);
2334 		}
2335 
2336 		if (pmap_load(l1) == 0) {
2337 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2338 			if (va_next < sva)
2339 				va_next = eva;
2340 			continue;
2341 		}
2342 
2343 		/*
2344 		 * Calculate index for next page table.
2345 		 */
2346 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2347 		if (va_next < sva)
2348 			va_next = eva;
2349 
2350 		l2 = pmap_l1_to_l2(l1, sva);
2351 		if (l2 == NULL)
2352 			continue;
2353 		if ((l2e = pmap_load(l2)) == 0)
2354 			continue;
2355 		if ((l2e & PTE_RWX) != 0) {
2356 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2357 				(void)pmap_remove_l2(pmap, l2, sva,
2358 				    pmap_load(l1), &free, &lock);
2359 				continue;
2360 			} else if (!pmap_demote_l2_locked(pmap, l2, sva,
2361 			    &lock)) {
2362 				/*
2363 				 * The large page mapping was destroyed.
2364 				 */
2365 				continue;
2366 			}
2367 			l2e = pmap_load(l2);
2368 		}
2369 
2370 		/*
2371 		 * Limit our scan to either the end of the va represented
2372 		 * by the current page table page, or to the end of the
2373 		 * range being removed.
2374 		 */
2375 		if (va_next > eva)
2376 			va_next = eva;
2377 
2378 		va = va_next;
2379 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2380 		    sva += L3_SIZE) {
2381 			if (pmap_load(l3) == 0) {
2382 				if (va != va_next) {
2383 					pmap_invalidate_range(pmap, va, sva);
2384 					va = va_next;
2385 				}
2386 				continue;
2387 			}
2388 			if (va == va_next)
2389 				va = sva;
2390 			if (pmap_remove_l3(pmap, l3, sva, l2e, &free, &lock)) {
2391 				sva += L3_SIZE;
2392 				break;
2393 			}
2394 		}
2395 		if (va != va_next)
2396 			pmap_invalidate_range(pmap, va, sva);
2397 	}
2398 	if (lock != NULL)
2399 		rw_wunlock(lock);
2400 	rw_runlock(&pvh_global_lock);
2401 	PMAP_UNLOCK(pmap);
2402 	vm_page_free_pages_toq(&free, false);
2403 }
2404 
2405 /*
2406  *	Routine:	pmap_remove_all
2407  *	Function:
2408  *		Removes this physical page from
2409  *		all physical maps in which it resides.
2410  *		Reflects back modify bits to the pager.
2411  *
2412  *	Notes:
2413  *		Original versions of this routine were very
2414  *		inefficient because they iteratively called
2415  *		pmap_remove (slow...)
2416  */
2417 
2418 void
2419 pmap_remove_all(vm_page_t m)
2420 {
2421 	struct spglist free;
2422 	struct md_page *pvh;
2423 	pmap_t pmap;
2424 	pt_entry_t *l3, l3e;
2425 	pd_entry_t *l2, l2e __diagused;
2426 	pv_entry_t pv;
2427 	vm_offset_t va;
2428 
2429 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2430 	    ("pmap_remove_all: page %p is not managed", m));
2431 	SLIST_INIT(&free);
2432 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
2433 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
2434 
2435 	rw_wlock(&pvh_global_lock);
2436 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
2437 		pmap = PV_PMAP(pv);
2438 		PMAP_LOCK(pmap);
2439 		va = pv->pv_va;
2440 		l2 = pmap_l2(pmap, va);
2441 		(void)pmap_demote_l2(pmap, l2, va);
2442 		PMAP_UNLOCK(pmap);
2443 	}
2444 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
2445 		pmap = PV_PMAP(pv);
2446 		PMAP_LOCK(pmap);
2447 		pmap_resident_count_dec(pmap, 1);
2448 		l2 = pmap_l2(pmap, pv->pv_va);
2449 		KASSERT(l2 != NULL, ("pmap_remove_all: no l2 table found"));
2450 		l2e = pmap_load(l2);
2451 
2452 		KASSERT((l2e & PTE_RX) == 0,
2453 		    ("pmap_remove_all: found a superpage in %p's pv list", m));
2454 
2455 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
2456 		l3e = pmap_load_clear(l3);
2457 		pmap_invalidate_page(pmap, pv->pv_va);
2458 		if (l3e & PTE_SW_WIRED)
2459 			pmap->pm_stats.wired_count--;
2460 		if ((l3e & PTE_A) != 0)
2461 			vm_page_aflag_set(m, PGA_REFERENCED);
2462 
2463 		/*
2464 		 * Update the vm_page_t clean and reference bits.
2465 		 */
2466 		if ((l3e & PTE_D) != 0)
2467 			vm_page_dirty(m);
2468 		pmap_unuse_pt(pmap, pv->pv_va, pmap_load(l2), &free);
2469 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2470 		m->md.pv_gen++;
2471 		free_pv_entry(pmap, pv);
2472 		PMAP_UNLOCK(pmap);
2473 	}
2474 	vm_page_aflag_clear(m, PGA_WRITEABLE);
2475 	rw_wunlock(&pvh_global_lock);
2476 	vm_page_free_pages_toq(&free, false);
2477 }
2478 
2479 /*
2480  *	Set the physical protection on the
2481  *	specified range of this map as requested.
2482  */
2483 void
2484 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
2485 {
2486 	pd_entry_t *l0, *l1, *l2, l2e;
2487 	pt_entry_t *l3, l3e, mask;
2488 	vm_page_t m, mt;
2489 	vm_paddr_t pa;
2490 	vm_offset_t va_next;
2491 	bool anychanged, pv_lists_locked;
2492 
2493 	if ((prot & VM_PROT_READ) == VM_PROT_NONE) {
2494 		pmap_remove(pmap, sva, eva);
2495 		return;
2496 	}
2497 
2498 	if ((prot & (VM_PROT_WRITE | VM_PROT_EXECUTE)) ==
2499 	    (VM_PROT_WRITE | VM_PROT_EXECUTE))
2500 		return;
2501 
2502 	anychanged = false;
2503 	pv_lists_locked = false;
2504 	mask = 0;
2505 	if ((prot & VM_PROT_WRITE) == 0)
2506 		mask |= PTE_W | PTE_D;
2507 	if ((prot & VM_PROT_EXECUTE) == 0)
2508 		mask |= PTE_X;
2509 resume:
2510 	PMAP_LOCK(pmap);
2511 	for (; sva < eva; sva = va_next) {
2512 		if (pmap_mode == PMAP_MODE_SV48) {
2513 			l0 = pmap_l0(pmap, sva);
2514 			if (pmap_load(l0) == 0) {
2515 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
2516 				if (va_next < sva)
2517 					va_next = eva;
2518 				continue;
2519 			}
2520 			l1 = pmap_l0_to_l1(l0, sva);
2521 		} else {
2522 			l1 = pmap_l1(pmap, sva);
2523 		}
2524 
2525 		if (pmap_load(l1) == 0) {
2526 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
2527 			if (va_next < sva)
2528 				va_next = eva;
2529 			continue;
2530 		}
2531 
2532 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
2533 		if (va_next < sva)
2534 			va_next = eva;
2535 
2536 		l2 = pmap_l1_to_l2(l1, sva);
2537 		if (l2 == NULL || (l2e = pmap_load(l2)) == 0)
2538 			continue;
2539 		if ((l2e & PTE_RWX) != 0) {
2540 			if (sva + L2_SIZE == va_next && eva >= va_next) {
2541 retryl2:
2542 				if ((prot & VM_PROT_WRITE) == 0 &&
2543 				    (l2e & (PTE_SW_MANAGED | PTE_D)) ==
2544 				    (PTE_SW_MANAGED | PTE_D)) {
2545 					pa = PTE_TO_PHYS(l2e);
2546 					m = PHYS_TO_VM_PAGE(pa);
2547 					for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
2548 						vm_page_dirty(mt);
2549 				}
2550 				if (!atomic_fcmpset_long(l2, &l2e, l2e & ~mask))
2551 					goto retryl2;
2552 				anychanged = true;
2553 				continue;
2554 			} else {
2555 				if (!pv_lists_locked) {
2556 					pv_lists_locked = true;
2557 					if (!rw_try_rlock(&pvh_global_lock)) {
2558 						if (anychanged)
2559 							pmap_invalidate_all(
2560 							    pmap);
2561 						PMAP_UNLOCK(pmap);
2562 						rw_rlock(&pvh_global_lock);
2563 						goto resume;
2564 					}
2565 				}
2566 				if (!pmap_demote_l2(pmap, l2, sva)) {
2567 					/*
2568 					 * The large page mapping was destroyed.
2569 					 */
2570 					continue;
2571 				}
2572 			}
2573 		}
2574 
2575 		if (va_next > eva)
2576 			va_next = eva;
2577 
2578 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
2579 		    sva += L3_SIZE) {
2580 			l3e = pmap_load(l3);
2581 retryl3:
2582 			if ((l3e & PTE_V) == 0)
2583 				continue;
2584 			if ((prot & VM_PROT_WRITE) == 0 &&
2585 			    (l3e & (PTE_SW_MANAGED | PTE_D)) ==
2586 			    (PTE_SW_MANAGED | PTE_D)) {
2587 				m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(l3e));
2588 				vm_page_dirty(m);
2589 			}
2590 			if (!atomic_fcmpset_long(l3, &l3e, l3e & ~mask))
2591 				goto retryl3;
2592 			anychanged = true;
2593 		}
2594 	}
2595 	if (anychanged)
2596 		pmap_invalidate_all(pmap);
2597 	if (pv_lists_locked)
2598 		rw_runlock(&pvh_global_lock);
2599 	PMAP_UNLOCK(pmap);
2600 }
2601 
2602 int
2603 pmap_fault(pmap_t pmap, vm_offset_t va, vm_prot_t ftype)
2604 {
2605 	pd_entry_t *l2, l2e;
2606 	pt_entry_t bits, *pte, oldpte;
2607 	int rv;
2608 
2609 	KASSERT(VIRT_IS_VALID(va), ("pmap_fault: invalid va %#lx", va));
2610 
2611 	rv = 0;
2612 	PMAP_LOCK(pmap);
2613 	l2 = pmap_l2(pmap, va);
2614 	if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
2615 		goto done;
2616 	if ((l2e & PTE_RWX) == 0) {
2617 		pte = pmap_l2_to_l3(l2, va);
2618 		if (pte == NULL || ((oldpte = pmap_load(pte)) & PTE_V) == 0)
2619 			goto done;
2620 	} else {
2621 		pte = l2;
2622 		oldpte = l2e;
2623 	}
2624 
2625 	if ((pmap != kernel_pmap && (oldpte & PTE_U) == 0) ||
2626 	    (ftype == VM_PROT_WRITE && (oldpte & PTE_W) == 0) ||
2627 	    (ftype == VM_PROT_EXECUTE && (oldpte & PTE_X) == 0) ||
2628 	    (ftype == VM_PROT_READ && (oldpte & PTE_R) == 0))
2629 		goto done;
2630 
2631 	bits = PTE_A;
2632 	if (ftype == VM_PROT_WRITE)
2633 		bits |= PTE_D;
2634 
2635 	/*
2636 	 * Spurious faults can occur if the implementation caches invalid
2637 	 * entries in the TLB, or if simultaneous accesses on multiple CPUs
2638 	 * race with each other.
2639 	 */
2640 	if ((oldpte & bits) != bits)
2641 		pmap_store_bits(pte, bits);
2642 	sfence_vma();
2643 	rv = 1;
2644 done:
2645 	PMAP_UNLOCK(pmap);
2646 	return (rv);
2647 }
2648 
2649 static bool
2650 pmap_demote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va)
2651 {
2652 	struct rwlock *lock;
2653 	bool rv;
2654 
2655 	lock = NULL;
2656 	rv = pmap_demote_l2_locked(pmap, l2, va, &lock);
2657 	if (lock != NULL)
2658 		rw_wunlock(lock);
2659 	return (rv);
2660 }
2661 
2662 /*
2663  * Tries to demote a 2MB page mapping.  If demotion fails, the 2MB page
2664  * mapping is invalidated.
2665  */
2666 static bool
2667 pmap_demote_l2_locked(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2668     struct rwlock **lockp)
2669 {
2670 	struct spglist free;
2671 	vm_page_t mpte;
2672 	pd_entry_t newl2, oldl2;
2673 	pt_entry_t *firstl3, newl3;
2674 	vm_paddr_t mptepa;
2675 	int i;
2676 
2677 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2678 
2679 	oldl2 = pmap_load(l2);
2680 	KASSERT((oldl2 & PTE_RWX) != 0,
2681 	    ("pmap_demote_l2_locked: oldl2 is not a leaf entry"));
2682 	if ((oldl2 & PTE_A) == 0 || (mpte = pmap_remove_pt_page(pmap, va)) ==
2683 	    NULL) {
2684 		if ((oldl2 & PTE_A) == 0 || (mpte = vm_page_alloc_noobj(
2685 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
2686 		    VM_ALLOC_WIRED)) == NULL) {
2687 			SLIST_INIT(&free);
2688 			(void)pmap_remove_l2(pmap, l2, va & ~L2_OFFSET,
2689 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
2690 			vm_page_free_pages_toq(&free, true);
2691 			CTR2(KTR_PMAP, "pmap_demote_l2_locked: "
2692 			    "failure for va %#lx in pmap %p", va, pmap);
2693 			return (false);
2694 		}
2695 		mpte->pindex = pmap_l2_pindex(va);
2696 		if (va < VM_MAXUSER_ADDRESS) {
2697 			mpte->ref_count = Ln_ENTRIES;
2698 			pmap_resident_count_inc(pmap, 1);
2699 		}
2700 	}
2701 	mptepa = VM_PAGE_TO_PHYS(mpte);
2702 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(mptepa);
2703 	newl2 = ((mptepa / PAGE_SIZE) << PTE_PPN0_S) | PTE_V;
2704 	KASSERT((oldl2 & PTE_A) != 0,
2705 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_A"));
2706 	KASSERT((oldl2 & (PTE_D | PTE_W)) != PTE_W,
2707 	    ("pmap_demote_l2_locked: oldl2 is missing PTE_D"));
2708 	newl3 = oldl2;
2709 
2710 	/*
2711 	 * If the page table page is not leftover from an earlier promotion,
2712 	 * initialize it.
2713 	 */
2714 	if (mpte->valid == 0) {
2715 		for (i = 0; i < Ln_ENTRIES; i++)
2716 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
2717 	}
2718 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3)) == PTE_TO_PHYS(newl3),
2719 	    ("pmap_demote_l2_locked: firstl3 and newl3 map different physical "
2720 	    "addresses"));
2721 
2722 	/*
2723 	 * If the mapping has changed attributes, update the page table
2724 	 * entries.
2725 	 */
2726 	if ((pmap_load(firstl3) & PTE_PROMOTE) != (newl3 & PTE_PROMOTE))
2727 		for (i = 0; i < Ln_ENTRIES; i++)
2728 			pmap_store(firstl3 + i, newl3 + (i << PTE_PPN0_S));
2729 
2730 	/*
2731 	 * The spare PV entries must be reserved prior to demoting the
2732 	 * mapping, that is, prior to changing the L2 entry.  Otherwise, the
2733 	 * state of the L2 entry and the PV lists will be inconsistent, which
2734 	 * can result in reclaim_pv_chunk() attempting to remove a PV entry from
2735 	 * the wrong PV list and pmap_pv_demote_l2() failing to find the
2736 	 * expected PV entry for the 2MB page mapping that is being demoted.
2737 	 */
2738 	if ((oldl2 & PTE_SW_MANAGED) != 0)
2739 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
2740 
2741 	/*
2742 	 * Demote the mapping.
2743 	 */
2744 	pmap_store(l2, newl2);
2745 
2746 	/*
2747 	 * Demote the PV entry.
2748 	 */
2749 	if ((oldl2 & PTE_SW_MANAGED) != 0)
2750 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
2751 
2752 	atomic_add_long(&pmap_l2_demotions, 1);
2753 	CTR2(KTR_PMAP, "pmap_demote_l2_locked: success for va %#lx in pmap %p",
2754 	    va, pmap);
2755 	return (true);
2756 }
2757 
2758 #if VM_NRESERVLEVEL > 0
2759 static void
2760 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
2761     struct rwlock **lockp)
2762 {
2763 	pt_entry_t *firstl3, firstl3e, *l3, l3e;
2764 	vm_paddr_t pa;
2765 	vm_page_t ml3;
2766 
2767 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2768 
2769 	va &= ~L2_OFFSET;
2770 	KASSERT((pmap_load(l2) & PTE_RWX) == 0,
2771 	    ("pmap_promote_l2: invalid l2 entry %p", l2));
2772 
2773 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
2774 	firstl3e = pmap_load(firstl3);
2775 	pa = PTE_TO_PHYS(firstl3e);
2776 	if ((pa & L2_OFFSET) != 0) {
2777 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
2778 		    va, pmap);
2779 		atomic_add_long(&pmap_l2_p_failures, 1);
2780 		return;
2781 	}
2782 
2783 	/*
2784 	 * Downgrade a clean, writable mapping to read-only to ensure that the
2785 	 * hardware does not set PTE_D while we are comparing PTEs.
2786 	 *
2787 	 * Upon a write access to a clean mapping, the implementation will
2788 	 * either atomically check protections and set PTE_D, or raise a page
2789 	 * fault.  In the latter case, the pmap lock provides atomicity.  Thus,
2790 	 * we do not issue an sfence.vma here and instead rely on pmap_fault()
2791 	 * to do so lazily.
2792 	 */
2793 	while ((firstl3e & (PTE_W | PTE_D)) == PTE_W) {
2794 		if (atomic_fcmpset_64(firstl3, &firstl3e, firstl3e & ~PTE_W)) {
2795 			firstl3e &= ~PTE_W;
2796 			break;
2797 		}
2798 	}
2799 
2800 	pa += PAGE_SIZE;
2801 	for (l3 = firstl3 + 1; l3 < firstl3 + Ln_ENTRIES; l3++) {
2802 		l3e = pmap_load(l3);
2803 		if (PTE_TO_PHYS(l3e) != pa) {
2804 			CTR2(KTR_PMAP,
2805 			    "pmap_promote_l2: failure for va %#lx pmap %p",
2806 			    va, pmap);
2807 			atomic_add_long(&pmap_l2_p_failures, 1);
2808 			return;
2809 		}
2810 		while ((l3e & (PTE_W | PTE_D)) == PTE_W) {
2811 			if (atomic_fcmpset_64(l3, &l3e, l3e & ~PTE_W)) {
2812 				l3e &= ~PTE_W;
2813 				break;
2814 			}
2815 		}
2816 		if ((l3e & PTE_PROMOTE) != (firstl3e & PTE_PROMOTE)) {
2817 			CTR2(KTR_PMAP,
2818 			    "pmap_promote_l2: failure for va %#lx pmap %p",
2819 			    va, pmap);
2820 			atomic_add_long(&pmap_l2_p_failures, 1);
2821 			return;
2822 		}
2823 		pa += PAGE_SIZE;
2824 	}
2825 
2826 	ml3 = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
2827 	KASSERT(ml3->pindex == pmap_l2_pindex(va),
2828 	    ("pmap_promote_l2: page table page's pindex is wrong"));
2829 	if (pmap_insert_pt_page(pmap, ml3, true)) {
2830 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx pmap %p",
2831 		    va, pmap);
2832 		atomic_add_long(&pmap_l2_p_failures, 1);
2833 		return;
2834 	}
2835 
2836 	if ((firstl3e & PTE_SW_MANAGED) != 0)
2837 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(firstl3e), lockp);
2838 
2839 	pmap_store(l2, firstl3e);
2840 
2841 	atomic_add_long(&pmap_l2_promotions, 1);
2842 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
2843 	    pmap);
2844 }
2845 #endif
2846 
2847 /*
2848  *	Insert the given physical page (p) at
2849  *	the specified virtual address (v) in the
2850  *	target physical map with the protection requested.
2851  *
2852  *	If specified, the page will be wired down, meaning
2853  *	that the related pte can not be reclaimed.
2854  *
2855  *	NB:  This is the only routine which MAY NOT lazy-evaluate
2856  *	or lose information.  That is, this routine must actually
2857  *	insert this page into the given map NOW.
2858  */
2859 int
2860 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
2861     u_int flags, int8_t psind)
2862 {
2863 	struct rwlock *lock;
2864 	pd_entry_t *l1, *l2, l2e;
2865 	pt_entry_t new_l3, orig_l3;
2866 	pt_entry_t *l3;
2867 	pv_entry_t pv;
2868 	vm_paddr_t opa, pa, l2_pa, l3_pa;
2869 	vm_page_t mpte, om, l2_m, l3_m;
2870 	pt_entry_t entry;
2871 	pn_t l2_pn, l3_pn, pn;
2872 	int rv;
2873 	bool nosleep;
2874 
2875 	va = trunc_page(va);
2876 	if ((m->oflags & VPO_UNMANAGED) == 0)
2877 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
2878 	pa = VM_PAGE_TO_PHYS(m);
2879 	pn = (pa / PAGE_SIZE);
2880 
2881 	new_l3 = PTE_V | PTE_R | PTE_A;
2882 	if (prot & VM_PROT_EXECUTE)
2883 		new_l3 |= PTE_X;
2884 	if (flags & VM_PROT_WRITE)
2885 		new_l3 |= PTE_D;
2886 	if (prot & VM_PROT_WRITE)
2887 		new_l3 |= PTE_W;
2888 	if (va < VM_MAX_USER_ADDRESS)
2889 		new_l3 |= PTE_U;
2890 
2891 	new_l3 |= (pn << PTE_PPN0_S);
2892 	if ((flags & PMAP_ENTER_WIRED) != 0)
2893 		new_l3 |= PTE_SW_WIRED;
2894 
2895 	/*
2896 	 * Set modified bit gratuitously for writeable mappings if
2897 	 * the page is unmanaged. We do not want to take a fault
2898 	 * to do the dirty bit accounting for these mappings.
2899 	 */
2900 	if ((m->oflags & VPO_UNMANAGED) != 0) {
2901 		if (prot & VM_PROT_WRITE)
2902 			new_l3 |= PTE_D;
2903 	} else
2904 		new_l3 |= PTE_SW_MANAGED;
2905 
2906 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
2907 
2908 	lock = NULL;
2909 	mpte = NULL;
2910 	rw_rlock(&pvh_global_lock);
2911 	PMAP_LOCK(pmap);
2912 	if (psind == 1) {
2913 		/* Assert the required virtual and physical alignment. */
2914 		KASSERT((va & L2_OFFSET) == 0,
2915 		    ("pmap_enter: va %#lx unaligned", va));
2916 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
2917 		rv = pmap_enter_l2(pmap, va, new_l3, flags, m, &lock);
2918 		goto out;
2919 	}
2920 
2921 	l2 = pmap_l2(pmap, va);
2922 	if (l2 != NULL && ((l2e = pmap_load(l2)) & PTE_V) != 0 &&
2923 	    ((l2e & PTE_RWX) == 0 || pmap_demote_l2_locked(pmap, l2,
2924 	    va, &lock))) {
2925 		l3 = pmap_l2_to_l3(l2, va);
2926 		if (va < VM_MAXUSER_ADDRESS) {
2927 			mpte = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
2928 			mpte->ref_count++;
2929 		}
2930 	} else if (va < VM_MAXUSER_ADDRESS) {
2931 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
2932 		mpte = pmap_alloc_l3(pmap, va, nosleep ? NULL : &lock);
2933 		if (mpte == NULL && nosleep) {
2934 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
2935 			if (lock != NULL)
2936 				rw_wunlock(lock);
2937 			rw_runlock(&pvh_global_lock);
2938 			PMAP_UNLOCK(pmap);
2939 			return (KERN_RESOURCE_SHORTAGE);
2940 		}
2941 		l3 = pmap_l3(pmap, va);
2942 	} else {
2943 		l3 = pmap_l3(pmap, va);
2944 		/* TODO: This is not optimal, but should mostly work */
2945 		if (l3 == NULL) {
2946 			if (l2 == NULL) {
2947 				l2_m = vm_page_alloc_noobj(VM_ALLOC_WIRED |
2948 				    VM_ALLOC_ZERO);
2949 				if (l2_m == NULL)
2950 					panic("pmap_enter: l2 pte_m == NULL");
2951 
2952 				l2_pa = VM_PAGE_TO_PHYS(l2_m);
2953 				l2_pn = (l2_pa / PAGE_SIZE);
2954 
2955 				l1 = pmap_l1(pmap, va);
2956 				entry = (PTE_V);
2957 				entry |= (l2_pn << PTE_PPN0_S);
2958 				pmap_store(l1, entry);
2959 				pmap_distribute_l1(pmap, pmap_l1_index(va), entry);
2960 				l2 = pmap_l1_to_l2(l1, va);
2961 			}
2962 
2963 			l3_m = vm_page_alloc_noobj(VM_ALLOC_WIRED |
2964 			    VM_ALLOC_ZERO);
2965 			if (l3_m == NULL)
2966 				panic("pmap_enter: l3 pte_m == NULL");
2967 
2968 			l3_pa = VM_PAGE_TO_PHYS(l3_m);
2969 			l3_pn = (l3_pa / PAGE_SIZE);
2970 			entry = (PTE_V);
2971 			entry |= (l3_pn << PTE_PPN0_S);
2972 			pmap_store(l2, entry);
2973 			l3 = pmap_l2_to_l3(l2, va);
2974 		}
2975 		pmap_invalidate_page(pmap, va);
2976 	}
2977 
2978 	orig_l3 = pmap_load(l3);
2979 	opa = PTE_TO_PHYS(orig_l3);
2980 	pv = NULL;
2981 
2982 	/*
2983 	 * Is the specified virtual address already mapped?
2984 	 */
2985 	if ((orig_l3 & PTE_V) != 0) {
2986 		/*
2987 		 * Wiring change, just update stats. We don't worry about
2988 		 * wiring PT pages as they remain resident as long as there
2989 		 * are valid mappings in them. Hence, if a user page is wired,
2990 		 * the PT page will be also.
2991 		 */
2992 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
2993 		    (orig_l3 & PTE_SW_WIRED) == 0)
2994 			pmap->pm_stats.wired_count++;
2995 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
2996 		    (orig_l3 & PTE_SW_WIRED) != 0)
2997 			pmap->pm_stats.wired_count--;
2998 
2999 		/*
3000 		 * Remove the extra PT page reference.
3001 		 */
3002 		if (mpte != NULL) {
3003 			mpte->ref_count--;
3004 			KASSERT(mpte->ref_count > 0,
3005 			    ("pmap_enter: missing reference to page table page,"
3006 			     " va: 0x%lx", va));
3007 		}
3008 
3009 		/*
3010 		 * Has the physical page changed?
3011 		 */
3012 		if (opa == pa) {
3013 			/*
3014 			 * No, might be a protection or wiring change.
3015 			 */
3016 			if ((orig_l3 & PTE_SW_MANAGED) != 0 &&
3017 			    (new_l3 & PTE_W) != 0)
3018 				vm_page_aflag_set(m, PGA_WRITEABLE);
3019 			goto validate;
3020 		}
3021 
3022 		/*
3023 		 * The physical page has changed.  Temporarily invalidate
3024 		 * the mapping.  This ensures that all threads sharing the
3025 		 * pmap keep a consistent view of the mapping, which is
3026 		 * necessary for the correct handling of COW faults.  It
3027 		 * also permits reuse of the old mapping's PV entry,
3028 		 * avoiding an allocation.
3029 		 *
3030 		 * For consistency, handle unmanaged mappings the same way.
3031 		 */
3032 		orig_l3 = pmap_load_clear(l3);
3033 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
3034 		    ("pmap_enter: unexpected pa update for %#lx", va));
3035 		if ((orig_l3 & PTE_SW_MANAGED) != 0) {
3036 			om = PHYS_TO_VM_PAGE(opa);
3037 
3038 			/*
3039 			 * The pmap lock is sufficient to synchronize with
3040 			 * concurrent calls to pmap_page_test_mappings() and
3041 			 * pmap_ts_referenced().
3042 			 */
3043 			if ((orig_l3 & PTE_D) != 0)
3044 				vm_page_dirty(om);
3045 			if ((orig_l3 & PTE_A) != 0)
3046 				vm_page_aflag_set(om, PGA_REFERENCED);
3047 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3048 			pv = pmap_pvh_remove(&om->md, pmap, va);
3049 			KASSERT(pv != NULL,
3050 			    ("pmap_enter: no PV entry for %#lx", va));
3051 			if ((new_l3 & PTE_SW_MANAGED) == 0)
3052 				free_pv_entry(pmap, pv);
3053 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3054 			    TAILQ_EMPTY(&om->md.pv_list) &&
3055 			    ((om->flags & PG_FICTITIOUS) != 0 ||
3056 			    TAILQ_EMPTY(&pa_to_pvh(opa)->pv_list)))
3057 				vm_page_aflag_clear(om, PGA_WRITEABLE);
3058 		}
3059 		pmap_invalidate_page(pmap, va);
3060 		orig_l3 = 0;
3061 	} else {
3062 		/*
3063 		 * Increment the counters.
3064 		 */
3065 		if ((new_l3 & PTE_SW_WIRED) != 0)
3066 			pmap->pm_stats.wired_count++;
3067 		pmap_resident_count_inc(pmap, 1);
3068 	}
3069 	/*
3070 	 * Enter on the PV list if part of our managed memory.
3071 	 */
3072 	if ((new_l3 & PTE_SW_MANAGED) != 0) {
3073 		if (pv == NULL) {
3074 			pv = get_pv_entry(pmap, &lock);
3075 			pv->pv_va = va;
3076 		}
3077 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
3078 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3079 		m->md.pv_gen++;
3080 		if ((new_l3 & PTE_W) != 0)
3081 			vm_page_aflag_set(m, PGA_WRITEABLE);
3082 	}
3083 
3084 validate:
3085 	/*
3086 	 * Sync the i-cache on all harts before updating the PTE
3087 	 * if the new PTE is executable.
3088 	 */
3089 	if (prot & VM_PROT_EXECUTE)
3090 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3091 
3092 	/*
3093 	 * Update the L3 entry.
3094 	 */
3095 	if (orig_l3 != 0) {
3096 		orig_l3 = pmap_load_store(l3, new_l3);
3097 		pmap_invalidate_page(pmap, va);
3098 		KASSERT(PTE_TO_PHYS(orig_l3) == pa,
3099 		    ("pmap_enter: invalid update"));
3100 		if ((orig_l3 & (PTE_D | PTE_SW_MANAGED)) ==
3101 		    (PTE_D | PTE_SW_MANAGED))
3102 			vm_page_dirty(m);
3103 	} else {
3104 		pmap_store(l3, new_l3);
3105 	}
3106 
3107 #if VM_NRESERVLEVEL > 0
3108 	if (mpte != NULL && mpte->ref_count == Ln_ENTRIES &&
3109 	    pmap_ps_enabled(pmap) &&
3110 	    (m->flags & PG_FICTITIOUS) == 0 &&
3111 	    vm_reserv_level_iffullpop(m) == 0)
3112 		pmap_promote_l2(pmap, l2, va, &lock);
3113 #endif
3114 
3115 	rv = KERN_SUCCESS;
3116 out:
3117 	if (lock != NULL)
3118 		rw_wunlock(lock);
3119 	rw_runlock(&pvh_global_lock);
3120 	PMAP_UNLOCK(pmap);
3121 	return (rv);
3122 }
3123 
3124 /*
3125  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
3126  * if successful.  Returns false if (1) a page table page cannot be allocated
3127  * without sleeping, (2) a mapping already exists at the specified virtual
3128  * address, or (3) a PV entry cannot be allocated without reclaiming another
3129  * PV entry.
3130  */
3131 static bool
3132 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3133     struct rwlock **lockp)
3134 {
3135 	pd_entry_t new_l2;
3136 	pn_t pn;
3137 
3138 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3139 
3140 	pn = VM_PAGE_TO_PHYS(m) / PAGE_SIZE;
3141 	new_l2 = (pd_entry_t)((pn << PTE_PPN0_S) | PTE_R | PTE_V);
3142 	if ((m->oflags & VPO_UNMANAGED) == 0)
3143 		new_l2 |= PTE_SW_MANAGED;
3144 	if ((prot & VM_PROT_EXECUTE) != 0)
3145 		new_l2 |= PTE_X;
3146 	if (va < VM_MAXUSER_ADDRESS)
3147 		new_l2 |= PTE_U;
3148 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
3149 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, NULL, lockp) ==
3150 	    KERN_SUCCESS);
3151 }
3152 
3153 /*
3154  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
3155  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
3156  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
3157  * a mapping already exists at the specified virtual address.  Returns
3158  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
3159  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
3160  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
3161  *
3162  * The parameter "m" is only used when creating a managed, writeable mapping.
3163  */
3164 static int
3165 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
3166     vm_page_t m, struct rwlock **lockp)
3167 {
3168 	struct spglist free;
3169 	pd_entry_t *l2, *l3, oldl2;
3170 	vm_offset_t sva;
3171 	vm_page_t l2pg, mt;
3172 
3173 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3174 
3175 	if ((l2pg = pmap_alloc_l2(pmap, va, (flags & PMAP_ENTER_NOSLEEP) != 0 ?
3176 	    NULL : lockp)) == NULL) {
3177 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
3178 		    va, pmap);
3179 		return (KERN_RESOURCE_SHORTAGE);
3180 	}
3181 
3182 	l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
3183 	l2 = &l2[pmap_l2_index(va)];
3184 	if ((oldl2 = pmap_load(l2)) != 0) {
3185 		KASSERT(l2pg->ref_count > 1,
3186 		    ("pmap_enter_l2: l2pg's ref count is too low"));
3187 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
3188 			l2pg->ref_count--;
3189 			CTR2(KTR_PMAP,
3190 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
3191 			    va, pmap);
3192 			return (KERN_FAILURE);
3193 		}
3194 		SLIST_INIT(&free);
3195 		if ((oldl2 & PTE_RWX) != 0)
3196 			(void)pmap_remove_l2(pmap, l2, va,
3197 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
3198 		else
3199 			for (sva = va; sva < va + L2_SIZE; sva += PAGE_SIZE) {
3200 				l3 = pmap_l2_to_l3(l2, sva);
3201 				if ((pmap_load(l3) & PTE_V) != 0 &&
3202 				    pmap_remove_l3(pmap, l3, sva, oldl2, &free,
3203 				    lockp) != 0)
3204 					break;
3205 			}
3206 		vm_page_free_pages_toq(&free, true);
3207 		if (va >= VM_MAXUSER_ADDRESS) {
3208 			/*
3209 			 * Both pmap_remove_l2() and pmap_remove_l3() will
3210 			 * leave the kernel page table page zero filled.
3211 			 */
3212 			mt = PHYS_TO_VM_PAGE(PTE_TO_PHYS(pmap_load(l2)));
3213 			if (pmap_insert_pt_page(pmap, mt, false))
3214 				panic("pmap_enter_l2: trie insert failed");
3215 		} else
3216 			KASSERT(pmap_load(l2) == 0,
3217 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
3218 	}
3219 
3220 	if ((new_l2 & PTE_SW_MANAGED) != 0) {
3221 		/*
3222 		 * Abort this mapping if its PV entry could not be created.
3223 		 */
3224 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
3225 			SLIST_INIT(&free);
3226 			if (pmap_unwire_ptp(pmap, va, l2pg, &free)) {
3227 				/*
3228 				 * Although "va" is not mapped, paging-structure
3229 				 * caches could nonetheless have entries that
3230 				 * refer to the freed page table pages.
3231 				 * Invalidate those entries.
3232 				 */
3233 				pmap_invalidate_page(pmap, va);
3234 				vm_page_free_pages_toq(&free, true);
3235 			}
3236 			CTR2(KTR_PMAP,
3237 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
3238 			    va, pmap);
3239 			return (KERN_RESOURCE_SHORTAGE);
3240 		}
3241 		if ((new_l2 & PTE_W) != 0)
3242 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3243 				vm_page_aflag_set(mt, PGA_WRITEABLE);
3244 	}
3245 
3246 	/*
3247 	 * Increment counters.
3248 	 */
3249 	if ((new_l2 & PTE_SW_WIRED) != 0)
3250 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
3251 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
3252 
3253 	/*
3254 	 * Map the superpage.
3255 	 */
3256 	pmap_store(l2, new_l2);
3257 
3258 	atomic_add_long(&pmap_l2_mappings, 1);
3259 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
3260 	    va, pmap);
3261 
3262 	return (KERN_SUCCESS);
3263 }
3264 
3265 /*
3266  * Maps a sequence of resident pages belonging to the same object.
3267  * The sequence begins with the given page m_start.  This page is
3268  * mapped at the given virtual address start.  Each subsequent page is
3269  * mapped at a virtual address that is offset from start by the same
3270  * amount as the page is offset from m_start within the object.  The
3271  * last page in the sequence is the page with the largest offset from
3272  * m_start that can be mapped at a virtual address less than the given
3273  * virtual address end.  Not every virtual page between start and end
3274  * is mapped; only those for which a resident page exists with the
3275  * corresponding offset from m_start are mapped.
3276  */
3277 void
3278 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
3279     vm_page_t m_start, vm_prot_t prot)
3280 {
3281 	struct rwlock *lock;
3282 	vm_offset_t va;
3283 	vm_page_t m, mpte;
3284 	vm_pindex_t diff, psize;
3285 
3286 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
3287 
3288 	psize = atop(end - start);
3289 	mpte = NULL;
3290 	m = m_start;
3291 	lock = NULL;
3292 	rw_rlock(&pvh_global_lock);
3293 	PMAP_LOCK(pmap);
3294 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
3295 		va = start + ptoa(diff);
3296 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
3297 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
3298 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
3299 			m = &m[L2_SIZE / PAGE_SIZE - 1];
3300 		else
3301 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
3302 			    &lock);
3303 		m = TAILQ_NEXT(m, listq);
3304 	}
3305 	if (lock != NULL)
3306 		rw_wunlock(lock);
3307 	rw_runlock(&pvh_global_lock);
3308 	PMAP_UNLOCK(pmap);
3309 }
3310 
3311 /*
3312  * this code makes some *MAJOR* assumptions:
3313  * 1. Current pmap & pmap exists.
3314  * 2. Not wired.
3315  * 3. Read access.
3316  * 4. No page table pages.
3317  * but is *MUCH* faster than pmap_enter...
3318  */
3319 
3320 void
3321 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
3322 {
3323 	struct rwlock *lock;
3324 
3325 	lock = NULL;
3326 	rw_rlock(&pvh_global_lock);
3327 	PMAP_LOCK(pmap);
3328 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
3329 	if (lock != NULL)
3330 		rw_wunlock(lock);
3331 	rw_runlock(&pvh_global_lock);
3332 	PMAP_UNLOCK(pmap);
3333 }
3334 
3335 static vm_page_t
3336 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
3337     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
3338 {
3339 	struct spglist free;
3340 	vm_paddr_t phys;
3341 	pd_entry_t *l2;
3342 	pt_entry_t *l3, newl3;
3343 
3344 	KASSERT(!VA_IS_CLEANMAP(va) ||
3345 	    (m->oflags & VPO_UNMANAGED) != 0,
3346 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
3347 	rw_assert(&pvh_global_lock, RA_LOCKED);
3348 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3349 
3350 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
3351 	/*
3352 	 * In the case that a page table page is not
3353 	 * resident, we are creating it here.
3354 	 */
3355 	if (va < VM_MAXUSER_ADDRESS) {
3356 		vm_pindex_t l2pindex;
3357 
3358 		/*
3359 		 * Calculate pagetable page index
3360 		 */
3361 		l2pindex = pmap_l2_pindex(va);
3362 		if (mpte && (mpte->pindex == l2pindex)) {
3363 			mpte->ref_count++;
3364 		} else {
3365 			/*
3366 			 * Get the l2 entry
3367 			 */
3368 			l2 = pmap_l2(pmap, va);
3369 
3370 			/*
3371 			 * If the page table page is mapped, we just increment
3372 			 * the hold count, and activate it.  Otherwise, we
3373 			 * attempt to allocate a page table page.  If this
3374 			 * attempt fails, we don't retry.  Instead, we give up.
3375 			 */
3376 			if (l2 != NULL && pmap_load(l2) != 0) {
3377 				phys = PTE_TO_PHYS(pmap_load(l2));
3378 				mpte = PHYS_TO_VM_PAGE(phys);
3379 				mpte->ref_count++;
3380 			} else {
3381 				/*
3382 				 * Pass NULL instead of the PV list lock
3383 				 * pointer, because we don't intend to sleep.
3384 				 */
3385 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
3386 				if (mpte == NULL)
3387 					return (mpte);
3388 			}
3389 		}
3390 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
3391 		l3 = &l3[pmap_l3_index(va)];
3392 	} else {
3393 		mpte = NULL;
3394 		l3 = pmap_l3(kernel_pmap, va);
3395 	}
3396 	if (l3 == NULL)
3397 		panic("pmap_enter_quick_locked: No l3");
3398 	if (pmap_load(l3) != 0) {
3399 		if (mpte != NULL) {
3400 			mpte->ref_count--;
3401 			mpte = NULL;
3402 		}
3403 		return (mpte);
3404 	}
3405 
3406 	/*
3407 	 * Enter on the PV list if part of our managed memory.
3408 	 */
3409 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
3410 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
3411 		if (mpte != NULL) {
3412 			SLIST_INIT(&free);
3413 			if (pmap_unwire_ptp(pmap, va, mpte, &free)) {
3414 				pmap_invalidate_page(pmap, va);
3415 				vm_page_free_pages_toq(&free, false);
3416 			}
3417 			mpte = NULL;
3418 		}
3419 		return (mpte);
3420 	}
3421 
3422 	/*
3423 	 * Increment counters
3424 	 */
3425 	pmap_resident_count_inc(pmap, 1);
3426 
3427 	newl3 = ((VM_PAGE_TO_PHYS(m) / PAGE_SIZE) << PTE_PPN0_S) |
3428 	    PTE_V | PTE_R;
3429 	if ((prot & VM_PROT_EXECUTE) != 0)
3430 		newl3 |= PTE_X;
3431 	if ((m->oflags & VPO_UNMANAGED) == 0)
3432 		newl3 |= PTE_SW_MANAGED;
3433 	if (va < VM_MAX_USER_ADDRESS)
3434 		newl3 |= PTE_U;
3435 
3436 	/*
3437 	 * Sync the i-cache on all harts before updating the PTE
3438 	 * if the new PTE is executable.
3439 	 */
3440 	if (prot & VM_PROT_EXECUTE)
3441 		pmap_sync_icache(pmap, va, PAGE_SIZE);
3442 
3443 	pmap_store(l3, newl3);
3444 
3445 	pmap_invalidate_page(pmap, va);
3446 	return (mpte);
3447 }
3448 
3449 /*
3450  * This code maps large physical mmap regions into the
3451  * processor address space.  Note that some shortcuts
3452  * are taken, but the code works.
3453  */
3454 void
3455 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
3456     vm_pindex_t pindex, vm_size_t size)
3457 {
3458 
3459 	VM_OBJECT_ASSERT_WLOCKED(object);
3460 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
3461 	    ("pmap_object_init_pt: non-device object"));
3462 }
3463 
3464 /*
3465  *	Clear the wired attribute from the mappings for the specified range of
3466  *	addresses in the given pmap.  Every valid mapping within that range
3467  *	must have the wired attribute set.  In contrast, invalid mappings
3468  *	cannot have the wired attribute set, so they are ignored.
3469  *
3470  *	The wired attribute of the page table entry is not a hardware feature,
3471  *	so there is no need to invalidate any TLB entries.
3472  */
3473 void
3474 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3475 {
3476 	vm_offset_t va_next;
3477 	pd_entry_t *l0, *l1, *l2, l2e;
3478 	pt_entry_t *l3, l3e;
3479 	bool pv_lists_locked;
3480 
3481 	pv_lists_locked = false;
3482 retry:
3483 	PMAP_LOCK(pmap);
3484 	for (; sva < eva; sva = va_next) {
3485 		if (pmap_mode == PMAP_MODE_SV48) {
3486 			l0 = pmap_l0(pmap, sva);
3487 			if (pmap_load(l0) == 0) {
3488 				va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3489 				if (va_next < sva)
3490 					va_next = eva;
3491 				continue;
3492 			}
3493 			l1 = pmap_l0_to_l1(l0, sva);
3494 		} else {
3495 			l1 = pmap_l1(pmap, sva);
3496 		}
3497 
3498 		if (pmap_load(l1) == 0) {
3499 			va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3500 			if (va_next < sva)
3501 				va_next = eva;
3502 			continue;
3503 		}
3504 
3505 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3506 		if (va_next < sva)
3507 			va_next = eva;
3508 
3509 		l2 = pmap_l1_to_l2(l1, sva);
3510 		if ((l2e = pmap_load(l2)) == 0)
3511 			continue;
3512 		if ((l2e & PTE_RWX) != 0) {
3513 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3514 				if ((l2e & PTE_SW_WIRED) == 0)
3515 					panic("pmap_unwire: l2 %#jx is missing "
3516 					    "PTE_SW_WIRED", (uintmax_t)l2e);
3517 				pmap_clear_bits(l2, PTE_SW_WIRED);
3518 				continue;
3519 			} else {
3520 				if (!pv_lists_locked) {
3521 					pv_lists_locked = true;
3522 					if (!rw_try_rlock(&pvh_global_lock)) {
3523 						PMAP_UNLOCK(pmap);
3524 						rw_rlock(&pvh_global_lock);
3525 						/* Repeat sva. */
3526 						goto retry;
3527 					}
3528 				}
3529 				if (!pmap_demote_l2(pmap, l2, sva))
3530 					panic("pmap_unwire: demotion failed");
3531 			}
3532 		}
3533 
3534 		if (va_next > eva)
3535 			va_next = eva;
3536 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
3537 		    sva += L3_SIZE) {
3538 			if ((l3e = pmap_load(l3)) == 0)
3539 				continue;
3540 			if ((l3e & PTE_SW_WIRED) == 0)
3541 				panic("pmap_unwire: l3 %#jx is missing "
3542 				    "PTE_SW_WIRED", (uintmax_t)l3e);
3543 
3544 			/*
3545 			 * PG_W must be cleared atomically.  Although the pmap
3546 			 * lock synchronizes access to PG_W, another processor
3547 			 * could be setting PG_M and/or PG_A concurrently.
3548 			 */
3549 			pmap_clear_bits(l3, PTE_SW_WIRED);
3550 			pmap->pm_stats.wired_count--;
3551 		}
3552 	}
3553 	if (pv_lists_locked)
3554 		rw_runlock(&pvh_global_lock);
3555 	PMAP_UNLOCK(pmap);
3556 }
3557 
3558 /*
3559  *	Copy the range specified by src_addr/len
3560  *	from the source map to the range dst_addr/len
3561  *	in the destination map.
3562  *
3563  *	This routine is only advisory and need not do anything.
3564  */
3565 
3566 void
3567 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
3568     vm_offset_t src_addr)
3569 {
3570 
3571 }
3572 
3573 /*
3574  *	pmap_zero_page zeros the specified hardware page by mapping
3575  *	the page into KVM and using bzero to clear its contents.
3576  */
3577 void
3578 pmap_zero_page(vm_page_t m)
3579 {
3580 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3581 
3582 	pagezero((void *)va);
3583 }
3584 
3585 /*
3586  *	pmap_zero_page_area zeros the specified hardware page by mapping
3587  *	the page into KVM and using bzero to clear its contents.
3588  *
3589  *	off and size may not cover an area beyond a single hardware page.
3590  */
3591 void
3592 pmap_zero_page_area(vm_page_t m, int off, int size)
3593 {
3594 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
3595 
3596 	if (off == 0 && size == PAGE_SIZE)
3597 		pagezero((void *)va);
3598 	else
3599 		bzero((char *)va + off, size);
3600 }
3601 
3602 /*
3603  *	pmap_copy_page copies the specified (machine independent)
3604  *	page by mapping the page into virtual memory and using
3605  *	bcopy to copy the page, one machine dependent page at a
3606  *	time.
3607  */
3608 void
3609 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
3610 {
3611 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
3612 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
3613 
3614 	pagecopy((void *)src, (void *)dst);
3615 }
3616 
3617 int unmapped_buf_allowed = 1;
3618 
3619 void
3620 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
3621     vm_offset_t b_offset, int xfersize)
3622 {
3623 	void *a_cp, *b_cp;
3624 	vm_page_t m_a, m_b;
3625 	vm_paddr_t p_a, p_b;
3626 	vm_offset_t a_pg_offset, b_pg_offset;
3627 	int cnt;
3628 
3629 	while (xfersize > 0) {
3630 		a_pg_offset = a_offset & PAGE_MASK;
3631 		m_a = ma[a_offset >> PAGE_SHIFT];
3632 		p_a = m_a->phys_addr;
3633 		b_pg_offset = b_offset & PAGE_MASK;
3634 		m_b = mb[b_offset >> PAGE_SHIFT];
3635 		p_b = m_b->phys_addr;
3636 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
3637 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
3638 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
3639 			panic("!DMAP a %lx", p_a);
3640 		} else {
3641 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
3642 		}
3643 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
3644 			panic("!DMAP b %lx", p_b);
3645 		} else {
3646 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
3647 		}
3648 		bcopy(a_cp, b_cp, cnt);
3649 		a_offset += cnt;
3650 		b_offset += cnt;
3651 		xfersize -= cnt;
3652 	}
3653 }
3654 
3655 vm_offset_t
3656 pmap_quick_enter_page(vm_page_t m)
3657 {
3658 
3659 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
3660 }
3661 
3662 void
3663 pmap_quick_remove_page(vm_offset_t addr)
3664 {
3665 }
3666 
3667 /*
3668  * Returns true if the pmap's pv is one of the first
3669  * 16 pvs linked to from this page.  This count may
3670  * be changed upwards or downwards in the future; it
3671  * is only necessary that true be returned for a small
3672  * subset of pmaps for proper page aging.
3673  */
3674 boolean_t
3675 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
3676 {
3677 	struct md_page *pvh;
3678 	struct rwlock *lock;
3679 	pv_entry_t pv;
3680 	int loops = 0;
3681 	boolean_t rv;
3682 
3683 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3684 	    ("pmap_page_exists_quick: page %p is not managed", m));
3685 	rv = FALSE;
3686 	rw_rlock(&pvh_global_lock);
3687 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3688 	rw_rlock(lock);
3689 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3690 		if (PV_PMAP(pv) == pmap) {
3691 			rv = TRUE;
3692 			break;
3693 		}
3694 		loops++;
3695 		if (loops >= 16)
3696 			break;
3697 	}
3698 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
3699 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3700 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3701 			if (PV_PMAP(pv) == pmap) {
3702 				rv = TRUE;
3703 				break;
3704 			}
3705 			loops++;
3706 			if (loops >= 16)
3707 				break;
3708 		}
3709 	}
3710 	rw_runlock(lock);
3711 	rw_runlock(&pvh_global_lock);
3712 	return (rv);
3713 }
3714 
3715 /*
3716  *	pmap_page_wired_mappings:
3717  *
3718  *	Return the number of managed mappings to the given physical page
3719  *	that are wired.
3720  */
3721 int
3722 pmap_page_wired_mappings(vm_page_t m)
3723 {
3724 	struct md_page *pvh;
3725 	struct rwlock *lock;
3726 	pmap_t pmap;
3727 	pd_entry_t *l2;
3728 	pt_entry_t *l3;
3729 	pv_entry_t pv;
3730 	int count, md_gen, pvh_gen;
3731 
3732 	if ((m->oflags & VPO_UNMANAGED) != 0)
3733 		return (0);
3734 	rw_rlock(&pvh_global_lock);
3735 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3736 	rw_rlock(lock);
3737 restart:
3738 	count = 0;
3739 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3740 		pmap = PV_PMAP(pv);
3741 		if (!PMAP_TRYLOCK(pmap)) {
3742 			md_gen = m->md.pv_gen;
3743 			rw_runlock(lock);
3744 			PMAP_LOCK(pmap);
3745 			rw_rlock(lock);
3746 			if (md_gen != m->md.pv_gen) {
3747 				PMAP_UNLOCK(pmap);
3748 				goto restart;
3749 			}
3750 		}
3751 		l2 = pmap_l2(pmap, pv->pv_va);
3752 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
3753 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
3754 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
3755 		if ((pmap_load(l3) & PTE_SW_WIRED) != 0)
3756 			count++;
3757 		PMAP_UNLOCK(pmap);
3758 	}
3759 	if ((m->flags & PG_FICTITIOUS) == 0) {
3760 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
3761 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3762 			pmap = PV_PMAP(pv);
3763 			if (!PMAP_TRYLOCK(pmap)) {
3764 				md_gen = m->md.pv_gen;
3765 				pvh_gen = pvh->pv_gen;
3766 				rw_runlock(lock);
3767 				PMAP_LOCK(pmap);
3768 				rw_rlock(lock);
3769 				if (md_gen != m->md.pv_gen ||
3770 				    pvh_gen != pvh->pv_gen) {
3771 					PMAP_UNLOCK(pmap);
3772 					goto restart;
3773 				}
3774 			}
3775 			l2 = pmap_l2(pmap, pv->pv_va);
3776 			if ((pmap_load(l2) & PTE_SW_WIRED) != 0)
3777 				count++;
3778 			PMAP_UNLOCK(pmap);
3779 		}
3780 	}
3781 	rw_runlock(lock);
3782 	rw_runlock(&pvh_global_lock);
3783 	return (count);
3784 }
3785 
3786 /*
3787  * Returns true if the given page is mapped individually or as part of
3788  * a 2mpage.  Otherwise, returns false.
3789  */
3790 bool
3791 pmap_page_is_mapped(vm_page_t m)
3792 {
3793 	struct rwlock *lock;
3794 	bool rv;
3795 
3796 	if ((m->oflags & VPO_UNMANAGED) != 0)
3797 		return (false);
3798 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3799 	rw_rlock(lock);
3800 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
3801 	    ((m->flags & PG_FICTITIOUS) == 0 &&
3802 	    !TAILQ_EMPTY(&pa_to_pvh(VM_PAGE_TO_PHYS(m))->pv_list));
3803 	rw_runlock(lock);
3804 	return (rv);
3805 }
3806 
3807 static void
3808 pmap_remove_pages_pv(pmap_t pmap, vm_page_t m, pv_entry_t pv,
3809     struct spglist *free, bool superpage)
3810 {
3811 	struct md_page *pvh;
3812 	vm_page_t mpte, mt;
3813 
3814 	if (superpage) {
3815 		pmap_resident_count_dec(pmap, Ln_ENTRIES);
3816 		pvh = pa_to_pvh(m->phys_addr);
3817 		TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3818 		pvh->pv_gen++;
3819 		if (TAILQ_EMPTY(&pvh->pv_list)) {
3820 			for (mt = m; mt < &m[Ln_ENTRIES]; mt++)
3821 				if (TAILQ_EMPTY(&mt->md.pv_list) &&
3822 				    (mt->a.flags & PGA_WRITEABLE) != 0)
3823 					vm_page_aflag_clear(mt, PGA_WRITEABLE);
3824 		}
3825 		mpte = pmap_remove_pt_page(pmap, pv->pv_va);
3826 		if (mpte != NULL) {
3827 			KASSERT(mpte->valid == VM_PAGE_BITS_ALL,
3828 			    ("pmap_remove_pages: pte page not promoted"));
3829 			pmap_resident_count_dec(pmap, 1);
3830 			KASSERT(mpte->ref_count == Ln_ENTRIES,
3831 			    ("pmap_remove_pages: pte page ref count error"));
3832 			mpte->ref_count = 0;
3833 			pmap_add_delayed_free_list(mpte, free, FALSE);
3834 		}
3835 	} else {
3836 		pmap_resident_count_dec(pmap, 1);
3837 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3838 		m->md.pv_gen++;
3839 		if (TAILQ_EMPTY(&m->md.pv_list) &&
3840 		    (m->a.flags & PGA_WRITEABLE) != 0) {
3841 			pvh = pa_to_pvh(m->phys_addr);
3842 			if (TAILQ_EMPTY(&pvh->pv_list))
3843 				vm_page_aflag_clear(m, PGA_WRITEABLE);
3844 		}
3845 	}
3846 }
3847 
3848 /*
3849  * Destroy all managed, non-wired mappings in the given user-space
3850  * pmap.  This pmap cannot be active on any processor besides the
3851  * caller.
3852  *
3853  * This function cannot be applied to the kernel pmap.  Moreover, it
3854  * is not intended for general use.  It is only to be used during
3855  * process termination.  Consequently, it can be implemented in ways
3856  * that make it faster than pmap_remove().  First, it can more quickly
3857  * destroy mappings by iterating over the pmap's collection of PV
3858  * entries, rather than searching the page table.  Second, it doesn't
3859  * have to test and clear the page table entries atomically, because
3860  * no processor is currently accessing the user address space.  In
3861  * particular, a page table entry's dirty bit won't change state once
3862  * this function starts.
3863  */
3864 void
3865 pmap_remove_pages(pmap_t pmap)
3866 {
3867 	struct spglist free;
3868 	pd_entry_t ptepde;
3869 	pt_entry_t *pte, tpte;
3870 	vm_page_t m, mt;
3871 	pv_entry_t pv;
3872 	struct pv_chunk *pc, *npc;
3873 	struct rwlock *lock;
3874 	int64_t bit;
3875 	uint64_t inuse, bitmask;
3876 	int allfree, field, freed, idx;
3877 	bool superpage;
3878 
3879 	lock = NULL;
3880 
3881 	SLIST_INIT(&free);
3882 	rw_rlock(&pvh_global_lock);
3883 	PMAP_LOCK(pmap);
3884 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
3885 		allfree = 1;
3886 		freed = 0;
3887 		for (field = 0; field < _NPCM; field++) {
3888 			inuse = ~pc->pc_map[field] & pc_freemask[field];
3889 			while (inuse != 0) {
3890 				bit = ffsl(inuse) - 1;
3891 				bitmask = 1UL << bit;
3892 				idx = field * 64 + bit;
3893 				pv = &pc->pc_pventry[idx];
3894 				inuse &= ~bitmask;
3895 
3896 				pte = pmap_l1(pmap, pv->pv_va);
3897 				ptepde = pmap_load(pte);
3898 				pte = pmap_l1_to_l2(pte, pv->pv_va);
3899 				tpte = pmap_load(pte);
3900 				if ((tpte & PTE_RWX) != 0) {
3901 					superpage = true;
3902 				} else {
3903 					ptepde = tpte;
3904 					pte = pmap_l2_to_l3(pte, pv->pv_va);
3905 					tpte = pmap_load(pte);
3906 					superpage = false;
3907 				}
3908 
3909 				/*
3910 				 * We cannot remove wired pages from a
3911 				 * process' mapping at this time.
3912 				 */
3913 				if (tpte & PTE_SW_WIRED) {
3914 					allfree = 0;
3915 					continue;
3916 				}
3917 
3918 				m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte));
3919 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
3920 				    m < &vm_page_array[vm_page_array_size],
3921 				    ("pmap_remove_pages: bad pte %#jx",
3922 				    (uintmax_t)tpte));
3923 
3924 				pmap_clear(pte);
3925 
3926 				/*
3927 				 * Update the vm_page_t clean/reference bits.
3928 				 */
3929 				if ((tpte & (PTE_D | PTE_W)) ==
3930 				    (PTE_D | PTE_W)) {
3931 					if (superpage)
3932 						for (mt = m;
3933 						    mt < &m[Ln_ENTRIES]; mt++)
3934 							vm_page_dirty(mt);
3935 					else
3936 						vm_page_dirty(m);
3937 				}
3938 
3939 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
3940 
3941 				/* Mark free */
3942 				pc->pc_map[field] |= bitmask;
3943 
3944 				pmap_remove_pages_pv(pmap, m, pv, &free,
3945 				    superpage);
3946 				pmap_unuse_pt(pmap, pv->pv_va, ptepde, &free);
3947 				freed++;
3948 			}
3949 		}
3950 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3951 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3952 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3953 		if (allfree) {
3954 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3955 			free_pv_chunk(pc);
3956 		}
3957 	}
3958 	if (lock != NULL)
3959 		rw_wunlock(lock);
3960 	pmap_invalidate_all(pmap);
3961 	rw_runlock(&pvh_global_lock);
3962 	PMAP_UNLOCK(pmap);
3963 	vm_page_free_pages_toq(&free, false);
3964 }
3965 
3966 static bool
3967 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
3968 {
3969 	struct md_page *pvh;
3970 	struct rwlock *lock;
3971 	pd_entry_t *l2;
3972 	pt_entry_t *l3, mask;
3973 	pv_entry_t pv;
3974 	pmap_t pmap;
3975 	int md_gen, pvh_gen;
3976 	bool rv;
3977 
3978 	mask = 0;
3979 	if (modified)
3980 		mask |= PTE_D;
3981 	if (accessed)
3982 		mask |= PTE_A;
3983 
3984 	rv = FALSE;
3985 	rw_rlock(&pvh_global_lock);
3986 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3987 	rw_rlock(lock);
3988 restart:
3989 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
3990 		pmap = PV_PMAP(pv);
3991 		if (!PMAP_TRYLOCK(pmap)) {
3992 			md_gen = m->md.pv_gen;
3993 			rw_runlock(lock);
3994 			PMAP_LOCK(pmap);
3995 			rw_rlock(lock);
3996 			if (md_gen != m->md.pv_gen) {
3997 				PMAP_UNLOCK(pmap);
3998 				goto restart;
3999 			}
4000 		}
4001 		l2 = pmap_l2(pmap, pv->pv_va);
4002 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4003 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4004 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4005 		rv = (pmap_load(l3) & mask) == mask;
4006 		PMAP_UNLOCK(pmap);
4007 		if (rv)
4008 			goto out;
4009 	}
4010 	if ((m->flags & PG_FICTITIOUS) == 0) {
4011 		pvh = pa_to_pvh(VM_PAGE_TO_PHYS(m));
4012 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4013 			pmap = PV_PMAP(pv);
4014 			if (!PMAP_TRYLOCK(pmap)) {
4015 				md_gen = m->md.pv_gen;
4016 				pvh_gen = pvh->pv_gen;
4017 				rw_runlock(lock);
4018 				PMAP_LOCK(pmap);
4019 				rw_rlock(lock);
4020 				if (md_gen != m->md.pv_gen ||
4021 				    pvh_gen != pvh->pv_gen) {
4022 					PMAP_UNLOCK(pmap);
4023 					goto restart;
4024 				}
4025 			}
4026 			l2 = pmap_l2(pmap, pv->pv_va);
4027 			rv = (pmap_load(l2) & mask) == mask;
4028 			PMAP_UNLOCK(pmap);
4029 			if (rv)
4030 				goto out;
4031 		}
4032 	}
4033 out:
4034 	rw_runlock(lock);
4035 	rw_runlock(&pvh_global_lock);
4036 	return (rv);
4037 }
4038 
4039 /*
4040  *	pmap_is_modified:
4041  *
4042  *	Return whether or not the specified physical page was modified
4043  *	in any physical maps.
4044  */
4045 boolean_t
4046 pmap_is_modified(vm_page_t m)
4047 {
4048 
4049 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4050 	    ("pmap_is_modified: page %p is not managed", m));
4051 
4052 	/*
4053 	 * If the page is not busied then this check is racy.
4054 	 */
4055 	if (!pmap_page_is_write_mapped(m))
4056 		return (FALSE);
4057 	return (pmap_page_test_mappings(m, FALSE, TRUE));
4058 }
4059 
4060 /*
4061  *	pmap_is_prefaultable:
4062  *
4063  *	Return whether or not the specified virtual address is eligible
4064  *	for prefault.
4065  */
4066 boolean_t
4067 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
4068 {
4069 	pt_entry_t *l3;
4070 	boolean_t rv;
4071 
4072 	/*
4073 	 * Return TRUE if and only if the L3 entry for the specified virtual
4074 	 * address is allocated but invalid.
4075 	 */
4076 	rv = FALSE;
4077 	PMAP_LOCK(pmap);
4078 	l3 = pmap_l3(pmap, addr);
4079 	if (l3 != NULL && pmap_load(l3) == 0) {
4080 		rv = TRUE;
4081 	}
4082 	PMAP_UNLOCK(pmap);
4083 	return (rv);
4084 }
4085 
4086 /*
4087  *	pmap_is_referenced:
4088  *
4089  *	Return whether or not the specified physical page was referenced
4090  *	in any physical maps.
4091  */
4092 boolean_t
4093 pmap_is_referenced(vm_page_t m)
4094 {
4095 
4096 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4097 	    ("pmap_is_referenced: page %p is not managed", m));
4098 	return (pmap_page_test_mappings(m, TRUE, FALSE));
4099 }
4100 
4101 /*
4102  * Clear the write and modified bits in each of the given page's mappings.
4103  */
4104 void
4105 pmap_remove_write(vm_page_t m)
4106 {
4107 	struct md_page *pvh;
4108 	struct rwlock *lock;
4109 	pmap_t pmap;
4110 	pd_entry_t *l2;
4111 	pt_entry_t *l3, oldl3, newl3;
4112 	pv_entry_t next_pv, pv;
4113 	vm_offset_t va;
4114 	int md_gen, pvh_gen;
4115 
4116 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4117 	    ("pmap_remove_write: page %p is not managed", m));
4118 	vm_page_assert_busied(m);
4119 
4120 	if (!pmap_page_is_write_mapped(m))
4121 		return;
4122 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4123 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4124 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4125 	rw_rlock(&pvh_global_lock);
4126 retry_pv_loop:
4127 	rw_wlock(lock);
4128 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4129 		pmap = PV_PMAP(pv);
4130 		if (!PMAP_TRYLOCK(pmap)) {
4131 			pvh_gen = pvh->pv_gen;
4132 			rw_wunlock(lock);
4133 			PMAP_LOCK(pmap);
4134 			rw_wlock(lock);
4135 			if (pvh_gen != pvh->pv_gen) {
4136 				PMAP_UNLOCK(pmap);
4137 				rw_wunlock(lock);
4138 				goto retry_pv_loop;
4139 			}
4140 		}
4141 		va = pv->pv_va;
4142 		l2 = pmap_l2(pmap, va);
4143 		if ((pmap_load(l2) & PTE_W) != 0)
4144 			(void)pmap_demote_l2_locked(pmap, l2, va, &lock);
4145 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
4146 		    ("inconsistent pv lock %p %p for page %p",
4147 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
4148 		PMAP_UNLOCK(pmap);
4149 	}
4150 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4151 		pmap = PV_PMAP(pv);
4152 		if (!PMAP_TRYLOCK(pmap)) {
4153 			pvh_gen = pvh->pv_gen;
4154 			md_gen = m->md.pv_gen;
4155 			rw_wunlock(lock);
4156 			PMAP_LOCK(pmap);
4157 			rw_wlock(lock);
4158 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4159 				PMAP_UNLOCK(pmap);
4160 				rw_wunlock(lock);
4161 				goto retry_pv_loop;
4162 			}
4163 		}
4164 		l2 = pmap_l2(pmap, pv->pv_va);
4165 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4166 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4167 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4168 		oldl3 = pmap_load(l3);
4169 retry:
4170 		if ((oldl3 & PTE_W) != 0) {
4171 			newl3 = oldl3 & ~(PTE_D | PTE_W);
4172 			if (!atomic_fcmpset_long(l3, &oldl3, newl3))
4173 				goto retry;
4174 			if ((oldl3 & PTE_D) != 0)
4175 				vm_page_dirty(m);
4176 			pmap_invalidate_page(pmap, pv->pv_va);
4177 		}
4178 		PMAP_UNLOCK(pmap);
4179 	}
4180 	rw_wunlock(lock);
4181 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4182 	rw_runlock(&pvh_global_lock);
4183 }
4184 
4185 /*
4186  *	pmap_ts_referenced:
4187  *
4188  *	Return a count of reference bits for a page, clearing those bits.
4189  *	It is not necessary for every reference bit to be cleared, but it
4190  *	is necessary that 0 only be returned when there are truly no
4191  *	reference bits set.
4192  *
4193  *	As an optimization, update the page's dirty field if a modified bit is
4194  *	found while counting reference bits.  This opportunistic update can be
4195  *	performed at low cost and can eliminate the need for some future calls
4196  *	to pmap_is_modified().  However, since this function stops after
4197  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
4198  *	dirty pages.  Those dirty pages will only be detected by a future call
4199  *	to pmap_is_modified().
4200  */
4201 int
4202 pmap_ts_referenced(vm_page_t m)
4203 {
4204 	struct spglist free;
4205 	struct md_page *pvh;
4206 	struct rwlock *lock;
4207 	pv_entry_t pv, pvf;
4208 	pmap_t pmap;
4209 	pd_entry_t *l2, l2e;
4210 	pt_entry_t *l3, l3e;
4211 	vm_paddr_t pa;
4212 	vm_offset_t va;
4213 	int cleared, md_gen, not_cleared, pvh_gen;
4214 
4215 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4216 	    ("pmap_ts_referenced: page %p is not managed", m));
4217 	SLIST_INIT(&free);
4218 	cleared = 0;
4219 	pa = VM_PAGE_TO_PHYS(m);
4220 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : pa_to_pvh(pa);
4221 
4222 	lock = PHYS_TO_PV_LIST_LOCK(pa);
4223 	rw_rlock(&pvh_global_lock);
4224 	rw_wlock(lock);
4225 retry:
4226 	not_cleared = 0;
4227 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
4228 		goto small_mappings;
4229 	pv = pvf;
4230 	do {
4231 		pmap = PV_PMAP(pv);
4232 		if (!PMAP_TRYLOCK(pmap)) {
4233 			pvh_gen = pvh->pv_gen;
4234 			rw_wunlock(lock);
4235 			PMAP_LOCK(pmap);
4236 			rw_wlock(lock);
4237 			if (pvh_gen != pvh->pv_gen) {
4238 				PMAP_UNLOCK(pmap);
4239 				goto retry;
4240 			}
4241 		}
4242 		va = pv->pv_va;
4243 		l2 = pmap_l2(pmap, va);
4244 		l2e = pmap_load(l2);
4245 		if ((l2e & (PTE_W | PTE_D)) == (PTE_W | PTE_D)) {
4246 			/*
4247 			 * Although l2e is mapping a 2MB page, because
4248 			 * this function is called at a 4KB page granularity,
4249 			 * we only update the 4KB page under test.
4250 			 */
4251 			vm_page_dirty(m);
4252 		}
4253 		if ((l2e & PTE_A) != 0) {
4254 			/*
4255 			 * Since this reference bit is shared by 512 4KB
4256 			 * pages, it should not be cleared every time it is
4257 			 * tested.  Apply a simple "hash" function on the
4258 			 * physical page number, the virtual superpage number,
4259 			 * and the pmap address to select one 4KB page out of
4260 			 * the 512 on which testing the reference bit will
4261 			 * result in clearing that reference bit.  This
4262 			 * function is designed to avoid the selection of the
4263 			 * same 4KB page for every 2MB page mapping.
4264 			 *
4265 			 * On demotion, a mapping that hasn't been referenced
4266 			 * is simply destroyed.  To avoid the possibility of a
4267 			 * subsequent page fault on a demoted wired mapping,
4268 			 * always leave its reference bit set.  Moreover,
4269 			 * since the superpage is wired, the current state of
4270 			 * its reference bit won't affect page replacement.
4271 			 */
4272 			if ((((pa >> PAGE_SHIFT) ^ (pv->pv_va >> L2_SHIFT) ^
4273 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
4274 			    (l2e & PTE_SW_WIRED) == 0) {
4275 				pmap_clear_bits(l2, PTE_A);
4276 				pmap_invalidate_page(pmap, va);
4277 				cleared++;
4278 			} else
4279 				not_cleared++;
4280 		}
4281 		PMAP_UNLOCK(pmap);
4282 		/* Rotate the PV list if it has more than one entry. */
4283 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4284 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
4285 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4286 			pvh->pv_gen++;
4287 		}
4288 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
4289 			goto out;
4290 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
4291 small_mappings:
4292 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
4293 		goto out;
4294 	pv = pvf;
4295 	do {
4296 		pmap = PV_PMAP(pv);
4297 		if (!PMAP_TRYLOCK(pmap)) {
4298 			pvh_gen = pvh->pv_gen;
4299 			md_gen = m->md.pv_gen;
4300 			rw_wunlock(lock);
4301 			PMAP_LOCK(pmap);
4302 			rw_wlock(lock);
4303 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4304 				PMAP_UNLOCK(pmap);
4305 				goto retry;
4306 			}
4307 		}
4308 		l2 = pmap_l2(pmap, pv->pv_va);
4309 
4310 		KASSERT((pmap_load(l2) & PTE_RX) == 0,
4311 		    ("pmap_ts_referenced: found an invalid l2 table"));
4312 
4313 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4314 		l3e = pmap_load(l3);
4315 		if ((l3e & PTE_D) != 0)
4316 			vm_page_dirty(m);
4317 		if ((l3e & PTE_A) != 0) {
4318 			if ((l3e & PTE_SW_WIRED) == 0) {
4319 				/*
4320 				 * Wired pages cannot be paged out so
4321 				 * doing accessed bit emulation for
4322 				 * them is wasted effort. We do the
4323 				 * hard work for unwired pages only.
4324 				 */
4325 				pmap_clear_bits(l3, PTE_A);
4326 				pmap_invalidate_page(pmap, pv->pv_va);
4327 				cleared++;
4328 			} else
4329 				not_cleared++;
4330 		}
4331 		PMAP_UNLOCK(pmap);
4332 		/* Rotate the PV list if it has more than one entry. */
4333 		if (pv != NULL && TAILQ_NEXT(pv, pv_next) != NULL) {
4334 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4335 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4336 			m->md.pv_gen++;
4337 		}
4338 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
4339 	    not_cleared < PMAP_TS_REFERENCED_MAX);
4340 out:
4341 	rw_wunlock(lock);
4342 	rw_runlock(&pvh_global_lock);
4343 	vm_page_free_pages_toq(&free, false);
4344 	return (cleared + not_cleared);
4345 }
4346 
4347 /*
4348  *	Apply the given advice to the specified range of addresses within the
4349  *	given pmap.  Depending on the advice, clear the referenced and/or
4350  *	modified flags in each mapping and set the mapped page's dirty field.
4351  */
4352 void
4353 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
4354 {
4355 }
4356 
4357 /*
4358  *	Clear the modify bits on the specified physical page.
4359  */
4360 void
4361 pmap_clear_modify(vm_page_t m)
4362 {
4363 	struct md_page *pvh;
4364 	struct rwlock *lock;
4365 	pmap_t pmap;
4366 	pv_entry_t next_pv, pv;
4367 	pd_entry_t *l2, oldl2;
4368 	pt_entry_t *l3;
4369 	vm_offset_t va;
4370 	int md_gen, pvh_gen;
4371 
4372 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4373 	    ("pmap_clear_modify: page %p is not managed", m));
4374 	vm_page_assert_busied(m);
4375 
4376 	if (!pmap_page_is_write_mapped(m))
4377 	        return;
4378 
4379 	/*
4380 	 * If the page is not PGA_WRITEABLE, then no PTEs can have PG_M set.
4381 	 * If the object containing the page is locked and the page is not
4382 	 * exclusive busied, then PGA_WRITEABLE cannot be concurrently set.
4383 	 */
4384 	if ((m->a.flags & PGA_WRITEABLE) == 0)
4385 		return;
4386 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy :
4387 	    pa_to_pvh(VM_PAGE_TO_PHYS(m));
4388 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4389 	rw_rlock(&pvh_global_lock);
4390 	rw_wlock(lock);
4391 restart:
4392 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
4393 		pmap = PV_PMAP(pv);
4394 		if (!PMAP_TRYLOCK(pmap)) {
4395 			pvh_gen = pvh->pv_gen;
4396 			rw_wunlock(lock);
4397 			PMAP_LOCK(pmap);
4398 			rw_wlock(lock);
4399 			if (pvh_gen != pvh->pv_gen) {
4400 				PMAP_UNLOCK(pmap);
4401 				goto restart;
4402 			}
4403 		}
4404 		va = pv->pv_va;
4405 		l2 = pmap_l2(pmap, va);
4406 		oldl2 = pmap_load(l2);
4407 		/* If oldl2 has PTE_W set, then it also has PTE_D set. */
4408 		if ((oldl2 & PTE_W) != 0 &&
4409 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
4410 		    (oldl2 & PTE_SW_WIRED) == 0) {
4411 			/*
4412 			 * Write protect the mapping to a single page so that
4413 			 * a subsequent write access may repromote.
4414 			 */
4415 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
4416 			l3 = pmap_l2_to_l3(l2, va);
4417 			pmap_clear_bits(l3, PTE_D | PTE_W);
4418 			vm_page_dirty(m);
4419 			pmap_invalidate_page(pmap, va);
4420 		}
4421 		PMAP_UNLOCK(pmap);
4422 	}
4423 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4424 		pmap = PV_PMAP(pv);
4425 		if (!PMAP_TRYLOCK(pmap)) {
4426 			md_gen = m->md.pv_gen;
4427 			pvh_gen = pvh->pv_gen;
4428 			rw_wunlock(lock);
4429 			PMAP_LOCK(pmap);
4430 			rw_wlock(lock);
4431 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4432 				PMAP_UNLOCK(pmap);
4433 				goto restart;
4434 			}
4435 		}
4436 		l2 = pmap_l2(pmap, pv->pv_va);
4437 		KASSERT((pmap_load(l2) & PTE_RWX) == 0,
4438 		    ("%s: found a 2mpage in page %p's pv list", __func__, m));
4439 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
4440 		if ((pmap_load(l3) & (PTE_D | PTE_W)) == (PTE_D | PTE_W)) {
4441 			pmap_clear_bits(l3, PTE_D | PTE_W);
4442 			pmap_invalidate_page(pmap, pv->pv_va);
4443 		}
4444 		PMAP_UNLOCK(pmap);
4445 	}
4446 	rw_wunlock(lock);
4447 	rw_runlock(&pvh_global_lock);
4448 }
4449 
4450 void *
4451 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
4452 {
4453 
4454         return ((void *)PHYS_TO_DMAP(pa));
4455 }
4456 
4457 void
4458 pmap_unmapbios(vm_paddr_t pa, vm_size_t size)
4459 {
4460 }
4461 
4462 /*
4463  * Sets the memory attribute for the specified page.
4464  */
4465 void
4466 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
4467 {
4468 
4469 	m->md.pv_memattr = ma;
4470 
4471 	/*
4472 	 * If "m" is a normal page, update its direct mapping.  This update
4473 	 * can be relied upon to perform any cache operations that are
4474 	 * required for data coherence.
4475 	 */
4476 	if ((m->flags & PG_FICTITIOUS) == 0 &&
4477 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
4478 	    m->md.pv_memattr) != 0)
4479 		panic("memory attribute change on the direct map failed");
4480 }
4481 
4482 /*
4483  * Changes the specified virtual address range's memory type to that given by
4484  * the parameter "mode".  The specified virtual address range must be
4485  * completely contained within either the direct map or the kernel map.
4486  *
4487  * Returns zero if the change completed successfully, and either EINVAL or
4488  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
4489  * of the virtual address range was not mapped, and ENOMEM is returned if
4490  * there was insufficient memory available to complete the change.  In the
4491  * latter case, the memory type may have been changed on some part of the
4492  * virtual address range.
4493  */
4494 int
4495 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
4496 {
4497 	int error;
4498 
4499 	PMAP_LOCK(kernel_pmap);
4500 	error = pmap_change_attr_locked(va, size, mode);
4501 	PMAP_UNLOCK(kernel_pmap);
4502 	return (error);
4503 }
4504 
4505 static int
4506 pmap_change_attr_locked(vm_offset_t va, vm_size_t size, int mode)
4507 {
4508 	vm_offset_t base, offset, tmpva;
4509 	pd_entry_t *l1, l1e;
4510 	pd_entry_t *l2, l2e;
4511 	pt_entry_t *l3, l3e;
4512 
4513 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
4514 	base = trunc_page(va);
4515 	offset = va & PAGE_MASK;
4516 	size = round_page(offset + size);
4517 
4518 	if (!VIRT_IN_DMAP(base) &&
4519 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
4520 		return (EINVAL);
4521 
4522 	for (tmpva = base; tmpva < base + size; ) {
4523 		l1 = pmap_l1(kernel_pmap, tmpva);
4524 		if (l1 == NULL || ((l1e = pmap_load(l1)) & PTE_V) == 0)
4525 			return (EINVAL);
4526 		if ((l1e & PTE_RWX) != 0) {
4527 			/*
4528 			 * TODO: Demote if attributes don't match and there
4529 			 * isn't an L1 page left in the range, and update the
4530 			 * L1 entry if the attributes don't match but there is
4531 			 * an L1 page left in the range, once we support the
4532 			 * upcoming Svpbmt extension.
4533 			 */
4534 			tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
4535 			continue;
4536 		}
4537 		l2 = pmap_l1_to_l2(l1, tmpva);
4538 		if (l2 == NULL || ((l2e = pmap_load(l2)) & PTE_V) == 0)
4539 			return (EINVAL);
4540 		if ((l2e & PTE_RWX) != 0) {
4541 			/*
4542 			 * TODO: Demote if attributes don't match and there
4543 			 * isn't an L2 page left in the range, and update the
4544 			 * L2 entry if the attributes don't match but there is
4545 			 * an L2 page left in the range, once we support the
4546 			 * upcoming Svpbmt extension.
4547 			 */
4548 			tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
4549 			continue;
4550 		}
4551 		l3 = pmap_l2_to_l3(l2, tmpva);
4552 		if (l3 == NULL || ((l3e = pmap_load(l3)) & PTE_V) == 0)
4553 			return (EINVAL);
4554 		/*
4555 		 * TODO: Update the L3 entry if the attributes don't match once
4556 		 * we support the upcoming Svpbmt extension.
4557 		 */
4558 		tmpva += PAGE_SIZE;
4559 	}
4560 
4561 	return (0);
4562 }
4563 
4564 /*
4565  * Perform the pmap work for mincore(2).  If the page is not both referenced and
4566  * modified by this pmap, returns its physical address so that the caller can
4567  * find other mappings.
4568  */
4569 int
4570 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
4571 {
4572 	pt_entry_t *l2, *l3, tpte;
4573 	vm_paddr_t pa;
4574 	int val;
4575 	bool managed;
4576 
4577 	PMAP_LOCK(pmap);
4578 	l2 = pmap_l2(pmap, addr);
4579 	if (l2 != NULL && ((tpte = pmap_load(l2)) & PTE_V) != 0) {
4580 		if ((tpte & PTE_RWX) != 0) {
4581 			pa = PTE_TO_PHYS(tpte) | (addr & L2_OFFSET);
4582 			val = MINCORE_INCORE | MINCORE_PSIND(1);
4583 		} else {
4584 			l3 = pmap_l2_to_l3(l2, addr);
4585 			tpte = pmap_load(l3);
4586 			if ((tpte & PTE_V) == 0) {
4587 				PMAP_UNLOCK(pmap);
4588 				return (0);
4589 			}
4590 			pa = PTE_TO_PHYS(tpte) | (addr & L3_OFFSET);
4591 			val = MINCORE_INCORE;
4592 		}
4593 
4594 		if ((tpte & PTE_D) != 0)
4595 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
4596 		if ((tpte & PTE_A) != 0)
4597 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
4598 		managed = (tpte & PTE_SW_MANAGED) == PTE_SW_MANAGED;
4599 	} else {
4600 		managed = false;
4601 		val = 0;
4602 	}
4603 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
4604 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
4605 		*pap = pa;
4606 	}
4607 	PMAP_UNLOCK(pmap);
4608 	return (val);
4609 }
4610 
4611 void
4612 pmap_activate_sw(struct thread *td)
4613 {
4614 	pmap_t oldpmap, pmap;
4615 	u_int hart;
4616 
4617 	oldpmap = PCPU_GET(curpmap);
4618 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
4619 	if (pmap == oldpmap)
4620 		return;
4621 	csr_write(satp, pmap->pm_satp);
4622 
4623 	hart = PCPU_GET(hart);
4624 #ifdef SMP
4625 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
4626 	CPU_CLR_ATOMIC(hart, &oldpmap->pm_active);
4627 #else
4628 	CPU_SET(hart, &pmap->pm_active);
4629 	CPU_CLR(hart, &oldpmap->pm_active);
4630 #endif
4631 	PCPU_SET(curpmap, pmap);
4632 
4633 	sfence_vma();
4634 }
4635 
4636 void
4637 pmap_activate(struct thread *td)
4638 {
4639 
4640 	critical_enter();
4641 	pmap_activate_sw(td);
4642 	critical_exit();
4643 }
4644 
4645 void
4646 pmap_activate_boot(pmap_t pmap)
4647 {
4648 	u_int hart;
4649 
4650 	hart = PCPU_GET(hart);
4651 #ifdef SMP
4652 	CPU_SET_ATOMIC(hart, &pmap->pm_active);
4653 #else
4654 	CPU_SET(hart, &pmap->pm_active);
4655 #endif
4656 	PCPU_SET(curpmap, pmap);
4657 }
4658 
4659 void
4660 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
4661 {
4662 	cpuset_t mask;
4663 
4664 	/*
4665 	 * From the RISC-V User-Level ISA V2.2:
4666 	 *
4667 	 * "To make a store to instruction memory visible to all
4668 	 * RISC-V harts, the writing hart has to execute a data FENCE
4669 	 * before requesting that all remote RISC-V harts execute a
4670 	 * FENCE.I."
4671 	 *
4672 	 * However, this is slightly misleading; we still need to
4673 	 * perform a FENCE.I for the local hart, as FENCE does nothing
4674 	 * for its icache. FENCE.I alone is also sufficient for the
4675 	 * local hart.
4676 	 */
4677 	sched_pin();
4678 	mask = all_harts;
4679 	CPU_CLR(PCPU_GET(hart), &mask);
4680 	fence_i();
4681 	if (!CPU_EMPTY(&mask) && smp_started) {
4682 		fence();
4683 		sbi_remote_fence_i(mask.__bits);
4684 	}
4685 	sched_unpin();
4686 }
4687 
4688 /*
4689  *	Increase the starting virtual address of the given mapping if a
4690  *	different alignment might result in more superpage mappings.
4691  */
4692 void
4693 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
4694     vm_offset_t *addr, vm_size_t size)
4695 {
4696 	vm_offset_t superpage_offset;
4697 
4698 	if (size < L2_SIZE)
4699 		return;
4700 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
4701 		offset += ptoa(object->pg_color);
4702 	superpage_offset = offset & L2_OFFSET;
4703 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
4704 	    (*addr & L2_OFFSET) == superpage_offset)
4705 		return;
4706 	if ((*addr & L2_OFFSET) < superpage_offset)
4707 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
4708 	else
4709 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
4710 }
4711 
4712 /**
4713  * Get the kernel virtual address of a set of physical pages. If there are
4714  * physical addresses not covered by the DMAP perform a transient mapping
4715  * that will be removed when calling pmap_unmap_io_transient.
4716  *
4717  * \param page        The pages the caller wishes to obtain the virtual
4718  *                    address on the kernel memory map.
4719  * \param vaddr       On return contains the kernel virtual memory address
4720  *                    of the pages passed in the page parameter.
4721  * \param count       Number of pages passed in.
4722  * \param can_fault   TRUE if the thread using the mapped pages can take
4723  *                    page faults, FALSE otherwise.
4724  *
4725  * \returns TRUE if the caller must call pmap_unmap_io_transient when
4726  *          finished or FALSE otherwise.
4727  *
4728  */
4729 boolean_t
4730 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
4731     boolean_t can_fault)
4732 {
4733 	vm_paddr_t paddr;
4734 	boolean_t needs_mapping;
4735 	int error __diagused, i;
4736 
4737 	/*
4738 	 * Allocate any KVA space that we need, this is done in a separate
4739 	 * loop to prevent calling vmem_alloc while pinned.
4740 	 */
4741 	needs_mapping = FALSE;
4742 	for (i = 0; i < count; i++) {
4743 		paddr = VM_PAGE_TO_PHYS(page[i]);
4744 		if (__predict_false(paddr >= DMAP_MAX_PHYSADDR)) {
4745 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
4746 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
4747 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
4748 			needs_mapping = TRUE;
4749 		} else {
4750 			vaddr[i] = PHYS_TO_DMAP(paddr);
4751 		}
4752 	}
4753 
4754 	/* Exit early if everything is covered by the DMAP */
4755 	if (!needs_mapping)
4756 		return (FALSE);
4757 
4758 	if (!can_fault)
4759 		sched_pin();
4760 	for (i = 0; i < count; i++) {
4761 		paddr = VM_PAGE_TO_PHYS(page[i]);
4762 		if (paddr >= DMAP_MAX_PHYSADDR) {
4763 			panic(
4764 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
4765 		}
4766 	}
4767 
4768 	return (needs_mapping);
4769 }
4770 
4771 void
4772 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
4773     boolean_t can_fault)
4774 {
4775 	vm_paddr_t paddr;
4776 	int i;
4777 
4778 	if (!can_fault)
4779 		sched_unpin();
4780 	for (i = 0; i < count; i++) {
4781 		paddr = VM_PAGE_TO_PHYS(page[i]);
4782 		if (paddr >= DMAP_MAX_PHYSADDR) {
4783 			panic("RISCVTODO: pmap_unmap_io_transient: Unmap data");
4784 		}
4785 	}
4786 }
4787 
4788 boolean_t
4789 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
4790 {
4791 
4792 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_BACK);
4793 }
4794 
4795 bool
4796 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l1, pd_entry_t **l2,
4797     pt_entry_t **l3)
4798 {
4799 	pd_entry_t *l1p, *l2p;
4800 
4801 	/* Get l1 directory entry. */
4802 	l1p = pmap_l1(pmap, va);
4803 	*l1 = l1p;
4804 
4805 	if (l1p == NULL || (pmap_load(l1p) & PTE_V) == 0)
4806 		return (false);
4807 
4808 	if ((pmap_load(l1p) & PTE_RX) != 0) {
4809 		*l2 = NULL;
4810 		*l3 = NULL;
4811 		return (true);
4812 	}
4813 
4814 	/* Get l2 directory entry. */
4815 	l2p = pmap_l1_to_l2(l1p, va);
4816 	*l2 = l2p;
4817 
4818 	if (l2p == NULL || (pmap_load(l2p) & PTE_V) == 0)
4819 		return (false);
4820 
4821 	if ((pmap_load(l2p) & PTE_RX) != 0) {
4822 		*l3 = NULL;
4823 		return (true);
4824 	}
4825 
4826 	/* Get l3 page table entry. */
4827 	*l3 = pmap_l2_to_l3(l2p, va);
4828 
4829 	return (true);
4830 }
4831 
4832 /*
4833  * Track a range of the kernel's virtual address space that is contiguous
4834  * in various mapping attributes.
4835  */
4836 struct pmap_kernel_map_range {
4837 	vm_offset_t sva;
4838 	pt_entry_t attrs;
4839 	int l3pages;
4840 	int l2pages;
4841 	int l1pages;
4842 };
4843 
4844 static void
4845 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
4846     vm_offset_t eva)
4847 {
4848 
4849 	if (eva <= range->sva)
4850 		return;
4851 
4852 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %d %d %d\n",
4853 	    range->sva, eva,
4854 	    (range->attrs & PTE_W) == PTE_W ? 'w' : '-',
4855 	    (range->attrs & PTE_X) == PTE_X ? 'x' : '-',
4856 	    (range->attrs & PTE_U) == PTE_U ? 'u' : 's',
4857 	    (range->attrs & PTE_G) == PTE_G ? 'g' : '-',
4858 	    range->l1pages, range->l2pages, range->l3pages);
4859 
4860 	/* Reset to sentinel value. */
4861 	range->sva = 0xfffffffffffffffful;
4862 }
4863 
4864 /*
4865  * Determine whether the attributes specified by a page table entry match those
4866  * being tracked by the current range.
4867  */
4868 static bool
4869 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
4870 {
4871 
4872 	return (range->attrs == attrs);
4873 }
4874 
4875 static void
4876 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
4877     pt_entry_t attrs)
4878 {
4879 
4880 	memset(range, 0, sizeof(*range));
4881 	range->sva = va;
4882 	range->attrs = attrs;
4883 }
4884 
4885 /*
4886  * Given a leaf PTE, derive the mapping's attributes. If they do not match
4887  * those of the current run, dump the address range and its attributes, and
4888  * begin a new run.
4889  */
4890 static void
4891 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
4892     vm_offset_t va, pd_entry_t l1e, pd_entry_t l2e, pt_entry_t l3e)
4893 {
4894 	pt_entry_t attrs;
4895 
4896 	/* The PTE global bit is inherited by lower levels. */
4897 	attrs = l1e & PTE_G;
4898 	if ((l1e & PTE_RWX) != 0)
4899 		attrs |= l1e & (PTE_RWX | PTE_U);
4900 	else if (l2e != 0)
4901 		attrs |= l2e & PTE_G;
4902 	if ((l2e & PTE_RWX) != 0)
4903 		attrs |= l2e & (PTE_RWX | PTE_U);
4904 	else if (l3e != 0)
4905 		attrs |= l3e & (PTE_RWX | PTE_U | PTE_G);
4906 
4907 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
4908 		sysctl_kmaps_dump(sb, range, va);
4909 		sysctl_kmaps_reinit(range, va, attrs);
4910 	}
4911 }
4912 
4913 static int
4914 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
4915 {
4916 	struct pmap_kernel_map_range range;
4917 	struct sbuf sbuf, *sb;
4918 	pd_entry_t l1e, *l2, l2e;
4919 	pt_entry_t *l3, l3e;
4920 	vm_offset_t sva;
4921 	vm_paddr_t pa;
4922 	int error, i, j, k;
4923 
4924 	error = sysctl_wire_old_buffer(req, 0);
4925 	if (error != 0)
4926 		return (error);
4927 	sb = &sbuf;
4928 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
4929 
4930 	/* Sentinel value. */
4931 	range.sva = 0xfffffffffffffffful;
4932 
4933 	/*
4934 	 * Iterate over the kernel page tables without holding the kernel pmap
4935 	 * lock. Kernel page table pages are never freed, so at worst we will
4936 	 * observe inconsistencies in the output.
4937 	 */
4938 	sva = VM_MIN_KERNEL_ADDRESS;
4939 	for (i = pmap_l1_index(sva); i < Ln_ENTRIES; i++) {
4940 		if (i == pmap_l1_index(DMAP_MIN_ADDRESS))
4941 			sbuf_printf(sb, "\nDirect map:\n");
4942 		else if (i == pmap_l1_index(VM_MIN_KERNEL_ADDRESS))
4943 			sbuf_printf(sb, "\nKernel map:\n");
4944 
4945 		l1e = kernel_pmap->pm_top[i];
4946 		if ((l1e & PTE_V) == 0) {
4947 			sysctl_kmaps_dump(sb, &range, sva);
4948 			sva += L1_SIZE;
4949 			continue;
4950 		}
4951 		if ((l1e & PTE_RWX) != 0) {
4952 			sysctl_kmaps_check(sb, &range, sva, l1e, 0, 0);
4953 			range.l1pages++;
4954 			sva += L1_SIZE;
4955 			continue;
4956 		}
4957 		pa = PTE_TO_PHYS(l1e);
4958 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
4959 
4960 		for (j = pmap_l2_index(sva); j < Ln_ENTRIES; j++) {
4961 			l2e = l2[j];
4962 			if ((l2e & PTE_V) == 0) {
4963 				sysctl_kmaps_dump(sb, &range, sva);
4964 				sva += L2_SIZE;
4965 				continue;
4966 			}
4967 			if ((l2e & PTE_RWX) != 0) {
4968 				sysctl_kmaps_check(sb, &range, sva, l1e, l2e, 0);
4969 				range.l2pages++;
4970 				sva += L2_SIZE;
4971 				continue;
4972 			}
4973 			pa = PTE_TO_PHYS(l2e);
4974 			l3 = (pd_entry_t *)PHYS_TO_DMAP(pa);
4975 
4976 			for (k = pmap_l3_index(sva); k < Ln_ENTRIES; k++,
4977 			    sva += L3_SIZE) {
4978 				l3e = l3[k];
4979 				if ((l3e & PTE_V) == 0) {
4980 					sysctl_kmaps_dump(sb, &range, sva);
4981 					continue;
4982 				}
4983 				sysctl_kmaps_check(sb, &range, sva,
4984 				    l1e, l2e, l3e);
4985 				range.l3pages++;
4986 			}
4987 		}
4988 	}
4989 
4990 	error = sbuf_finish(sb);
4991 	sbuf_delete(sb);
4992 	return (error);
4993 }
4994 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
4995     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
4996     NULL, 0, sysctl_kmaps, "A",
4997     "Dump kernel address layout");
4998