xref: /freebsd/sys/arm64/arm64/pmap.c (revision 2b833162)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *	This product includes software developed by the University of
35  *	California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
53  */
54 /*-
55  * Copyright (c) 2003 Networks Associates Technology, Inc.
56  * All rights reserved.
57  *
58  * This software was developed for the FreeBSD Project by Jake Burkholder,
59  * Safeport Network Services, and Network Associates Laboratories, the
60  * Security Research Division of Network Associates, Inc. under
61  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62  * CHATS research program.
63  *
64  * Redistribution and use in source and binary forms, with or without
65  * modification, are permitted provided that the following conditions
66  * are met:
67  * 1. Redistributions of source code must retain the above copyright
68  *    notice, this list of conditions and the following disclaimer.
69  * 2. Redistributions in binary form must reproduce the above copyright
70  *    notice, this list of conditions and the following disclaimer in the
71  *    documentation and/or other materials provided with the distribution.
72  *
73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83  * SUCH DAMAGE.
84  */
85 
86 #include <sys/cdefs.h>
87 __FBSDID("$FreeBSD$");
88 
89 /*
90  *	Manages physical address maps.
91  *
92  *	Since the information managed by this module is
93  *	also stored by the logical address mapping module,
94  *	this module may throw away valid virtual-to-physical
95  *	mappings at almost any time.  However, invalidations
96  *	of virtual-to-physical mappings must be done as
97  *	requested.
98  *
99  *	In order to cope with hardware architectures which
100  *	make virtual-to-physical map invalidates expensive,
101  *	this module may delay invalidate or reduced protection
102  *	operations until such time as they are actually
103  *	necessary.  This module is given full information as
104  *	to which processors are currently using which maps,
105  *	and to when physical maps must be made correct.
106  */
107 
108 #include "opt_vm.h"
109 
110 #include <sys/param.h>
111 #include <sys/asan.h>
112 #include <sys/bitstring.h>
113 #include <sys/bus.h>
114 #include <sys/systm.h>
115 #include <sys/kernel.h>
116 #include <sys/ktr.h>
117 #include <sys/limits.h>
118 #include <sys/lock.h>
119 #include <sys/malloc.h>
120 #include <sys/mman.h>
121 #include <sys/msgbuf.h>
122 #include <sys/mutex.h>
123 #include <sys/physmem.h>
124 #include <sys/proc.h>
125 #include <sys/rwlock.h>
126 #include <sys/sbuf.h>
127 #include <sys/sx.h>
128 #include <sys/vmem.h>
129 #include <sys/vmmeter.h>
130 #include <sys/sched.h>
131 #include <sys/sysctl.h>
132 #include <sys/_unrhdr.h>
133 #include <sys/smp.h>
134 
135 #include <vm/vm.h>
136 #include <vm/vm_param.h>
137 #include <vm/vm_kern.h>
138 #include <vm/vm_page.h>
139 #include <vm/vm_map.h>
140 #include <vm/vm_object.h>
141 #include <vm/vm_extern.h>
142 #include <vm/vm_pageout.h>
143 #include <vm/vm_pager.h>
144 #include <vm/vm_phys.h>
145 #include <vm/vm_radix.h>
146 #include <vm/vm_reserv.h>
147 #include <vm/vm_dumpset.h>
148 #include <vm/uma.h>
149 
150 #include <machine/asan.h>
151 #include <machine/machdep.h>
152 #include <machine/md_var.h>
153 #include <machine/pcb.h>
154 
155 #ifdef NUMA
156 #define	PMAP_MEMDOM	MAXMEMDOM
157 #else
158 #define	PMAP_MEMDOM	1
159 #endif
160 
161 #define	PMAP_ASSERT_STAGE1(pmap)	MPASS((pmap)->pm_stage == PM_STAGE1)
162 #define	PMAP_ASSERT_STAGE2(pmap)	MPASS((pmap)->pm_stage == PM_STAGE2)
163 
164 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
166 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
167 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
168 
169 #define	NUL0E		L0_ENTRIES
170 #define	NUL1E		(NUL0E * NL1PG)
171 #define	NUL2E		(NUL1E * NL2PG)
172 
173 #if !defined(DIAGNOSTIC)
174 #ifdef __GNUC_GNU_INLINE__
175 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
176 #else
177 #define PMAP_INLINE	extern inline
178 #endif
179 #else
180 #define PMAP_INLINE
181 #endif
182 
183 #ifdef PV_STATS
184 #define PV_STAT(x)	do { x ; } while (0)
185 #define __pvused
186 #else
187 #define PV_STAT(x)	do { } while (0)
188 #define __pvused	__unused
189 #endif
190 
191 #define	pmap_l0_pindex(v)	(NUL2E + NUL1E + ((v) >> L0_SHIFT))
192 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
193 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
194 
195 #define	PMAP_SAN_PTE_BITS	(ATTR_DEFAULT | ATTR_S1_XN |	\
196 	ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
197 
198 struct pmap_large_md_page {
199 	struct rwlock   pv_lock;
200 	struct md_page  pv_page;
201 	/* Pad to a power of 2, see pmap_init_pv_table(). */
202 	int		pv_pad[2];
203 };
204 
205 static struct pmap_large_md_page *
206 _pa_to_pmdp(vm_paddr_t pa)
207 {
208 	struct vm_phys_seg *seg;
209 	int segind;
210 
211 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
212 		seg = &vm_phys_segs[segind];
213 		if (pa >= seg->start && pa < seg->end)
214 			return ((struct pmap_large_md_page *)seg->md_first +
215 			    pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
216 	}
217 	return (NULL);
218 }
219 
220 static struct pmap_large_md_page *
221 pa_to_pmdp(vm_paddr_t pa)
222 {
223 	struct pmap_large_md_page *pvd;
224 
225 	pvd = _pa_to_pmdp(pa);
226 	if (pvd == NULL)
227 		panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
228 	return (pvd);
229 }
230 
231 static struct pmap_large_md_page *
232 page_to_pmdp(vm_page_t m)
233 {
234 	struct vm_phys_seg *seg;
235 
236 	seg = &vm_phys_segs[m->segind];
237 	return ((struct pmap_large_md_page *)seg->md_first +
238 	    pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
239 }
240 
241 #define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
242 #define	page_to_pvh(m)	(&(page_to_pmdp(m)->pv_page))
243 
244 #define	PHYS_TO_PV_LIST_LOCK(pa)	({			\
245 	struct pmap_large_md_page *_pvd;			\
246 	struct rwlock *_lock;					\
247 	_pvd = _pa_to_pmdp(pa);					\
248 	if (__predict_false(_pvd == NULL))			\
249 		_lock = &pv_dummy_large.pv_lock;		\
250 	else							\
251 		_lock = &(_pvd->pv_lock);			\
252 	_lock;							\
253 })
254 
255 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
256 	struct rwlock **_lockp = (lockp);		\
257 	struct rwlock *_new_lock;			\
258 							\
259 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
260 	if (_new_lock != *_lockp) {			\
261 		if (*_lockp != NULL)			\
262 			rw_wunlock(*_lockp);		\
263 		*_lockp = _new_lock;			\
264 		rw_wlock(*_lockp);			\
265 	}						\
266 } while (0)
267 
268 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
269 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
270 
271 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
272 	struct rwlock **_lockp = (lockp);		\
273 							\
274 	if (*_lockp != NULL) {				\
275 		rw_wunlock(*_lockp);			\
276 		*_lockp = NULL;				\
277 	}						\
278 } while (0)
279 
280 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
281 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
282 
283 /*
284  * The presence of this flag indicates that the mapping is writeable.
285  * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
286  * it is dirty.  This flag may only be set on managed mappings.
287  *
288  * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
289  * as a software managed bit.
290  */
291 #define	ATTR_SW_DBM	ATTR_DBM
292 
293 struct pmap kernel_pmap_store;
294 
295 /* Used for mapping ACPI memory before VM is initialized */
296 #define	PMAP_PREINIT_MAPPING_COUNT	32
297 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
298 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
299 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
300 
301 /*
302  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
303  * Always map entire L2 block for simplicity.
304  * VA of L2 block = preinit_map_va + i * L2_SIZE
305  */
306 static struct pmap_preinit_mapping {
307 	vm_paddr_t	pa;
308 	vm_offset_t	va;
309 	vm_size_t	size;
310 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
311 
312 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
313 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
314 vm_offset_t kernel_vm_end = 0;
315 
316 /*
317  * Data for the pv entry allocation mechanism.
318  */
319 #ifdef NUMA
320 static __inline int
321 pc_to_domain(struct pv_chunk *pc)
322 {
323 	return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
324 }
325 #else
326 static __inline int
327 pc_to_domain(struct pv_chunk *pc __unused)
328 {
329 	return (0);
330 }
331 #endif
332 
333 struct pv_chunks_list {
334 	struct mtx pvc_lock;
335 	TAILQ_HEAD(pch, pv_chunk) pvc_list;
336 	int active_reclaims;
337 } __aligned(CACHE_LINE_SIZE);
338 
339 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
340 
341 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
342 #define pv_dummy pv_dummy_large.pv_page
343 __read_mostly static struct pmap_large_md_page *pv_table;
344 __read_mostly vm_paddr_t pmap_last_pa;
345 
346 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
347 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
348 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
349 
350 extern pt_entry_t pagetable_l0_ttbr1[];
351 
352 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
353 static vm_paddr_t physmap[PHYSMAP_SIZE];
354 static u_int physmap_idx;
355 
356 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
357     "VM/pmap parameters");
358 
359 #if PAGE_SIZE == PAGE_SIZE_4K
360 #define	L1_BLOCKS_SUPPORTED	1
361 #else
362 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
363 #define	L1_BLOCKS_SUPPORTED	0
364 #endif
365 
366 #define	PMAP_ASSERT_L1_BLOCKS_SUPPORTED	MPASS(L1_BLOCKS_SUPPORTED)
367 
368 /*
369  * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
370  * that it has currently allocated to a pmap, a cursor ("asid_next") to
371  * optimize its search for a free ASID in the bit vector, and an epoch number
372  * ("asid_epoch") to indicate when it has reclaimed all previously allocated
373  * ASIDs that are not currently active on a processor.
374  *
375  * The current epoch number is always in the range [0, INT_MAX).  Negative
376  * numbers and INT_MAX are reserved for special cases that are described
377  * below.
378  */
379 struct asid_set {
380 	int asid_bits;
381 	bitstr_t *asid_set;
382 	int asid_set_size;
383 	int asid_next;
384 	int asid_epoch;
385 	struct mtx asid_set_mutex;
386 };
387 
388 static struct asid_set asids;
389 static struct asid_set vmids;
390 
391 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
392     "ASID allocator");
393 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
394     "The number of bits in an ASID");
395 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
396     "The last allocated ASID plus one");
397 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
398     "The current epoch number");
399 
400 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
401 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
402     "The number of bits in an VMID");
403 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
404     "The last allocated VMID plus one");
405 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
406     "The current epoch number");
407 
408 void (*pmap_clean_stage2_tlbi)(void);
409 void (*pmap_invalidate_vpipt_icache)(void);
410 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
411 void (*pmap_stage2_invalidate_all)(uint64_t);
412 
413 /*
414  * A pmap's cookie encodes an ASID and epoch number.  Cookies for reserved
415  * ASIDs have a negative epoch number, specifically, INT_MIN.  Cookies for
416  * dynamically allocated ASIDs have a non-negative epoch number.
417  *
418  * An invalid ASID is represented by -1.
419  *
420  * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
421  * which indicates that an ASID should never be allocated to the pmap, and
422  * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
423  * allocated when the pmap is next activated.
424  */
425 #define	COOKIE_FROM(asid, epoch)	((long)((u_int)(asid) |	\
426 					    ((u_long)(epoch) << 32)))
427 #define	COOKIE_TO_ASID(cookie)		((int)(cookie))
428 #define	COOKIE_TO_EPOCH(cookie)		((int)((u_long)(cookie) >> 32))
429 
430 #define	TLBI_VA_SHIFT			12
431 #define	TLBI_VA_MASK			((1ul << 44) - 1)
432 #define	TLBI_VA(addr)			(((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
433 #define	TLBI_VA_L3_INCR			(L3_SIZE >> TLBI_VA_SHIFT)
434 
435 static int superpages_enabled = 1;
436 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
437     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
438     "Are large page mappings enabled?");
439 
440 /*
441  * Internal flags for pmap_enter()'s helper functions.
442  */
443 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
444 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
445 
446 TAILQ_HEAD(pv_chunklist, pv_chunk);
447 
448 static void	free_pv_chunk(struct pv_chunk *pc);
449 static void	free_pv_chunk_batch(struct pv_chunklist *batch);
450 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
451 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
452 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
453 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
454 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
455 		    vm_offset_t va);
456 
457 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
458 static bool pmap_activate_int(pmap_t pmap);
459 static void pmap_alloc_asid(pmap_t pmap);
460 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
461     vm_prot_t prot, int mode, bool skip_unmapped);
462 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
463 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
464     vm_offset_t va, struct rwlock **lockp);
465 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
466 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
467     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
468 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
469     u_int flags, vm_page_t m, struct rwlock **lockp);
470 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
471     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
472 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
473     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
474 static void pmap_reset_asid_set(pmap_t pmap);
475 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
476     vm_page_t m, struct rwlock **lockp);
477 
478 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
479 		struct rwlock **lockp);
480 
481 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
482     struct spglist *free);
483 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
484 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
485 
486 /*
487  * These load the old table data and store the new value.
488  * They need to be atomic as the System MMU may write to the table at
489  * the same time as the CPU.
490  */
491 #define	pmap_clear(table)		atomic_store_64(table, 0)
492 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
493 #define	pmap_load(table)		(*table)
494 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
495 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
496 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
497 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
498 
499 /********************/
500 /* Inline functions */
501 /********************/
502 
503 static __inline void
504 pagecopy(void *s, void *d)
505 {
506 
507 	memcpy(d, s, PAGE_SIZE);
508 }
509 
510 static __inline pd_entry_t *
511 pmap_l0(pmap_t pmap, vm_offset_t va)
512 {
513 
514 	return (&pmap->pm_l0[pmap_l0_index(va)]);
515 }
516 
517 static __inline pd_entry_t *
518 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
519 {
520 	pd_entry_t *l1;
521 
522 	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
523 	return (&l1[pmap_l1_index(va)]);
524 }
525 
526 static __inline pd_entry_t *
527 pmap_l1(pmap_t pmap, vm_offset_t va)
528 {
529 	pd_entry_t *l0;
530 
531 	l0 = pmap_l0(pmap, va);
532 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
533 		return (NULL);
534 
535 	return (pmap_l0_to_l1(l0, va));
536 }
537 
538 static __inline pd_entry_t *
539 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
540 {
541 	pd_entry_t l1, *l2p;
542 
543 	l1 = pmap_load(l1p);
544 
545 	KASSERT(ADDR_IS_CANONICAL(va),
546 	    ("%s: Address not in canonical form: %lx", __func__, va));
547 	/*
548 	 * The valid bit may be clear if pmap_update_entry() is concurrently
549 	 * modifying the entry, so for KVA only the entry type may be checked.
550 	 */
551 	KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
552 	    ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
553 	KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
554 	    ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
555 	l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK);
556 	return (&l2p[pmap_l2_index(va)]);
557 }
558 
559 static __inline pd_entry_t *
560 pmap_l2(pmap_t pmap, vm_offset_t va)
561 {
562 	pd_entry_t *l1;
563 
564 	l1 = pmap_l1(pmap, va);
565 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
566 		return (NULL);
567 
568 	return (pmap_l1_to_l2(l1, va));
569 }
570 
571 static __inline pt_entry_t *
572 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
573 {
574 	pd_entry_t l2;
575 	pt_entry_t *l3p;
576 
577 	l2 = pmap_load(l2p);
578 
579 	KASSERT(ADDR_IS_CANONICAL(va),
580 	    ("%s: Address not in canonical form: %lx", __func__, va));
581 	/*
582 	 * The valid bit may be clear if pmap_update_entry() is concurrently
583 	 * modifying the entry, so for KVA only the entry type may be checked.
584 	 */
585 	KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
586 	    ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
587 	KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
588 	    ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
589 	l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK);
590 	return (&l3p[pmap_l3_index(va)]);
591 }
592 
593 /*
594  * Returns the lowest valid pde for a given virtual address.
595  * The next level may or may not point to a valid page or block.
596  */
597 static __inline pd_entry_t *
598 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
599 {
600 	pd_entry_t *l0, *l1, *l2, desc;
601 
602 	l0 = pmap_l0(pmap, va);
603 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
604 	if (desc != L0_TABLE) {
605 		*level = -1;
606 		return (NULL);
607 	}
608 
609 	l1 = pmap_l0_to_l1(l0, va);
610 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
611 	if (desc != L1_TABLE) {
612 		*level = 0;
613 		return (l0);
614 	}
615 
616 	l2 = pmap_l1_to_l2(l1, va);
617 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
618 	if (desc != L2_TABLE) {
619 		*level = 1;
620 		return (l1);
621 	}
622 
623 	*level = 2;
624 	return (l2);
625 }
626 
627 /*
628  * Returns the lowest valid pte block or table entry for a given virtual
629  * address. If there are no valid entries return NULL and set the level to
630  * the first invalid level.
631  */
632 static __inline pt_entry_t *
633 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
634 {
635 	pd_entry_t *l1, *l2, desc;
636 	pt_entry_t *l3;
637 
638 	l1 = pmap_l1(pmap, va);
639 	if (l1 == NULL) {
640 		*level = 0;
641 		return (NULL);
642 	}
643 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
644 	if (desc == L1_BLOCK) {
645 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
646 		*level = 1;
647 		return (l1);
648 	}
649 
650 	if (desc != L1_TABLE) {
651 		*level = 1;
652 		return (NULL);
653 	}
654 
655 	l2 = pmap_l1_to_l2(l1, va);
656 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
657 	if (desc == L2_BLOCK) {
658 		*level = 2;
659 		return (l2);
660 	}
661 
662 	if (desc != L2_TABLE) {
663 		*level = 2;
664 		return (NULL);
665 	}
666 
667 	*level = 3;
668 	l3 = pmap_l2_to_l3(l2, va);
669 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
670 		return (NULL);
671 
672 	return (l3);
673 }
674 
675 /*
676  * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
677  * level that maps the specified virtual address, then a pointer to that entry
678  * is returned.  Otherwise, NULL is returned, unless INVARIANTS are enabled
679  * and a diagnostic message is provided, in which case this function panics.
680  */
681 static __always_inline pt_entry_t *
682 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
683 {
684 	pd_entry_t *l0p, *l1p, *l2p;
685 	pt_entry_t desc, *l3p;
686 	int walk_level __diagused;
687 
688 	KASSERT(level >= 0 && level < 4,
689 	    ("%s: %s passed an out-of-range level (%d)", __func__, diag,
690 	    level));
691 	l0p = pmap_l0(pmap, va);
692 	desc = pmap_load(l0p) & ATTR_DESCR_MASK;
693 	if (desc == L0_TABLE && level > 0) {
694 		l1p = pmap_l0_to_l1(l0p, va);
695 		desc = pmap_load(l1p) & ATTR_DESCR_MASK;
696 		if (desc == L1_BLOCK && level == 1) {
697 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
698 			return (l1p);
699 		}
700 		if (desc == L1_TABLE && level > 1) {
701 			l2p = pmap_l1_to_l2(l1p, va);
702 			desc = pmap_load(l2p) & ATTR_DESCR_MASK;
703 			if (desc == L2_BLOCK && level == 2)
704 				return (l2p);
705 			else if (desc == L2_TABLE && level > 2) {
706 				l3p = pmap_l2_to_l3(l2p, va);
707 				desc = pmap_load(l3p) & ATTR_DESCR_MASK;
708 				if (desc == L3_PAGE && level == 3)
709 					return (l3p);
710 				else
711 					walk_level = 3;
712 			} else
713 				walk_level = 2;
714 		} else
715 			walk_level = 1;
716 	} else
717 		walk_level = 0;
718 	KASSERT(diag == NULL,
719 	    ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
720 	    diag, va, level, desc, walk_level));
721 	return (NULL);
722 }
723 
724 bool
725 pmap_ps_enabled(pmap_t pmap)
726 {
727 	/*
728 	 * Promotion requires a hypervisor call when the kernel is running
729 	 * in EL1. To stop this disable superpage support on non-stage 1
730 	 * pmaps for now.
731 	 */
732 	if (pmap->pm_stage != PM_STAGE1)
733 		return (false);
734 
735 	return (superpages_enabled != 0);
736 }
737 
738 bool
739 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
740     pd_entry_t **l2, pt_entry_t **l3)
741 {
742 	pd_entry_t *l0p, *l1p, *l2p;
743 
744 	if (pmap->pm_l0 == NULL)
745 		return (false);
746 
747 	l0p = pmap_l0(pmap, va);
748 	*l0 = l0p;
749 
750 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
751 		return (false);
752 
753 	l1p = pmap_l0_to_l1(l0p, va);
754 	*l1 = l1p;
755 
756 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
757 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
758 		*l2 = NULL;
759 		*l3 = NULL;
760 		return (true);
761 	}
762 
763 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
764 		return (false);
765 
766 	l2p = pmap_l1_to_l2(l1p, va);
767 	*l2 = l2p;
768 
769 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
770 		*l3 = NULL;
771 		return (true);
772 	}
773 
774 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
775 		return (false);
776 
777 	*l3 = pmap_l2_to_l3(l2p, va);
778 
779 	return (true);
780 }
781 
782 static __inline int
783 pmap_l3_valid(pt_entry_t l3)
784 {
785 
786 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
787 }
788 
789 CTASSERT(L1_BLOCK == L2_BLOCK);
790 
791 static pt_entry_t
792 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
793 {
794 	pt_entry_t val;
795 
796 	if (pmap->pm_stage == PM_STAGE1) {
797 		val = ATTR_S1_IDX(memattr);
798 		if (memattr == VM_MEMATTR_DEVICE)
799 			val |= ATTR_S1_XN;
800 		return (val);
801 	}
802 
803 	val = 0;
804 
805 	switch (memattr) {
806 	case VM_MEMATTR_DEVICE:
807 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
808 		    ATTR_S2_XN(ATTR_S2_XN_ALL));
809 	case VM_MEMATTR_UNCACHEABLE:
810 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
811 	case VM_MEMATTR_WRITE_BACK:
812 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
813 	case VM_MEMATTR_WRITE_THROUGH:
814 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
815 	default:
816 		panic("%s: invalid memory attribute %x", __func__, memattr);
817 	}
818 }
819 
820 static pt_entry_t
821 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
822 {
823 	pt_entry_t val;
824 
825 	val = 0;
826 	if (pmap->pm_stage == PM_STAGE1) {
827 		if ((prot & VM_PROT_EXECUTE) == 0)
828 			val |= ATTR_S1_XN;
829 		if ((prot & VM_PROT_WRITE) == 0)
830 			val |= ATTR_S1_AP(ATTR_S1_AP_RO);
831 	} else {
832 		if ((prot & VM_PROT_WRITE) != 0)
833 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
834 		if ((prot & VM_PROT_READ) != 0)
835 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
836 		if ((prot & VM_PROT_EXECUTE) == 0)
837 			val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
838 	}
839 
840 	return (val);
841 }
842 
843 /*
844  * Checks if the PTE is dirty.
845  */
846 static inline int
847 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
848 {
849 
850 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
851 
852 	if (pmap->pm_stage == PM_STAGE1) {
853 		KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
854 		    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
855 
856 		return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
857 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
858 	}
859 
860 	return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
861 	    ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
862 }
863 
864 static __inline void
865 pmap_resident_count_inc(pmap_t pmap, int count)
866 {
867 
868 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
869 	pmap->pm_stats.resident_count += count;
870 }
871 
872 static __inline void
873 pmap_resident_count_dec(pmap_t pmap, int count)
874 {
875 
876 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
877 	KASSERT(pmap->pm_stats.resident_count >= count,
878 	    ("pmap %p resident count underflow %ld %d", pmap,
879 	    pmap->pm_stats.resident_count, count));
880 	pmap->pm_stats.resident_count -= count;
881 }
882 
883 static vm_paddr_t
884 pmap_early_vtophys(vm_offset_t va)
885 {
886 	vm_paddr_t pa_page;
887 
888 	pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
889 	return (pa_page | (va & PAR_LOW_MASK));
890 }
891 
892 /* State of the bootstrapped DMAP page tables */
893 struct pmap_bootstrap_state {
894 	pt_entry_t	*l1;
895 	pt_entry_t	*l2;
896 	pt_entry_t	*l3;
897 	vm_offset_t	freemempos;
898 	vm_offset_t	va;
899 	vm_paddr_t	pa;
900 	pt_entry_t	table_attrs;
901 	u_int		l0_slot;
902 	u_int		l1_slot;
903 	u_int		l2_slot;
904 	bool		dmap_valid;
905 };
906 
907 /* The bootstrap state */
908 static struct pmap_bootstrap_state bs_state = {
909 	.l1 = NULL,
910 	.l2 = NULL,
911 	.l3 = NULL,
912 	.table_attrs = TATTR_PXN_TABLE,
913 	.l0_slot = L0_ENTRIES,
914 	.l1_slot = Ln_ENTRIES,
915 	.l2_slot = Ln_ENTRIES,
916 	.dmap_valid = false,
917 };
918 
919 static void
920 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
921 {
922 	vm_paddr_t l1_pa;
923 	pd_entry_t l0e;
924 	u_int l0_slot;
925 
926 	/* Link the level 0 table to a level 1 table */
927 	l0_slot = pmap_l0_index(state->va);
928 	if (l0_slot != state->l0_slot) {
929 		/*
930 		 * Make sure we move from a low address to high address
931 		 * before the DMAP region is ready. This ensures we never
932 		 * modify an existing mapping until we can map from a
933 		 * physical address to a virtual address.
934 		 */
935 		MPASS(state->l0_slot < l0_slot ||
936 		    state->l0_slot == L0_ENTRIES ||
937 		    state->dmap_valid);
938 
939 		/* Reset lower levels */
940 		state->l2 = NULL;
941 		state->l3 = NULL;
942 		state->l1_slot = Ln_ENTRIES;
943 		state->l2_slot = Ln_ENTRIES;
944 
945 		/* Check the existing L0 entry */
946 		state->l0_slot = l0_slot;
947 		if (state->dmap_valid) {
948 			l0e = pagetable_l0_ttbr1[l0_slot];
949 			if ((l0e & ATTR_DESCR_VALID) != 0) {
950 				MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
951 				l1_pa = l0e & ~ATTR_MASK;
952 				state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
953 				return;
954 			}
955 		}
956 
957 		/* Create a new L0 table entry */
958 		state->l1 = (pt_entry_t *)state->freemempos;
959 		memset(state->l1, 0, PAGE_SIZE);
960 		state->freemempos += PAGE_SIZE;
961 
962 		l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
963 		MPASS((l1_pa & Ln_TABLE_MASK) == 0);
964 		MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
965 		pmap_store(&pagetable_l0_ttbr1[l0_slot], l1_pa |
966 		    TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
967 	}
968 	KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
969 }
970 
971 static void
972 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
973 {
974 	vm_paddr_t l2_pa;
975 	pd_entry_t l1e;
976 	u_int l1_slot;
977 
978 	/* Make sure there is a valid L0 -> L1 table */
979 	pmap_bootstrap_l0_table(state);
980 
981 	/* Link the level 1 table to a level 2 table */
982 	l1_slot = pmap_l1_index(state->va);
983 	if (l1_slot != state->l1_slot) {
984 		/* See pmap_bootstrap_l0_table for a description */
985 		MPASS(state->l1_slot < l1_slot ||
986 		    state->l1_slot == Ln_ENTRIES ||
987 		    state->dmap_valid);
988 
989 		/* Reset lower levels */
990 		state->l3 = NULL;
991 		state->l2_slot = Ln_ENTRIES;
992 
993 		/* Check the existing L1 entry */
994 		state->l1_slot = l1_slot;
995 		if (state->dmap_valid) {
996 			l1e = state->l1[l1_slot];
997 			if ((l1e & ATTR_DESCR_VALID) != 0) {
998 				MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
999 				l2_pa = l1e & ~ATTR_MASK;
1000 				state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1001 				return;
1002 			}
1003 		}
1004 
1005 		/* Create a new L1 table entry */
1006 		state->l2 = (pt_entry_t *)state->freemempos;
1007 		memset(state->l2, 0, PAGE_SIZE);
1008 		state->freemempos += PAGE_SIZE;
1009 
1010 		l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1011 		MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1012 		MPASS(state->l1[l1_slot] == 0);
1013 		pmap_store(&state->l1[l1_slot], l2_pa | state->table_attrs |
1014 		    L1_TABLE);
1015 	}
1016 	KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1017 }
1018 
1019 static void
1020 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1021 {
1022 	vm_paddr_t l3_pa;
1023 	pd_entry_t l2e;
1024 	u_int l2_slot;
1025 
1026 	/* Make sure there is a valid L1 -> L2 table */
1027 	pmap_bootstrap_l1_table(state);
1028 
1029 	/* Link the level 2 table to a level 3 table */
1030 	l2_slot = pmap_l2_index(state->va);
1031 	if (l2_slot != state->l2_slot) {
1032 		/* See pmap_bootstrap_l0_table for a description */
1033 		MPASS(state->l2_slot < l2_slot ||
1034 		    state->l2_slot == Ln_ENTRIES ||
1035 		    state->dmap_valid);
1036 
1037 		/* Check the existing L2 entry */
1038 		state->l2_slot = l2_slot;
1039 		if (state->dmap_valid) {
1040 			l2e = state->l2[l2_slot];
1041 			if ((l2e & ATTR_DESCR_VALID) != 0) {
1042 				MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1043 				l3_pa = l2e & ~ATTR_MASK;
1044 				state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1045 				return;
1046 			}
1047 		}
1048 
1049 		/* Create a new L2 table entry */
1050 		state->l3 = (pt_entry_t *)state->freemempos;
1051 		memset(state->l3, 0, PAGE_SIZE);
1052 		state->freemempos += PAGE_SIZE;
1053 
1054 		l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1055 		MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1056 		MPASS(state->l2[l2_slot] == 0);
1057 		pmap_store(&state->l2[l2_slot], l3_pa | state->table_attrs |
1058 		    L2_TABLE);
1059 	}
1060 	KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1061 }
1062 
1063 static void
1064 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1065 {
1066 	u_int l2_slot;
1067 	bool first;
1068 
1069 	if ((physmap[i + 1] - state->pa) < L2_SIZE)
1070 		return;
1071 
1072 	/* Make sure there is a valid L1 table */
1073 	pmap_bootstrap_l1_table(state);
1074 
1075 	MPASS((state->va & L2_OFFSET) == 0);
1076 	for (first = true;
1077 	    state->va < DMAP_MAX_ADDRESS &&
1078 	    (physmap[i + 1] - state->pa) >= L2_SIZE;
1079 	    state->va += L2_SIZE, state->pa += L2_SIZE) {
1080 		/*
1081 		 * Stop if we are about to walk off the end of what the
1082 		 * current L1 slot can address.
1083 		 */
1084 		if (!first && (state->pa & L1_OFFSET) == 0)
1085 			break;
1086 
1087 		first = false;
1088 		l2_slot = pmap_l2_index(state->va);
1089 		MPASS((state->pa & L2_OFFSET) == 0);
1090 		MPASS(state->l2[l2_slot] == 0);
1091 		pmap_store(&state->l2[l2_slot], state->pa | ATTR_DEFAULT |
1092 		    ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1093 		    L2_BLOCK);
1094 	}
1095 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1096 }
1097 
1098 static void
1099 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1100 {
1101 	u_int l3_slot;
1102 	bool first;
1103 
1104 	if ((physmap[i + 1] - state->pa) < L3_SIZE)
1105 		return;
1106 
1107 	/* Make sure there is a valid L2 table */
1108 	pmap_bootstrap_l2_table(state);
1109 
1110 	MPASS((state->va & L3_OFFSET) == 0);
1111 	for (first = true;
1112 	    state->va < DMAP_MAX_ADDRESS &&
1113 	    (physmap[i + 1] - state->pa) >= L3_SIZE;
1114 	    state->va += L3_SIZE, state->pa += L3_SIZE) {
1115 		/*
1116 		 * Stop if we are about to walk off the end of what the
1117 		 * current L2 slot can address.
1118 		 */
1119 		if (!first && (state->pa & L2_OFFSET) == 0)
1120 			break;
1121 
1122 		first = false;
1123 		l3_slot = pmap_l3_index(state->va);
1124 		MPASS((state->pa & L3_OFFSET) == 0);
1125 		MPASS(state->l3[l3_slot] == 0);
1126 		pmap_store(&state->l3[l3_slot], state->pa | ATTR_DEFAULT |
1127 		    ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1128 		    L3_PAGE);
1129 	}
1130 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1131 }
1132 
1133 static void
1134 pmap_bootstrap_dmap(vm_paddr_t min_pa)
1135 {
1136 	int i;
1137 
1138 	dmap_phys_base = min_pa & ~L1_OFFSET;
1139 	dmap_phys_max = 0;
1140 	dmap_max_addr = 0;
1141 
1142 	for (i = 0; i < (physmap_idx * 2); i += 2) {
1143 		bs_state.pa = physmap[i] & ~L3_OFFSET;
1144 		bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1145 
1146 		/* Create L3 mappings at the start of the region */
1147 		if ((bs_state.pa & L2_OFFSET) != 0)
1148 			pmap_bootstrap_l3_page(&bs_state, i);
1149 		MPASS(bs_state.pa <= physmap[i + 1]);
1150 
1151 		if (L1_BLOCKS_SUPPORTED) {
1152 			/* Create L2 mappings at the start of the region */
1153 			if ((bs_state.pa & L1_OFFSET) != 0)
1154 				pmap_bootstrap_l2_block(&bs_state, i);
1155 			MPASS(bs_state.pa <= physmap[i + 1]);
1156 
1157 			/* Create the main L1 block mappings */
1158 			for (; bs_state.va < DMAP_MAX_ADDRESS &&
1159 			    (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1160 			    bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1161 				/* Make sure there is a valid L1 table */
1162 				pmap_bootstrap_l0_table(&bs_state);
1163 				MPASS((bs_state.pa & L1_OFFSET) == 0);
1164 				pmap_store(
1165 				    &bs_state.l1[pmap_l1_index(bs_state.va)],
1166 				    bs_state.pa | ATTR_DEFAULT | ATTR_S1_XN |
1167 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1168 				    L1_BLOCK);
1169 			}
1170 			MPASS(bs_state.pa <= physmap[i + 1]);
1171 
1172 			/* Create L2 mappings at the end of the region */
1173 			pmap_bootstrap_l2_block(&bs_state, i);
1174 		} else {
1175 			while (bs_state.va < DMAP_MAX_ADDRESS &&
1176 			    (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1177 				pmap_bootstrap_l2_block(&bs_state, i);
1178 			}
1179 		}
1180 		MPASS(bs_state.pa <= physmap[i + 1]);
1181 
1182 		/* Create L3 mappings at the end of the region */
1183 		pmap_bootstrap_l3_page(&bs_state, i);
1184 		MPASS(bs_state.pa == physmap[i + 1]);
1185 
1186 		if (bs_state.pa > dmap_phys_max) {
1187 			dmap_phys_max = bs_state.pa;
1188 			dmap_max_addr = bs_state.va;
1189 		}
1190 	}
1191 
1192 	cpu_tlb_flushID();
1193 }
1194 
1195 static void
1196 pmap_bootstrap_l2(vm_offset_t va)
1197 {
1198 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1199 
1200 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1201 	bs_state.va = va;
1202 
1203 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1204 		pmap_bootstrap_l1_table(&bs_state);
1205 }
1206 
1207 static void
1208 pmap_bootstrap_l3(vm_offset_t va)
1209 {
1210 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1211 
1212 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1213 	bs_state.va = va;
1214 
1215 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1216 		pmap_bootstrap_l2_table(&bs_state);
1217 }
1218 
1219 #ifdef KASAN
1220 static void
1221 pmap_bootstrap_allocate_kasan_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1222     vm_offset_t *start_va, int *nkasan_l2)
1223 {
1224 	int i;
1225 	vm_paddr_t pa;
1226 	vm_offset_t va;
1227 	pd_entry_t *l2;
1228 
1229 	va = *start_va;
1230 	pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1231 	l2 = pmap_l2(kernel_pmap, va);
1232 
1233 	for (i = 0; pa >= start_pa && i < *nkasan_l2;
1234 	    i++, va += L2_SIZE, pa -= L2_SIZE, l2++) {
1235 		/*
1236 		 * KASAN stack checking results in us having already allocated
1237 		 * part of our shadow map, so we can just skip those segments.
1238 		 */
1239 		if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1240 			pa += L2_SIZE;
1241 			continue;
1242 		}
1243 
1244 		pmap_store(l2, (pa & ~Ln_TABLE_MASK) | PMAP_SAN_PTE_BITS |
1245 		    L2_BLOCK);
1246 	}
1247 
1248 	/*
1249 	 * Ended the allocation due to start_pa constraint, rather than because
1250 	 * we allocated everything.  Adjust back up to the start_pa and remove
1251 	 * the invalid L2 block from our accounting.
1252 	 */
1253 	if (pa < start_pa) {
1254 		va += L2_SIZE;
1255 		i--;
1256 		pa = start_pa;
1257 	}
1258 
1259 	bzero((void *)PHYS_TO_DMAP(pa), i * L2_SIZE);
1260 	physmem_exclude_region(pa, i * L2_SIZE, EXFLAG_NOALLOC);
1261 
1262 	*nkasan_l2 -= i;
1263 	*start_va = va;
1264 }
1265 #endif
1266 
1267 /*
1268  *	Bootstrap the system enough to run with virtual memory.
1269  */
1270 void
1271 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen)
1272 {
1273 	vm_offset_t dpcpu, msgbufpv;
1274 	vm_paddr_t start_pa, pa, min_pa;
1275 	uint64_t kern_delta;
1276 	int i;
1277 
1278 	/* Verify that the ASID is set through TTBR0. */
1279 	KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1280 	    ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1281 
1282 	kern_delta = KERNBASE - kernstart;
1283 
1284 	printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen);
1285 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
1286 
1287 	/* Set this early so we can use the pagetable walking functions */
1288 	kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1289 	PMAP_LOCK_INIT(kernel_pmap);
1290 	kernel_pmap->pm_l0_paddr =
1291 	    pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1292 	kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1293 	kernel_pmap->pm_stage = PM_STAGE1;
1294 	kernel_pmap->pm_levels = 4;
1295 	kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1296 	kernel_pmap->pm_asid_set = &asids;
1297 
1298 	/* Assume the address we were loaded to is a valid physical address */
1299 	min_pa = KERNBASE - kern_delta;
1300 
1301 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1302 	physmap_idx /= 2;
1303 
1304 	/*
1305 	 * Find the minimum physical address. physmap is sorted,
1306 	 * but may contain empty ranges.
1307 	 */
1308 	for (i = 0; i < physmap_idx * 2; i += 2) {
1309 		if (physmap[i] == physmap[i + 1])
1310 			continue;
1311 		if (physmap[i] <= min_pa)
1312 			min_pa = physmap[i];
1313 	}
1314 
1315 	bs_state.freemempos = KERNBASE + kernlen;
1316 	bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1317 
1318 	/* Create a direct map region early so we can use it for pa -> va */
1319 	pmap_bootstrap_dmap(min_pa);
1320 	bs_state.dmap_valid = true;
1321 	/*
1322 	 * We only use PXN when we know nothing will be executed from it, e.g.
1323 	 * the DMAP region.
1324 	 */
1325 	bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1326 
1327 	start_pa = pa = KERNBASE - kern_delta;
1328 
1329 	/*
1330 	 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS.  We assume that the
1331 	 * loader allocated the first and only l2 page table page used to map
1332 	 * the kernel, preloaded files and module metadata.
1333 	 */
1334 	pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1335 	/* And the l3 tables for the early devmap */
1336 	pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1337 
1338 	cpu_tlb_flushID();
1339 
1340 #define alloc_pages(var, np)						\
1341 	(var) = bs_state.freemempos;					\
1342 	bs_state.freemempos += (np * PAGE_SIZE);			\
1343 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
1344 
1345 	/* Allocate dynamic per-cpu area. */
1346 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1347 	dpcpu_init((void *)dpcpu, 0);
1348 
1349 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1350 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1351 	msgbufp = (void *)msgbufpv;
1352 
1353 	/* Reserve some VA space for early BIOS/ACPI mapping */
1354 	preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1355 
1356 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1357 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
1358 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1359 	kernel_vm_end = virtual_avail;
1360 
1361 	pa = pmap_early_vtophys(bs_state.freemempos);
1362 
1363 	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1364 
1365 	cpu_tlb_flushID();
1366 }
1367 
1368 #if defined(KASAN)
1369 /*
1370  * Finish constructing the initial shadow map:
1371  * - Count how many pages from KERNBASE to virtual_avail (scaled for
1372  *   shadow map)
1373  * - Map that entire range using L2 superpages.
1374  */
1375 void
1376 pmap_bootstrap_san(vm_paddr_t kernstart)
1377 {
1378 	vm_offset_t va;
1379 	int i, shadow_npages, nkasan_l2;
1380 
1381 	/*
1382 	 * Rebuild physmap one more time, we may have excluded more regions from
1383 	 * allocation since pmap_bootstrap().
1384 	 */
1385 	bzero(physmap, sizeof(physmap));
1386 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1387 	physmap_idx /= 2;
1388 
1389 	shadow_npages = (virtual_avail - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE;
1390 	shadow_npages = howmany(shadow_npages, KASAN_SHADOW_SCALE);
1391 	nkasan_l2 = howmany(shadow_npages, Ln_ENTRIES);
1392 
1393 	/* Map the valid KVA up to this point. */
1394 	va = KASAN_MIN_ADDRESS;
1395 
1396 	/*
1397 	 * Find a slot in the physmap large enough for what we needed.  We try to put
1398 	 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1399 	 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1400 	 */
1401 	for (i = (physmap_idx * 2) - 2; i >= 0 && nkasan_l2 > 0; i -= 2) {
1402 		vm_paddr_t plow, phigh;
1403 
1404 		/* L2 mappings must be backed by memory that is L2-aligned */
1405 		plow = roundup2(physmap[i], L2_SIZE);
1406 		phigh = physmap[i + 1];
1407 		if (plow >= phigh)
1408 			continue;
1409 		if (kernstart >= plow && kernstart < phigh)
1410 			phigh = kernstart;
1411 		if (phigh - plow >= L2_SIZE)
1412 			pmap_bootstrap_allocate_kasan_l2(plow, phigh, &va,
1413 			    &nkasan_l2);
1414 	}
1415 
1416 	if (nkasan_l2 != 0)
1417 		panic("Could not find phys region for shadow map");
1418 
1419 	/*
1420 	 * Done. We should now have a valid shadow address mapped for all KVA
1421 	 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1422 	 * shadow accesses by the kasan(9) runtime will succeed for this range.
1423 	 * When the kernel virtual address range is later expanded, as will
1424 	 * happen in vm_mem_init(), the shadow map will be grown as well. This
1425 	 * is handled by pmap_san_enter().
1426 	 */
1427 }
1428 #endif
1429 
1430 /*
1431  *	Initialize a vm_page's machine-dependent fields.
1432  */
1433 void
1434 pmap_page_init(vm_page_t m)
1435 {
1436 
1437 	TAILQ_INIT(&m->md.pv_list);
1438 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1439 }
1440 
1441 static void
1442 pmap_init_asids(struct asid_set *set, int bits)
1443 {
1444 	int i;
1445 
1446 	set->asid_bits = bits;
1447 
1448 	/*
1449 	 * We may be too early in the overall initialization process to use
1450 	 * bit_alloc().
1451 	 */
1452 	set->asid_set_size = 1 << set->asid_bits;
1453 	set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1454 	    M_WAITOK | M_ZERO);
1455 	for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1456 		bit_set(set->asid_set, i);
1457 	set->asid_next = ASID_FIRST_AVAILABLE;
1458 	mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1459 }
1460 
1461 static void
1462 pmap_init_pv_table(void)
1463 {
1464 	struct vm_phys_seg *seg, *next_seg;
1465 	struct pmap_large_md_page *pvd;
1466 	vm_size_t s;
1467 	int domain, i, j, pages;
1468 
1469 	/*
1470 	 * We strongly depend on the size being a power of two, so the assert
1471 	 * is overzealous. However, should the struct be resized to a
1472 	 * different power of two, the code below needs to be revisited.
1473 	 */
1474 	CTASSERT((sizeof(*pvd) == 64));
1475 
1476 	/*
1477 	 * Calculate the size of the array.
1478 	 */
1479 	s = 0;
1480 	for (i = 0; i < vm_phys_nsegs; i++) {
1481 		seg = &vm_phys_segs[i];
1482 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1483 		    pmap_l2_pindex(seg->start);
1484 		s += round_page(pages * sizeof(*pvd));
1485 	}
1486 	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1487 	if (pv_table == NULL)
1488 		panic("%s: kva_alloc failed\n", __func__);
1489 
1490 	/*
1491 	 * Iterate physical segments to allocate domain-local memory for PV
1492 	 * list headers.
1493 	 */
1494 	pvd = pv_table;
1495 	for (i = 0; i < vm_phys_nsegs; i++) {
1496 		seg = &vm_phys_segs[i];
1497 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1498 		    pmap_l2_pindex(seg->start);
1499 		domain = seg->domain;
1500 
1501 		s = round_page(pages * sizeof(*pvd));
1502 
1503 		for (j = 0; j < s; j += PAGE_SIZE) {
1504 			vm_page_t m = vm_page_alloc_noobj_domain(domain,
1505 			    VM_ALLOC_ZERO);
1506 			if (m == NULL)
1507 				panic("failed to allocate PV table page");
1508 			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1509 		}
1510 
1511 		for (j = 0; j < s / sizeof(*pvd); j++) {
1512 			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1513 			TAILQ_INIT(&pvd->pv_page.pv_list);
1514 			pvd++;
1515 		}
1516 	}
1517 	pvd = &pv_dummy_large;
1518 	memset(pvd, 0, sizeof(*pvd));
1519 	rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1520 	TAILQ_INIT(&pvd->pv_page.pv_list);
1521 
1522 	/*
1523 	 * Set pointers from vm_phys_segs to pv_table.
1524 	 */
1525 	for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1526 		seg = &vm_phys_segs[i];
1527 		seg->md_first = pvd;
1528 		pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1529 		    pmap_l2_pindex(seg->start);
1530 
1531 		/*
1532 		 * If there is a following segment, and the final
1533 		 * superpage of this segment and the initial superpage
1534 		 * of the next segment are the same then adjust the
1535 		 * pv_table entry for that next segment down by one so
1536 		 * that the pv_table entries will be shared.
1537 		 */
1538 		if (i + 1 < vm_phys_nsegs) {
1539 			next_seg = &vm_phys_segs[i + 1];
1540 			if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1541 			    pmap_l2_pindex(next_seg->start)) {
1542 				pvd--;
1543 			}
1544 		}
1545 	}
1546 }
1547 
1548 /*
1549  *	Initialize the pmap module.
1550  *	Called by vm_init, to initialize any structures that the pmap
1551  *	system needs to map virtual memory.
1552  */
1553 void
1554 pmap_init(void)
1555 {
1556 	uint64_t mmfr1;
1557 	int i, vmid_bits;
1558 
1559 	/*
1560 	 * Are large page mappings enabled?
1561 	 */
1562 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1563 	if (superpages_enabled) {
1564 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1565 		    ("pmap_init: can't assign to pagesizes[1]"));
1566 		pagesizes[1] = L2_SIZE;
1567 		if (L1_BLOCKS_SUPPORTED) {
1568 			KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1569 			    ("pmap_init: can't assign to pagesizes[2]"));
1570 			pagesizes[2] = L1_SIZE;
1571 		}
1572 	}
1573 
1574 	/*
1575 	 * Initialize the ASID allocator.
1576 	 */
1577 	pmap_init_asids(&asids,
1578 	    (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1579 
1580 	if (has_hyp()) {
1581 		mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1582 		vmid_bits = 8;
1583 
1584 		if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1585 		    ID_AA64MMFR1_VMIDBits_16)
1586 			vmid_bits = 16;
1587 		pmap_init_asids(&vmids, vmid_bits);
1588 	}
1589 
1590 	/*
1591 	 * Initialize pv chunk lists.
1592 	 */
1593 	for (i = 0; i < PMAP_MEMDOM; i++) {
1594 		mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1595 		    MTX_DEF);
1596 		TAILQ_INIT(&pv_chunks[i].pvc_list);
1597 	}
1598 	pmap_init_pv_table();
1599 
1600 	vm_initialized = 1;
1601 }
1602 
1603 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1604     "2MB page mapping counters");
1605 
1606 static u_long pmap_l2_demotions;
1607 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1608     &pmap_l2_demotions, 0, "2MB page demotions");
1609 
1610 static u_long pmap_l2_mappings;
1611 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1612     &pmap_l2_mappings, 0, "2MB page mappings");
1613 
1614 static u_long pmap_l2_p_failures;
1615 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1616     &pmap_l2_p_failures, 0, "2MB page promotion failures");
1617 
1618 static u_long pmap_l2_promotions;
1619 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1620     &pmap_l2_promotions, 0, "2MB page promotions");
1621 
1622 /*
1623  * If the given value for "final_only" is false, then any cached intermediate-
1624  * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1625  * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1626  * Otherwise, just the cached final-level entry is invalidated.
1627  */
1628 static __inline void
1629 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1630 {
1631 	if (final_only)
1632 		__asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1633 	else
1634 		__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1635 }
1636 
1637 static __inline void
1638 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1639 {
1640 	if (final_only)
1641 		__asm __volatile("tlbi vale1is, %0" : : "r" (r));
1642 	else
1643 		__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1644 }
1645 
1646 /*
1647  * Invalidates any cached final- and optionally intermediate-level TLB entries
1648  * for the specified virtual address in the given virtual address space.
1649  */
1650 static __inline void
1651 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1652 {
1653 	uint64_t r;
1654 
1655 	PMAP_ASSERT_STAGE1(pmap);
1656 
1657 	dsb(ishst);
1658 	r = TLBI_VA(va);
1659 	if (pmap == kernel_pmap) {
1660 		pmap_s1_invalidate_kernel(r, final_only);
1661 	} else {
1662 		r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1663 		pmap_s1_invalidate_user(r, final_only);
1664 	}
1665 	dsb(ish);
1666 	isb();
1667 }
1668 
1669 static __inline void
1670 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1671 {
1672 	PMAP_ASSERT_STAGE2(pmap);
1673 	MPASS(pmap_stage2_invalidate_range != NULL);
1674 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1675 	    final_only);
1676 }
1677 
1678 static __inline void
1679 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1680 {
1681 	if (pmap->pm_stage == PM_STAGE1)
1682 		pmap_s1_invalidate_page(pmap, va, final_only);
1683 	else
1684 		pmap_s2_invalidate_page(pmap, va, final_only);
1685 }
1686 
1687 /*
1688  * Invalidates any cached final- and optionally intermediate-level TLB entries
1689  * for the specified virtual address range in the given virtual address space.
1690  */
1691 static __inline void
1692 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1693     bool final_only)
1694 {
1695 	uint64_t end, r, start;
1696 
1697 	PMAP_ASSERT_STAGE1(pmap);
1698 
1699 	dsb(ishst);
1700 	if (pmap == kernel_pmap) {
1701 		start = TLBI_VA(sva);
1702 		end = TLBI_VA(eva);
1703 		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1704 			pmap_s1_invalidate_kernel(r, final_only);
1705 	} else {
1706 		start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1707 		start |= TLBI_VA(sva);
1708 		end |= TLBI_VA(eva);
1709 		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1710 			pmap_s1_invalidate_user(r, final_only);
1711 	}
1712 	dsb(ish);
1713 	isb();
1714 }
1715 
1716 static __inline void
1717 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1718     bool final_only)
1719 {
1720 	PMAP_ASSERT_STAGE2(pmap);
1721 	MPASS(pmap_stage2_invalidate_range != NULL);
1722 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1723 }
1724 
1725 static __inline void
1726 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1727     bool final_only)
1728 {
1729 	if (pmap->pm_stage == PM_STAGE1)
1730 		pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1731 	else
1732 		pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1733 }
1734 
1735 /*
1736  * Invalidates all cached intermediate- and final-level TLB entries for the
1737  * given virtual address space.
1738  */
1739 static __inline void
1740 pmap_s1_invalidate_all(pmap_t pmap)
1741 {
1742 	uint64_t r;
1743 
1744 	PMAP_ASSERT_STAGE1(pmap);
1745 
1746 	dsb(ishst);
1747 	if (pmap == kernel_pmap) {
1748 		__asm __volatile("tlbi vmalle1is");
1749 	} else {
1750 		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1751 		__asm __volatile("tlbi aside1is, %0" : : "r" (r));
1752 	}
1753 	dsb(ish);
1754 	isb();
1755 }
1756 
1757 static __inline void
1758 pmap_s2_invalidate_all(pmap_t pmap)
1759 {
1760 	PMAP_ASSERT_STAGE2(pmap);
1761 	MPASS(pmap_stage2_invalidate_all != NULL);
1762 	pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1763 }
1764 
1765 static __inline void
1766 pmap_invalidate_all(pmap_t pmap)
1767 {
1768 	if (pmap->pm_stage == PM_STAGE1)
1769 		pmap_s1_invalidate_all(pmap);
1770 	else
1771 		pmap_s2_invalidate_all(pmap);
1772 }
1773 
1774 /*
1775  *	Routine:	pmap_extract
1776  *	Function:
1777  *		Extract the physical page address associated
1778  *		with the given map/virtual_address pair.
1779  */
1780 vm_paddr_t
1781 pmap_extract(pmap_t pmap, vm_offset_t va)
1782 {
1783 	pt_entry_t *pte, tpte;
1784 	vm_paddr_t pa;
1785 	int lvl;
1786 
1787 	pa = 0;
1788 	PMAP_LOCK(pmap);
1789 	/*
1790 	 * Find the block or page map for this virtual address. pmap_pte
1791 	 * will return either a valid block/page entry, or NULL.
1792 	 */
1793 	pte = pmap_pte(pmap, va, &lvl);
1794 	if (pte != NULL) {
1795 		tpte = pmap_load(pte);
1796 		pa = tpte & ~ATTR_MASK;
1797 		switch(lvl) {
1798 		case 1:
1799 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1800 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1801 			    ("pmap_extract: Invalid L1 pte found: %lx",
1802 			    tpte & ATTR_DESCR_MASK));
1803 			pa |= (va & L1_OFFSET);
1804 			break;
1805 		case 2:
1806 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1807 			    ("pmap_extract: Invalid L2 pte found: %lx",
1808 			    tpte & ATTR_DESCR_MASK));
1809 			pa |= (va & L2_OFFSET);
1810 			break;
1811 		case 3:
1812 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1813 			    ("pmap_extract: Invalid L3 pte found: %lx",
1814 			    tpte & ATTR_DESCR_MASK));
1815 			pa |= (va & L3_OFFSET);
1816 			break;
1817 		}
1818 	}
1819 	PMAP_UNLOCK(pmap);
1820 	return (pa);
1821 }
1822 
1823 /*
1824  *	Routine:	pmap_extract_and_hold
1825  *	Function:
1826  *		Atomically extract and hold the physical page
1827  *		with the given pmap and virtual address pair
1828  *		if that mapping permits the given protection.
1829  */
1830 vm_page_t
1831 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1832 {
1833 	pt_entry_t *pte, tpte;
1834 	vm_offset_t off;
1835 	vm_page_t m;
1836 	int lvl;
1837 	bool use;
1838 
1839 	m = NULL;
1840 	PMAP_LOCK(pmap);
1841 	pte = pmap_pte(pmap, va, &lvl);
1842 	if (pte != NULL) {
1843 		tpte = pmap_load(pte);
1844 
1845 		KASSERT(lvl > 0 && lvl <= 3,
1846 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1847 		/*
1848 		 * Check that the pte is either a L3 page, or a L1 or L2 block
1849 		 * entry. We can assume L1_BLOCK == L2_BLOCK.
1850 		 */
1851 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1852 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1853 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1854 		     tpte & ATTR_DESCR_MASK));
1855 
1856 		use = false;
1857 		if ((prot & VM_PROT_WRITE) == 0)
1858 			use = true;
1859 		else if (pmap->pm_stage == PM_STAGE1 &&
1860 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1861 			use = true;
1862 		else if (pmap->pm_stage == PM_STAGE2 &&
1863 		    ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1864 		     ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1865 			use = true;
1866 
1867 		if (use) {
1868 			switch (lvl) {
1869 			case 1:
1870 				off = va & L1_OFFSET;
1871 				break;
1872 			case 2:
1873 				off = va & L2_OFFSET;
1874 				break;
1875 			case 3:
1876 			default:
1877 				off = 0;
1878 			}
1879 			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
1880 			if (m != NULL && !vm_page_wire_mapped(m))
1881 				m = NULL;
1882 		}
1883 	}
1884 	PMAP_UNLOCK(pmap);
1885 	return (m);
1886 }
1887 
1888 /*
1889  * Walks the page tables to translate a kernel virtual address to a
1890  * physical address. Returns true if the kva is valid and stores the
1891  * physical address in pa if it is not NULL.
1892  *
1893  * See the comment above data_abort() for the rationale for specifying
1894  * NO_PERTHREAD_SSP here.
1895  */
1896 bool NO_PERTHREAD_SSP
1897 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1898 {
1899 	pt_entry_t *pte, tpte;
1900 	register_t intr;
1901 	uint64_t par;
1902 
1903 	/*
1904 	 * Disable interrupts so we don't get interrupted between asking
1905 	 * for address translation, and getting the result back.
1906 	 */
1907 	intr = intr_disable();
1908 	par = arm64_address_translate_s1e1r(va);
1909 	intr_restore(intr);
1910 
1911 	if (PAR_SUCCESS(par)) {
1912 		if (pa != NULL)
1913 			*pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
1914 		return (true);
1915 	}
1916 
1917 	/*
1918 	 * Fall back to walking the page table. The address translation
1919 	 * instruction may fail when the page is in a break-before-make
1920 	 * sequence. As we only clear the valid bit in said sequence we
1921 	 * can walk the page table to find the physical address.
1922 	 */
1923 
1924 	pte = pmap_l1(kernel_pmap, va);
1925 	if (pte == NULL)
1926 		return (false);
1927 
1928 	/*
1929 	 * A concurrent pmap_update_entry() will clear the entry's valid bit
1930 	 * but leave the rest of the entry unchanged.  Therefore, we treat a
1931 	 * non-zero entry as being valid, and we ignore the valid bit when
1932 	 * determining whether the entry maps a block, page, or table.
1933 	 */
1934 	tpte = pmap_load(pte);
1935 	if (tpte == 0)
1936 		return (false);
1937 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1938 		if (pa != NULL)
1939 			*pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET);
1940 		return (true);
1941 	}
1942 	pte = pmap_l1_to_l2(&tpte, va);
1943 	tpte = pmap_load(pte);
1944 	if (tpte == 0)
1945 		return (false);
1946 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1947 		if (pa != NULL)
1948 			*pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET);
1949 		return (true);
1950 	}
1951 	pte = pmap_l2_to_l3(&tpte, va);
1952 	tpte = pmap_load(pte);
1953 	if (tpte == 0)
1954 		return (false);
1955 	if (pa != NULL)
1956 		*pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET);
1957 	return (true);
1958 }
1959 
1960 vm_paddr_t
1961 pmap_kextract(vm_offset_t va)
1962 {
1963 	vm_paddr_t pa;
1964 
1965 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
1966 		return (DMAP_TO_PHYS(va));
1967 
1968 	if (pmap_klookup(va, &pa) == false)
1969 		return (0);
1970 	return (pa);
1971 }
1972 
1973 /***************************************************
1974  * Low level mapping routines.....
1975  ***************************************************/
1976 
1977 void
1978 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1979 {
1980 	pd_entry_t *pde;
1981 	pt_entry_t *pte, attr;
1982 	vm_offset_t va;
1983 	int lvl;
1984 
1985 	KASSERT((pa & L3_OFFSET) == 0,
1986 	   ("pmap_kenter: Invalid physical address"));
1987 	KASSERT((sva & L3_OFFSET) == 0,
1988 	   ("pmap_kenter: Invalid virtual address"));
1989 	KASSERT((size & PAGE_MASK) == 0,
1990 	    ("pmap_kenter: Mapping is not page-sized"));
1991 
1992 	attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1993 	    ATTR_S1_IDX(mode) | L3_PAGE;
1994 	va = sva;
1995 	while (size != 0) {
1996 		pde = pmap_pde(kernel_pmap, va, &lvl);
1997 		KASSERT(pde != NULL,
1998 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1999 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2000 
2001 		pte = pmap_l2_to_l3(pde, va);
2002 		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
2003 
2004 		va += PAGE_SIZE;
2005 		pa += PAGE_SIZE;
2006 		size -= PAGE_SIZE;
2007 	}
2008 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2009 }
2010 
2011 void
2012 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2013 {
2014 
2015 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2016 }
2017 
2018 /*
2019  * Remove a page from the kernel pagetables.
2020  */
2021 PMAP_INLINE void
2022 pmap_kremove(vm_offset_t va)
2023 {
2024 	pt_entry_t *pte;
2025 
2026 	pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2027 	pmap_clear(pte);
2028 	pmap_s1_invalidate_page(kernel_pmap, va, true);
2029 }
2030 
2031 void
2032 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2033 {
2034 	pt_entry_t *pte;
2035 	vm_offset_t va;
2036 
2037 	KASSERT((sva & L3_OFFSET) == 0,
2038 	   ("pmap_kremove_device: Invalid virtual address"));
2039 	KASSERT((size & PAGE_MASK) == 0,
2040 	    ("pmap_kremove_device: Mapping is not page-sized"));
2041 
2042 	va = sva;
2043 	while (size != 0) {
2044 		pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2045 		pmap_clear(pte);
2046 
2047 		va += PAGE_SIZE;
2048 		size -= PAGE_SIZE;
2049 	}
2050 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2051 }
2052 
2053 /*
2054  *	Used to map a range of physical addresses into kernel
2055  *	virtual address space.
2056  *
2057  *	The value passed in '*virt' is a suggested virtual address for
2058  *	the mapping. Architectures which can support a direct-mapped
2059  *	physical to virtual region can return the appropriate address
2060  *	within that region, leaving '*virt' unchanged. Other
2061  *	architectures should map the pages starting at '*virt' and
2062  *	update '*virt' with the first usable address after the mapped
2063  *	region.
2064  */
2065 vm_offset_t
2066 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2067 {
2068 	return PHYS_TO_DMAP(start);
2069 }
2070 
2071 /*
2072  * Add a list of wired pages to the kva
2073  * this routine is only used for temporary
2074  * kernel mappings that do not need to have
2075  * page modification or references recorded.
2076  * Note that old mappings are simply written
2077  * over.  The page *must* be wired.
2078  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2079  */
2080 void
2081 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2082 {
2083 	pd_entry_t *pde;
2084 	pt_entry_t *pte, pa;
2085 	vm_offset_t va;
2086 	vm_page_t m;
2087 	int i, lvl;
2088 
2089 	va = sva;
2090 	for (i = 0; i < count; i++) {
2091 		pde = pmap_pde(kernel_pmap, va, &lvl);
2092 		KASSERT(pde != NULL,
2093 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2094 		KASSERT(lvl == 2,
2095 		    ("pmap_qenter: Invalid level %d", lvl));
2096 
2097 		m = ma[i];
2098 		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
2099 		    ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2100 		    ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2101 		pte = pmap_l2_to_l3(pde, va);
2102 		pmap_load_store(pte, pa);
2103 
2104 		va += L3_SIZE;
2105 	}
2106 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2107 }
2108 
2109 /*
2110  * This routine tears out page mappings from the
2111  * kernel -- it is meant only for temporary mappings.
2112  */
2113 void
2114 pmap_qremove(vm_offset_t sva, int count)
2115 {
2116 	pt_entry_t *pte;
2117 	vm_offset_t va;
2118 
2119 	KASSERT(ADDR_IS_CANONICAL(sva),
2120 	    ("%s: Address not in canonical form: %lx", __func__, sva));
2121 	KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2122 
2123 	va = sva;
2124 	while (count-- > 0) {
2125 		pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2126 		if (pte != NULL) {
2127 			pmap_clear(pte);
2128 		}
2129 
2130 		va += PAGE_SIZE;
2131 	}
2132 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2133 }
2134 
2135 /***************************************************
2136  * Page table page management routines.....
2137  ***************************************************/
2138 /*
2139  * Schedule the specified unused page table page to be freed.  Specifically,
2140  * add the page to the specified list of pages that will be released to the
2141  * physical memory manager after the TLB has been updated.
2142  */
2143 static __inline void
2144 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2145     boolean_t set_PG_ZERO)
2146 {
2147 
2148 	if (set_PG_ZERO)
2149 		m->flags |= PG_ZERO;
2150 	else
2151 		m->flags &= ~PG_ZERO;
2152 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2153 }
2154 
2155 /*
2156  * Decrements a page table page's reference count, which is used to record the
2157  * number of valid page table entries within the page.  If the reference count
2158  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2159  * page table page was unmapped and FALSE otherwise.
2160  */
2161 static inline boolean_t
2162 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2163 {
2164 
2165 	--m->ref_count;
2166 	if (m->ref_count == 0) {
2167 		_pmap_unwire_l3(pmap, va, m, free);
2168 		return (TRUE);
2169 	} else
2170 		return (FALSE);
2171 }
2172 
2173 static void
2174 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2175 {
2176 
2177 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2178 	/*
2179 	 * unmap the page table page
2180 	 */
2181 	if (m->pindex >= (NUL2E + NUL1E)) {
2182 		/* l1 page */
2183 		pd_entry_t *l0;
2184 
2185 		l0 = pmap_l0(pmap, va);
2186 		pmap_clear(l0);
2187 	} else if (m->pindex >= NUL2E) {
2188 		/* l2 page */
2189 		pd_entry_t *l1;
2190 
2191 		l1 = pmap_l1(pmap, va);
2192 		pmap_clear(l1);
2193 	} else {
2194 		/* l3 page */
2195 		pd_entry_t *l2;
2196 
2197 		l2 = pmap_l2(pmap, va);
2198 		pmap_clear(l2);
2199 	}
2200 	pmap_resident_count_dec(pmap, 1);
2201 	if (m->pindex < NUL2E) {
2202 		/* We just released an l3, unhold the matching l2 */
2203 		pd_entry_t *l1, tl1;
2204 		vm_page_t l2pg;
2205 
2206 		l1 = pmap_l1(pmap, va);
2207 		tl1 = pmap_load(l1);
2208 		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
2209 		pmap_unwire_l3(pmap, va, l2pg, free);
2210 	} else if (m->pindex < (NUL2E + NUL1E)) {
2211 		/* We just released an l2, unhold the matching l1 */
2212 		pd_entry_t *l0, tl0;
2213 		vm_page_t l1pg;
2214 
2215 		l0 = pmap_l0(pmap, va);
2216 		tl0 = pmap_load(l0);
2217 		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
2218 		pmap_unwire_l3(pmap, va, l1pg, free);
2219 	}
2220 	pmap_invalidate_page(pmap, va, false);
2221 
2222 	/*
2223 	 * Put page on a list so that it is released after
2224 	 * *ALL* TLB shootdown is done
2225 	 */
2226 	pmap_add_delayed_free_list(m, free, TRUE);
2227 }
2228 
2229 /*
2230  * After removing a page table entry, this routine is used to
2231  * conditionally free the page, and manage the reference count.
2232  */
2233 static int
2234 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2235     struct spglist *free)
2236 {
2237 	vm_page_t mpte;
2238 
2239 	KASSERT(ADDR_IS_CANONICAL(va),
2240 	    ("%s: Address not in canonical form: %lx", __func__, va));
2241 	if (ADDR_IS_KERNEL(va))
2242 		return (0);
2243 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2244 	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
2245 	return (pmap_unwire_l3(pmap, va, mpte, free));
2246 }
2247 
2248 /*
2249  * Release a page table page reference after a failed attempt to create a
2250  * mapping.
2251  */
2252 static void
2253 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2254 {
2255 	struct spglist free;
2256 
2257 	SLIST_INIT(&free);
2258 	if (pmap_unwire_l3(pmap, va, mpte, &free))
2259 		vm_page_free_pages_toq(&free, true);
2260 }
2261 
2262 void
2263 pmap_pinit0(pmap_t pmap)
2264 {
2265 
2266 	PMAP_LOCK_INIT(pmap);
2267 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2268 	pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2269 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2270 	vm_radix_init(&pmap->pm_root);
2271 	pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2272 	pmap->pm_stage = PM_STAGE1;
2273 	pmap->pm_levels = 4;
2274 	pmap->pm_ttbr = pmap->pm_l0_paddr;
2275 	pmap->pm_asid_set = &asids;
2276 
2277 	PCPU_SET(curpmap, pmap);
2278 }
2279 
2280 int
2281 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2282 {
2283 	vm_page_t m;
2284 
2285 	/*
2286 	 * allocate the l0 page
2287 	 */
2288 	m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2289 	    VM_ALLOC_ZERO);
2290 	pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2291 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2292 
2293 	vm_radix_init(&pmap->pm_root);
2294 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2295 	pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2296 
2297 	MPASS(levels == 3 || levels == 4);
2298 	pmap->pm_levels = levels;
2299 	pmap->pm_stage = stage;
2300 	switch (stage) {
2301 	case PM_STAGE1:
2302 		pmap->pm_asid_set = &asids;
2303 		break;
2304 	case PM_STAGE2:
2305 		pmap->pm_asid_set = &vmids;
2306 		break;
2307 	default:
2308 		panic("%s: Invalid pmap type %d", __func__, stage);
2309 		break;
2310 	}
2311 
2312 	/* XXX Temporarily disable deferred ASID allocation. */
2313 	pmap_alloc_asid(pmap);
2314 
2315 	/*
2316 	 * Allocate the level 1 entry to use as the root. This will increase
2317 	 * the refcount on the level 1 page so it won't be removed until
2318 	 * pmap_release() is called.
2319 	 */
2320 	if (pmap->pm_levels == 3) {
2321 		PMAP_LOCK(pmap);
2322 		m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2323 		PMAP_UNLOCK(pmap);
2324 	}
2325 	pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2326 
2327 	return (1);
2328 }
2329 
2330 int
2331 pmap_pinit(pmap_t pmap)
2332 {
2333 
2334 	return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2335 }
2336 
2337 /*
2338  * This routine is called if the desired page table page does not exist.
2339  *
2340  * If page table page allocation fails, this routine may sleep before
2341  * returning NULL.  It sleeps only if a lock pointer was given.
2342  *
2343  * Note: If a page allocation fails at page table level two or three,
2344  * one or two pages may be held during the wait, only to be released
2345  * afterwards.  This conservative approach is easily argued to avoid
2346  * race conditions.
2347  */
2348 static vm_page_t
2349 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2350 {
2351 	vm_page_t m, l1pg, l2pg;
2352 
2353 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2354 
2355 	/*
2356 	 * Allocate a page table page.
2357 	 */
2358 	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2359 		if (lockp != NULL) {
2360 			RELEASE_PV_LIST_LOCK(lockp);
2361 			PMAP_UNLOCK(pmap);
2362 			vm_wait(NULL);
2363 			PMAP_LOCK(pmap);
2364 		}
2365 
2366 		/*
2367 		 * Indicate the need to retry.  While waiting, the page table
2368 		 * page may have been allocated.
2369 		 */
2370 		return (NULL);
2371 	}
2372 	m->pindex = ptepindex;
2373 
2374 	/*
2375 	 * Because of AArch64's weak memory consistency model, we must have a
2376 	 * barrier here to ensure that the stores for zeroing "m", whether by
2377 	 * pmap_zero_page() or an earlier function, are visible before adding
2378 	 * "m" to the page table.  Otherwise, a page table walk by another
2379 	 * processor's MMU could see the mapping to "m" and a stale, non-zero
2380 	 * PTE within "m".
2381 	 */
2382 	dmb(ishst);
2383 
2384 	/*
2385 	 * Map the pagetable page into the process address space, if
2386 	 * it isn't already there.
2387 	 */
2388 
2389 	if (ptepindex >= (NUL2E + NUL1E)) {
2390 		pd_entry_t *l0p, l0e;
2391 		vm_pindex_t l0index;
2392 
2393 		l0index = ptepindex - (NUL2E + NUL1E);
2394 		l0p = &pmap->pm_l0[l0index];
2395 		KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2396 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2397 		l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE;
2398 
2399 		/*
2400 		 * Mark all kernel memory as not accessible from userspace
2401 		 * and userspace memory as not executable from the kernel.
2402 		 * This has been done for the bootstrap L0 entries in
2403 		 * locore.S.
2404 		 */
2405 		if (pmap == kernel_pmap)
2406 			l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2407 		else
2408 			l0e |= TATTR_PXN_TABLE;
2409 		pmap_store(l0p, l0e);
2410 	} else if (ptepindex >= NUL2E) {
2411 		vm_pindex_t l0index, l1index;
2412 		pd_entry_t *l0, *l1;
2413 		pd_entry_t tl0;
2414 
2415 		l1index = ptepindex - NUL2E;
2416 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2417 
2418 		l0 = &pmap->pm_l0[l0index];
2419 		tl0 = pmap_load(l0);
2420 		if (tl0 == 0) {
2421 			/* recurse for allocating page dir */
2422 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2423 			    lockp) == NULL) {
2424 				vm_page_unwire_noq(m);
2425 				vm_page_free_zero(m);
2426 				return (NULL);
2427 			}
2428 		} else {
2429 			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
2430 			l1pg->ref_count++;
2431 		}
2432 
2433 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
2434 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
2435 		KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2436 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2437 		pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
2438 	} else {
2439 		vm_pindex_t l0index, l1index;
2440 		pd_entry_t *l0, *l1, *l2;
2441 		pd_entry_t tl0, tl1;
2442 
2443 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2444 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2445 
2446 		l0 = &pmap->pm_l0[l0index];
2447 		tl0 = pmap_load(l0);
2448 		if (tl0 == 0) {
2449 			/* recurse for allocating page dir */
2450 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2451 			    lockp) == NULL) {
2452 				vm_page_unwire_noq(m);
2453 				vm_page_free_zero(m);
2454 				return (NULL);
2455 			}
2456 			tl0 = pmap_load(l0);
2457 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
2458 			l1 = &l1[l1index & Ln_ADDR_MASK];
2459 		} else {
2460 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
2461 			l1 = &l1[l1index & Ln_ADDR_MASK];
2462 			tl1 = pmap_load(l1);
2463 			if (tl1 == 0) {
2464 				/* recurse for allocating page dir */
2465 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2466 				    lockp) == NULL) {
2467 					vm_page_unwire_noq(m);
2468 					vm_page_free_zero(m);
2469 					return (NULL);
2470 				}
2471 			} else {
2472 				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
2473 				l2pg->ref_count++;
2474 			}
2475 		}
2476 
2477 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
2478 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
2479 		KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2480 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2481 		pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
2482 	}
2483 
2484 	pmap_resident_count_inc(pmap, 1);
2485 
2486 	return (m);
2487 }
2488 
2489 static pd_entry_t *
2490 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2491     struct rwlock **lockp)
2492 {
2493 	pd_entry_t *l1, *l2;
2494 	vm_page_t l2pg;
2495 	vm_pindex_t l2pindex;
2496 
2497 	KASSERT(ADDR_IS_CANONICAL(va),
2498 	    ("%s: Address not in canonical form: %lx", __func__, va));
2499 
2500 retry:
2501 	l1 = pmap_l1(pmap, va);
2502 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2503 		l2 = pmap_l1_to_l2(l1, va);
2504 		if (!ADDR_IS_KERNEL(va)) {
2505 			/* Add a reference to the L2 page. */
2506 			l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK);
2507 			l2pg->ref_count++;
2508 		} else
2509 			l2pg = NULL;
2510 	} else if (!ADDR_IS_KERNEL(va)) {
2511 		/* Allocate a L2 page. */
2512 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2513 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2514 		if (l2pg == NULL) {
2515 			if (lockp != NULL)
2516 				goto retry;
2517 			else
2518 				return (NULL);
2519 		}
2520 		l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2521 		l2 = &l2[pmap_l2_index(va)];
2522 	} else
2523 		panic("pmap_alloc_l2: missing page table page for va %#lx",
2524 		    va);
2525 	*l2pgp = l2pg;
2526 	return (l2);
2527 }
2528 
2529 static vm_page_t
2530 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2531 {
2532 	vm_pindex_t ptepindex;
2533 	pd_entry_t *pde, tpde;
2534 #ifdef INVARIANTS
2535 	pt_entry_t *pte;
2536 #endif
2537 	vm_page_t m;
2538 	int lvl;
2539 
2540 	/*
2541 	 * Calculate pagetable page index
2542 	 */
2543 	ptepindex = pmap_l2_pindex(va);
2544 retry:
2545 	/*
2546 	 * Get the page directory entry
2547 	 */
2548 	pde = pmap_pde(pmap, va, &lvl);
2549 
2550 	/*
2551 	 * If the page table page is mapped, we just increment the hold count,
2552 	 * and activate it. If we get a level 2 pde it will point to a level 3
2553 	 * table.
2554 	 */
2555 	switch (lvl) {
2556 	case -1:
2557 		break;
2558 	case 0:
2559 #ifdef INVARIANTS
2560 		pte = pmap_l0_to_l1(pde, va);
2561 		KASSERT(pmap_load(pte) == 0,
2562 		    ("pmap_alloc_l3: TODO: l0 superpages"));
2563 #endif
2564 		break;
2565 	case 1:
2566 #ifdef INVARIANTS
2567 		pte = pmap_l1_to_l2(pde, va);
2568 		KASSERT(pmap_load(pte) == 0,
2569 		    ("pmap_alloc_l3: TODO: l1 superpages"));
2570 #endif
2571 		break;
2572 	case 2:
2573 		tpde = pmap_load(pde);
2574 		if (tpde != 0) {
2575 			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
2576 			m->ref_count++;
2577 			return (m);
2578 		}
2579 		break;
2580 	default:
2581 		panic("pmap_alloc_l3: Invalid level %d", lvl);
2582 	}
2583 
2584 	/*
2585 	 * Here if the pte page isn't mapped, or if it has been deallocated.
2586 	 */
2587 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2588 	if (m == NULL && lockp != NULL)
2589 		goto retry;
2590 
2591 	return (m);
2592 }
2593 
2594 /***************************************************
2595  * Pmap allocation/deallocation routines.
2596  ***************************************************/
2597 
2598 /*
2599  * Release any resources held by the given physical map.
2600  * Called when a pmap initialized by pmap_pinit is being released.
2601  * Should only be called if the map contains no valid mappings.
2602  */
2603 void
2604 pmap_release(pmap_t pmap)
2605 {
2606 	boolean_t rv __diagused;
2607 	struct spglist free;
2608 	struct asid_set *set;
2609 	vm_page_t m;
2610 	int asid;
2611 
2612 	if (pmap->pm_levels != 4) {
2613 		PMAP_ASSERT_STAGE2(pmap);
2614 		KASSERT(pmap->pm_stats.resident_count == 1,
2615 		    ("pmap_release: pmap resident count %ld != 0",
2616 		    pmap->pm_stats.resident_count));
2617 		KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2618 		    ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2619 
2620 		SLIST_INIT(&free);
2621 		m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2622 		PMAP_LOCK(pmap);
2623 		rv = pmap_unwire_l3(pmap, 0, m, &free);
2624 		PMAP_UNLOCK(pmap);
2625 		MPASS(rv == TRUE);
2626 		vm_page_free_pages_toq(&free, true);
2627 	}
2628 
2629 	KASSERT(pmap->pm_stats.resident_count == 0,
2630 	    ("pmap_release: pmap resident count %ld != 0",
2631 	    pmap->pm_stats.resident_count));
2632 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2633 	    ("pmap_release: pmap has reserved page table page(s)"));
2634 
2635 	set = pmap->pm_asid_set;
2636 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2637 
2638 	/*
2639 	 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2640 	 * the entries when removing them so rely on a later tlb invalidation.
2641 	 * this will happen when updating the VMID generation. Because of this
2642 	 * we don't reuse VMIDs within a generation.
2643 	 */
2644 	if (pmap->pm_stage == PM_STAGE1) {
2645 		mtx_lock_spin(&set->asid_set_mutex);
2646 		if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2647 			asid = COOKIE_TO_ASID(pmap->pm_cookie);
2648 			KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2649 			    asid < set->asid_set_size,
2650 			    ("pmap_release: pmap cookie has out-of-range asid"));
2651 			bit_clear(set->asid_set, asid);
2652 		}
2653 		mtx_unlock_spin(&set->asid_set_mutex);
2654 	}
2655 
2656 	m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2657 	vm_page_unwire_noq(m);
2658 	vm_page_free_zero(m);
2659 }
2660 
2661 static int
2662 kvm_size(SYSCTL_HANDLER_ARGS)
2663 {
2664 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2665 
2666 	return sysctl_handle_long(oidp, &ksize, 0, req);
2667 }
2668 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2669     0, 0, kvm_size, "LU",
2670     "Size of KVM");
2671 
2672 static int
2673 kvm_free(SYSCTL_HANDLER_ARGS)
2674 {
2675 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2676 
2677 	return sysctl_handle_long(oidp, &kfree, 0, req);
2678 }
2679 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2680     0, 0, kvm_free, "LU",
2681     "Amount of KVM free");
2682 
2683 /*
2684  * grow the number of kernel page table entries, if needed
2685  */
2686 void
2687 pmap_growkernel(vm_offset_t addr)
2688 {
2689 	vm_paddr_t paddr;
2690 	vm_page_t nkpg;
2691 	pd_entry_t *l0, *l1, *l2;
2692 
2693 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2694 
2695 	addr = roundup2(addr, L2_SIZE);
2696 	if (addr - 1 >= vm_map_max(kernel_map))
2697 		addr = vm_map_max(kernel_map);
2698 	if (kernel_vm_end < addr)
2699 		kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2700 	while (kernel_vm_end < addr) {
2701 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2702 		KASSERT(pmap_load(l0) != 0,
2703 		    ("pmap_growkernel: No level 0 kernel entry"));
2704 
2705 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2706 		if (pmap_load(l1) == 0) {
2707 			/* We need a new PDP entry */
2708 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2709 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2710 			if (nkpg == NULL)
2711 				panic("pmap_growkernel: no memory to grow kernel");
2712 			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2713 			/* See the dmb() in _pmap_alloc_l3(). */
2714 			dmb(ishst);
2715 			paddr = VM_PAGE_TO_PHYS(nkpg);
2716 			pmap_store(l1, paddr | L1_TABLE);
2717 			continue; /* try again */
2718 		}
2719 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2720 		if (pmap_load(l2) != 0) {
2721 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2722 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2723 				kernel_vm_end = vm_map_max(kernel_map);
2724 				break;
2725 			}
2726 			continue;
2727 		}
2728 
2729 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2730 		    VM_ALLOC_ZERO);
2731 		if (nkpg == NULL)
2732 			panic("pmap_growkernel: no memory to grow kernel");
2733 		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2734 		/* See the dmb() in _pmap_alloc_l3(). */
2735 		dmb(ishst);
2736 		paddr = VM_PAGE_TO_PHYS(nkpg);
2737 		pmap_store(l2, paddr | L2_TABLE);
2738 
2739 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2740 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2741 			kernel_vm_end = vm_map_max(kernel_map);
2742 			break;
2743 		}
2744 	}
2745 }
2746 
2747 /***************************************************
2748  * page management routines.
2749  ***************************************************/
2750 
2751 static const uint64_t pc_freemask[_NPCM] = {
2752 	[0 ... _NPCM - 2] = PC_FREEN,
2753 	[_NPCM - 1] = PC_FREEL
2754 };
2755 
2756 #ifdef PV_STATS
2757 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2758 
2759 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2760 	"Current number of pv entry chunks");
2761 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2762 	"Current number of pv entry chunks allocated");
2763 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2764 	"Current number of pv entry chunks frees");
2765 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2766 	"Number of times tried to get a chunk page but failed.");
2767 
2768 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2769 static int pv_entry_spare;
2770 
2771 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2772 	"Current number of pv entry frees");
2773 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2774 	"Current number of pv entry allocs");
2775 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2776 	"Current number of pv entries");
2777 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2778 	"Current number of spare pv entries");
2779 #endif
2780 
2781 /*
2782  * We are in a serious low memory condition.  Resort to
2783  * drastic measures to free some pages so we can allocate
2784  * another pv entry chunk.
2785  *
2786  * Returns NULL if PV entries were reclaimed from the specified pmap.
2787  *
2788  * We do not, however, unmap 2mpages because subsequent accesses will
2789  * allocate per-page pv entries until repromotion occurs, thereby
2790  * exacerbating the shortage of free pv entries.
2791  */
2792 static vm_page_t
2793 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
2794 {
2795 	struct pv_chunks_list *pvc;
2796 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
2797 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
2798 	struct md_page *pvh;
2799 	pd_entry_t *pde;
2800 	pmap_t next_pmap, pmap;
2801 	pt_entry_t *pte, tpte;
2802 	pv_entry_t pv;
2803 	vm_offset_t va;
2804 	vm_page_t m, m_pc;
2805 	struct spglist free;
2806 	uint64_t inuse;
2807 	int bit, field, freed, lvl;
2808 
2809 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2810 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2811 
2812 	pmap = NULL;
2813 	m_pc = NULL;
2814 	SLIST_INIT(&free);
2815 	bzero(&pc_marker_b, sizeof(pc_marker_b));
2816 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
2817 	pc_marker = (struct pv_chunk *)&pc_marker_b;
2818 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
2819 
2820 	pvc = &pv_chunks[domain];
2821 	mtx_lock(&pvc->pvc_lock);
2822 	pvc->active_reclaims++;
2823 	TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
2824 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
2825 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
2826 	    SLIST_EMPTY(&free)) {
2827 		next_pmap = pc->pc_pmap;
2828 		if (next_pmap == NULL) {
2829 			/*
2830 			 * The next chunk is a marker.  However, it is
2831 			 * not our marker, so active_reclaims must be
2832 			 * > 1.  Consequently, the next_chunk code
2833 			 * will not rotate the pv_chunks list.
2834 			 */
2835 			goto next_chunk;
2836 		}
2837 		mtx_unlock(&pvc->pvc_lock);
2838 
2839 		/*
2840 		 * A pv_chunk can only be removed from the pc_lru list
2841 		 * when both pvc->pvc_lock is owned and the
2842 		 * corresponding pmap is locked.
2843 		 */
2844 		if (pmap != next_pmap) {
2845 			if (pmap != NULL && pmap != locked_pmap)
2846 				PMAP_UNLOCK(pmap);
2847 			pmap = next_pmap;
2848 			/* Avoid deadlock and lock recursion. */
2849 			if (pmap > locked_pmap) {
2850 				RELEASE_PV_LIST_LOCK(lockp);
2851 				PMAP_LOCK(pmap);
2852 				mtx_lock(&pvc->pvc_lock);
2853 				continue;
2854 			} else if (pmap != locked_pmap) {
2855 				if (PMAP_TRYLOCK(pmap)) {
2856 					mtx_lock(&pvc->pvc_lock);
2857 					continue;
2858 				} else {
2859 					pmap = NULL; /* pmap is not locked */
2860 					mtx_lock(&pvc->pvc_lock);
2861 					pc = TAILQ_NEXT(pc_marker, pc_lru);
2862 					if (pc == NULL ||
2863 					    pc->pc_pmap != next_pmap)
2864 						continue;
2865 					goto next_chunk;
2866 				}
2867 			}
2868 		}
2869 
2870 		/*
2871 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2872 		 */
2873 		freed = 0;
2874 		for (field = 0; field < _NPCM; field++) {
2875 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2876 			    inuse != 0; inuse &= ~(1UL << bit)) {
2877 				bit = ffsl(inuse) - 1;
2878 				pv = &pc->pc_pventry[field * 64 + bit];
2879 				va = pv->pv_va;
2880 				pde = pmap_pde(pmap, va, &lvl);
2881 				if (lvl != 2)
2882 					continue;
2883 				pte = pmap_l2_to_l3(pde, va);
2884 				tpte = pmap_load(pte);
2885 				if ((tpte & ATTR_SW_WIRED) != 0)
2886 					continue;
2887 				tpte = pmap_load_clear(pte);
2888 				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
2889 				if (pmap_pte_dirty(pmap, tpte))
2890 					vm_page_dirty(m);
2891 				if ((tpte & ATTR_AF) != 0) {
2892 					pmap_s1_invalidate_page(pmap, va, true);
2893 					vm_page_aflag_set(m, PGA_REFERENCED);
2894 				}
2895 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2896 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2897 				m->md.pv_gen++;
2898 				if (TAILQ_EMPTY(&m->md.pv_list) &&
2899 				    (m->flags & PG_FICTITIOUS) == 0) {
2900 					pvh = page_to_pvh(m);
2901 					if (TAILQ_EMPTY(&pvh->pv_list)) {
2902 						vm_page_aflag_clear(m,
2903 						    PGA_WRITEABLE);
2904 					}
2905 				}
2906 				pc->pc_map[field] |= 1UL << bit;
2907 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
2908 				freed++;
2909 			}
2910 		}
2911 		if (freed == 0) {
2912 			mtx_lock(&pvc->pvc_lock);
2913 			goto next_chunk;
2914 		}
2915 		/* Every freed mapping is for a 4 KB page. */
2916 		pmap_resident_count_dec(pmap, freed);
2917 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2918 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2919 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2920 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2921 		if (pc_is_free(pc)) {
2922 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2923 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2924 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2925 			/* Entire chunk is free; return it. */
2926 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2927 			dump_drop_page(m_pc->phys_addr);
2928 			mtx_lock(&pvc->pvc_lock);
2929 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
2930 			break;
2931 		}
2932 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2933 		mtx_lock(&pvc->pvc_lock);
2934 		/* One freed pv entry in locked_pmap is sufficient. */
2935 		if (pmap == locked_pmap)
2936 			break;
2937 
2938 next_chunk:
2939 		TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
2940 		TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
2941 		if (pvc->active_reclaims == 1 && pmap != NULL) {
2942 			/*
2943 			 * Rotate the pv chunks list so that we do not
2944 			 * scan the same pv chunks that could not be
2945 			 * freed (because they contained a wired
2946 			 * and/or superpage mapping) on every
2947 			 * invocation of reclaim_pv_chunk().
2948 			 */
2949 			while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
2950 				MPASS(pc->pc_pmap != NULL);
2951 				TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
2952 				TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
2953 			}
2954 		}
2955 	}
2956 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
2957 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
2958 	pvc->active_reclaims--;
2959 	mtx_unlock(&pvc->pvc_lock);
2960 	if (pmap != NULL && pmap != locked_pmap)
2961 		PMAP_UNLOCK(pmap);
2962 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2963 		m_pc = SLIST_FIRST(&free);
2964 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2965 		/* Recycle a freed page table page. */
2966 		m_pc->ref_count = 1;
2967 	}
2968 	vm_page_free_pages_toq(&free, true);
2969 	return (m_pc);
2970 }
2971 
2972 static vm_page_t
2973 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2974 {
2975 	vm_page_t m;
2976 	int i, domain;
2977 
2978 	domain = PCPU_GET(domain);
2979 	for (i = 0; i < vm_ndomains; i++) {
2980 		m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
2981 		if (m != NULL)
2982 			break;
2983 		domain = (domain + 1) % vm_ndomains;
2984 	}
2985 
2986 	return (m);
2987 }
2988 
2989 /*
2990  * free the pv_entry back to the free list
2991  */
2992 static void
2993 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2994 {
2995 	struct pv_chunk *pc;
2996 	int idx, field, bit;
2997 
2998 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2999 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3000 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3001 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3002 	pc = pv_to_chunk(pv);
3003 	idx = pv - &pc->pc_pventry[0];
3004 	field = idx / 64;
3005 	bit = idx % 64;
3006 	pc->pc_map[field] |= 1ul << bit;
3007 	if (!pc_is_free(pc)) {
3008 		/* 98% of the time, pc is already at the head of the list. */
3009 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3010 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3011 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3012 		}
3013 		return;
3014 	}
3015 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3016 	free_pv_chunk(pc);
3017 }
3018 
3019 static void
3020 free_pv_chunk_dequeued(struct pv_chunk *pc)
3021 {
3022 	vm_page_t m;
3023 
3024 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3025 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3026 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3027 	/* entire chunk is free, return it */
3028 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3029 	dump_drop_page(m->phys_addr);
3030 	vm_page_unwire_noq(m);
3031 	vm_page_free(m);
3032 }
3033 
3034 static void
3035 free_pv_chunk(struct pv_chunk *pc)
3036 {
3037 	struct pv_chunks_list *pvc;
3038 
3039 	pvc = &pv_chunks[pc_to_domain(pc)];
3040 	mtx_lock(&pvc->pvc_lock);
3041 	TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3042 	mtx_unlock(&pvc->pvc_lock);
3043 	free_pv_chunk_dequeued(pc);
3044 }
3045 
3046 static void
3047 free_pv_chunk_batch(struct pv_chunklist *batch)
3048 {
3049 	struct pv_chunks_list *pvc;
3050 	struct pv_chunk *pc, *npc;
3051 	int i;
3052 
3053 	for (i = 0; i < vm_ndomains; i++) {
3054 		if (TAILQ_EMPTY(&batch[i]))
3055 			continue;
3056 		pvc = &pv_chunks[i];
3057 		mtx_lock(&pvc->pvc_lock);
3058 		TAILQ_FOREACH(pc, &batch[i], pc_list) {
3059 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3060 		}
3061 		mtx_unlock(&pvc->pvc_lock);
3062 	}
3063 
3064 	for (i = 0; i < vm_ndomains; i++) {
3065 		TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3066 			free_pv_chunk_dequeued(pc);
3067 		}
3068 	}
3069 }
3070 
3071 /*
3072  * Returns a new PV entry, allocating a new PV chunk from the system when
3073  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3074  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3075  * returned.
3076  *
3077  * The given PV list lock may be released.
3078  */
3079 static pv_entry_t
3080 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3081 {
3082 	struct pv_chunks_list *pvc;
3083 	int bit, field;
3084 	pv_entry_t pv;
3085 	struct pv_chunk *pc;
3086 	vm_page_t m;
3087 
3088 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3089 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3090 retry:
3091 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3092 	if (pc != NULL) {
3093 		for (field = 0; field < _NPCM; field++) {
3094 			if (pc->pc_map[field]) {
3095 				bit = ffsl(pc->pc_map[field]) - 1;
3096 				break;
3097 			}
3098 		}
3099 		if (field < _NPCM) {
3100 			pv = &pc->pc_pventry[field * 64 + bit];
3101 			pc->pc_map[field] &= ~(1ul << bit);
3102 			/* If this was the last item, move it to tail */
3103 			if (pc_is_full(pc)) {
3104 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3105 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3106 				    pc_list);
3107 			}
3108 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3109 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3110 			return (pv);
3111 		}
3112 	}
3113 	/* No free items, allocate another chunk */
3114 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3115 	if (m == NULL) {
3116 		if (lockp == NULL) {
3117 			PV_STAT(pc_chunk_tryfail++);
3118 			return (NULL);
3119 		}
3120 		m = reclaim_pv_chunk(pmap, lockp);
3121 		if (m == NULL)
3122 			goto retry;
3123 	}
3124 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3125 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3126 	dump_add_page(m->phys_addr);
3127 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3128 	pc->pc_pmap = pmap;
3129 	memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3130 	pc->pc_map[0] &= ~1ul;		/* preallocated bit 0 */
3131 	pvc = &pv_chunks[vm_page_domain(m)];
3132 	mtx_lock(&pvc->pvc_lock);
3133 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3134 	mtx_unlock(&pvc->pvc_lock);
3135 	pv = &pc->pc_pventry[0];
3136 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3137 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3138 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3139 	return (pv);
3140 }
3141 
3142 /*
3143  * Ensure that the number of spare PV entries in the specified pmap meets or
3144  * exceeds the given count, "needed".
3145  *
3146  * The given PV list lock may be released.
3147  */
3148 static void
3149 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3150 {
3151 	struct pv_chunks_list *pvc;
3152 	struct pch new_tail[PMAP_MEMDOM];
3153 	struct pv_chunk *pc;
3154 	vm_page_t m;
3155 	int avail, free, i;
3156 	bool reclaimed;
3157 
3158 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3159 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3160 
3161 	/*
3162 	 * Newly allocated PV chunks must be stored in a private list until
3163 	 * the required number of PV chunks have been allocated.  Otherwise,
3164 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3165 	 * contrast, these chunks must be added to the pmap upon allocation.
3166 	 */
3167 	for (i = 0; i < PMAP_MEMDOM; i++)
3168 		TAILQ_INIT(&new_tail[i]);
3169 retry:
3170 	avail = 0;
3171 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3172 		bit_count((bitstr_t *)pc->pc_map, 0,
3173 		    sizeof(pc->pc_map) * NBBY, &free);
3174 		if (free == 0)
3175 			break;
3176 		avail += free;
3177 		if (avail >= needed)
3178 			break;
3179 	}
3180 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
3181 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3182 		if (m == NULL) {
3183 			m = reclaim_pv_chunk(pmap, lockp);
3184 			if (m == NULL)
3185 				goto retry;
3186 			reclaimed = true;
3187 		}
3188 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3189 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3190 		dump_add_page(m->phys_addr);
3191 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3192 		pc->pc_pmap = pmap;
3193 		memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3194 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3195 		TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3196 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3197 
3198 		/*
3199 		 * The reclaim might have freed a chunk from the current pmap.
3200 		 * If that chunk contained available entries, we need to
3201 		 * re-count the number of available entries.
3202 		 */
3203 		if (reclaimed)
3204 			goto retry;
3205 	}
3206 	for (i = 0; i < vm_ndomains; i++) {
3207 		if (TAILQ_EMPTY(&new_tail[i]))
3208 			continue;
3209 		pvc = &pv_chunks[i];
3210 		mtx_lock(&pvc->pvc_lock);
3211 		TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3212 		mtx_unlock(&pvc->pvc_lock);
3213 	}
3214 }
3215 
3216 /*
3217  * First find and then remove the pv entry for the specified pmap and virtual
3218  * address from the specified pv list.  Returns the pv entry if found and NULL
3219  * otherwise.  This operation can be performed on pv lists for either 4KB or
3220  * 2MB page mappings.
3221  */
3222 static __inline pv_entry_t
3223 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3224 {
3225 	pv_entry_t pv;
3226 
3227 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3228 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3229 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3230 			pvh->pv_gen++;
3231 			break;
3232 		}
3233 	}
3234 	return (pv);
3235 }
3236 
3237 /*
3238  * After demotion from a 2MB page mapping to 512 4KB page mappings,
3239  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3240  * entries for each of the 4KB page mappings.
3241  */
3242 static void
3243 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3244     struct rwlock **lockp)
3245 {
3246 	struct md_page *pvh;
3247 	struct pv_chunk *pc;
3248 	pv_entry_t pv;
3249 	vm_offset_t va_last;
3250 	vm_page_t m;
3251 	int bit, field;
3252 
3253 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3254 	KASSERT((va & L2_OFFSET) == 0,
3255 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3256 	KASSERT((pa & L2_OFFSET) == 0,
3257 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3258 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3259 
3260 	/*
3261 	 * Transfer the 2mpage's pv entry for this mapping to the first
3262 	 * page's pv list.  Once this transfer begins, the pv list lock
3263 	 * must not be released until the last pv entry is reinstantiated.
3264 	 */
3265 	pvh = pa_to_pvh(pa);
3266 	pv = pmap_pvh_remove(pvh, pmap, va);
3267 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3268 	m = PHYS_TO_VM_PAGE(pa);
3269 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3270 	m->md.pv_gen++;
3271 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3272 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3273 	va_last = va + L2_SIZE - PAGE_SIZE;
3274 	for (;;) {
3275 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3276 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3277 		for (field = 0; field < _NPCM; field++) {
3278 			while (pc->pc_map[field]) {
3279 				bit = ffsl(pc->pc_map[field]) - 1;
3280 				pc->pc_map[field] &= ~(1ul << bit);
3281 				pv = &pc->pc_pventry[field * 64 + bit];
3282 				va += PAGE_SIZE;
3283 				pv->pv_va = va;
3284 				m++;
3285 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3286 			    ("pmap_pv_demote_l2: page %p is not managed", m));
3287 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3288 				m->md.pv_gen++;
3289 				if (va == va_last)
3290 					goto out;
3291 			}
3292 		}
3293 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3294 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3295 	}
3296 out:
3297 	if (pc_is_full(pc)) {
3298 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3299 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3300 	}
3301 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3302 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3303 }
3304 
3305 /*
3306  * First find and then destroy the pv entry for the specified pmap and virtual
3307  * address.  This operation can be performed on pv lists for either 4KB or 2MB
3308  * page mappings.
3309  */
3310 static void
3311 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3312 {
3313 	pv_entry_t pv;
3314 
3315 	pv = pmap_pvh_remove(pvh, pmap, va);
3316 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3317 	free_pv_entry(pmap, pv);
3318 }
3319 
3320 /*
3321  * Conditionally create the PV entry for a 4KB page mapping if the required
3322  * memory can be allocated without resorting to reclamation.
3323  */
3324 static boolean_t
3325 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3326     struct rwlock **lockp)
3327 {
3328 	pv_entry_t pv;
3329 
3330 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3331 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3332 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3333 		pv->pv_va = va;
3334 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3335 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3336 		m->md.pv_gen++;
3337 		return (TRUE);
3338 	} else
3339 		return (FALSE);
3340 }
3341 
3342 /*
3343  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3344  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3345  * false if the PV entry cannot be allocated without resorting to reclamation.
3346  */
3347 static bool
3348 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3349     struct rwlock **lockp)
3350 {
3351 	struct md_page *pvh;
3352 	pv_entry_t pv;
3353 	vm_paddr_t pa;
3354 
3355 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3356 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3357 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3358 	    NULL : lockp)) == NULL)
3359 		return (false);
3360 	pv->pv_va = va;
3361 	pa = l2e & ~ATTR_MASK;
3362 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3363 	pvh = pa_to_pvh(pa);
3364 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3365 	pvh->pv_gen++;
3366 	return (true);
3367 }
3368 
3369 static void
3370 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3371 {
3372 	pt_entry_t newl2, oldl2 __diagused;
3373 	vm_page_t ml3;
3374 	vm_paddr_t ml3pa;
3375 
3376 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3377 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3378 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3379 
3380 	ml3 = pmap_remove_pt_page(pmap, va);
3381 	if (ml3 == NULL)
3382 		panic("pmap_remove_kernel_l2: Missing pt page");
3383 
3384 	ml3pa = VM_PAGE_TO_PHYS(ml3);
3385 	newl2 = ml3pa | L2_TABLE;
3386 
3387 	/*
3388 	 * If this page table page was unmapped by a promotion, then it
3389 	 * contains valid mappings.  Zero it to invalidate those mappings.
3390 	 */
3391 	if (ml3->valid != 0)
3392 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
3393 
3394 	/*
3395 	 * Demote the mapping.  The caller must have already invalidated the
3396 	 * mapping (i.e., the "break" in break-before-make).
3397 	 */
3398 	oldl2 = pmap_load_store(l2, newl2);
3399 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3400 	    __func__, l2, oldl2));
3401 }
3402 
3403 /*
3404  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3405  */
3406 static int
3407 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3408     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3409 {
3410 	struct md_page *pvh;
3411 	pt_entry_t old_l2;
3412 	vm_page_t m, ml3, mt;
3413 
3414 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3415 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3416 	old_l2 = pmap_load_clear(l2);
3417 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3418 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3419 
3420 	/*
3421 	 * Since a promotion must break the 4KB page mappings before making
3422 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3423 	 */
3424 	pmap_s1_invalidate_page(pmap, sva, true);
3425 
3426 	if (old_l2 & ATTR_SW_WIRED)
3427 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3428 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3429 	if (old_l2 & ATTR_SW_MANAGED) {
3430 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
3431 		pvh = page_to_pvh(m);
3432 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
3433 		pmap_pvh_free(pvh, pmap, sva);
3434 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3435 			if (pmap_pte_dirty(pmap, old_l2))
3436 				vm_page_dirty(mt);
3437 			if (old_l2 & ATTR_AF)
3438 				vm_page_aflag_set(mt, PGA_REFERENCED);
3439 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3440 			    TAILQ_EMPTY(&pvh->pv_list))
3441 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3442 		}
3443 	}
3444 	if (pmap == kernel_pmap) {
3445 		pmap_remove_kernel_l2(pmap, l2, sva);
3446 	} else {
3447 		ml3 = pmap_remove_pt_page(pmap, sva);
3448 		if (ml3 != NULL) {
3449 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
3450 			    ("pmap_remove_l2: l3 page not promoted"));
3451 			pmap_resident_count_dec(pmap, 1);
3452 			KASSERT(ml3->ref_count == NL3PG,
3453 			    ("pmap_remove_l2: l3 page ref count error"));
3454 			ml3->ref_count = 0;
3455 			pmap_add_delayed_free_list(ml3, free, FALSE);
3456 		}
3457 	}
3458 	return (pmap_unuse_pt(pmap, sva, l1e, free));
3459 }
3460 
3461 /*
3462  * pmap_remove_l3: do the things to unmap a page in a process
3463  */
3464 static int
3465 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3466     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3467 {
3468 	struct md_page *pvh;
3469 	pt_entry_t old_l3;
3470 	vm_page_t m;
3471 
3472 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3473 	old_l3 = pmap_load_clear(l3);
3474 	pmap_s1_invalidate_page(pmap, va, true);
3475 	if (old_l3 & ATTR_SW_WIRED)
3476 		pmap->pm_stats.wired_count -= 1;
3477 	pmap_resident_count_dec(pmap, 1);
3478 	if (old_l3 & ATTR_SW_MANAGED) {
3479 		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
3480 		if (pmap_pte_dirty(pmap, old_l3))
3481 			vm_page_dirty(m);
3482 		if (old_l3 & ATTR_AF)
3483 			vm_page_aflag_set(m, PGA_REFERENCED);
3484 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3485 		pmap_pvh_free(&m->md, pmap, va);
3486 		if (TAILQ_EMPTY(&m->md.pv_list) &&
3487 		    (m->flags & PG_FICTITIOUS) == 0) {
3488 			pvh = page_to_pvh(m);
3489 			if (TAILQ_EMPTY(&pvh->pv_list))
3490 				vm_page_aflag_clear(m, PGA_WRITEABLE);
3491 		}
3492 	}
3493 	return (pmap_unuse_pt(pmap, va, l2e, free));
3494 }
3495 
3496 /*
3497  * Remove the specified range of addresses from the L3 page table that is
3498  * identified by the given L2 entry.
3499  */
3500 static void
3501 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3502     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3503 {
3504 	struct md_page *pvh;
3505 	struct rwlock *new_lock;
3506 	pt_entry_t *l3, old_l3;
3507 	vm_offset_t va;
3508 	vm_page_t l3pg, m;
3509 
3510 	KASSERT(ADDR_IS_CANONICAL(sva),
3511 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
3512 	KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3513 	    ("%s: End address not in canonical form: %lx", __func__, eva));
3514 
3515 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3516 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3517 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3518 	l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL;
3519 	va = eva;
3520 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3521 		if (!pmap_l3_valid(pmap_load(l3))) {
3522 			if (va != eva) {
3523 				pmap_invalidate_range(pmap, va, sva, true);
3524 				va = eva;
3525 			}
3526 			continue;
3527 		}
3528 		old_l3 = pmap_load_clear(l3);
3529 		if ((old_l3 & ATTR_SW_WIRED) != 0)
3530 			pmap->pm_stats.wired_count--;
3531 		pmap_resident_count_dec(pmap, 1);
3532 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3533 			m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
3534 			if (pmap_pte_dirty(pmap, old_l3))
3535 				vm_page_dirty(m);
3536 			if ((old_l3 & ATTR_AF) != 0)
3537 				vm_page_aflag_set(m, PGA_REFERENCED);
3538 			new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m));
3539 			if (new_lock != *lockp) {
3540 				if (*lockp != NULL) {
3541 					/*
3542 					 * Pending TLB invalidations must be
3543 					 * performed before the PV list lock is
3544 					 * released.  Otherwise, a concurrent
3545 					 * pmap_remove_all() on a physical page
3546 					 * could return while a stale TLB entry
3547 					 * still provides access to that page.
3548 					 */
3549 					if (va != eva) {
3550 						pmap_invalidate_range(pmap, va,
3551 						    sva, true);
3552 						va = eva;
3553 					}
3554 					rw_wunlock(*lockp);
3555 				}
3556 				*lockp = new_lock;
3557 				rw_wlock(*lockp);
3558 			}
3559 			pmap_pvh_free(&m->md, pmap, sva);
3560 			if (TAILQ_EMPTY(&m->md.pv_list) &&
3561 			    (m->flags & PG_FICTITIOUS) == 0) {
3562 				pvh = page_to_pvh(m);
3563 				if (TAILQ_EMPTY(&pvh->pv_list))
3564 					vm_page_aflag_clear(m, PGA_WRITEABLE);
3565 			}
3566 		}
3567 		if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3568 			/*
3569 			 * _pmap_unwire_l3() has already invalidated the TLB
3570 			 * entries at all levels for "sva".  So, we need not
3571 			 * perform "sva += L3_SIZE;" here.  Moreover, we need
3572 			 * not perform "va = sva;" if "sva" is at the start
3573 			 * of a new valid range consisting of a single page.
3574 			 */
3575 			break;
3576 		}
3577 		if (va == eva)
3578 			va = sva;
3579 	}
3580 	if (va != eva)
3581 		pmap_invalidate_range(pmap, va, sva, true);
3582 }
3583 
3584 /*
3585  *	Remove the given range of addresses from the specified map.
3586  *
3587  *	It is assumed that the start and end are properly
3588  *	rounded to the page size.
3589  */
3590 void
3591 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3592 {
3593 	struct rwlock *lock;
3594 	vm_offset_t va_next;
3595 	pd_entry_t *l0, *l1, *l2;
3596 	pt_entry_t l3_paddr;
3597 	struct spglist free;
3598 
3599 	/*
3600 	 * Perform an unsynchronized read.  This is, however, safe.
3601 	 */
3602 	if (pmap->pm_stats.resident_count == 0)
3603 		return;
3604 
3605 	SLIST_INIT(&free);
3606 
3607 	PMAP_LOCK(pmap);
3608 
3609 	lock = NULL;
3610 	for (; sva < eva; sva = va_next) {
3611 		if (pmap->pm_stats.resident_count == 0)
3612 			break;
3613 
3614 		l0 = pmap_l0(pmap, sva);
3615 		if (pmap_load(l0) == 0) {
3616 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3617 			if (va_next < sva)
3618 				va_next = eva;
3619 			continue;
3620 		}
3621 
3622 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3623 		if (va_next < sva)
3624 			va_next = eva;
3625 		l1 = pmap_l0_to_l1(l0, sva);
3626 		if (pmap_load(l1) == 0)
3627 			continue;
3628 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3629 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
3630 			KASSERT(va_next <= eva,
3631 			    ("partial update of non-transparent 1G page "
3632 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3633 			    pmap_load(l1), sva, eva, va_next));
3634 			MPASS(pmap != kernel_pmap);
3635 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3636 			pmap_clear(l1);
3637 			pmap_s1_invalidate_page(pmap, sva, true);
3638 			pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
3639 			pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
3640 			continue;
3641 		}
3642 
3643 		/*
3644 		 * Calculate index for next page table.
3645 		 */
3646 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3647 		if (va_next < sva)
3648 			va_next = eva;
3649 
3650 		l2 = pmap_l1_to_l2(l1, sva);
3651 		if (l2 == NULL)
3652 			continue;
3653 
3654 		l3_paddr = pmap_load(l2);
3655 
3656 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
3657 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3658 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
3659 				    &free, &lock);
3660 				continue;
3661 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
3662 			    &lock) == NULL)
3663 				continue;
3664 			l3_paddr = pmap_load(l2);
3665 		}
3666 
3667 		/*
3668 		 * Weed out invalid mappings.
3669 		 */
3670 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
3671 			continue;
3672 
3673 		/*
3674 		 * Limit our scan to either the end of the va represented
3675 		 * by the current page table page, or to the end of the
3676 		 * range being removed.
3677 		 */
3678 		if (va_next > eva)
3679 			va_next = eva;
3680 
3681 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
3682 		    &lock);
3683 	}
3684 	if (lock != NULL)
3685 		rw_wunlock(lock);
3686 	PMAP_UNLOCK(pmap);
3687 	vm_page_free_pages_toq(&free, true);
3688 }
3689 
3690 /*
3691  *	Routine:	pmap_remove_all
3692  *	Function:
3693  *		Removes this physical page from
3694  *		all physical maps in which it resides.
3695  *		Reflects back modify bits to the pager.
3696  *
3697  *	Notes:
3698  *		Original versions of this routine were very
3699  *		inefficient because they iteratively called
3700  *		pmap_remove (slow...)
3701  */
3702 
3703 void
3704 pmap_remove_all(vm_page_t m)
3705 {
3706 	struct md_page *pvh;
3707 	pv_entry_t pv;
3708 	pmap_t pmap;
3709 	struct rwlock *lock;
3710 	pd_entry_t *pde, tpde;
3711 	pt_entry_t *pte, tpte;
3712 	vm_offset_t va;
3713 	struct spglist free;
3714 	int lvl, pvh_gen, md_gen;
3715 
3716 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3717 	    ("pmap_remove_all: page %p is not managed", m));
3718 	SLIST_INIT(&free);
3719 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3720 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
3721 	rw_wlock(lock);
3722 retry:
3723 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3724 		pmap = PV_PMAP(pv);
3725 		if (!PMAP_TRYLOCK(pmap)) {
3726 			pvh_gen = pvh->pv_gen;
3727 			rw_wunlock(lock);
3728 			PMAP_LOCK(pmap);
3729 			rw_wlock(lock);
3730 			if (pvh_gen != pvh->pv_gen) {
3731 				PMAP_UNLOCK(pmap);
3732 				goto retry;
3733 			}
3734 		}
3735 		va = pv->pv_va;
3736 		pte = pmap_pte_exists(pmap, va, 2, __func__);
3737 		pmap_demote_l2_locked(pmap, pte, va, &lock);
3738 		PMAP_UNLOCK(pmap);
3739 	}
3740 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3741 		pmap = PV_PMAP(pv);
3742 		if (!PMAP_TRYLOCK(pmap)) {
3743 			pvh_gen = pvh->pv_gen;
3744 			md_gen = m->md.pv_gen;
3745 			rw_wunlock(lock);
3746 			PMAP_LOCK(pmap);
3747 			rw_wlock(lock);
3748 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3749 				PMAP_UNLOCK(pmap);
3750 				goto retry;
3751 			}
3752 		}
3753 		pmap_resident_count_dec(pmap, 1);
3754 
3755 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
3756 		KASSERT(pde != NULL,
3757 		    ("pmap_remove_all: no page directory entry found"));
3758 		KASSERT(lvl == 2,
3759 		    ("pmap_remove_all: invalid pde level %d", lvl));
3760 		tpde = pmap_load(pde);
3761 
3762 		pte = pmap_l2_to_l3(pde, pv->pv_va);
3763 		tpte = pmap_load_clear(pte);
3764 		if (tpte & ATTR_SW_WIRED)
3765 			pmap->pm_stats.wired_count--;
3766 		if ((tpte & ATTR_AF) != 0) {
3767 			pmap_invalidate_page(pmap, pv->pv_va, true);
3768 			vm_page_aflag_set(m, PGA_REFERENCED);
3769 		}
3770 
3771 		/*
3772 		 * Update the vm_page_t clean and reference bits.
3773 		 */
3774 		if (pmap_pte_dirty(pmap, tpte))
3775 			vm_page_dirty(m);
3776 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
3777 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3778 		m->md.pv_gen++;
3779 		free_pv_entry(pmap, pv);
3780 		PMAP_UNLOCK(pmap);
3781 	}
3782 	vm_page_aflag_clear(m, PGA_WRITEABLE);
3783 	rw_wunlock(lock);
3784 	vm_page_free_pages_toq(&free, true);
3785 }
3786 
3787 /*
3788  * Masks and sets bits in a level 2 page table entries in the specified pmap
3789  */
3790 static void
3791 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
3792     pt_entry_t nbits)
3793 {
3794 	pd_entry_t old_l2;
3795 	vm_page_t m, mt;
3796 
3797 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3798 	PMAP_ASSERT_STAGE1(pmap);
3799 	KASSERT((sva & L2_OFFSET) == 0,
3800 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
3801 	old_l2 = pmap_load(l2);
3802 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3803 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
3804 
3805 	/*
3806 	 * Return if the L2 entry already has the desired access restrictions
3807 	 * in place.
3808 	 */
3809 	if ((old_l2 & mask) == nbits)
3810 		return;
3811 
3812 	while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
3813 		cpu_spinwait();
3814 
3815 	/*
3816 	 * When a dirty read/write superpage mapping is write protected,
3817 	 * update the dirty field of each of the superpage's constituent 4KB
3818 	 * pages.
3819 	 */
3820 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
3821 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3822 	    pmap_pte_dirty(pmap, old_l2)) {
3823 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
3824 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3825 			vm_page_dirty(mt);
3826 	}
3827 
3828 	/*
3829 	 * Since a promotion must break the 4KB page mappings before making
3830 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3831 	 */
3832 	pmap_s1_invalidate_page(pmap, sva, true);
3833 }
3834 
3835 /*
3836  * Masks and sets bits in last level page table entries in the specified
3837  * pmap and range
3838  */
3839 static void
3840 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
3841     pt_entry_t nbits, bool invalidate)
3842 {
3843 	vm_offset_t va, va_next;
3844 	pd_entry_t *l0, *l1, *l2;
3845 	pt_entry_t *l3p, l3;
3846 
3847 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3848 	for (; sva < eva; sva = va_next) {
3849 		l0 = pmap_l0(pmap, sva);
3850 		if (pmap_load(l0) == 0) {
3851 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3852 			if (va_next < sva)
3853 				va_next = eva;
3854 			continue;
3855 		}
3856 
3857 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3858 		if (va_next < sva)
3859 			va_next = eva;
3860 		l1 = pmap_l0_to_l1(l0, sva);
3861 		if (pmap_load(l1) == 0)
3862 			continue;
3863 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3864 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
3865 			KASSERT(va_next <= eva,
3866 			    ("partial update of non-transparent 1G page "
3867 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3868 			    pmap_load(l1), sva, eva, va_next));
3869 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3870 			if ((pmap_load(l1) & mask) != nbits) {
3871 				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
3872 				if (invalidate)
3873 					pmap_s1_invalidate_page(pmap, sva, true);
3874 			}
3875 			continue;
3876 		}
3877 
3878 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3879 		if (va_next < sva)
3880 			va_next = eva;
3881 
3882 		l2 = pmap_l1_to_l2(l1, sva);
3883 		if (pmap_load(l2) == 0)
3884 			continue;
3885 
3886 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3887 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3888 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
3889 				continue;
3890 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
3891 				continue;
3892 		}
3893 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3894 		    ("pmap_protect: Invalid L2 entry after demotion"));
3895 
3896 		if (va_next > eva)
3897 			va_next = eva;
3898 
3899 		va = va_next;
3900 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
3901 		    sva += L3_SIZE) {
3902 			l3 = pmap_load(l3p);
3903 
3904 			/*
3905 			 * Go to the next L3 entry if the current one is
3906 			 * invalid or already has the desired access
3907 			 * restrictions in place.  (The latter case occurs
3908 			 * frequently.  For example, in a "buildworld"
3909 			 * workload, almost 1 out of 4 L3 entries already
3910 			 * have the desired restrictions.)
3911 			 */
3912 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
3913 				if (va != va_next) {
3914 					if (invalidate)
3915 						pmap_s1_invalidate_range(pmap,
3916 						    va, sva, true);
3917 					va = va_next;
3918 				}
3919 				continue;
3920 			}
3921 
3922 			while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
3923 			    nbits))
3924 				cpu_spinwait();
3925 
3926 			/*
3927 			 * When a dirty read/write mapping is write protected,
3928 			 * update the page's dirty field.
3929 			 */
3930 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
3931 			    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3932 			    pmap_pte_dirty(pmap, l3))
3933 				vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK));
3934 
3935 			if (va == va_next)
3936 				va = sva;
3937 		}
3938 		if (va != va_next && invalidate)
3939 			pmap_s1_invalidate_range(pmap, va, sva, true);
3940 	}
3941 }
3942 
3943 static void
3944 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
3945     pt_entry_t nbits, bool invalidate)
3946 {
3947 	PMAP_LOCK(pmap);
3948 	pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
3949 	PMAP_UNLOCK(pmap);
3950 }
3951 
3952 /*
3953  *	Set the physical protection on the
3954  *	specified range of this map as requested.
3955  */
3956 void
3957 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3958 {
3959 	pt_entry_t mask, nbits;
3960 
3961 	PMAP_ASSERT_STAGE1(pmap);
3962 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3963 	if (prot == VM_PROT_NONE) {
3964 		pmap_remove(pmap, sva, eva);
3965 		return;
3966 	}
3967 
3968 	mask = nbits = 0;
3969 	if ((prot & VM_PROT_WRITE) == 0) {
3970 		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
3971 		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
3972 	}
3973 	if ((prot & VM_PROT_EXECUTE) == 0) {
3974 		mask |= ATTR_S1_XN;
3975 		nbits |= ATTR_S1_XN;
3976 	}
3977 	if (mask == 0)
3978 		return;
3979 
3980 	pmap_mask_set(pmap, sva, eva, mask, nbits, true);
3981 }
3982 
3983 void
3984 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
3985 {
3986 
3987 	MPASS((sva & L3_OFFSET) == 0);
3988 	MPASS(((sva + size) & L3_OFFSET) == 0);
3989 
3990 	pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
3991 	    ATTR_SW_NO_PROMOTE, false);
3992 }
3993 
3994 /*
3995  * Inserts the specified page table page into the specified pmap's collection
3996  * of idle page table pages.  Each of a pmap's page table pages is responsible
3997  * for mapping a distinct range of virtual addresses.  The pmap's collection is
3998  * ordered by this virtual address range.
3999  *
4000  * If "promoted" is false, then the page table page "mpte" must be zero filled.
4001  */
4002 static __inline int
4003 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
4004 {
4005 
4006 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4007 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
4008 	return (vm_radix_insert(&pmap->pm_root, mpte));
4009 }
4010 
4011 /*
4012  * Removes the page table page mapping the specified virtual address from the
4013  * specified pmap's collection of idle page table pages, and returns it.
4014  * Otherwise, returns NULL if there is no page table page corresponding to the
4015  * specified virtual address.
4016  */
4017 static __inline vm_page_t
4018 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4019 {
4020 
4021 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4022 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4023 }
4024 
4025 /*
4026  * Performs a break-before-make update of a pmap entry. This is needed when
4027  * either promoting or demoting pages to ensure the TLB doesn't get into an
4028  * inconsistent state.
4029  */
4030 static void
4031 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
4032     vm_offset_t va, vm_size_t size)
4033 {
4034 	register_t intr;
4035 
4036 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4037 
4038 	if ((newpte & ATTR_SW_NO_PROMOTE) != 0)
4039 		panic("%s: Updating non-promote pte", __func__);
4040 
4041 	/*
4042 	 * Ensure we don't get switched out with the page table in an
4043 	 * inconsistent state. We also need to ensure no interrupts fire
4044 	 * as they may make use of an address we are about to invalidate.
4045 	 */
4046 	intr = intr_disable();
4047 
4048 	/*
4049 	 * Clear the old mapping's valid bit, but leave the rest of the entry
4050 	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4051 	 * lookup the physical address.
4052 	 */
4053 	pmap_clear_bits(pte, ATTR_DESCR_VALID);
4054 
4055 	/*
4056 	 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4057 	 * be cached, so we invalidate intermediate entries as well as final
4058 	 * entries.
4059 	 */
4060 	pmap_s1_invalidate_range(pmap, va, va + size, false);
4061 
4062 	/* Create the new mapping */
4063 	pmap_store(pte, newpte);
4064 	dsb(ishst);
4065 
4066 	intr_restore(intr);
4067 }
4068 
4069 #if VM_NRESERVLEVEL > 0
4070 /*
4071  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4072  * replace the many pv entries for the 4KB page mappings by a single pv entry
4073  * for the 2MB page mapping.
4074  */
4075 static void
4076 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4077     struct rwlock **lockp)
4078 {
4079 	struct md_page *pvh;
4080 	pv_entry_t pv;
4081 	vm_offset_t va_last;
4082 	vm_page_t m;
4083 
4084 	KASSERT((pa & L2_OFFSET) == 0,
4085 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4086 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4087 
4088 	/*
4089 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
4090 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
4091 	 * a transfer avoids the possibility that get_pv_entry() calls
4092 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4093 	 * mappings that is being promoted.
4094 	 */
4095 	m = PHYS_TO_VM_PAGE(pa);
4096 	va = va & ~L2_OFFSET;
4097 	pv = pmap_pvh_remove(&m->md, pmap, va);
4098 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4099 	pvh = page_to_pvh(m);
4100 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4101 	pvh->pv_gen++;
4102 	/* Free the remaining NPTEPG - 1 pv entries. */
4103 	va_last = va + L2_SIZE - PAGE_SIZE;
4104 	do {
4105 		m++;
4106 		va += PAGE_SIZE;
4107 		pmap_pvh_free(&m->md, pmap, va);
4108 	} while (va < va_last);
4109 }
4110 
4111 /*
4112  * Tries to promote the 512, contiguous 4KB page mappings that are within a
4113  * single level 2 table entry to a single 2MB page mapping.  For promotion
4114  * to occur, two conditions must be met: (1) the 4KB page mappings must map
4115  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4116  * identical characteristics.
4117  */
4118 static void
4119 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4120     struct rwlock **lockp)
4121 {
4122 	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
4123 
4124 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4125 	PMAP_ASSERT_STAGE1(pmap);
4126 
4127 	/*
4128 	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
4129 	 * ineligible for promotion, invalid, or does not map the first 4KB
4130 	 * physical page within a 2MB page.
4131 	 */
4132 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
4133 	newl2 = pmap_load(firstl3);
4134 	if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4135 		return;
4136 	if ((newl2 & ((~ATTR_MASK & L2_OFFSET) | ATTR_DESCR_MASK)) != L3_PAGE) {
4137 		atomic_add_long(&pmap_l2_p_failures, 1);
4138 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4139 		    " in pmap %p", va, pmap);
4140 		return;
4141 	}
4142 
4143 	/*
4144 	 * Both here and in the below "for" loop, to allow for repromotion
4145 	 * after MADV_FREE, conditionally write protect a clean L3E before
4146 	 * possibly aborting the promotion due to other L3E attributes.  Why?
4147 	 * Suppose that MADV_FREE is applied to a part of a superpage, the
4148 	 * address range [S, E).  pmap_advise() will demote the superpage
4149 	 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4150 	 * set AP_RO and clear AF in the L3Es for the rest of [S, E).  Later,
4151 	 * imagine that the memory in [S, E) is recycled, but the last 4KB
4152 	 * page in [S, E) is not the last to be rewritten, or simply accessed.
4153 	 * In other words, there is still a 4KB page in [S, E), call it P,
4154 	 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4155 	 * Unless we write protect P before aborting the promotion, if and
4156 	 * when P is finally rewritten, there won't be a page fault to trigger
4157 	 * repromotion.
4158 	 */
4159 setl2:
4160 	if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4161 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4162 		/*
4163 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4164 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4165 		 */
4166 		if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4167 			goto setl2;
4168 		newl2 &= ~ATTR_SW_DBM;
4169 	}
4170 	if ((newl2 & ATTR_AF) == 0) {
4171 		atomic_add_long(&pmap_l2_p_failures, 1);
4172 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4173 		    " in pmap %p", va, pmap);
4174 		return;
4175 	}
4176 
4177 	/*
4178 	 * Examine each of the other L3Es in the specified PTP.  Abort if this
4179 	 * L3E maps an unexpected 4KB physical page or does not have identical
4180 	 * characteristics to the first L3E.
4181 	 */
4182 	pa = (newl2 & (~ATTR_MASK | ATTR_DESCR_MASK)) + L2_SIZE - PAGE_SIZE;
4183 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4184 		oldl3 = pmap_load(l3);
4185 		if ((oldl3 & (~ATTR_MASK | ATTR_DESCR_MASK)) != pa) {
4186 			atomic_add_long(&pmap_l2_p_failures, 1);
4187 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4188 			    " in pmap %p", va, pmap);
4189 			return;
4190 		}
4191 setl3:
4192 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4193 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4194 			/*
4195 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4196 			 * set, ATTR_SW_DBM can be cleared without a TLB
4197 			 * invalidation.
4198 			 */
4199 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4200 			    ~ATTR_SW_DBM))
4201 				goto setl3;
4202 			oldl3 &= ~ATTR_SW_DBM;
4203 		}
4204 		if ((oldl3 & ATTR_MASK) != (newl2 & ATTR_MASK)) {
4205 			atomic_add_long(&pmap_l2_p_failures, 1);
4206 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4207 			    " in pmap %p", va, pmap);
4208 			return;
4209 		}
4210 		pa -= PAGE_SIZE;
4211 	}
4212 
4213 	/*
4214 	 * Save the page table page in its current state until the L2
4215 	 * mapping the superpage is demoted by pmap_demote_l2() or
4216 	 * destroyed by pmap_remove_l3().
4217 	 */
4218 	if (mpte == NULL)
4219 		mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
4220 	KASSERT(mpte >= vm_page_array &&
4221 	    mpte < &vm_page_array[vm_page_array_size],
4222 	    ("pmap_promote_l2: page table page is out of range"));
4223 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
4224 	    ("pmap_promote_l2: page table page's pindex is wrong"));
4225 	if (pmap_insert_pt_page(pmap, mpte, true)) {
4226 		atomic_add_long(&pmap_l2_p_failures, 1);
4227 		CTR2(KTR_PMAP,
4228 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4229 		    pmap);
4230 		return;
4231 	}
4232 
4233 	if ((newl2 & ATTR_SW_MANAGED) != 0)
4234 		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
4235 
4236 	newl2 &= ~ATTR_DESCR_MASK;
4237 	newl2 |= L2_BLOCK;
4238 
4239 	pmap_update_entry(pmap, l2, newl2, va & ~L2_OFFSET, L2_SIZE);
4240 
4241 	atomic_add_long(&pmap_l2_promotions, 1);
4242 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4243 	    pmap);
4244 }
4245 #endif /* VM_NRESERVLEVEL > 0 */
4246 
4247 static int
4248 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
4249     int psind)
4250 {
4251 	pd_entry_t *l0p, *l1p, *l2p, origpte;
4252 	vm_page_t mp;
4253 
4254 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4255 	KASSERT(psind > 0 && psind < MAXPAGESIZES,
4256 	    ("psind %d unexpected", psind));
4257 	KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0,
4258 	    ("unaligned phys address %#lx newpte %#lx psind %d",
4259 	    (newpte & ~ATTR_MASK), newpte, psind));
4260 
4261 restart:
4262 	if (psind == 2) {
4263 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4264 
4265 		l0p = pmap_l0(pmap, va);
4266 		if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4267 			mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4268 			if (mp == NULL) {
4269 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4270 					return (KERN_RESOURCE_SHORTAGE);
4271 				PMAP_UNLOCK(pmap);
4272 				vm_wait(NULL);
4273 				PMAP_LOCK(pmap);
4274 				goto restart;
4275 			}
4276 			l1p = pmap_l0_to_l1(l0p, va);
4277 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4278 			origpte = pmap_load(l1p);
4279 		} else {
4280 			l1p = pmap_l0_to_l1(l0p, va);
4281 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4282 			origpte = pmap_load(l1p);
4283 			if ((origpte & ATTR_DESCR_VALID) == 0) {
4284 				mp = PHYS_TO_VM_PAGE(pmap_load(l0p) &
4285 				    ~ATTR_MASK);
4286 				mp->ref_count++;
4287 			}
4288 		}
4289 		KASSERT(((origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK) &&
4290 		    (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
4291 		    (origpte & ATTR_DESCR_VALID) == 0,
4292 		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
4293 		    va, origpte, newpte));
4294 		pmap_store(l1p, newpte);
4295 	} else /* (psind == 1) */ {
4296 		l2p = pmap_l2(pmap, va);
4297 		if (l2p == NULL) {
4298 			mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
4299 			if (mp == NULL) {
4300 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4301 					return (KERN_RESOURCE_SHORTAGE);
4302 				PMAP_UNLOCK(pmap);
4303 				vm_wait(NULL);
4304 				PMAP_LOCK(pmap);
4305 				goto restart;
4306 			}
4307 			l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
4308 			l2p = &l2p[pmap_l2_index(va)];
4309 			origpte = pmap_load(l2p);
4310 		} else {
4311 			l1p = pmap_l1(pmap, va);
4312 			origpte = pmap_load(l2p);
4313 			if ((origpte & ATTR_DESCR_VALID) == 0) {
4314 				mp = PHYS_TO_VM_PAGE(pmap_load(l1p) &
4315 				    ~ATTR_MASK);
4316 				mp->ref_count++;
4317 			}
4318 		}
4319 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
4320 		    ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
4321 		     (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
4322 		    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
4323 		    va, origpte, newpte));
4324 		pmap_store(l2p, newpte);
4325 	}
4326 	dsb(ishst);
4327 
4328 	if ((origpte & ATTR_DESCR_VALID) == 0)
4329 		pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
4330 	if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
4331 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
4332 	else if ((newpte & ATTR_SW_WIRED) == 0 &&
4333 	    (origpte & ATTR_SW_WIRED) != 0)
4334 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
4335 
4336 	return (KERN_SUCCESS);
4337 }
4338 
4339 /*
4340  *	Insert the given physical page (p) at
4341  *	the specified virtual address (v) in the
4342  *	target physical map with the protection requested.
4343  *
4344  *	If specified, the page will be wired down, meaning
4345  *	that the related pte can not be reclaimed.
4346  *
4347  *	NB:  This is the only routine which MAY NOT lazy-evaluate
4348  *	or lose information.  That is, this routine must actually
4349  *	insert this page into the given map NOW.
4350  */
4351 int
4352 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4353     u_int flags, int8_t psind)
4354 {
4355 	struct rwlock *lock;
4356 	pd_entry_t *pde;
4357 	pt_entry_t new_l3, orig_l3;
4358 	pt_entry_t *l2, *l3;
4359 	pv_entry_t pv;
4360 	vm_paddr_t opa, pa;
4361 	vm_page_t mpte, om;
4362 	boolean_t nosleep;
4363 	int lvl, rv;
4364 
4365 	KASSERT(ADDR_IS_CANONICAL(va),
4366 	    ("%s: Address not in canonical form: %lx", __func__, va));
4367 
4368 	va = trunc_page(va);
4369 	if ((m->oflags & VPO_UNMANAGED) == 0)
4370 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
4371 	pa = VM_PAGE_TO_PHYS(m);
4372 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE);
4373 	new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
4374 	new_l3 |= pmap_pte_prot(pmap, prot);
4375 
4376 	if ((flags & PMAP_ENTER_WIRED) != 0)
4377 		new_l3 |= ATTR_SW_WIRED;
4378 	if (pmap->pm_stage == PM_STAGE1) {
4379 		if (!ADDR_IS_KERNEL(va))
4380 			new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4381 		else
4382 			new_l3 |= ATTR_S1_UXN;
4383 		if (pmap != kernel_pmap)
4384 			new_l3 |= ATTR_S1_nG;
4385 	} else {
4386 		/*
4387 		 * Clear the access flag on executable mappings, this will be
4388 		 * set later when the page is accessed. The fault handler is
4389 		 * required to invalidate the I-cache.
4390 		 *
4391 		 * TODO: Switch to the valid flag to allow hardware management
4392 		 * of the access flag. Much of the pmap code assumes the
4393 		 * valid flag is set and fails to destroy the old page tables
4394 		 * correctly if it is clear.
4395 		 */
4396 		if (prot & VM_PROT_EXECUTE)
4397 			new_l3 &= ~ATTR_AF;
4398 	}
4399 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4400 		new_l3 |= ATTR_SW_MANAGED;
4401 		if ((prot & VM_PROT_WRITE) != 0) {
4402 			new_l3 |= ATTR_SW_DBM;
4403 			if ((flags & VM_PROT_WRITE) == 0) {
4404 				if (pmap->pm_stage == PM_STAGE1)
4405 					new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
4406 				else
4407 					new_l3 &=
4408 					    ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
4409 			}
4410 		}
4411 	}
4412 
4413 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
4414 
4415 	lock = NULL;
4416 	PMAP_LOCK(pmap);
4417 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
4418 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
4419 		    ("managed largepage va %#lx flags %#x", va, flags));
4420 		new_l3 &= ~L3_PAGE;
4421 		if (psind == 2) {
4422 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4423 			new_l3 |= L1_BLOCK;
4424 		} else /* (psind == 1) */
4425 			new_l3 |= L2_BLOCK;
4426 		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
4427 		goto out;
4428 	}
4429 	if (psind == 1) {
4430 		/* Assert the required virtual and physical alignment. */
4431 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
4432 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
4433 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
4434 		    flags, m, &lock);
4435 		goto out;
4436 	}
4437 	mpte = NULL;
4438 
4439 	/*
4440 	 * In the case that a page table page is not
4441 	 * resident, we are creating it here.
4442 	 */
4443 retry:
4444 	pde = pmap_pde(pmap, va, &lvl);
4445 	if (pde != NULL && lvl == 2) {
4446 		l3 = pmap_l2_to_l3(pde, va);
4447 		if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
4448 			mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
4449 			mpte->ref_count++;
4450 		}
4451 		goto havel3;
4452 	} else if (pde != NULL && lvl == 1) {
4453 		l2 = pmap_l1_to_l2(pde, va);
4454 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
4455 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
4456 			l3 = &l3[pmap_l3_index(va)];
4457 			if (!ADDR_IS_KERNEL(va)) {
4458 				mpte = PHYS_TO_VM_PAGE(
4459 				    pmap_load(l2) & ~ATTR_MASK);
4460 				mpte->ref_count++;
4461 			}
4462 			goto havel3;
4463 		}
4464 		/* We need to allocate an L3 table. */
4465 	}
4466 	if (!ADDR_IS_KERNEL(va)) {
4467 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4468 
4469 		/*
4470 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
4471 		 * to handle the possibility that a superpage mapping for "va"
4472 		 * was created while we slept.
4473 		 */
4474 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
4475 		    nosleep ? NULL : &lock);
4476 		if (mpte == NULL && nosleep) {
4477 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
4478 			rv = KERN_RESOURCE_SHORTAGE;
4479 			goto out;
4480 		}
4481 		goto retry;
4482 	} else
4483 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
4484 
4485 havel3:
4486 	orig_l3 = pmap_load(l3);
4487 	opa = orig_l3 & ~ATTR_MASK;
4488 	pv = NULL;
4489 
4490 	/*
4491 	 * Is the specified virtual address already mapped?
4492 	 */
4493 	if (pmap_l3_valid(orig_l3)) {
4494 		/*
4495 		 * Wiring change, just update stats. We don't worry about
4496 		 * wiring PT pages as they remain resident as long as there
4497 		 * are valid mappings in them. Hence, if a user page is wired,
4498 		 * the PT page will be also.
4499 		 */
4500 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
4501 		    (orig_l3 & ATTR_SW_WIRED) == 0)
4502 			pmap->pm_stats.wired_count++;
4503 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
4504 		    (orig_l3 & ATTR_SW_WIRED) != 0)
4505 			pmap->pm_stats.wired_count--;
4506 
4507 		/*
4508 		 * Remove the extra PT page reference.
4509 		 */
4510 		if (mpte != NULL) {
4511 			mpte->ref_count--;
4512 			KASSERT(mpte->ref_count > 0,
4513 			    ("pmap_enter: missing reference to page table page,"
4514 			     " va: 0x%lx", va));
4515 		}
4516 
4517 		/*
4518 		 * Has the physical page changed?
4519 		 */
4520 		if (opa == pa) {
4521 			/*
4522 			 * No, might be a protection or wiring change.
4523 			 */
4524 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4525 			    (new_l3 & ATTR_SW_DBM) != 0)
4526 				vm_page_aflag_set(m, PGA_WRITEABLE);
4527 			goto validate;
4528 		}
4529 
4530 		/*
4531 		 * The physical page has changed.  Temporarily invalidate
4532 		 * the mapping.
4533 		 */
4534 		orig_l3 = pmap_load_clear(l3);
4535 		KASSERT((orig_l3 & ~ATTR_MASK) == opa,
4536 		    ("pmap_enter: unexpected pa update for %#lx", va));
4537 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
4538 			om = PHYS_TO_VM_PAGE(opa);
4539 
4540 			/*
4541 			 * The pmap lock is sufficient to synchronize with
4542 			 * concurrent calls to pmap_page_test_mappings() and
4543 			 * pmap_ts_referenced().
4544 			 */
4545 			if (pmap_pte_dirty(pmap, orig_l3))
4546 				vm_page_dirty(om);
4547 			if ((orig_l3 & ATTR_AF) != 0) {
4548 				pmap_invalidate_page(pmap, va, true);
4549 				vm_page_aflag_set(om, PGA_REFERENCED);
4550 			}
4551 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4552 			pv = pmap_pvh_remove(&om->md, pmap, va);
4553 			if ((m->oflags & VPO_UNMANAGED) != 0)
4554 				free_pv_entry(pmap, pv);
4555 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
4556 			    TAILQ_EMPTY(&om->md.pv_list) &&
4557 			    ((om->flags & PG_FICTITIOUS) != 0 ||
4558 			    TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
4559 				vm_page_aflag_clear(om, PGA_WRITEABLE);
4560 		} else {
4561 			KASSERT((orig_l3 & ATTR_AF) != 0,
4562 			    ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
4563 			pmap_invalidate_page(pmap, va, true);
4564 		}
4565 		orig_l3 = 0;
4566 	} else {
4567 		/*
4568 		 * Increment the counters.
4569 		 */
4570 		if ((new_l3 & ATTR_SW_WIRED) != 0)
4571 			pmap->pm_stats.wired_count++;
4572 		pmap_resident_count_inc(pmap, 1);
4573 	}
4574 	/*
4575 	 * Enter on the PV list if part of our managed memory.
4576 	 */
4577 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4578 		if (pv == NULL) {
4579 			pv = get_pv_entry(pmap, &lock);
4580 			pv->pv_va = va;
4581 		}
4582 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4583 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4584 		m->md.pv_gen++;
4585 		if ((new_l3 & ATTR_SW_DBM) != 0)
4586 			vm_page_aflag_set(m, PGA_WRITEABLE);
4587 	}
4588 
4589 validate:
4590 	if (pmap->pm_stage == PM_STAGE1) {
4591 		/*
4592 		 * Sync icache if exec permission and attribute
4593 		 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
4594 		 * is stored and made valid for hardware table walk. If done
4595 		 * later, then other can access this page before caches are
4596 		 * properly synced. Don't do it for kernel memory which is
4597 		 * mapped with exec permission even if the memory isn't going
4598 		 * to hold executable code. The only time when icache sync is
4599 		 * needed is after kernel module is loaded and the relocation
4600 		 * info is processed. And it's done in elf_cpu_load_file().
4601 		*/
4602 		if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
4603 		    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
4604 		    (opa != pa || (orig_l3 & ATTR_S1_XN))) {
4605 			PMAP_ASSERT_STAGE1(pmap);
4606 			cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4607 		}
4608 	} else {
4609 		cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4610 	}
4611 
4612 	/*
4613 	 * Update the L3 entry
4614 	 */
4615 	if (pmap_l3_valid(orig_l3)) {
4616 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
4617 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
4618 			/* same PA, different attributes */
4619 			orig_l3 = pmap_load_store(l3, new_l3);
4620 			pmap_invalidate_page(pmap, va, true);
4621 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4622 			    pmap_pte_dirty(pmap, orig_l3))
4623 				vm_page_dirty(m);
4624 		} else {
4625 			/*
4626 			 * orig_l3 == new_l3
4627 			 * This can happens if multiple threads simultaneously
4628 			 * access not yet mapped page. This bad for performance
4629 			 * since this can cause full demotion-NOP-promotion
4630 			 * cycle.
4631 			 * Another possible reasons are:
4632 			 * - VM and pmap memory layout are diverged
4633 			 * - tlb flush is missing somewhere and CPU doesn't see
4634 			 *   actual mapping.
4635 			 */
4636 			CTR4(KTR_PMAP, "%s: already mapped page - "
4637 			    "pmap %p va 0x%#lx pte 0x%lx",
4638 			    __func__, pmap, va, new_l3);
4639 		}
4640 	} else {
4641 		/* New mapping */
4642 		pmap_store(l3, new_l3);
4643 		dsb(ishst);
4644 	}
4645 
4646 #if VM_NRESERVLEVEL > 0
4647 	/*
4648 	 * Try to promote from level 3 pages to a level 2 superpage. This
4649 	 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at
4650 	 * stage 1 specific fields and performs a break-before-make sequence
4651 	 * that is incorrect a stage 2 pmap.
4652 	 */
4653 	if ((mpte == NULL || mpte->ref_count == NL3PG) &&
4654 	    pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 &&
4655 	    (m->flags & PG_FICTITIOUS) == 0 &&
4656 	    vm_reserv_level_iffullpop(m) == 0) {
4657 		pmap_promote_l2(pmap, pde, va, mpte, &lock);
4658 	}
4659 #endif
4660 
4661 	rv = KERN_SUCCESS;
4662 out:
4663 	if (lock != NULL)
4664 		rw_wunlock(lock);
4665 	PMAP_UNLOCK(pmap);
4666 	return (rv);
4667 }
4668 
4669 /*
4670  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns
4671  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
4672  * value.  See pmap_enter_l2() for the possible error values when "no sleep",
4673  * "no replace", and "no reclaim" are specified.
4674  */
4675 static int
4676 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4677     struct rwlock **lockp)
4678 {
4679 	pd_entry_t new_l2;
4680 
4681 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4682 	PMAP_ASSERT_STAGE1(pmap);
4683 	KASSERT(ADDR_IS_CANONICAL(va),
4684 	    ("%s: Address not in canonical form: %lx", __func__, va));
4685 
4686 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
4687 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
4688 	    L2_BLOCK);
4689 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4690 		new_l2 |= ATTR_SW_MANAGED;
4691 		new_l2 &= ~ATTR_AF;
4692 	}
4693 	if ((prot & VM_PROT_EXECUTE) == 0 ||
4694 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
4695 		new_l2 |= ATTR_S1_XN;
4696 	if (!ADDR_IS_KERNEL(va))
4697 		new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4698 	else
4699 		new_l2 |= ATTR_S1_UXN;
4700 	if (pmap != kernel_pmap)
4701 		new_l2 |= ATTR_S1_nG;
4702 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
4703 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
4704 }
4705 
4706 /*
4707  * Returns true if every page table entry in the specified page table is
4708  * zero.
4709  */
4710 static bool
4711 pmap_every_pte_zero(vm_paddr_t pa)
4712 {
4713 	pt_entry_t *pt_end, *pte;
4714 
4715 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
4716 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
4717 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
4718 		if (*pte != 0)
4719 			return (false);
4720 	}
4721 	return (true);
4722 }
4723 
4724 /*
4725  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
4726  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
4727  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
4728  * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists
4729  * within the 2MB virtual address range starting at the specified virtual
4730  * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
4731  * 2MB page mapping already exists at the specified virtual address.  Returns
4732  * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
4733  * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
4734  * and a PV entry allocation failed.
4735  */
4736 static int
4737 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
4738     vm_page_t m, struct rwlock **lockp)
4739 {
4740 	struct spglist free;
4741 	pd_entry_t *l2, old_l2;
4742 	vm_page_t l2pg, mt;
4743 
4744 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4745 	KASSERT(ADDR_IS_CANONICAL(va),
4746 	    ("%s: Address not in canonical form: %lx", __func__, va));
4747 
4748 	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
4749 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
4750 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
4751 		    va, pmap);
4752 		return (KERN_RESOURCE_SHORTAGE);
4753 	}
4754 
4755 	/*
4756 	 * If there are existing mappings, either abort or remove them.
4757 	 */
4758 	if ((old_l2 = pmap_load(l2)) != 0) {
4759 		KASSERT(l2pg == NULL || l2pg->ref_count > 1,
4760 		    ("pmap_enter_l2: l2pg's ref count is too low"));
4761 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
4762 			if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
4763 				if (l2pg != NULL)
4764 					l2pg->ref_count--;
4765 				CTR2(KTR_PMAP,
4766 				    "pmap_enter_l2: no space for va %#lx"
4767 				    " in pmap %p", va, pmap);
4768 				return (KERN_NO_SPACE);
4769 			} else if (!ADDR_IS_KERNEL(va) ||
4770 			    !pmap_every_pte_zero(old_l2 & ~ATTR_MASK)) {
4771 				if (l2pg != NULL)
4772 					l2pg->ref_count--;
4773 				CTR2(KTR_PMAP,
4774 				    "pmap_enter_l2: failure for va %#lx"
4775 				    " in pmap %p", va, pmap);
4776 				return (KERN_FAILURE);
4777 			}
4778 		}
4779 		SLIST_INIT(&free);
4780 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
4781 			(void)pmap_remove_l2(pmap, l2, va,
4782 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
4783 		else
4784 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
4785 			    &free, lockp);
4786 		if (!ADDR_IS_KERNEL(va)) {
4787 			vm_page_free_pages_toq(&free, true);
4788 			KASSERT(pmap_load(l2) == 0,
4789 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
4790 		} else {
4791 			KASSERT(SLIST_EMPTY(&free),
4792 			    ("pmap_enter_l2: freed kernel page table page"));
4793 
4794 			/*
4795 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
4796 			 * will leave the kernel page table page zero filled.
4797 			 * Nonetheless, the TLB could have an intermediate
4798 			 * entry for the kernel page table page, so request
4799 			 * an invalidation at all levels after clearing
4800 			 * the L2_TABLE entry.
4801 			 */
4802 			mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
4803 			if (pmap_insert_pt_page(pmap, mt, false))
4804 				panic("pmap_enter_l2: trie insert failed");
4805 			pmap_clear(l2);
4806 			pmap_s1_invalidate_page(pmap, va, false);
4807 		}
4808 	}
4809 
4810 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
4811 		/*
4812 		 * Abort this mapping if its PV entry could not be created.
4813 		 */
4814 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
4815 			if (l2pg != NULL)
4816 				pmap_abort_ptp(pmap, va, l2pg);
4817 			CTR2(KTR_PMAP,
4818 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
4819 			    va, pmap);
4820 			return (KERN_RESOURCE_SHORTAGE);
4821 		}
4822 		if ((new_l2 & ATTR_SW_DBM) != 0)
4823 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4824 				vm_page_aflag_set(mt, PGA_WRITEABLE);
4825 	}
4826 
4827 	/*
4828 	 * Increment counters.
4829 	 */
4830 	if ((new_l2 & ATTR_SW_WIRED) != 0)
4831 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
4832 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
4833 
4834 	/*
4835 	 * Conditionally sync the icache.  See pmap_enter() for details.
4836 	 */
4837 	if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) !=
4838 	    (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) &&
4839 	    pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
4840 		cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK),
4841 		    L2_SIZE);
4842 	}
4843 
4844 	/*
4845 	 * Map the superpage.
4846 	 */
4847 	pmap_store(l2, new_l2);
4848 	dsb(ishst);
4849 
4850 	atomic_add_long(&pmap_l2_mappings, 1);
4851 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
4852 	    va, pmap);
4853 
4854 	return (KERN_SUCCESS);
4855 }
4856 
4857 /*
4858  * Maps a sequence of resident pages belonging to the same object.
4859  * The sequence begins with the given page m_start.  This page is
4860  * mapped at the given virtual address start.  Each subsequent page is
4861  * mapped at a virtual address that is offset from start by the same
4862  * amount as the page is offset from m_start within the object.  The
4863  * last page in the sequence is the page with the largest offset from
4864  * m_start that can be mapped at a virtual address less than the given
4865  * virtual address end.  Not every virtual page between start and end
4866  * is mapped; only those for which a resident page exists with the
4867  * corresponding offset from m_start are mapped.
4868  */
4869 void
4870 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4871     vm_page_t m_start, vm_prot_t prot)
4872 {
4873 	struct rwlock *lock;
4874 	vm_offset_t va;
4875 	vm_page_t m, mpte;
4876 	vm_pindex_t diff, psize;
4877 	int rv;
4878 
4879 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4880 
4881 	psize = atop(end - start);
4882 	mpte = NULL;
4883 	m = m_start;
4884 	lock = NULL;
4885 	PMAP_LOCK(pmap);
4886 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4887 		va = start + ptoa(diff);
4888 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
4889 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4890 		    ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
4891 		    KERN_SUCCESS || rv == KERN_NO_SPACE))
4892 			m = &m[L2_SIZE / PAGE_SIZE - 1];
4893 		else
4894 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
4895 			    &lock);
4896 		m = TAILQ_NEXT(m, listq);
4897 	}
4898 	if (lock != NULL)
4899 		rw_wunlock(lock);
4900 	PMAP_UNLOCK(pmap);
4901 }
4902 
4903 /*
4904  * this code makes some *MAJOR* assumptions:
4905  * 1. Current pmap & pmap exists.
4906  * 2. Not wired.
4907  * 3. Read access.
4908  * 4. No page table pages.
4909  * but is *MUCH* faster than pmap_enter...
4910  */
4911 
4912 void
4913 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4914 {
4915 	struct rwlock *lock;
4916 
4917 	lock = NULL;
4918 	PMAP_LOCK(pmap);
4919 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4920 	if (lock != NULL)
4921 		rw_wunlock(lock);
4922 	PMAP_UNLOCK(pmap);
4923 }
4924 
4925 static vm_page_t
4926 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4927     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4928 {
4929 	pd_entry_t *pde;
4930 	pt_entry_t *l1, *l2, *l3, l3_val;
4931 	vm_paddr_t pa;
4932 	int lvl;
4933 
4934 	KASSERT(!VA_IS_CLEANMAP(va) ||
4935 	    (m->oflags & VPO_UNMANAGED) != 0,
4936 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4937 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4938 	PMAP_ASSERT_STAGE1(pmap);
4939 	KASSERT(ADDR_IS_CANONICAL(va),
4940 	    ("%s: Address not in canonical form: %lx", __func__, va));
4941 
4942 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
4943 	/*
4944 	 * In the case that a page table page is not
4945 	 * resident, we are creating it here.
4946 	 */
4947 	if (!ADDR_IS_KERNEL(va)) {
4948 		vm_pindex_t l2pindex;
4949 
4950 		/*
4951 		 * Calculate pagetable page index
4952 		 */
4953 		l2pindex = pmap_l2_pindex(va);
4954 		if (mpte && (mpte->pindex == l2pindex)) {
4955 			mpte->ref_count++;
4956 		} else {
4957 			/*
4958 			 * If the page table page is mapped, we just increment
4959 			 * the hold count, and activate it.  Otherwise, we
4960 			 * attempt to allocate a page table page, passing NULL
4961 			 * instead of the PV list lock pointer because we don't
4962 			 * intend to sleep.  If this attempt fails, we don't
4963 			 * retry.  Instead, we give up.
4964 			 */
4965 			l1 = pmap_l1(pmap, va);
4966 			if (l1 != NULL && pmap_load(l1) != 0) {
4967 				if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
4968 				    L1_BLOCK)
4969 					return (NULL);
4970 				l2 = pmap_l1_to_l2(l1, va);
4971 				if (pmap_load(l2) != 0) {
4972 					if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
4973 					    L2_BLOCK)
4974 						return (NULL);
4975 					mpte = PHYS_TO_VM_PAGE(pmap_load(l2) &
4976 					    ~ATTR_MASK);
4977 					mpte->ref_count++;
4978 				} else {
4979 					mpte = _pmap_alloc_l3(pmap, l2pindex,
4980 					    NULL);
4981 					if (mpte == NULL)
4982 						return (mpte);
4983 				}
4984 			} else {
4985 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
4986 				if (mpte == NULL)
4987 					return (mpte);
4988 			}
4989 		}
4990 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4991 		l3 = &l3[pmap_l3_index(va)];
4992 	} else {
4993 		mpte = NULL;
4994 		pde = pmap_pde(kernel_pmap, va, &lvl);
4995 		KASSERT(pde != NULL,
4996 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
4997 		     va));
4998 		KASSERT(lvl == 2,
4999 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
5000 		l3 = pmap_l2_to_l3(pde, va);
5001 	}
5002 
5003 	/*
5004 	 * Abort if a mapping already exists.
5005 	 */
5006 	if (pmap_load(l3) != 0) {
5007 		if (mpte != NULL)
5008 			mpte->ref_count--;
5009 		return (NULL);
5010 	}
5011 
5012 	/*
5013 	 * Enter on the PV list if part of our managed memory.
5014 	 */
5015 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
5016 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5017 		if (mpte != NULL)
5018 			pmap_abort_ptp(pmap, va, mpte);
5019 		return (NULL);
5020 	}
5021 
5022 	/*
5023 	 * Increment counters
5024 	 */
5025 	pmap_resident_count_inc(pmap, 1);
5026 
5027 	pa = VM_PAGE_TO_PHYS(m);
5028 	l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
5029 	    ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
5030 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5031 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5032 		l3_val |= ATTR_S1_XN;
5033 	if (!ADDR_IS_KERNEL(va))
5034 		l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5035 	else
5036 		l3_val |= ATTR_S1_UXN;
5037 	if (pmap != kernel_pmap)
5038 		l3_val |= ATTR_S1_nG;
5039 
5040 	/*
5041 	 * Now validate mapping with RO protection
5042 	 */
5043 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5044 		l3_val |= ATTR_SW_MANAGED;
5045 		l3_val &= ~ATTR_AF;
5046 	}
5047 
5048 	/* Sync icache before the mapping is stored to PTE */
5049 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5050 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5051 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
5052 
5053 	pmap_store(l3, l3_val);
5054 	dsb(ishst);
5055 
5056 	return (mpte);
5057 }
5058 
5059 /*
5060  * This code maps large physical mmap regions into the
5061  * processor address space.  Note that some shortcuts
5062  * are taken, but the code works.
5063  */
5064 void
5065 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
5066     vm_pindex_t pindex, vm_size_t size)
5067 {
5068 
5069 	VM_OBJECT_ASSERT_WLOCKED(object);
5070 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
5071 	    ("pmap_object_init_pt: non-device object"));
5072 }
5073 
5074 /*
5075  *	Clear the wired attribute from the mappings for the specified range of
5076  *	addresses in the given pmap.  Every valid mapping within that range
5077  *	must have the wired attribute set.  In contrast, invalid mappings
5078  *	cannot have the wired attribute set, so they are ignored.
5079  *
5080  *	The wired attribute of the page table entry is not a hardware feature,
5081  *	so there is no need to invalidate any TLB entries.
5082  */
5083 void
5084 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5085 {
5086 	vm_offset_t va_next;
5087 	pd_entry_t *l0, *l1, *l2;
5088 	pt_entry_t *l3;
5089 
5090 	PMAP_LOCK(pmap);
5091 	for (; sva < eva; sva = va_next) {
5092 		l0 = pmap_l0(pmap, sva);
5093 		if (pmap_load(l0) == 0) {
5094 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
5095 			if (va_next < sva)
5096 				va_next = eva;
5097 			continue;
5098 		}
5099 
5100 		l1 = pmap_l0_to_l1(l0, sva);
5101 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
5102 		if (va_next < sva)
5103 			va_next = eva;
5104 		if (pmap_load(l1) == 0)
5105 			continue;
5106 
5107 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5108 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5109 			KASSERT(va_next <= eva,
5110 			    ("partial update of non-transparent 1G page "
5111 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
5112 			    pmap_load(l1), sva, eva, va_next));
5113 			MPASS(pmap != kernel_pmap);
5114 			MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
5115 			    ATTR_SW_WIRED)) == ATTR_SW_WIRED);
5116 			pmap_clear_bits(l1, ATTR_SW_WIRED);
5117 			pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
5118 			continue;
5119 		}
5120 
5121 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
5122 		if (va_next < sva)
5123 			va_next = eva;
5124 
5125 		l2 = pmap_l1_to_l2(l1, sva);
5126 		if (pmap_load(l2) == 0)
5127 			continue;
5128 
5129 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
5130 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
5131 				panic("pmap_unwire: l2 %#jx is missing "
5132 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
5133 
5134 			/*
5135 			 * Are we unwiring the entire large page?  If not,
5136 			 * demote the mapping and fall through.
5137 			 */
5138 			if (sva + L2_SIZE == va_next && eva >= va_next) {
5139 				pmap_clear_bits(l2, ATTR_SW_WIRED);
5140 				pmap->pm_stats.wired_count -= L2_SIZE /
5141 				    PAGE_SIZE;
5142 				continue;
5143 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
5144 				panic("pmap_unwire: demotion failed");
5145 		}
5146 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
5147 		    ("pmap_unwire: Invalid l2 entry after demotion"));
5148 
5149 		if (va_next > eva)
5150 			va_next = eva;
5151 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
5152 		    sva += L3_SIZE) {
5153 			if (pmap_load(l3) == 0)
5154 				continue;
5155 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
5156 				panic("pmap_unwire: l3 %#jx is missing "
5157 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
5158 
5159 			/*
5160 			 * ATTR_SW_WIRED must be cleared atomically.  Although
5161 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
5162 			 * the System MMU may write to the entry concurrently.
5163 			 */
5164 			pmap_clear_bits(l3, ATTR_SW_WIRED);
5165 			pmap->pm_stats.wired_count--;
5166 		}
5167 	}
5168 	PMAP_UNLOCK(pmap);
5169 }
5170 
5171 /*
5172  *	Copy the range specified by src_addr/len
5173  *	from the source map to the range dst_addr/len
5174  *	in the destination map.
5175  *
5176  *	This routine is only advisory and need not do anything.
5177  *
5178  *	Because the executable mappings created by this routine are copied,
5179  *	it should not have to flush the instruction cache.
5180  */
5181 void
5182 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
5183     vm_offset_t src_addr)
5184 {
5185 	struct rwlock *lock;
5186 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
5187 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
5188 	vm_offset_t addr, end_addr, va_next;
5189 	vm_page_t dst_m, dstmpte, srcmpte;
5190 
5191 	PMAP_ASSERT_STAGE1(dst_pmap);
5192 	PMAP_ASSERT_STAGE1(src_pmap);
5193 
5194 	if (dst_addr != src_addr)
5195 		return;
5196 	end_addr = src_addr + len;
5197 	lock = NULL;
5198 	if (dst_pmap < src_pmap) {
5199 		PMAP_LOCK(dst_pmap);
5200 		PMAP_LOCK(src_pmap);
5201 	} else {
5202 		PMAP_LOCK(src_pmap);
5203 		PMAP_LOCK(dst_pmap);
5204 	}
5205 	for (addr = src_addr; addr < end_addr; addr = va_next) {
5206 		l0 = pmap_l0(src_pmap, addr);
5207 		if (pmap_load(l0) == 0) {
5208 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
5209 			if (va_next < addr)
5210 				va_next = end_addr;
5211 			continue;
5212 		}
5213 
5214 		va_next = (addr + L1_SIZE) & ~L1_OFFSET;
5215 		if (va_next < addr)
5216 			va_next = end_addr;
5217 		l1 = pmap_l0_to_l1(l0, addr);
5218 		if (pmap_load(l1) == 0)
5219 			continue;
5220 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5221 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5222 			KASSERT(va_next <= end_addr,
5223 			    ("partial update of non-transparent 1G page "
5224 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
5225 			    pmap_load(l1), addr, end_addr, va_next));
5226 			srcptepaddr = pmap_load(l1);
5227 			l1 = pmap_l1(dst_pmap, addr);
5228 			if (l1 == NULL) {
5229 				if (_pmap_alloc_l3(dst_pmap,
5230 				    pmap_l0_pindex(addr), NULL) == NULL)
5231 					break;
5232 				l1 = pmap_l1(dst_pmap, addr);
5233 			} else {
5234 				l0 = pmap_l0(dst_pmap, addr);
5235 				dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) &
5236 				    ~ATTR_MASK);
5237 				dst_m->ref_count++;
5238 			}
5239 			KASSERT(pmap_load(l1) == 0,
5240 			    ("1G mapping present in dst pmap "
5241 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
5242 			    pmap_load(l1), addr, end_addr, va_next));
5243 			pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
5244 			pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
5245 			continue;
5246 		}
5247 
5248 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
5249 		if (va_next < addr)
5250 			va_next = end_addr;
5251 		l2 = pmap_l1_to_l2(l1, addr);
5252 		srcptepaddr = pmap_load(l2);
5253 		if (srcptepaddr == 0)
5254 			continue;
5255 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
5256 			/*
5257 			 * We can only virtual copy whole superpages.
5258 			 */
5259 			if ((addr & L2_OFFSET) != 0 ||
5260 			    addr + L2_SIZE > end_addr)
5261 				continue;
5262 			l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
5263 			if (l2 == NULL)
5264 				break;
5265 			if (pmap_load(l2) == 0 &&
5266 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
5267 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
5268 			    PMAP_ENTER_NORECLAIM, &lock))) {
5269 				/*
5270 				 * We leave the dirty bit unchanged because
5271 				 * managed read/write superpage mappings are
5272 				 * required to be dirty.  However, managed
5273 				 * superpage mappings are not required to
5274 				 * have their accessed bit set, so we clear
5275 				 * it because we don't know if this mapping
5276 				 * will be used.
5277 				 */
5278 				srcptepaddr &= ~ATTR_SW_WIRED;
5279 				if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
5280 					srcptepaddr &= ~ATTR_AF;
5281 				pmap_store(l2, srcptepaddr);
5282 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
5283 				    PAGE_SIZE);
5284 				atomic_add_long(&pmap_l2_mappings, 1);
5285 			} else
5286 				pmap_abort_ptp(dst_pmap, addr, dst_m);
5287 			continue;
5288 		}
5289 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
5290 		    ("pmap_copy: invalid L2 entry"));
5291 		srcptepaddr &= ~ATTR_MASK;
5292 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
5293 		KASSERT(srcmpte->ref_count > 0,
5294 		    ("pmap_copy: source page table page is unused"));
5295 		if (va_next > end_addr)
5296 			va_next = end_addr;
5297 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
5298 		src_pte = &src_pte[pmap_l3_index(addr)];
5299 		dstmpte = NULL;
5300 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
5301 			ptetemp = pmap_load(src_pte);
5302 
5303 			/*
5304 			 * We only virtual copy managed pages.
5305 			 */
5306 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
5307 				continue;
5308 
5309 			if (dstmpte != NULL) {
5310 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
5311 				    ("dstmpte pindex/addr mismatch"));
5312 				dstmpte->ref_count++;
5313 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
5314 			    NULL)) == NULL)
5315 				goto out;
5316 			dst_pte = (pt_entry_t *)
5317 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5318 			dst_pte = &dst_pte[pmap_l3_index(addr)];
5319 			if (pmap_load(dst_pte) == 0 &&
5320 			    pmap_try_insert_pv_entry(dst_pmap, addr,
5321 			    PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) {
5322 				/*
5323 				 * Clear the wired, modified, and accessed
5324 				 * (referenced) bits during the copy.
5325 				 */
5326 				mask = ATTR_AF | ATTR_SW_WIRED;
5327 				nbits = 0;
5328 				if ((ptetemp & ATTR_SW_DBM) != 0)
5329 					nbits |= ATTR_S1_AP_RW_BIT;
5330 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
5331 				pmap_resident_count_inc(dst_pmap, 1);
5332 			} else {
5333 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
5334 				goto out;
5335 			}
5336 			/* Have we copied all of the valid mappings? */
5337 			if (dstmpte->ref_count >= srcmpte->ref_count)
5338 				break;
5339 		}
5340 	}
5341 out:
5342 	/*
5343 	 * XXX This barrier may not be needed because the destination pmap is
5344 	 * not active.
5345 	 */
5346 	dsb(ishst);
5347 
5348 	if (lock != NULL)
5349 		rw_wunlock(lock);
5350 	PMAP_UNLOCK(src_pmap);
5351 	PMAP_UNLOCK(dst_pmap);
5352 }
5353 
5354 /*
5355  *	pmap_zero_page zeros the specified hardware page by mapping
5356  *	the page into KVM and using bzero to clear its contents.
5357  */
5358 void
5359 pmap_zero_page(vm_page_t m)
5360 {
5361 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5362 
5363 	pagezero((void *)va);
5364 }
5365 
5366 /*
5367  *	pmap_zero_page_area zeros the specified hardware page by mapping
5368  *	the page into KVM and using bzero to clear its contents.
5369  *
5370  *	off and size may not cover an area beyond a single hardware page.
5371  */
5372 void
5373 pmap_zero_page_area(vm_page_t m, int off, int size)
5374 {
5375 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5376 
5377 	if (off == 0 && size == PAGE_SIZE)
5378 		pagezero((void *)va);
5379 	else
5380 		bzero((char *)va + off, size);
5381 }
5382 
5383 /*
5384  *	pmap_copy_page copies the specified (machine independent)
5385  *	page by mapping the page into virtual memory and using
5386  *	bcopy to copy the page, one machine dependent page at a
5387  *	time.
5388  */
5389 void
5390 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5391 {
5392 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5393 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5394 
5395 	pagecopy((void *)src, (void *)dst);
5396 }
5397 
5398 int unmapped_buf_allowed = 1;
5399 
5400 void
5401 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5402     vm_offset_t b_offset, int xfersize)
5403 {
5404 	void *a_cp, *b_cp;
5405 	vm_page_t m_a, m_b;
5406 	vm_paddr_t p_a, p_b;
5407 	vm_offset_t a_pg_offset, b_pg_offset;
5408 	int cnt;
5409 
5410 	while (xfersize > 0) {
5411 		a_pg_offset = a_offset & PAGE_MASK;
5412 		m_a = ma[a_offset >> PAGE_SHIFT];
5413 		p_a = m_a->phys_addr;
5414 		b_pg_offset = b_offset & PAGE_MASK;
5415 		m_b = mb[b_offset >> PAGE_SHIFT];
5416 		p_b = m_b->phys_addr;
5417 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5418 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5419 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
5420 			panic("!DMAP a %lx", p_a);
5421 		} else {
5422 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
5423 		}
5424 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
5425 			panic("!DMAP b %lx", p_b);
5426 		} else {
5427 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
5428 		}
5429 		bcopy(a_cp, b_cp, cnt);
5430 		a_offset += cnt;
5431 		b_offset += cnt;
5432 		xfersize -= cnt;
5433 	}
5434 }
5435 
5436 vm_offset_t
5437 pmap_quick_enter_page(vm_page_t m)
5438 {
5439 
5440 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
5441 }
5442 
5443 void
5444 pmap_quick_remove_page(vm_offset_t addr)
5445 {
5446 }
5447 
5448 /*
5449  * Returns true if the pmap's pv is one of the first
5450  * 16 pvs linked to from this page.  This count may
5451  * be changed upwards or downwards in the future; it
5452  * is only necessary that true be returned for a small
5453  * subset of pmaps for proper page aging.
5454  */
5455 boolean_t
5456 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5457 {
5458 	struct md_page *pvh;
5459 	struct rwlock *lock;
5460 	pv_entry_t pv;
5461 	int loops = 0;
5462 	boolean_t rv;
5463 
5464 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5465 	    ("pmap_page_exists_quick: page %p is not managed", m));
5466 	rv = FALSE;
5467 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5468 	rw_rlock(lock);
5469 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5470 		if (PV_PMAP(pv) == pmap) {
5471 			rv = TRUE;
5472 			break;
5473 		}
5474 		loops++;
5475 		if (loops >= 16)
5476 			break;
5477 	}
5478 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5479 		pvh = page_to_pvh(m);
5480 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5481 			if (PV_PMAP(pv) == pmap) {
5482 				rv = TRUE;
5483 				break;
5484 			}
5485 			loops++;
5486 			if (loops >= 16)
5487 				break;
5488 		}
5489 	}
5490 	rw_runlock(lock);
5491 	return (rv);
5492 }
5493 
5494 /*
5495  *	pmap_page_wired_mappings:
5496  *
5497  *	Return the number of managed mappings to the given physical page
5498  *	that are wired.
5499  */
5500 int
5501 pmap_page_wired_mappings(vm_page_t m)
5502 {
5503 	struct rwlock *lock;
5504 	struct md_page *pvh;
5505 	pmap_t pmap;
5506 	pt_entry_t *pte;
5507 	pv_entry_t pv;
5508 	int count, md_gen, pvh_gen;
5509 
5510 	if ((m->oflags & VPO_UNMANAGED) != 0)
5511 		return (0);
5512 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5513 	rw_rlock(lock);
5514 restart:
5515 	count = 0;
5516 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5517 		pmap = PV_PMAP(pv);
5518 		if (!PMAP_TRYLOCK(pmap)) {
5519 			md_gen = m->md.pv_gen;
5520 			rw_runlock(lock);
5521 			PMAP_LOCK(pmap);
5522 			rw_rlock(lock);
5523 			if (md_gen != m->md.pv_gen) {
5524 				PMAP_UNLOCK(pmap);
5525 				goto restart;
5526 			}
5527 		}
5528 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5529 		if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
5530 			count++;
5531 		PMAP_UNLOCK(pmap);
5532 	}
5533 	if ((m->flags & PG_FICTITIOUS) == 0) {
5534 		pvh = page_to_pvh(m);
5535 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5536 			pmap = PV_PMAP(pv);
5537 			if (!PMAP_TRYLOCK(pmap)) {
5538 				md_gen = m->md.pv_gen;
5539 				pvh_gen = pvh->pv_gen;
5540 				rw_runlock(lock);
5541 				PMAP_LOCK(pmap);
5542 				rw_rlock(lock);
5543 				if (md_gen != m->md.pv_gen ||
5544 				    pvh_gen != pvh->pv_gen) {
5545 					PMAP_UNLOCK(pmap);
5546 					goto restart;
5547 				}
5548 			}
5549 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
5550 			if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
5551 				count++;
5552 			PMAP_UNLOCK(pmap);
5553 		}
5554 	}
5555 	rw_runlock(lock);
5556 	return (count);
5557 }
5558 
5559 /*
5560  * Returns true if the given page is mapped individually or as part of
5561  * a 2mpage.  Otherwise, returns false.
5562  */
5563 bool
5564 pmap_page_is_mapped(vm_page_t m)
5565 {
5566 	struct rwlock *lock;
5567 	bool rv;
5568 
5569 	if ((m->oflags & VPO_UNMANAGED) != 0)
5570 		return (false);
5571 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5572 	rw_rlock(lock);
5573 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5574 	    ((m->flags & PG_FICTITIOUS) == 0 &&
5575 	    !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
5576 	rw_runlock(lock);
5577 	return (rv);
5578 }
5579 
5580 /*
5581  * Destroy all managed, non-wired mappings in the given user-space
5582  * pmap.  This pmap cannot be active on any processor besides the
5583  * caller.
5584  *
5585  * This function cannot be applied to the kernel pmap.  Moreover, it
5586  * is not intended for general use.  It is only to be used during
5587  * process termination.  Consequently, it can be implemented in ways
5588  * that make it faster than pmap_remove().  First, it can more quickly
5589  * destroy mappings by iterating over the pmap's collection of PV
5590  * entries, rather than searching the page table.  Second, it doesn't
5591  * have to test and clear the page table entries atomically, because
5592  * no processor is currently accessing the user address space.  In
5593  * particular, a page table entry's dirty bit won't change state once
5594  * this function starts.
5595  */
5596 void
5597 pmap_remove_pages(pmap_t pmap)
5598 {
5599 	pd_entry_t *pde;
5600 	pt_entry_t *pte, tpte;
5601 	struct spglist free;
5602 	struct pv_chunklist free_chunks[PMAP_MEMDOM];
5603 	vm_page_t m, ml3, mt;
5604 	pv_entry_t pv;
5605 	struct md_page *pvh;
5606 	struct pv_chunk *pc, *npc;
5607 	struct rwlock *lock;
5608 	int64_t bit;
5609 	uint64_t inuse, bitmask;
5610 	int allfree, field, i, idx, lvl;
5611 	int freed __pvused;
5612 	vm_paddr_t pa;
5613 
5614 	lock = NULL;
5615 
5616 	for (i = 0; i < PMAP_MEMDOM; i++)
5617 		TAILQ_INIT(&free_chunks[i]);
5618 	SLIST_INIT(&free);
5619 	PMAP_LOCK(pmap);
5620 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5621 		allfree = 1;
5622 		freed = 0;
5623 		for (field = 0; field < _NPCM; field++) {
5624 			inuse = ~pc->pc_map[field] & pc_freemask[field];
5625 			while (inuse != 0) {
5626 				bit = ffsl(inuse) - 1;
5627 				bitmask = 1UL << bit;
5628 				idx = field * 64 + bit;
5629 				pv = &pc->pc_pventry[idx];
5630 				inuse &= ~bitmask;
5631 
5632 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
5633 				KASSERT(pde != NULL,
5634 				    ("Attempting to remove an unmapped page"));
5635 
5636 				switch(lvl) {
5637 				case 1:
5638 					pte = pmap_l1_to_l2(pde, pv->pv_va);
5639 					tpte = pmap_load(pte);
5640 					KASSERT((tpte & ATTR_DESCR_MASK) ==
5641 					    L2_BLOCK,
5642 					    ("Attempting to remove an invalid "
5643 					    "block: %lx", tpte));
5644 					break;
5645 				case 2:
5646 					pte = pmap_l2_to_l3(pde, pv->pv_va);
5647 					tpte = pmap_load(pte);
5648 					KASSERT((tpte & ATTR_DESCR_MASK) ==
5649 					    L3_PAGE,
5650 					    ("Attempting to remove an invalid "
5651 					     "page: %lx", tpte));
5652 					break;
5653 				default:
5654 					panic(
5655 					    "Invalid page directory level: %d",
5656 					    lvl);
5657 				}
5658 
5659 /*
5660  * We cannot remove wired pages from a process' mapping at this time
5661  */
5662 				if (tpte & ATTR_SW_WIRED) {
5663 					allfree = 0;
5664 					continue;
5665 				}
5666 
5667 				/* Mark free */
5668 				pc->pc_map[field] |= bitmask;
5669 
5670 				/*
5671 				 * Because this pmap is not active on other
5672 				 * processors, the dirty bit cannot have
5673 				 * changed state since we last loaded pte.
5674 				 */
5675 				pmap_clear(pte);
5676 
5677 				pa = tpte & ~ATTR_MASK;
5678 
5679 				m = PHYS_TO_VM_PAGE(pa);
5680 				KASSERT(m->phys_addr == pa,
5681 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5682 				    m, (uintmax_t)m->phys_addr,
5683 				    (uintmax_t)tpte));
5684 
5685 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5686 				    m < &vm_page_array[vm_page_array_size],
5687 				    ("pmap_remove_pages: bad pte %#jx",
5688 				    (uintmax_t)tpte));
5689 
5690 				/*
5691 				 * Update the vm_page_t clean/reference bits.
5692 				 */
5693 				if (pmap_pte_dirty(pmap, tpte)) {
5694 					switch (lvl) {
5695 					case 1:
5696 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5697 							vm_page_dirty(mt);
5698 						break;
5699 					case 2:
5700 						vm_page_dirty(m);
5701 						break;
5702 					}
5703 				}
5704 
5705 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5706 
5707 				switch (lvl) {
5708 				case 1:
5709 					pmap_resident_count_dec(pmap,
5710 					    L2_SIZE / PAGE_SIZE);
5711 					pvh = page_to_pvh(m);
5712 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
5713 					pvh->pv_gen++;
5714 					if (TAILQ_EMPTY(&pvh->pv_list)) {
5715 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5716 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5717 							    TAILQ_EMPTY(&mt->md.pv_list))
5718 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5719 					}
5720 					ml3 = pmap_remove_pt_page(pmap,
5721 					    pv->pv_va);
5722 					if (ml3 != NULL) {
5723 						KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
5724 						    ("pmap_remove_pages: l3 page not promoted"));
5725 						pmap_resident_count_dec(pmap,1);
5726 						KASSERT(ml3->ref_count == NL3PG,
5727 						    ("pmap_remove_pages: l3 page ref count error"));
5728 						ml3->ref_count = 0;
5729 						pmap_add_delayed_free_list(ml3,
5730 						    &free, FALSE);
5731 					}
5732 					break;
5733 				case 2:
5734 					pmap_resident_count_dec(pmap, 1);
5735 					TAILQ_REMOVE(&m->md.pv_list, pv,
5736 					    pv_next);
5737 					m->md.pv_gen++;
5738 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5739 					    TAILQ_EMPTY(&m->md.pv_list) &&
5740 					    (m->flags & PG_FICTITIOUS) == 0) {
5741 						pvh = page_to_pvh(m);
5742 						if (TAILQ_EMPTY(&pvh->pv_list))
5743 							vm_page_aflag_clear(m,
5744 							    PGA_WRITEABLE);
5745 					}
5746 					break;
5747 				}
5748 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
5749 				    &free);
5750 				freed++;
5751 			}
5752 		}
5753 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5754 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5755 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5756 		if (allfree) {
5757 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5758 			TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
5759 			    pc_list);
5760 		}
5761 	}
5762 	if (lock != NULL)
5763 		rw_wunlock(lock);
5764 	pmap_invalidate_all(pmap);
5765 	free_pv_chunk_batch(free_chunks);
5766 	PMAP_UNLOCK(pmap);
5767 	vm_page_free_pages_toq(&free, true);
5768 }
5769 
5770 /*
5771  * This is used to check if a page has been accessed or modified.
5772  */
5773 static boolean_t
5774 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5775 {
5776 	struct rwlock *lock;
5777 	pv_entry_t pv;
5778 	struct md_page *pvh;
5779 	pt_entry_t *pte, mask, value;
5780 	pmap_t pmap;
5781 	int md_gen, pvh_gen;
5782 	boolean_t rv;
5783 
5784 	rv = FALSE;
5785 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5786 	rw_rlock(lock);
5787 restart:
5788 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5789 		pmap = PV_PMAP(pv);
5790 		PMAP_ASSERT_STAGE1(pmap);
5791 		if (!PMAP_TRYLOCK(pmap)) {
5792 			md_gen = m->md.pv_gen;
5793 			rw_runlock(lock);
5794 			PMAP_LOCK(pmap);
5795 			rw_rlock(lock);
5796 			if (md_gen != m->md.pv_gen) {
5797 				PMAP_UNLOCK(pmap);
5798 				goto restart;
5799 			}
5800 		}
5801 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5802 		mask = 0;
5803 		value = 0;
5804 		if (modified) {
5805 			mask |= ATTR_S1_AP_RW_BIT;
5806 			value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5807 		}
5808 		if (accessed) {
5809 			mask |= ATTR_AF | ATTR_DESCR_MASK;
5810 			value |= ATTR_AF | L3_PAGE;
5811 		}
5812 		rv = (pmap_load(pte) & mask) == value;
5813 		PMAP_UNLOCK(pmap);
5814 		if (rv)
5815 			goto out;
5816 	}
5817 	if ((m->flags & PG_FICTITIOUS) == 0) {
5818 		pvh = page_to_pvh(m);
5819 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5820 			pmap = PV_PMAP(pv);
5821 			PMAP_ASSERT_STAGE1(pmap);
5822 			if (!PMAP_TRYLOCK(pmap)) {
5823 				md_gen = m->md.pv_gen;
5824 				pvh_gen = pvh->pv_gen;
5825 				rw_runlock(lock);
5826 				PMAP_LOCK(pmap);
5827 				rw_rlock(lock);
5828 				if (md_gen != m->md.pv_gen ||
5829 				    pvh_gen != pvh->pv_gen) {
5830 					PMAP_UNLOCK(pmap);
5831 					goto restart;
5832 				}
5833 			}
5834 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
5835 			mask = 0;
5836 			value = 0;
5837 			if (modified) {
5838 				mask |= ATTR_S1_AP_RW_BIT;
5839 				value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5840 			}
5841 			if (accessed) {
5842 				mask |= ATTR_AF | ATTR_DESCR_MASK;
5843 				value |= ATTR_AF | L2_BLOCK;
5844 			}
5845 			rv = (pmap_load(pte) & mask) == value;
5846 			PMAP_UNLOCK(pmap);
5847 			if (rv)
5848 				goto out;
5849 		}
5850 	}
5851 out:
5852 	rw_runlock(lock);
5853 	return (rv);
5854 }
5855 
5856 /*
5857  *	pmap_is_modified:
5858  *
5859  *	Return whether or not the specified physical page was modified
5860  *	in any physical maps.
5861  */
5862 boolean_t
5863 pmap_is_modified(vm_page_t m)
5864 {
5865 
5866 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5867 	    ("pmap_is_modified: page %p is not managed", m));
5868 
5869 	/*
5870 	 * If the page is not busied then this check is racy.
5871 	 */
5872 	if (!pmap_page_is_write_mapped(m))
5873 		return (FALSE);
5874 	return (pmap_page_test_mappings(m, FALSE, TRUE));
5875 }
5876 
5877 /*
5878  *	pmap_is_prefaultable:
5879  *
5880  *	Return whether or not the specified virtual address is eligible
5881  *	for prefault.
5882  */
5883 boolean_t
5884 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5885 {
5886 	pd_entry_t *pde;
5887 	pt_entry_t *pte;
5888 	boolean_t rv;
5889 	int lvl;
5890 
5891 	/*
5892 	 * Return TRUE if and only if the L3 entry for the specified virtual
5893 	 * address is allocated but invalid.
5894 	 */
5895 	rv = FALSE;
5896 	PMAP_LOCK(pmap);
5897 	pde = pmap_pde(pmap, addr, &lvl);
5898 	if (pde != NULL && lvl == 2) {
5899 		pte = pmap_l2_to_l3(pde, addr);
5900 		rv = pmap_load(pte) == 0;
5901 	}
5902 	PMAP_UNLOCK(pmap);
5903 	return (rv);
5904 }
5905 
5906 /*
5907  *	pmap_is_referenced:
5908  *
5909  *	Return whether or not the specified physical page was referenced
5910  *	in any physical maps.
5911  */
5912 boolean_t
5913 pmap_is_referenced(vm_page_t m)
5914 {
5915 
5916 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5917 	    ("pmap_is_referenced: page %p is not managed", m));
5918 	return (pmap_page_test_mappings(m, TRUE, FALSE));
5919 }
5920 
5921 /*
5922  * Clear the write and modified bits in each of the given page's mappings.
5923  */
5924 void
5925 pmap_remove_write(vm_page_t m)
5926 {
5927 	struct md_page *pvh;
5928 	pmap_t pmap;
5929 	struct rwlock *lock;
5930 	pv_entry_t next_pv, pv;
5931 	pt_entry_t oldpte, *pte, set, clear, mask, val;
5932 	vm_offset_t va;
5933 	int md_gen, pvh_gen;
5934 
5935 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5936 	    ("pmap_remove_write: page %p is not managed", m));
5937 	vm_page_assert_busied(m);
5938 
5939 	if (!pmap_page_is_write_mapped(m))
5940 		return;
5941 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5942 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5943 	rw_wlock(lock);
5944 retry:
5945 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5946 		pmap = PV_PMAP(pv);
5947 		PMAP_ASSERT_STAGE1(pmap);
5948 		if (!PMAP_TRYLOCK(pmap)) {
5949 			pvh_gen = pvh->pv_gen;
5950 			rw_wunlock(lock);
5951 			PMAP_LOCK(pmap);
5952 			rw_wlock(lock);
5953 			if (pvh_gen != pvh->pv_gen) {
5954 				PMAP_UNLOCK(pmap);
5955 				goto retry;
5956 			}
5957 		}
5958 		va = pv->pv_va;
5959 		pte = pmap_pte_exists(pmap, va, 2, __func__);
5960 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
5961 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
5962 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5963 		    ("inconsistent pv lock %p %p for page %p",
5964 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5965 		PMAP_UNLOCK(pmap);
5966 	}
5967 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5968 		pmap = PV_PMAP(pv);
5969 		if (!PMAP_TRYLOCK(pmap)) {
5970 			pvh_gen = pvh->pv_gen;
5971 			md_gen = m->md.pv_gen;
5972 			rw_wunlock(lock);
5973 			PMAP_LOCK(pmap);
5974 			rw_wlock(lock);
5975 			if (pvh_gen != pvh->pv_gen ||
5976 			    md_gen != m->md.pv_gen) {
5977 				PMAP_UNLOCK(pmap);
5978 				goto retry;
5979 			}
5980 		}
5981 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5982 		oldpte = pmap_load(pte);
5983 		if ((oldpte & ATTR_SW_DBM) != 0) {
5984 			if (pmap->pm_stage == PM_STAGE1) {
5985 				set = ATTR_S1_AP_RW_BIT;
5986 				clear = 0;
5987 				mask = ATTR_S1_AP_RW_BIT;
5988 				val = ATTR_S1_AP(ATTR_S1_AP_RW);
5989 			} else {
5990 				set = 0;
5991 				clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5992 				mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5993 				val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5994 			}
5995 			clear |= ATTR_SW_DBM;
5996 			while (!atomic_fcmpset_64(pte, &oldpte,
5997 			    (oldpte | set) & ~clear))
5998 				cpu_spinwait();
5999 
6000 			if ((oldpte & mask) == val)
6001 				vm_page_dirty(m);
6002 			pmap_invalidate_page(pmap, pv->pv_va, true);
6003 		}
6004 		PMAP_UNLOCK(pmap);
6005 	}
6006 	rw_wunlock(lock);
6007 	vm_page_aflag_clear(m, PGA_WRITEABLE);
6008 }
6009 
6010 /*
6011  *	pmap_ts_referenced:
6012  *
6013  *	Return a count of reference bits for a page, clearing those bits.
6014  *	It is not necessary for every reference bit to be cleared, but it
6015  *	is necessary that 0 only be returned when there are truly no
6016  *	reference bits set.
6017  *
6018  *	As an optimization, update the page's dirty field if a modified bit is
6019  *	found while counting reference bits.  This opportunistic update can be
6020  *	performed at low cost and can eliminate the need for some future calls
6021  *	to pmap_is_modified().  However, since this function stops after
6022  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
6023  *	dirty pages.  Those dirty pages will only be detected by a future call
6024  *	to pmap_is_modified().
6025  */
6026 int
6027 pmap_ts_referenced(vm_page_t m)
6028 {
6029 	struct md_page *pvh;
6030 	pv_entry_t pv, pvf;
6031 	pmap_t pmap;
6032 	struct rwlock *lock;
6033 	pt_entry_t *pte, tpte;
6034 	vm_offset_t va;
6035 	vm_paddr_t pa;
6036 	int cleared, md_gen, not_cleared, pvh_gen;
6037 	struct spglist free;
6038 
6039 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6040 	    ("pmap_ts_referenced: page %p is not managed", m));
6041 	SLIST_INIT(&free);
6042 	cleared = 0;
6043 	pa = VM_PAGE_TO_PHYS(m);
6044 	lock = PHYS_TO_PV_LIST_LOCK(pa);
6045 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
6046 	rw_wlock(lock);
6047 retry:
6048 	not_cleared = 0;
6049 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
6050 		goto small_mappings;
6051 	pv = pvf;
6052 	do {
6053 		if (pvf == NULL)
6054 			pvf = pv;
6055 		pmap = PV_PMAP(pv);
6056 		if (!PMAP_TRYLOCK(pmap)) {
6057 			pvh_gen = pvh->pv_gen;
6058 			rw_wunlock(lock);
6059 			PMAP_LOCK(pmap);
6060 			rw_wlock(lock);
6061 			if (pvh_gen != pvh->pv_gen) {
6062 				PMAP_UNLOCK(pmap);
6063 				goto retry;
6064 			}
6065 		}
6066 		va = pv->pv_va;
6067 		pte = pmap_pte_exists(pmap, va, 2, __func__);
6068 		tpte = pmap_load(pte);
6069 		if (pmap_pte_dirty(pmap, tpte)) {
6070 			/*
6071 			 * Although "tpte" is mapping a 2MB page, because
6072 			 * this function is called at a 4KB page granularity,
6073 			 * we only update the 4KB page under test.
6074 			 */
6075 			vm_page_dirty(m);
6076 		}
6077 		if ((tpte & ATTR_AF) != 0) {
6078 			/*
6079 			 * Since this reference bit is shared by 512 4KB pages,
6080 			 * it should not be cleared every time it is tested.
6081 			 * Apply a simple "hash" function on the physical page
6082 			 * number, the virtual superpage number, and the pmap
6083 			 * address to select one 4KB page out of the 512 on
6084 			 * which testing the reference bit will result in
6085 			 * clearing that reference bit.  This function is
6086 			 * designed to avoid the selection of the same 4KB page
6087 			 * for every 2MB page mapping.
6088 			 *
6089 			 * On demotion, a mapping that hasn't been referenced
6090 			 * is simply destroyed.  To avoid the possibility of a
6091 			 * subsequent page fault on a demoted wired mapping,
6092 			 * always leave its reference bit set.  Moreover,
6093 			 * since the superpage is wired, the current state of
6094 			 * its reference bit won't affect page replacement.
6095 			 */
6096 			if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
6097 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
6098 			    (tpte & ATTR_SW_WIRED) == 0) {
6099 				pmap_clear_bits(pte, ATTR_AF);
6100 				pmap_invalidate_page(pmap, va, true);
6101 				cleared++;
6102 			} else
6103 				not_cleared++;
6104 		}
6105 		PMAP_UNLOCK(pmap);
6106 		/* Rotate the PV list if it has more than one entry. */
6107 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
6108 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6109 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
6110 			pvh->pv_gen++;
6111 		}
6112 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
6113 			goto out;
6114 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
6115 small_mappings:
6116 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
6117 		goto out;
6118 	pv = pvf;
6119 	do {
6120 		if (pvf == NULL)
6121 			pvf = pv;
6122 		pmap = PV_PMAP(pv);
6123 		if (!PMAP_TRYLOCK(pmap)) {
6124 			pvh_gen = pvh->pv_gen;
6125 			md_gen = m->md.pv_gen;
6126 			rw_wunlock(lock);
6127 			PMAP_LOCK(pmap);
6128 			rw_wlock(lock);
6129 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6130 				PMAP_UNLOCK(pmap);
6131 				goto retry;
6132 			}
6133 		}
6134 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6135 		tpte = pmap_load(pte);
6136 		if (pmap_pte_dirty(pmap, tpte))
6137 			vm_page_dirty(m);
6138 		if ((tpte & ATTR_AF) != 0) {
6139 			if ((tpte & ATTR_SW_WIRED) == 0) {
6140 				pmap_clear_bits(pte, ATTR_AF);
6141 				pmap_invalidate_page(pmap, pv->pv_va, true);
6142 				cleared++;
6143 			} else
6144 				not_cleared++;
6145 		}
6146 		PMAP_UNLOCK(pmap);
6147 		/* Rotate the PV list if it has more than one entry. */
6148 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
6149 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6150 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6151 			m->md.pv_gen++;
6152 		}
6153 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6154 	    not_cleared < PMAP_TS_REFERENCED_MAX);
6155 out:
6156 	rw_wunlock(lock);
6157 	vm_page_free_pages_toq(&free, true);
6158 	return (cleared + not_cleared);
6159 }
6160 
6161 /*
6162  *	Apply the given advice to the specified range of addresses within the
6163  *	given pmap.  Depending on the advice, clear the referenced and/or
6164  *	modified flags in each mapping and set the mapped page's dirty field.
6165  */
6166 void
6167 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6168 {
6169 	struct rwlock *lock;
6170 	vm_offset_t va, va_next;
6171 	vm_page_t m;
6172 	pd_entry_t *l0, *l1, *l2, oldl2;
6173 	pt_entry_t *l3, oldl3;
6174 
6175 	PMAP_ASSERT_STAGE1(pmap);
6176 
6177 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
6178 		return;
6179 
6180 	PMAP_LOCK(pmap);
6181 	for (; sva < eva; sva = va_next) {
6182 		l0 = pmap_l0(pmap, sva);
6183 		if (pmap_load(l0) == 0) {
6184 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6185 			if (va_next < sva)
6186 				va_next = eva;
6187 			continue;
6188 		}
6189 
6190 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6191 		if (va_next < sva)
6192 			va_next = eva;
6193 		l1 = pmap_l0_to_l1(l0, sva);
6194 		if (pmap_load(l1) == 0)
6195 			continue;
6196 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6197 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6198 			continue;
6199 		}
6200 
6201 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6202 		if (va_next < sva)
6203 			va_next = eva;
6204 		l2 = pmap_l1_to_l2(l1, sva);
6205 		oldl2 = pmap_load(l2);
6206 		if (oldl2 == 0)
6207 			continue;
6208 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
6209 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
6210 				continue;
6211 			lock = NULL;
6212 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
6213 				if (lock != NULL)
6214 					rw_wunlock(lock);
6215 
6216 				/*
6217 				 * The 2MB page mapping was destroyed.
6218 				 */
6219 				continue;
6220 			}
6221 
6222 			/*
6223 			 * Unless the page mappings are wired, remove the
6224 			 * mapping to a single page so that a subsequent
6225 			 * access may repromote.  Choosing the last page
6226 			 * within the address range [sva, min(va_next, eva))
6227 			 * generally results in more repromotions.  Since the
6228 			 * underlying page table page is fully populated, this
6229 			 * removal never frees a page table page.
6230 			 */
6231 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
6232 				va = eva;
6233 				if (va > va_next)
6234 					va = va_next;
6235 				va -= PAGE_SIZE;
6236 				KASSERT(va >= sva,
6237 				    ("pmap_advise: no address gap"));
6238 				l3 = pmap_l2_to_l3(l2, va);
6239 				KASSERT(pmap_load(l3) != 0,
6240 				    ("pmap_advise: invalid PTE"));
6241 				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
6242 				    NULL, &lock);
6243 			}
6244 			if (lock != NULL)
6245 				rw_wunlock(lock);
6246 		}
6247 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6248 		    ("pmap_advise: invalid L2 entry after demotion"));
6249 		if (va_next > eva)
6250 			va_next = eva;
6251 		va = va_next;
6252 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
6253 		    sva += L3_SIZE) {
6254 			oldl3 = pmap_load(l3);
6255 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
6256 			    (ATTR_SW_MANAGED | L3_PAGE))
6257 				goto maybe_invlrng;
6258 			else if (pmap_pte_dirty(pmap, oldl3)) {
6259 				if (advice == MADV_DONTNEED) {
6260 					/*
6261 					 * Future calls to pmap_is_modified()
6262 					 * can be avoided by making the page
6263 					 * dirty now.
6264 					 */
6265 					m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK);
6266 					vm_page_dirty(m);
6267 				}
6268 				while (!atomic_fcmpset_long(l3, &oldl3,
6269 				    (oldl3 & ~ATTR_AF) |
6270 				    ATTR_S1_AP(ATTR_S1_AP_RO)))
6271 					cpu_spinwait();
6272 			} else if ((oldl3 & ATTR_AF) != 0)
6273 				pmap_clear_bits(l3, ATTR_AF);
6274 			else
6275 				goto maybe_invlrng;
6276 			if (va == va_next)
6277 				va = sva;
6278 			continue;
6279 maybe_invlrng:
6280 			if (va != va_next) {
6281 				pmap_s1_invalidate_range(pmap, va, sva, true);
6282 				va = va_next;
6283 			}
6284 		}
6285 		if (va != va_next)
6286 			pmap_s1_invalidate_range(pmap, va, sva, true);
6287 	}
6288 	PMAP_UNLOCK(pmap);
6289 }
6290 
6291 /*
6292  *	Clear the modify bits on the specified physical page.
6293  */
6294 void
6295 pmap_clear_modify(vm_page_t m)
6296 {
6297 	struct md_page *pvh;
6298 	struct rwlock *lock;
6299 	pmap_t pmap;
6300 	pv_entry_t next_pv, pv;
6301 	pd_entry_t *l2, oldl2;
6302 	pt_entry_t *l3, oldl3;
6303 	vm_offset_t va;
6304 	int md_gen, pvh_gen;
6305 
6306 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6307 	    ("pmap_clear_modify: page %p is not managed", m));
6308 	vm_page_assert_busied(m);
6309 
6310 	if (!pmap_page_is_write_mapped(m))
6311 		return;
6312 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
6313 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6314 	rw_wlock(lock);
6315 restart:
6316 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6317 		pmap = PV_PMAP(pv);
6318 		PMAP_ASSERT_STAGE1(pmap);
6319 		if (!PMAP_TRYLOCK(pmap)) {
6320 			pvh_gen = pvh->pv_gen;
6321 			rw_wunlock(lock);
6322 			PMAP_LOCK(pmap);
6323 			rw_wlock(lock);
6324 			if (pvh_gen != pvh->pv_gen) {
6325 				PMAP_UNLOCK(pmap);
6326 				goto restart;
6327 			}
6328 		}
6329 		va = pv->pv_va;
6330 		l2 = pmap_l2(pmap, va);
6331 		oldl2 = pmap_load(l2);
6332 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
6333 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
6334 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
6335 		    (oldl2 & ATTR_SW_WIRED) == 0) {
6336 			/*
6337 			 * Write protect the mapping to a single page so that
6338 			 * a subsequent write access may repromote.
6339 			 */
6340 			va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK);
6341 			l3 = pmap_l2_to_l3(l2, va);
6342 			oldl3 = pmap_load(l3);
6343 			while (!atomic_fcmpset_long(l3, &oldl3,
6344 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
6345 				cpu_spinwait();
6346 			vm_page_dirty(m);
6347 			pmap_s1_invalidate_page(pmap, va, true);
6348 		}
6349 		PMAP_UNLOCK(pmap);
6350 	}
6351 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6352 		pmap = PV_PMAP(pv);
6353 		PMAP_ASSERT_STAGE1(pmap);
6354 		if (!PMAP_TRYLOCK(pmap)) {
6355 			md_gen = m->md.pv_gen;
6356 			pvh_gen = pvh->pv_gen;
6357 			rw_wunlock(lock);
6358 			PMAP_LOCK(pmap);
6359 			rw_wlock(lock);
6360 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6361 				PMAP_UNLOCK(pmap);
6362 				goto restart;
6363 			}
6364 		}
6365 		l2 = pmap_l2(pmap, pv->pv_va);
6366 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
6367 		oldl3 = pmap_load(l3);
6368 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){
6369 			pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
6370 			pmap_s1_invalidate_page(pmap, pv->pv_va, true);
6371 		}
6372 		PMAP_UNLOCK(pmap);
6373 	}
6374 	rw_wunlock(lock);
6375 }
6376 
6377 void *
6378 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6379 {
6380 	struct pmap_preinit_mapping *ppim;
6381 	vm_offset_t va, offset;
6382 	pd_entry_t *pde;
6383 	pt_entry_t *l2;
6384 	int i, lvl, l2_blocks, free_l2_count, start_idx;
6385 
6386 	if (!vm_initialized) {
6387 		/*
6388 		 * No L3 ptables so map entire L2 blocks where start VA is:
6389 		 * 	preinit_map_va + start_idx * L2_SIZE
6390 		 * There may be duplicate mappings (multiple VA -> same PA) but
6391 		 * ARM64 dcache is always PIPT so that's acceptable.
6392 		 */
6393 		 if (size == 0)
6394 			 return (NULL);
6395 
6396 		 /* Calculate how many L2 blocks are needed for the mapping */
6397 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
6398 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
6399 
6400 		offset = pa & L2_OFFSET;
6401 
6402 		if (preinit_map_va == 0)
6403 			return (NULL);
6404 
6405 		/* Map 2MiB L2 blocks from reserved VA space */
6406 
6407 		free_l2_count = 0;
6408 		start_idx = -1;
6409 		/* Find enough free contiguous VA space */
6410 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6411 			ppim = pmap_preinit_mapping + i;
6412 			if (free_l2_count > 0 && ppim->pa != 0) {
6413 				/* Not enough space here */
6414 				free_l2_count = 0;
6415 				start_idx = -1;
6416 				continue;
6417 			}
6418 
6419 			if (ppim->pa == 0) {
6420 				/* Free L2 block */
6421 				if (start_idx == -1)
6422 					start_idx = i;
6423 				free_l2_count++;
6424 				if (free_l2_count == l2_blocks)
6425 					break;
6426 			}
6427 		}
6428 		if (free_l2_count != l2_blocks)
6429 			panic("%s: too many preinit mappings", __func__);
6430 
6431 		va = preinit_map_va + (start_idx * L2_SIZE);
6432 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
6433 			/* Mark entries as allocated */
6434 			ppim = pmap_preinit_mapping + i;
6435 			ppim->pa = pa;
6436 			ppim->va = va + offset;
6437 			ppim->size = size;
6438 		}
6439 
6440 		/* Map L2 blocks */
6441 		pa = rounddown2(pa, L2_SIZE);
6442 		for (i = 0; i < l2_blocks; i++) {
6443 			pde = pmap_pde(kernel_pmap, va, &lvl);
6444 			KASSERT(pde != NULL,
6445 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
6446 			    va));
6447 			KASSERT(lvl == 1,
6448 			    ("pmap_mapbios: Invalid level %d", lvl));
6449 
6450 			/* Insert L2_BLOCK */
6451 			l2 = pmap_l1_to_l2(pde, va);
6452 			pmap_load_store(l2,
6453 			    pa | ATTR_DEFAULT | ATTR_S1_XN |
6454 			    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
6455 
6456 			va += L2_SIZE;
6457 			pa += L2_SIZE;
6458 		}
6459 		pmap_s1_invalidate_all(kernel_pmap);
6460 
6461 		va = preinit_map_va + (start_idx * L2_SIZE);
6462 
6463 	} else {
6464 		/* kva_alloc may be used to map the pages */
6465 		offset = pa & PAGE_MASK;
6466 		size = round_page(offset + size);
6467 
6468 		va = kva_alloc(size);
6469 		if (va == 0)
6470 			panic("%s: Couldn't allocate KVA", __func__);
6471 
6472 		pde = pmap_pde(kernel_pmap, va, &lvl);
6473 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
6474 
6475 		/* L3 table is linked */
6476 		va = trunc_page(va);
6477 		pa = trunc_page(pa);
6478 		pmap_kenter(va, size, pa, memory_mapping_mode(pa));
6479 	}
6480 
6481 	return ((void *)(va + offset));
6482 }
6483 
6484 void
6485 pmap_unmapbios(void *p, vm_size_t size)
6486 {
6487 	struct pmap_preinit_mapping *ppim;
6488 	vm_offset_t offset, tmpsize, va, va_trunc;
6489 	pd_entry_t *pde;
6490 	pt_entry_t *l2;
6491 	int i, lvl, l2_blocks, block;
6492 	bool preinit_map;
6493 
6494 	va = (vm_offset_t)p;
6495 	l2_blocks =
6496 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
6497 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
6498 
6499 	/* Remove preinit mapping */
6500 	preinit_map = false;
6501 	block = 0;
6502 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6503 		ppim = pmap_preinit_mapping + i;
6504 		if (ppim->va == va) {
6505 			KASSERT(ppim->size == size,
6506 			    ("pmap_unmapbios: size mismatch"));
6507 			ppim->va = 0;
6508 			ppim->pa = 0;
6509 			ppim->size = 0;
6510 			preinit_map = true;
6511 			offset = block * L2_SIZE;
6512 			va_trunc = rounddown2(va, L2_SIZE) + offset;
6513 
6514 			/* Remove L2_BLOCK */
6515 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
6516 			KASSERT(pde != NULL,
6517 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
6518 			    va_trunc));
6519 			l2 = pmap_l1_to_l2(pde, va_trunc);
6520 			pmap_clear(l2);
6521 
6522 			if (block == (l2_blocks - 1))
6523 				break;
6524 			block++;
6525 		}
6526 	}
6527 	if (preinit_map) {
6528 		pmap_s1_invalidate_all(kernel_pmap);
6529 		return;
6530 	}
6531 
6532 	/* Unmap the pages reserved with kva_alloc. */
6533 	if (vm_initialized) {
6534 		offset = va & PAGE_MASK;
6535 		size = round_page(offset + size);
6536 		va = trunc_page(va);
6537 
6538 		pde = pmap_pde(kernel_pmap, va, &lvl);
6539 		KASSERT(pde != NULL,
6540 		    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
6541 		KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
6542 
6543 		/* Unmap and invalidate the pages */
6544                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6545 			pmap_kremove(va + tmpsize);
6546 
6547 		kva_free(va, size);
6548 	}
6549 }
6550 
6551 /*
6552  * Sets the memory attribute for the specified page.
6553  */
6554 void
6555 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6556 {
6557 
6558 	m->md.pv_memattr = ma;
6559 
6560 	/*
6561 	 * If "m" is a normal page, update its direct mapping.  This update
6562 	 * can be relied upon to perform any cache operations that are
6563 	 * required for data coherence.
6564 	 */
6565 	if ((m->flags & PG_FICTITIOUS) == 0 &&
6566 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6567 	    m->md.pv_memattr) != 0)
6568 		panic("memory attribute change on the direct map failed");
6569 }
6570 
6571 /*
6572  * Changes the specified virtual address range's memory type to that given by
6573  * the parameter "mode".  The specified virtual address range must be
6574  * completely contained within either the direct map or the kernel map.  If
6575  * the virtual address range is contained within the kernel map, then the
6576  * memory type for each of the corresponding ranges of the direct map is also
6577  * changed.  (The corresponding ranges of the direct map are those ranges that
6578  * map the same physical pages as the specified virtual address range.)  These
6579  * changes to the direct map are necessary because Intel describes the
6580  * behavior of their processors as "undefined" if two or more mappings to the
6581  * same physical page have different memory types.
6582  *
6583  * Returns zero if the change completed successfully, and either EINVAL or
6584  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6585  * of the virtual address range was not mapped, and ENOMEM is returned if
6586  * there was insufficient memory available to complete the change.  In the
6587  * latter case, the memory type may have been changed on some part of the
6588  * virtual address range or the direct map.
6589  */
6590 int
6591 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6592 {
6593 	int error;
6594 
6595 	PMAP_LOCK(kernel_pmap);
6596 	error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
6597 	PMAP_UNLOCK(kernel_pmap);
6598 	return (error);
6599 }
6600 
6601 /*
6602  * Changes the specified virtual address range's protections to those
6603  * specified by "prot".  Like pmap_change_attr(), protections for aliases
6604  * in the direct map are updated as well.  Protections on aliasing mappings may
6605  * be a subset of the requested protections; for example, mappings in the direct
6606  * map are never executable.
6607  */
6608 int
6609 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
6610 {
6611 	int error;
6612 
6613 	/* Only supported within the kernel map. */
6614 	if (va < VM_MIN_KERNEL_ADDRESS)
6615 		return (EINVAL);
6616 
6617 	PMAP_LOCK(kernel_pmap);
6618 	error = pmap_change_props_locked(va, size, prot, -1, false);
6619 	PMAP_UNLOCK(kernel_pmap);
6620 	return (error);
6621 }
6622 
6623 static int
6624 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
6625     int mode, bool skip_unmapped)
6626 {
6627 	vm_offset_t base, offset, tmpva;
6628 	vm_size_t pte_size;
6629 	vm_paddr_t pa;
6630 	pt_entry_t pte, *ptep, *newpte;
6631 	pt_entry_t bits, mask;
6632 	int lvl, rv;
6633 
6634 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6635 	base = trunc_page(va);
6636 	offset = va & PAGE_MASK;
6637 	size = round_page(offset + size);
6638 
6639 	if (!VIRT_IN_DMAP(base) &&
6640 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
6641 		return (EINVAL);
6642 
6643 	bits = 0;
6644 	mask = 0;
6645 	if (mode != -1) {
6646 		bits = ATTR_S1_IDX(mode);
6647 		mask = ATTR_S1_IDX_MASK;
6648 		if (mode == VM_MEMATTR_DEVICE) {
6649 			mask |= ATTR_S1_XN;
6650 			bits |= ATTR_S1_XN;
6651 		}
6652 	}
6653 	if (prot != VM_PROT_NONE) {
6654 		/* Don't mark the DMAP as executable. It never is on arm64. */
6655 		if (VIRT_IN_DMAP(base)) {
6656 			prot &= ~VM_PROT_EXECUTE;
6657 			/*
6658 			 * XXX Mark the DMAP as writable for now. We rely
6659 			 * on this in ddb & dtrace to insert breakpoint
6660 			 * instructions.
6661 			 */
6662 			prot |= VM_PROT_WRITE;
6663 		}
6664 
6665 		if ((prot & VM_PROT_WRITE) == 0) {
6666 			bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
6667 		}
6668 		if ((prot & VM_PROT_EXECUTE) == 0) {
6669 			bits |= ATTR_S1_PXN;
6670 		}
6671 		bits |= ATTR_S1_UXN;
6672 		mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
6673 	}
6674 
6675 	for (tmpva = base; tmpva < base + size; ) {
6676 		ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
6677 		if (ptep == NULL && !skip_unmapped) {
6678 			return (EINVAL);
6679 		} else if ((ptep == NULL && skip_unmapped) ||
6680 		    (pmap_load(ptep) & mask) == bits) {
6681 			/*
6682 			 * We already have the correct attribute or there
6683 			 * is no memory mapped at this address and we are
6684 			 * skipping unmapped memory.
6685 			 */
6686 			switch (lvl) {
6687 			default:
6688 				panic("Invalid DMAP table level: %d\n", lvl);
6689 			case 1:
6690 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
6691 				break;
6692 			case 2:
6693 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
6694 				break;
6695 			case 3:
6696 				tmpva += PAGE_SIZE;
6697 				break;
6698 			}
6699 		} else {
6700 			/* We can't demote/promote this entry */
6701 			MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
6702 
6703 			/*
6704 			 * Split the entry to an level 3 table, then
6705 			 * set the new attribute.
6706 			 */
6707 			switch (lvl) {
6708 			default:
6709 				panic("Invalid DMAP table level: %d\n", lvl);
6710 			case 1:
6711 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6712 				if ((tmpva & L1_OFFSET) == 0 &&
6713 				    (base + size - tmpva) >= L1_SIZE) {
6714 					pte_size = L1_SIZE;
6715 					break;
6716 				}
6717 				newpte = pmap_demote_l1(kernel_pmap, ptep,
6718 				    tmpva & ~L1_OFFSET);
6719 				if (newpte == NULL)
6720 					return (EINVAL);
6721 				ptep = pmap_l1_to_l2(ptep, tmpva);
6722 				/* FALLTHROUGH */
6723 			case 2:
6724 				if ((tmpva & L2_OFFSET) == 0 &&
6725 				    (base + size - tmpva) >= L2_SIZE) {
6726 					pte_size = L2_SIZE;
6727 					break;
6728 				}
6729 				newpte = pmap_demote_l2(kernel_pmap, ptep,
6730 				    tmpva);
6731 				if (newpte == NULL)
6732 					return (EINVAL);
6733 				ptep = pmap_l2_to_l3(ptep, tmpva);
6734 				/* FALLTHROUGH */
6735 			case 3:
6736 				pte_size = PAGE_SIZE;
6737 				break;
6738 			}
6739 
6740 			/* Update the entry */
6741 			pte = pmap_load(ptep);
6742 			pte &= ~mask;
6743 			pte |= bits;
6744 
6745 			pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
6746 			    pte_size);
6747 
6748 			pa = pte & ~ATTR_MASK;
6749 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
6750 				/*
6751 				 * Keep the DMAP memory in sync.
6752 				 */
6753 				rv = pmap_change_props_locked(
6754 				    PHYS_TO_DMAP(pa), pte_size,
6755 				    prot, mode, true);
6756 				if (rv != 0)
6757 					return (rv);
6758 			}
6759 
6760 			/*
6761 			 * If moving to a non-cacheable entry flush
6762 			 * the cache.
6763 			 */
6764 			if (mode == VM_MEMATTR_UNCACHEABLE)
6765 				cpu_dcache_wbinv_range(tmpva, pte_size);
6766 			tmpva += pte_size;
6767 		}
6768 	}
6769 
6770 	return (0);
6771 }
6772 
6773 /*
6774  * Create an L2 table to map all addresses within an L1 mapping.
6775  */
6776 static pt_entry_t *
6777 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
6778 {
6779 	pt_entry_t *l2, newl2, oldl1;
6780 	vm_offset_t tmpl1;
6781 	vm_paddr_t l2phys, phys;
6782 	vm_page_t ml2;
6783 	int i;
6784 
6785 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6786 	oldl1 = pmap_load(l1);
6787 	PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6788 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
6789 	    ("pmap_demote_l1: Demoting a non-block entry"));
6790 	KASSERT((va & L1_OFFSET) == 0,
6791 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
6792 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
6793 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
6794 	KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
6795 	    ("pmap_demote_l1: Demoting entry with no-demote flag set"));
6796 
6797 	tmpl1 = 0;
6798 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
6799 		tmpl1 = kva_alloc(PAGE_SIZE);
6800 		if (tmpl1 == 0)
6801 			return (NULL);
6802 	}
6803 
6804 	if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
6805 	    NULL) {
6806 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
6807 		    " in pmap %p", va, pmap);
6808 		l2 = NULL;
6809 		goto fail;
6810 	}
6811 
6812 	l2phys = VM_PAGE_TO_PHYS(ml2);
6813 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
6814 
6815 	/* Address the range points at */
6816 	phys = oldl1 & ~ATTR_MASK;
6817 	/* The attributed from the old l1 table to be copied */
6818 	newl2 = oldl1 & ATTR_MASK;
6819 
6820 	/* Create the new entries */
6821 	for (i = 0; i < Ln_ENTRIES; i++) {
6822 		l2[i] = newl2 | phys;
6823 		phys += L2_SIZE;
6824 	}
6825 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
6826 	    ("Invalid l2 page (%lx != %lx)", l2[0],
6827 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
6828 
6829 	if (tmpl1 != 0) {
6830 		pmap_kenter(tmpl1, PAGE_SIZE,
6831 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
6832 		    VM_MEMATTR_WRITE_BACK);
6833 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
6834 	}
6835 
6836 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
6837 
6838 fail:
6839 	if (tmpl1 != 0) {
6840 		pmap_kremove(tmpl1);
6841 		kva_free(tmpl1, PAGE_SIZE);
6842 	}
6843 
6844 	return (l2);
6845 }
6846 
6847 static void
6848 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
6849 {
6850 	pt_entry_t *l3;
6851 
6852 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
6853 		*l3 = newl3;
6854 		newl3 += L3_SIZE;
6855 	}
6856 }
6857 
6858 static void
6859 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
6860     struct rwlock **lockp)
6861 {
6862 	struct spglist free;
6863 
6864 	SLIST_INIT(&free);
6865 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
6866 	    lockp);
6867 	vm_page_free_pages_toq(&free, true);
6868 }
6869 
6870 /*
6871  * Create an L3 table to map all addresses within an L2 mapping.
6872  */
6873 static pt_entry_t *
6874 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
6875     struct rwlock **lockp)
6876 {
6877 	pt_entry_t *l3, newl3, oldl2;
6878 	vm_offset_t tmpl2;
6879 	vm_paddr_t l3phys;
6880 	vm_page_t ml3;
6881 
6882 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6883 	PMAP_ASSERT_STAGE1(pmap);
6884 	KASSERT(ADDR_IS_CANONICAL(va),
6885 	    ("%s: Address not in canonical form: %lx", __func__, va));
6886 
6887 	l3 = NULL;
6888 	oldl2 = pmap_load(l2);
6889 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
6890 	    ("pmap_demote_l2: Demoting a non-block entry"));
6891 	KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
6892 	    ("pmap_demote_l2: Demoting entry with no-demote flag set"));
6893 	va &= ~L2_OFFSET;
6894 
6895 	tmpl2 = 0;
6896 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
6897 		tmpl2 = kva_alloc(PAGE_SIZE);
6898 		if (tmpl2 == 0)
6899 			return (NULL);
6900 	}
6901 
6902 	/*
6903 	 * Invalidate the 2MB page mapping and return "failure" if the
6904 	 * mapping was never accessed.
6905 	 */
6906 	if ((oldl2 & ATTR_AF) == 0) {
6907 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6908 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
6909 		pmap_demote_l2_abort(pmap, va, l2, lockp);
6910 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
6911 		    va, pmap);
6912 		goto fail;
6913 	}
6914 
6915 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
6916 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6917 		    ("pmap_demote_l2: page table page for a wired mapping"
6918 		    " is missing"));
6919 
6920 		/*
6921 		 * If the page table page is missing and the mapping
6922 		 * is for a kernel address, the mapping must belong to
6923 		 * either the direct map or the early kernel memory.
6924 		 * Page table pages are preallocated for every other
6925 		 * part of the kernel address space, so the direct map
6926 		 * region and early kernel memory are the only parts of the
6927 		 * kernel address space that must be handled here.
6928 		 */
6929 		KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
6930 		    (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
6931 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
6932 
6933 		/*
6934 		 * If the 2MB page mapping belongs to the direct map
6935 		 * region of the kernel's address space, then the page
6936 		 * allocation request specifies the highest possible
6937 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
6938 		 * priority is normal.
6939 		 */
6940 		ml3 = vm_page_alloc_noobj(
6941 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
6942 		    VM_ALLOC_WIRED);
6943 
6944 		/*
6945 		 * If the allocation of the new page table page fails,
6946 		 * invalidate the 2MB page mapping and return "failure".
6947 		 */
6948 		if (ml3 == NULL) {
6949 			pmap_demote_l2_abort(pmap, va, l2, lockp);
6950 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
6951 			    " in pmap %p", va, pmap);
6952 			goto fail;
6953 		}
6954 		ml3->pindex = pmap_l2_pindex(va);
6955 
6956 		if (!ADDR_IS_KERNEL(va)) {
6957 			ml3->ref_count = NL3PG;
6958 			pmap_resident_count_inc(pmap, 1);
6959 		}
6960 	}
6961 	l3phys = VM_PAGE_TO_PHYS(ml3);
6962 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
6963 	newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
6964 	KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
6965 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
6966 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
6967 
6968 	/*
6969 	 * If the page table page is not leftover from an earlier promotion,
6970 	 * or the mapping attributes have changed, (re)initialize the L3 table.
6971 	 *
6972 	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
6973 	 * performs a dsb().  That dsb() ensures that the stores for filling
6974 	 * "l3" are visible before "l3" is added to the page table.
6975 	 */
6976 	if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK))
6977 		pmap_fill_l3(l3, newl3);
6978 
6979 	/*
6980 	 * Map the temporary page so we don't lose access to the l2 table.
6981 	 */
6982 	if (tmpl2 != 0) {
6983 		pmap_kenter(tmpl2, PAGE_SIZE,
6984 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
6985 		    VM_MEMATTR_WRITE_BACK);
6986 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
6987 	}
6988 
6989 	/*
6990 	 * The spare PV entries must be reserved prior to demoting the
6991 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
6992 	 * of the L2 and the PV lists will be inconsistent, which can result
6993 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
6994 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
6995 	 * PV entry for the 2MB page mapping that is being demoted.
6996 	 */
6997 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
6998 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
6999 
7000 	/*
7001 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
7002 	 * the 2MB page mapping.
7003 	 */
7004 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
7005 
7006 	/*
7007 	 * Demote the PV entry.
7008 	 */
7009 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
7010 		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
7011 
7012 	atomic_add_long(&pmap_l2_demotions, 1);
7013 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
7014 	    " in pmap %p %lx", va, pmap, l3[0]);
7015 
7016 fail:
7017 	if (tmpl2 != 0) {
7018 		pmap_kremove(tmpl2);
7019 		kva_free(tmpl2, PAGE_SIZE);
7020 	}
7021 
7022 	return (l3);
7023 
7024 }
7025 
7026 static pt_entry_t *
7027 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
7028 {
7029 	struct rwlock *lock;
7030 	pt_entry_t *l3;
7031 
7032 	lock = NULL;
7033 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
7034 	if (lock != NULL)
7035 		rw_wunlock(lock);
7036 	return (l3);
7037 }
7038 
7039 /*
7040  * Perform the pmap work for mincore(2).  If the page is not both referenced and
7041  * modified by this pmap, returns its physical address so that the caller can
7042  * find other mappings.
7043  */
7044 int
7045 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
7046 {
7047 	pt_entry_t *pte, tpte;
7048 	vm_paddr_t mask, pa;
7049 	int lvl, val;
7050 	bool managed;
7051 
7052 	PMAP_ASSERT_STAGE1(pmap);
7053 	PMAP_LOCK(pmap);
7054 	pte = pmap_pte(pmap, addr, &lvl);
7055 	if (pte != NULL) {
7056 		tpte = pmap_load(pte);
7057 
7058 		switch (lvl) {
7059 		case 3:
7060 			mask = L3_OFFSET;
7061 			break;
7062 		case 2:
7063 			mask = L2_OFFSET;
7064 			break;
7065 		case 1:
7066 			mask = L1_OFFSET;
7067 			break;
7068 		default:
7069 			panic("pmap_mincore: invalid level %d", lvl);
7070 		}
7071 
7072 		managed = (tpte & ATTR_SW_MANAGED) != 0;
7073 		val = MINCORE_INCORE;
7074 		if (lvl != 3)
7075 			val |= MINCORE_PSIND(3 - lvl);
7076 		if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
7077 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
7078 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
7079 		if ((tpte & ATTR_AF) == ATTR_AF)
7080 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
7081 
7082 		pa = (tpte & ~ATTR_MASK) | (addr & mask);
7083 	} else {
7084 		managed = false;
7085 		val = 0;
7086 	}
7087 
7088 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
7089 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
7090 		*pap = pa;
7091 	}
7092 	PMAP_UNLOCK(pmap);
7093 	return (val);
7094 }
7095 
7096 /*
7097  * Garbage collect every ASID that is neither active on a processor nor
7098  * reserved.
7099  */
7100 static void
7101 pmap_reset_asid_set(pmap_t pmap)
7102 {
7103 	pmap_t curpmap;
7104 	int asid, cpuid, epoch;
7105 	struct asid_set *set;
7106 	enum pmap_stage stage;
7107 
7108 	set = pmap->pm_asid_set;
7109 	stage = pmap->pm_stage;
7110 
7111 	set = pmap->pm_asid_set;
7112 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7113 	mtx_assert(&set->asid_set_mutex, MA_OWNED);
7114 
7115 	/*
7116 	 * Ensure that the store to asid_epoch is globally visible before the
7117 	 * loads from pc_curpmap are performed.
7118 	 */
7119 	epoch = set->asid_epoch + 1;
7120 	if (epoch == INT_MAX)
7121 		epoch = 0;
7122 	set->asid_epoch = epoch;
7123 	dsb(ishst);
7124 	if (stage == PM_STAGE1) {
7125 		__asm __volatile("tlbi vmalle1is");
7126 	} else {
7127 		KASSERT(pmap_clean_stage2_tlbi != NULL,
7128 		    ("%s: Unset stage 2 tlb invalidation callback\n",
7129 		    __func__));
7130 		pmap_clean_stage2_tlbi();
7131 	}
7132 	dsb(ish);
7133 	bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
7134 	    set->asid_set_size - 1);
7135 	CPU_FOREACH(cpuid) {
7136 		if (cpuid == curcpu)
7137 			continue;
7138 		if (stage == PM_STAGE1) {
7139 			curpmap = pcpu_find(cpuid)->pc_curpmap;
7140 			PMAP_ASSERT_STAGE1(pmap);
7141 		} else {
7142 			curpmap = pcpu_find(cpuid)->pc_curvmpmap;
7143 			if (curpmap == NULL)
7144 				continue;
7145 			PMAP_ASSERT_STAGE2(pmap);
7146 		}
7147 		KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
7148 		asid = COOKIE_TO_ASID(curpmap->pm_cookie);
7149 		if (asid == -1)
7150 			continue;
7151 		bit_set(set->asid_set, asid);
7152 		curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
7153 	}
7154 }
7155 
7156 /*
7157  * Allocate a new ASID for the specified pmap.
7158  */
7159 static void
7160 pmap_alloc_asid(pmap_t pmap)
7161 {
7162 	struct asid_set *set;
7163 	int new_asid;
7164 
7165 	set = pmap->pm_asid_set;
7166 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7167 
7168 	mtx_lock_spin(&set->asid_set_mutex);
7169 
7170 	/*
7171 	 * While this processor was waiting to acquire the asid set mutex,
7172 	 * pmap_reset_asid_set() running on another processor might have
7173 	 * updated this pmap's cookie to the current epoch.  In which case, we
7174 	 * don't need to allocate a new ASID.
7175 	 */
7176 	if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
7177 		goto out;
7178 
7179 	bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
7180 	    &new_asid);
7181 	if (new_asid == -1) {
7182 		bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
7183 		    set->asid_next, &new_asid);
7184 		if (new_asid == -1) {
7185 			pmap_reset_asid_set(pmap);
7186 			bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
7187 			    set->asid_set_size, &new_asid);
7188 			KASSERT(new_asid != -1, ("ASID allocation failure"));
7189 		}
7190 	}
7191 	bit_set(set->asid_set, new_asid);
7192 	set->asid_next = new_asid + 1;
7193 	pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
7194 out:
7195 	mtx_unlock_spin(&set->asid_set_mutex);
7196 }
7197 
7198 static uint64_t __read_mostly ttbr_flags;
7199 
7200 /*
7201  * Compute the value that should be stored in ttbr0 to activate the specified
7202  * pmap.  This value may change from time to time.
7203  */
7204 uint64_t
7205 pmap_to_ttbr0(pmap_t pmap)
7206 {
7207 	uint64_t ttbr;
7208 
7209 	ttbr = pmap->pm_ttbr;
7210 	ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
7211 	ttbr |= ttbr_flags;
7212 
7213 	return (ttbr);
7214 }
7215 
7216 static void
7217 pmap_set_cnp(void *arg)
7218 {
7219 	uint64_t ttbr0, ttbr1;
7220 	u_int cpuid;
7221 
7222 	cpuid = *(u_int *)arg;
7223 	if (cpuid == curcpu) {
7224 		/*
7225 		 * Set the flags while all CPUs are handling the
7226 		 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
7227 		 * to pmap_to_ttbr0 after this will have the CnP flag set.
7228 		 * The dsb after invalidating the TLB will act as a barrier
7229 		 * to ensure all CPUs can observe this change.
7230 		 */
7231 		ttbr_flags |= TTBR_CnP;
7232 	}
7233 
7234 	ttbr0 = READ_SPECIALREG(ttbr0_el1);
7235 	ttbr0 |= TTBR_CnP;
7236 
7237 	ttbr1 = READ_SPECIALREG(ttbr1_el1);
7238 	ttbr1 |= TTBR_CnP;
7239 
7240 	/* Update ttbr{0,1}_el1 with the CnP flag */
7241 	WRITE_SPECIALREG(ttbr0_el1, ttbr0);
7242 	WRITE_SPECIALREG(ttbr1_el1, ttbr1);
7243 	isb();
7244 	__asm __volatile("tlbi vmalle1is");
7245 	dsb(ish);
7246 	isb();
7247 }
7248 
7249 /*
7250  * Defer enabling CnP until we have read the ID registers to know if it's
7251  * supported on all CPUs.
7252  */
7253 static void
7254 pmap_init_cnp(void *dummy __unused)
7255 {
7256 	uint64_t reg;
7257 	u_int cpuid;
7258 
7259 	if (!get_kernel_reg(ID_AA64MMFR2_EL1, &reg))
7260 		return;
7261 
7262 	if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
7263 		if (bootverbose)
7264 			printf("Enabling CnP\n");
7265 		cpuid = curcpu;
7266 		smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
7267 	}
7268 
7269 }
7270 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
7271 
7272 static bool
7273 pmap_activate_int(pmap_t pmap)
7274 {
7275 	struct asid_set *set;
7276 	int epoch;
7277 
7278 	KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
7279 	KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
7280 
7281 	if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
7282 	    (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
7283 		/*
7284 		 * Handle the possibility that the old thread was preempted
7285 		 * after an "ic" or "tlbi" instruction but before it performed
7286 		 * a "dsb" instruction.  If the old thread migrates to a new
7287 		 * processor, its completion of a "dsb" instruction on that
7288 		 * new processor does not guarantee that the "ic" or "tlbi"
7289 		 * instructions performed on the old processor have completed.
7290 		 */
7291 		dsb(ish);
7292 		return (false);
7293 	}
7294 
7295 	set = pmap->pm_asid_set;
7296 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7297 
7298 	/*
7299 	 * Ensure that the store to curpmap is globally visible before the
7300 	 * load from asid_epoch is performed.
7301 	 */
7302 	if (pmap->pm_stage == PM_STAGE1)
7303 		PCPU_SET(curpmap, pmap);
7304 	else
7305 		PCPU_SET(curvmpmap, pmap);
7306 	dsb(ish);
7307 	epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
7308 	if (epoch >= 0 && epoch != set->asid_epoch)
7309 		pmap_alloc_asid(pmap);
7310 
7311 	if (pmap->pm_stage == PM_STAGE1) {
7312 		set_ttbr0(pmap_to_ttbr0(pmap));
7313 		if (PCPU_GET(bcast_tlbi_workaround) != 0)
7314 			invalidate_local_icache();
7315 	}
7316 	return (true);
7317 }
7318 
7319 void
7320 pmap_activate_vm(pmap_t pmap)
7321 {
7322 
7323 	PMAP_ASSERT_STAGE2(pmap);
7324 
7325 	(void)pmap_activate_int(pmap);
7326 }
7327 
7328 void
7329 pmap_activate(struct thread *td)
7330 {
7331 	pmap_t	pmap;
7332 
7333 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
7334 	PMAP_ASSERT_STAGE1(pmap);
7335 	critical_enter();
7336 	(void)pmap_activate_int(pmap);
7337 	critical_exit();
7338 }
7339 
7340 /*
7341  * Activate the thread we are switching to.
7342  * To simplify the assembly in cpu_throw return the new threads pcb.
7343  */
7344 struct pcb *
7345 pmap_switch(struct thread *new)
7346 {
7347 	pcpu_bp_harden bp_harden;
7348 	struct pcb *pcb;
7349 
7350 	/* Store the new curthread */
7351 	PCPU_SET(curthread, new);
7352 
7353 	/* And the new pcb */
7354 	pcb = new->td_pcb;
7355 	PCPU_SET(curpcb, pcb);
7356 
7357 	/*
7358 	 * TODO: We may need to flush the cache here if switching
7359 	 * to a user process.
7360 	 */
7361 
7362 	if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
7363 		/*
7364 		 * Stop userspace from training the branch predictor against
7365 		 * other processes. This will call into a CPU specific
7366 		 * function that clears the branch predictor state.
7367 		 */
7368 		bp_harden = PCPU_GET(bp_harden);
7369 		if (bp_harden != NULL)
7370 			bp_harden();
7371 	}
7372 
7373 	return (pcb);
7374 }
7375 
7376 void
7377 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
7378 {
7379 
7380 	PMAP_ASSERT_STAGE1(pmap);
7381 	KASSERT(ADDR_IS_CANONICAL(va),
7382 	    ("%s: Address not in canonical form: %lx", __func__, va));
7383 
7384 	if (ADDR_IS_KERNEL(va)) {
7385 		cpu_icache_sync_range(va, sz);
7386 	} else {
7387 		u_int len, offset;
7388 		vm_paddr_t pa;
7389 
7390 		/* Find the length of data in this page to flush */
7391 		offset = va & PAGE_MASK;
7392 		len = imin(PAGE_SIZE - offset, sz);
7393 
7394 		while (sz != 0) {
7395 			/* Extract the physical address & find it in the DMAP */
7396 			pa = pmap_extract(pmap, va);
7397 			if (pa != 0)
7398 				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
7399 
7400 			/* Move to the next page */
7401 			sz -= len;
7402 			va += len;
7403 			/* Set the length for the next iteration */
7404 			len = imin(PAGE_SIZE, sz);
7405 		}
7406 	}
7407 }
7408 
7409 static int
7410 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
7411 {
7412 	pd_entry_t *pdep;
7413 	pt_entry_t *ptep, pte;
7414 	int rv, lvl, dfsc;
7415 
7416 	PMAP_ASSERT_STAGE2(pmap);
7417 	rv = KERN_FAILURE;
7418 
7419 	/* Data and insn aborts use same encoding for FSC field. */
7420 	dfsc = esr & ISS_DATA_DFSC_MASK;
7421 	switch (dfsc) {
7422 	case ISS_DATA_DFSC_TF_L0:
7423 	case ISS_DATA_DFSC_TF_L1:
7424 	case ISS_DATA_DFSC_TF_L2:
7425 	case ISS_DATA_DFSC_TF_L3:
7426 		PMAP_LOCK(pmap);
7427 		pdep = pmap_pde(pmap, far, &lvl);
7428 		if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
7429 			PMAP_LOCK(pmap);
7430 			break;
7431 		}
7432 
7433 		switch (lvl) {
7434 		case 0:
7435 			ptep = pmap_l0_to_l1(pdep, far);
7436 			break;
7437 		case 1:
7438 			ptep = pmap_l1_to_l2(pdep, far);
7439 			break;
7440 		case 2:
7441 			ptep = pmap_l2_to_l3(pdep, far);
7442 			break;
7443 		default:
7444 			panic("%s: Invalid pde level %d", __func__,lvl);
7445 		}
7446 		goto fault_exec;
7447 
7448 	case ISS_DATA_DFSC_AFF_L1:
7449 	case ISS_DATA_DFSC_AFF_L2:
7450 	case ISS_DATA_DFSC_AFF_L3:
7451 		PMAP_LOCK(pmap);
7452 		ptep = pmap_pte(pmap, far, &lvl);
7453 fault_exec:
7454 		if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
7455 			if (icache_vmid) {
7456 				pmap_invalidate_vpipt_icache();
7457 			} else {
7458 				/*
7459 				 * If accessing an executable page invalidate
7460 				 * the I-cache so it will be valid when we
7461 				 * continue execution in the guest. The D-cache
7462 				 * is assumed to already be clean to the Point
7463 				 * of Coherency.
7464 				 */
7465 				if ((pte & ATTR_S2_XN_MASK) !=
7466 				    ATTR_S2_XN(ATTR_S2_XN_NONE)) {
7467 					invalidate_icache();
7468 				}
7469 			}
7470 			pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
7471 			rv = KERN_SUCCESS;
7472 		}
7473 		PMAP_UNLOCK(pmap);
7474 		break;
7475 	}
7476 
7477 	return (rv);
7478 }
7479 
7480 int
7481 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
7482 {
7483 	pt_entry_t pte, *ptep;
7484 	register_t intr;
7485 	uint64_t ec, par;
7486 	int lvl, rv;
7487 
7488 	rv = KERN_FAILURE;
7489 
7490 	ec = ESR_ELx_EXCEPTION(esr);
7491 	switch (ec) {
7492 	case EXCP_INSN_ABORT_L:
7493 	case EXCP_INSN_ABORT:
7494 	case EXCP_DATA_ABORT_L:
7495 	case EXCP_DATA_ABORT:
7496 		break;
7497 	default:
7498 		return (rv);
7499 	}
7500 
7501 	if (pmap->pm_stage == PM_STAGE2)
7502 		return (pmap_stage2_fault(pmap, esr, far));
7503 
7504 	/* Data and insn aborts use same encoding for FSC field. */
7505 	switch (esr & ISS_DATA_DFSC_MASK) {
7506 	case ISS_DATA_DFSC_AFF_L1:
7507 	case ISS_DATA_DFSC_AFF_L2:
7508 	case ISS_DATA_DFSC_AFF_L3:
7509 		PMAP_LOCK(pmap);
7510 		ptep = pmap_pte(pmap, far, &lvl);
7511 		if (ptep != NULL) {
7512 			pmap_set_bits(ptep, ATTR_AF);
7513 			rv = KERN_SUCCESS;
7514 			/*
7515 			 * XXXMJ as an optimization we could mark the entry
7516 			 * dirty if this is a write fault.
7517 			 */
7518 		}
7519 		PMAP_UNLOCK(pmap);
7520 		break;
7521 	case ISS_DATA_DFSC_PF_L1:
7522 	case ISS_DATA_DFSC_PF_L2:
7523 	case ISS_DATA_DFSC_PF_L3:
7524 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
7525 		    (esr & ISS_DATA_WnR) == 0)
7526 			return (rv);
7527 		PMAP_LOCK(pmap);
7528 		ptep = pmap_pte(pmap, far, &lvl);
7529 		if (ptep != NULL &&
7530 		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
7531 			if ((pte & ATTR_S1_AP_RW_BIT) ==
7532 			    ATTR_S1_AP(ATTR_S1_AP_RO)) {
7533 				pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
7534 				pmap_s1_invalidate_page(pmap, far, true);
7535 			}
7536 			rv = KERN_SUCCESS;
7537 		}
7538 		PMAP_UNLOCK(pmap);
7539 		break;
7540 	case ISS_DATA_DFSC_TF_L0:
7541 	case ISS_DATA_DFSC_TF_L1:
7542 	case ISS_DATA_DFSC_TF_L2:
7543 	case ISS_DATA_DFSC_TF_L3:
7544 		/*
7545 		 * Retry the translation.  A break-before-make sequence can
7546 		 * produce a transient fault.
7547 		 */
7548 		if (pmap == kernel_pmap) {
7549 			/*
7550 			 * The translation fault may have occurred within a
7551 			 * critical section.  Therefore, we must check the
7552 			 * address without acquiring the kernel pmap's lock.
7553 			 */
7554 			if (pmap_klookup(far, NULL))
7555 				rv = KERN_SUCCESS;
7556 		} else {
7557 			PMAP_LOCK(pmap);
7558 			/* Ask the MMU to check the address. */
7559 			intr = intr_disable();
7560 			par = arm64_address_translate_s1e0r(far);
7561 			intr_restore(intr);
7562 			PMAP_UNLOCK(pmap);
7563 
7564 			/*
7565 			 * If the translation was successful, then we can
7566 			 * return success to the trap handler.
7567 			 */
7568 			if (PAR_SUCCESS(par))
7569 				rv = KERN_SUCCESS;
7570 		}
7571 		break;
7572 	}
7573 
7574 	return (rv);
7575 }
7576 
7577 /*
7578  *	Increase the starting virtual address of the given mapping if a
7579  *	different alignment might result in more superpage mappings.
7580  */
7581 void
7582 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7583     vm_offset_t *addr, vm_size_t size)
7584 {
7585 	vm_offset_t superpage_offset;
7586 
7587 	if (size < L2_SIZE)
7588 		return;
7589 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7590 		offset += ptoa(object->pg_color);
7591 	superpage_offset = offset & L2_OFFSET;
7592 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
7593 	    (*addr & L2_OFFSET) == superpage_offset)
7594 		return;
7595 	if ((*addr & L2_OFFSET) < superpage_offset)
7596 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
7597 	else
7598 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
7599 }
7600 
7601 /**
7602  * Get the kernel virtual address of a set of physical pages. If there are
7603  * physical addresses not covered by the DMAP perform a transient mapping
7604  * that will be removed when calling pmap_unmap_io_transient.
7605  *
7606  * \param page        The pages the caller wishes to obtain the virtual
7607  *                    address on the kernel memory map.
7608  * \param vaddr       On return contains the kernel virtual memory address
7609  *                    of the pages passed in the page parameter.
7610  * \param count       Number of pages passed in.
7611  * \param can_fault   TRUE if the thread using the mapped pages can take
7612  *                    page faults, FALSE otherwise.
7613  *
7614  * \returns TRUE if the caller must call pmap_unmap_io_transient when
7615  *          finished or FALSE otherwise.
7616  *
7617  */
7618 boolean_t
7619 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7620     boolean_t can_fault)
7621 {
7622 	vm_paddr_t paddr;
7623 	boolean_t needs_mapping;
7624 	int error __diagused, i;
7625 
7626 	/*
7627 	 * Allocate any KVA space that we need, this is done in a separate
7628 	 * loop to prevent calling vmem_alloc while pinned.
7629 	 */
7630 	needs_mapping = FALSE;
7631 	for (i = 0; i < count; i++) {
7632 		paddr = VM_PAGE_TO_PHYS(page[i]);
7633 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
7634 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
7635 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
7636 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
7637 			needs_mapping = TRUE;
7638 		} else {
7639 			vaddr[i] = PHYS_TO_DMAP(paddr);
7640 		}
7641 	}
7642 
7643 	/* Exit early if everything is covered by the DMAP */
7644 	if (!needs_mapping)
7645 		return (FALSE);
7646 
7647 	if (!can_fault)
7648 		sched_pin();
7649 	for (i = 0; i < count; i++) {
7650 		paddr = VM_PAGE_TO_PHYS(page[i]);
7651 		if (!PHYS_IN_DMAP(paddr)) {
7652 			panic(
7653 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
7654 		}
7655 	}
7656 
7657 	return (needs_mapping);
7658 }
7659 
7660 void
7661 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7662     boolean_t can_fault)
7663 {
7664 	vm_paddr_t paddr;
7665 	int i;
7666 
7667 	if (!can_fault)
7668 		sched_unpin();
7669 	for (i = 0; i < count; i++) {
7670 		paddr = VM_PAGE_TO_PHYS(page[i]);
7671 		if (!PHYS_IN_DMAP(paddr)) {
7672 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
7673 		}
7674 	}
7675 }
7676 
7677 boolean_t
7678 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
7679 {
7680 
7681 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
7682 }
7683 
7684 #if defined(KASAN)
7685 static vm_paddr_t	pmap_san_early_kernstart;
7686 static pd_entry_t	*pmap_san_early_l2;
7687 
7688 void __nosanitizeaddress
7689 pmap_san_bootstrap(struct arm64_bootparams *abp)
7690 {
7691 
7692 	pmap_san_early_kernstart = KERNBASE - abp->kern_delta;
7693 	kasan_init_early(abp->kern_stack, KSTACK_PAGES * PAGE_SIZE);
7694 }
7695 
7696 #define	SAN_BOOTSTRAP_L2_SIZE	(1 * L2_SIZE)
7697 #define	SAN_BOOTSTRAP_SIZE	(2 * PAGE_SIZE)
7698 static vm_offset_t __nosanitizeaddress
7699 pmap_san_enter_bootstrap_alloc_l2(void)
7700 {
7701 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
7702 	static size_t offset = 0;
7703 	vm_offset_t addr;
7704 
7705 	if (offset + L2_SIZE > sizeof(bootstrap_data)) {
7706 		panic("%s: out of memory for the bootstrap shadow map L2 entries",
7707 		    __func__);
7708 	}
7709 
7710 	addr = (uintptr_t)&bootstrap_data[offset];
7711 	offset += L2_SIZE;
7712 	return (addr);
7713 }
7714 
7715 /*
7716  * SAN L1 + L2 pages, maybe L3 entries later?
7717  */
7718 static vm_offset_t __nosanitizeaddress
7719 pmap_san_enter_bootstrap_alloc_pages(int npages)
7720 {
7721 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
7722 	static size_t offset = 0;
7723 	vm_offset_t addr;
7724 
7725 	if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
7726 		panic("%s: out of memory for the bootstrap shadow map",
7727 		    __func__);
7728 	}
7729 
7730 	addr = (uintptr_t)&bootstrap_data[offset];
7731 	offset += (npages * PAGE_SIZE);
7732 	return (addr);
7733 }
7734 
7735 static void __nosanitizeaddress
7736 pmap_san_enter_bootstrap(void)
7737 {
7738 	vm_offset_t freemempos;
7739 
7740 	/* L1, L2 */
7741 	freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
7742 	bs_state.freemempos = freemempos;
7743 	bs_state.va = KASAN_MIN_ADDRESS;
7744 	pmap_bootstrap_l1_table(&bs_state);
7745 	pmap_san_early_l2 = bs_state.l2;
7746 }
7747 
7748 static vm_page_t
7749 pmap_san_enter_alloc_l3(void)
7750 {
7751 	vm_page_t m;
7752 
7753 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
7754 	    VM_ALLOC_ZERO);
7755 	if (m == NULL)
7756 		panic("%s: no memory to grow shadow map", __func__);
7757 	return (m);
7758 }
7759 
7760 static vm_page_t
7761 pmap_san_enter_alloc_l2(void)
7762 {
7763 	return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
7764 	    Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
7765 }
7766 
7767 void __nosanitizeaddress
7768 pmap_san_enter(vm_offset_t va)
7769 {
7770 	pd_entry_t *l1, *l2;
7771 	pt_entry_t *l3;
7772 	vm_page_t m;
7773 
7774 	if (virtual_avail == 0) {
7775 		vm_offset_t block;
7776 		int slot;
7777 		bool first;
7778 
7779 		/* Temporary shadow map prior to pmap_bootstrap(). */
7780 		first = pmap_san_early_l2 == NULL;
7781 		if (first)
7782 			pmap_san_enter_bootstrap();
7783 
7784 		l2 = pmap_san_early_l2;
7785 		slot = pmap_l2_index(va);
7786 
7787 		if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
7788 			MPASS(first);
7789 			block = pmap_san_enter_bootstrap_alloc_l2();
7790 			pmap_store(&l2[slot], pmap_early_vtophys(block) |
7791 			    PMAP_SAN_PTE_BITS | L2_BLOCK);
7792 			dmb(ishst);
7793 		}
7794 
7795 		return;
7796 	}
7797 
7798 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
7799 	l1 = pmap_l1(kernel_pmap, va);
7800 	MPASS(l1 != NULL);
7801 	if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
7802 		m = pmap_san_enter_alloc_l3();
7803 		pmap_store(l1, (VM_PAGE_TO_PHYS(m) & ~Ln_TABLE_MASK) |
7804 		    L1_TABLE);
7805 	}
7806 	l2 = pmap_l1_to_l2(l1, va);
7807 	if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
7808 		m = pmap_san_enter_alloc_l2();
7809 		if (m != NULL) {
7810 			pmap_store(l2, VM_PAGE_TO_PHYS(m) | PMAP_SAN_PTE_BITS |
7811 			    L2_BLOCK);
7812 		} else {
7813 			m = pmap_san_enter_alloc_l3();
7814 			pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
7815 		}
7816 		dmb(ishst);
7817 	}
7818 	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
7819 		return;
7820 	l3 = pmap_l2_to_l3(l2, va);
7821 	if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
7822 		return;
7823 	m = pmap_san_enter_alloc_l3();
7824 	pmap_store(l3, VM_PAGE_TO_PHYS(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
7825 	dmb(ishst);
7826 }
7827 #endif /* KASAN */
7828 
7829 /*
7830  * Track a range of the kernel's virtual address space that is contiguous
7831  * in various mapping attributes.
7832  */
7833 struct pmap_kernel_map_range {
7834 	vm_offset_t sva;
7835 	pt_entry_t attrs;
7836 	int l3pages;
7837 	int l3contig;
7838 	int l2blocks;
7839 	int l1blocks;
7840 };
7841 
7842 static void
7843 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
7844     vm_offset_t eva)
7845 {
7846 	const char *mode;
7847 	int index;
7848 
7849 	if (eva <= range->sva)
7850 		return;
7851 
7852 	index = range->attrs & ATTR_S1_IDX_MASK;
7853 	switch (index) {
7854 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
7855 		mode = "DEV-NP";
7856 		break;
7857 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
7858 		mode = "DEV";
7859 		break;
7860 	case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
7861 		mode = "UC";
7862 		break;
7863 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
7864 		mode = "WB";
7865 		break;
7866 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
7867 		mode = "WT";
7868 		break;
7869 	default:
7870 		printf(
7871 		    "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
7872 		    __func__, index, range->sva, eva);
7873 		mode = "??";
7874 		break;
7875 	}
7876 
7877 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %6s %d %d %d %d\n",
7878 	    range->sva, eva,
7879 	    (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
7880 	    (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
7881 	    (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
7882 	    (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
7883 	    mode, range->l1blocks, range->l2blocks, range->l3contig,
7884 	    range->l3pages);
7885 
7886 	/* Reset to sentinel value. */
7887 	range->sva = 0xfffffffffffffffful;
7888 }
7889 
7890 /*
7891  * Determine whether the attributes specified by a page table entry match those
7892  * being tracked by the current range.
7893  */
7894 static bool
7895 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
7896 {
7897 
7898 	return (range->attrs == attrs);
7899 }
7900 
7901 static void
7902 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
7903     pt_entry_t attrs)
7904 {
7905 
7906 	memset(range, 0, sizeof(*range));
7907 	range->sva = va;
7908 	range->attrs = attrs;
7909 }
7910 
7911 /* Get the block/page attributes that correspond to the table attributes */
7912 static pt_entry_t
7913 sysctl_kmaps_table_attrs(pd_entry_t table)
7914 {
7915 	pt_entry_t attrs;
7916 
7917 	attrs = 0;
7918 	if ((table & TATTR_UXN_TABLE) != 0)
7919 		attrs |= ATTR_S1_UXN;
7920 	if ((table & TATTR_PXN_TABLE) != 0)
7921 		attrs |= ATTR_S1_PXN;
7922 	if ((table & TATTR_AP_TABLE_RO) != 0)
7923 		attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
7924 
7925 	return (attrs);
7926 }
7927 
7928 /* Read the block/page attributes we care about */
7929 static pt_entry_t
7930 sysctl_kmaps_block_attrs(pt_entry_t block)
7931 {
7932 	return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK));
7933 }
7934 
7935 /*
7936  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
7937  * those of the current run, dump the address range and its attributes, and
7938  * begin a new run.
7939  */
7940 static void
7941 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
7942     vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
7943     pt_entry_t l3e)
7944 {
7945 	pt_entry_t attrs;
7946 
7947 	attrs = sysctl_kmaps_table_attrs(l0e);
7948 
7949 	if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7950 		attrs |= sysctl_kmaps_block_attrs(l1e);
7951 		goto done;
7952 	}
7953 	attrs |= sysctl_kmaps_table_attrs(l1e);
7954 
7955 	if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7956 		attrs |= sysctl_kmaps_block_attrs(l2e);
7957 		goto done;
7958 	}
7959 	attrs |= sysctl_kmaps_table_attrs(l2e);
7960 	attrs |= sysctl_kmaps_block_attrs(l3e);
7961 
7962 done:
7963 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
7964 		sysctl_kmaps_dump(sb, range, va);
7965 		sysctl_kmaps_reinit(range, va, attrs);
7966 	}
7967 }
7968 
7969 static int
7970 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
7971 {
7972 	struct pmap_kernel_map_range range;
7973 	struct sbuf sbuf, *sb;
7974 	pd_entry_t l0e, *l1, l1e, *l2, l2e;
7975 	pt_entry_t *l3, l3e;
7976 	vm_offset_t sva;
7977 	vm_paddr_t pa;
7978 	int error, i, j, k, l;
7979 
7980 	error = sysctl_wire_old_buffer(req, 0);
7981 	if (error != 0)
7982 		return (error);
7983 	sb = &sbuf;
7984 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
7985 
7986 	/* Sentinel value. */
7987 	range.sva = 0xfffffffffffffffful;
7988 
7989 	/*
7990 	 * Iterate over the kernel page tables without holding the kernel pmap
7991 	 * lock.  Kernel page table pages are never freed, so at worst we will
7992 	 * observe inconsistencies in the output.
7993 	 */
7994 	for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
7995 	    i++) {
7996 		if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
7997 			sbuf_printf(sb, "\nDirect map:\n");
7998 		else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
7999 			sbuf_printf(sb, "\nKernel map:\n");
8000 #ifdef KASAN
8001 		else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
8002 			sbuf_printf(sb, "\nKASAN shadow map:\n");
8003 #endif
8004 
8005 		l0e = kernel_pmap->pm_l0[i];
8006 		if ((l0e & ATTR_DESCR_VALID) == 0) {
8007 			sysctl_kmaps_dump(sb, &range, sva);
8008 			sva += L0_SIZE;
8009 			continue;
8010 		}
8011 		pa = l0e & ~ATTR_MASK;
8012 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
8013 
8014 		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
8015 			l1e = l1[j];
8016 			if ((l1e & ATTR_DESCR_VALID) == 0) {
8017 				sysctl_kmaps_dump(sb, &range, sva);
8018 				sva += L1_SIZE;
8019 				continue;
8020 			}
8021 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
8022 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8023 				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
8024 				    0, 0);
8025 				range.l1blocks++;
8026 				sva += L1_SIZE;
8027 				continue;
8028 			}
8029 			pa = l1e & ~ATTR_MASK;
8030 			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
8031 
8032 			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
8033 				l2e = l2[k];
8034 				if ((l2e & ATTR_DESCR_VALID) == 0) {
8035 					sysctl_kmaps_dump(sb, &range, sva);
8036 					sva += L2_SIZE;
8037 					continue;
8038 				}
8039 				if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
8040 					sysctl_kmaps_check(sb, &range, sva,
8041 					    l0e, l1e, l2e, 0);
8042 					range.l2blocks++;
8043 					sva += L2_SIZE;
8044 					continue;
8045 				}
8046 				pa = l2e & ~ATTR_MASK;
8047 				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
8048 
8049 				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
8050 				    l++, sva += L3_SIZE) {
8051 					l3e = l3[l];
8052 					if ((l3e & ATTR_DESCR_VALID) == 0) {
8053 						sysctl_kmaps_dump(sb, &range,
8054 						    sva);
8055 						continue;
8056 					}
8057 					sysctl_kmaps_check(sb, &range, sva,
8058 					    l0e, l1e, l2e, l3e);
8059 					if ((l3e & ATTR_CONTIGUOUS) != 0)
8060 						range.l3contig += l % 16 == 0 ?
8061 						    1 : 0;
8062 					else
8063 						range.l3pages++;
8064 				}
8065 			}
8066 		}
8067 	}
8068 
8069 	error = sbuf_finish(sb);
8070 	sbuf_delete(sb);
8071 	return (error);
8072 }
8073 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
8074     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
8075     NULL, 0, sysctl_kmaps, "A",
8076     "Dump kernel address layout");
8077