xref: /freebsd/sys/arm64/arm64/pmap.c (revision d411c1d6)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *	This product includes software developed by the University of
35  *	California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
53  */
54 /*-
55  * Copyright (c) 2003 Networks Associates Technology, Inc.
56  * All rights reserved.
57  *
58  * This software was developed for the FreeBSD Project by Jake Burkholder,
59  * Safeport Network Services, and Network Associates Laboratories, the
60  * Security Research Division of Network Associates, Inc. under
61  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62  * CHATS research program.
63  *
64  * Redistribution and use in source and binary forms, with or without
65  * modification, are permitted provided that the following conditions
66  * are met:
67  * 1. Redistributions of source code must retain the above copyright
68  *    notice, this list of conditions and the following disclaimer.
69  * 2. Redistributions in binary form must reproduce the above copyright
70  *    notice, this list of conditions and the following disclaimer in the
71  *    documentation and/or other materials provided with the distribution.
72  *
73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83  * SUCH DAMAGE.
84  */
85 
86 #include <sys/cdefs.h>
87 __FBSDID("$FreeBSD$");
88 
89 /*
90  *	Manages physical address maps.
91  *
92  *	Since the information managed by this module is
93  *	also stored by the logical address mapping module,
94  *	this module may throw away valid virtual-to-physical
95  *	mappings at almost any time.  However, invalidations
96  *	of virtual-to-physical mappings must be done as
97  *	requested.
98  *
99  *	In order to cope with hardware architectures which
100  *	make virtual-to-physical map invalidates expensive,
101  *	this module may delay invalidate or reduced protection
102  *	operations until such time as they are actually
103  *	necessary.  This module is given full information as
104  *	to which processors are currently using which maps,
105  *	and to when physical maps must be made correct.
106  */
107 
108 #include "opt_vm.h"
109 
110 #include <sys/param.h>
111 #include <sys/asan.h>
112 #include <sys/bitstring.h>
113 #include <sys/bus.h>
114 #include <sys/systm.h>
115 #include <sys/kernel.h>
116 #include <sys/ktr.h>
117 #include <sys/limits.h>
118 #include <sys/lock.h>
119 #include <sys/malloc.h>
120 #include <sys/mman.h>
121 #include <sys/msgbuf.h>
122 #include <sys/mutex.h>
123 #include <sys/physmem.h>
124 #include <sys/proc.h>
125 #include <sys/rwlock.h>
126 #include <sys/sbuf.h>
127 #include <sys/sx.h>
128 #include <sys/vmem.h>
129 #include <sys/vmmeter.h>
130 #include <sys/sched.h>
131 #include <sys/sysctl.h>
132 #include <sys/_unrhdr.h>
133 #include <sys/smp.h>
134 
135 #include <vm/vm.h>
136 #include <vm/vm_param.h>
137 #include <vm/vm_kern.h>
138 #include <vm/vm_page.h>
139 #include <vm/vm_map.h>
140 #include <vm/vm_object.h>
141 #include <vm/vm_extern.h>
142 #include <vm/vm_pageout.h>
143 #include <vm/vm_pager.h>
144 #include <vm/vm_phys.h>
145 #include <vm/vm_radix.h>
146 #include <vm/vm_reserv.h>
147 #include <vm/vm_dumpset.h>
148 #include <vm/uma.h>
149 
150 #include <machine/asan.h>
151 #include <machine/machdep.h>
152 #include <machine/md_var.h>
153 #include <machine/pcb.h>
154 
155 #ifdef NUMA
156 #define	PMAP_MEMDOM	MAXMEMDOM
157 #else
158 #define	PMAP_MEMDOM	1
159 #endif
160 
161 #define	PMAP_ASSERT_STAGE1(pmap)	MPASS((pmap)->pm_stage == PM_STAGE1)
162 #define	PMAP_ASSERT_STAGE2(pmap)	MPASS((pmap)->pm_stage == PM_STAGE2)
163 
164 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
166 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
167 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
168 
169 #define	NUL0E		L0_ENTRIES
170 #define	NUL1E		(NUL0E * NL1PG)
171 #define	NUL2E		(NUL1E * NL2PG)
172 
173 #if !defined(DIAGNOSTIC)
174 #ifdef __GNUC_GNU_INLINE__
175 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
176 #else
177 #define PMAP_INLINE	extern inline
178 #endif
179 #else
180 #define PMAP_INLINE
181 #endif
182 
183 #ifdef PV_STATS
184 #define PV_STAT(x)	do { x ; } while (0)
185 #define __pvused
186 #else
187 #define PV_STAT(x)	do { } while (0)
188 #define __pvused	__unused
189 #endif
190 
191 #define	pmap_l0_pindex(v)	(NUL2E + NUL1E + ((v) >> L0_SHIFT))
192 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
193 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
194 
195 #define	PMAP_SAN_PTE_BITS	(ATTR_DEFAULT | ATTR_S1_XN |	\
196 	ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
197 
198 struct pmap_large_md_page {
199 	struct rwlock   pv_lock;
200 	struct md_page  pv_page;
201 	/* Pad to a power of 2, see pmap_init_pv_table(). */
202 	int		pv_pad[2];
203 };
204 
205 static struct pmap_large_md_page *
206 _pa_to_pmdp(vm_paddr_t pa)
207 {
208 	struct vm_phys_seg *seg;
209 	int segind;
210 
211 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
212 		seg = &vm_phys_segs[segind];
213 		if (pa >= seg->start && pa < seg->end)
214 			return ((struct pmap_large_md_page *)seg->md_first +
215 			    pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
216 	}
217 	return (NULL);
218 }
219 
220 static struct pmap_large_md_page *
221 pa_to_pmdp(vm_paddr_t pa)
222 {
223 	struct pmap_large_md_page *pvd;
224 
225 	pvd = _pa_to_pmdp(pa);
226 	if (pvd == NULL)
227 		panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
228 	return (pvd);
229 }
230 
231 static struct pmap_large_md_page *
232 page_to_pmdp(vm_page_t m)
233 {
234 	struct vm_phys_seg *seg;
235 
236 	seg = &vm_phys_segs[m->segind];
237 	return ((struct pmap_large_md_page *)seg->md_first +
238 	    pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
239 }
240 
241 #define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
242 #define	page_to_pvh(m)	(&(page_to_pmdp(m)->pv_page))
243 
244 #define	PHYS_TO_PV_LIST_LOCK(pa)	({			\
245 	struct pmap_large_md_page *_pvd;			\
246 	struct rwlock *_lock;					\
247 	_pvd = _pa_to_pmdp(pa);					\
248 	if (__predict_false(_pvd == NULL))			\
249 		_lock = &pv_dummy_large.pv_lock;		\
250 	else							\
251 		_lock = &(_pvd->pv_lock);			\
252 	_lock;							\
253 })
254 
255 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
256 	struct rwlock **_lockp = (lockp);		\
257 	struct rwlock *_new_lock;			\
258 							\
259 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
260 	if (_new_lock != *_lockp) {			\
261 		if (*_lockp != NULL)			\
262 			rw_wunlock(*_lockp);		\
263 		*_lockp = _new_lock;			\
264 		rw_wlock(*_lockp);			\
265 	}						\
266 } while (0)
267 
268 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
269 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
270 
271 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
272 	struct rwlock **_lockp = (lockp);		\
273 							\
274 	if (*_lockp != NULL) {				\
275 		rw_wunlock(*_lockp);			\
276 		*_lockp = NULL;				\
277 	}						\
278 } while (0)
279 
280 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
281 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
282 
283 /*
284  * The presence of this flag indicates that the mapping is writeable.
285  * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
286  * it is dirty.  This flag may only be set on managed mappings.
287  *
288  * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
289  * as a software managed bit.
290  */
291 #define	ATTR_SW_DBM	ATTR_DBM
292 
293 struct pmap kernel_pmap_store;
294 
295 /* Used for mapping ACPI memory before VM is initialized */
296 #define	PMAP_PREINIT_MAPPING_COUNT	32
297 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
298 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
299 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
300 
301 /*
302  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
303  * Always map entire L2 block for simplicity.
304  * VA of L2 block = preinit_map_va + i * L2_SIZE
305  */
306 static struct pmap_preinit_mapping {
307 	vm_paddr_t	pa;
308 	vm_offset_t	va;
309 	vm_size_t	size;
310 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
311 
312 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
313 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
314 vm_offset_t kernel_vm_end = 0;
315 
316 /*
317  * Data for the pv entry allocation mechanism.
318  */
319 #ifdef NUMA
320 static __inline int
321 pc_to_domain(struct pv_chunk *pc)
322 {
323 	return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
324 }
325 #else
326 static __inline int
327 pc_to_domain(struct pv_chunk *pc __unused)
328 {
329 	return (0);
330 }
331 #endif
332 
333 struct pv_chunks_list {
334 	struct mtx pvc_lock;
335 	TAILQ_HEAD(pch, pv_chunk) pvc_list;
336 	int active_reclaims;
337 } __aligned(CACHE_LINE_SIZE);
338 
339 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
340 
341 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
342 #define pv_dummy pv_dummy_large.pv_page
343 __read_mostly static struct pmap_large_md_page *pv_table;
344 __read_mostly vm_paddr_t pmap_last_pa;
345 
346 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
347 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
348 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
349 
350 extern pt_entry_t pagetable_l0_ttbr1[];
351 
352 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
353 static vm_paddr_t physmap[PHYSMAP_SIZE];
354 static u_int physmap_idx;
355 
356 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
357     "VM/pmap parameters");
358 
359 #if PAGE_SIZE == PAGE_SIZE_4K
360 #define	L1_BLOCKS_SUPPORTED	1
361 #else
362 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
363 #define	L1_BLOCKS_SUPPORTED	0
364 #endif
365 
366 #define	PMAP_ASSERT_L1_BLOCKS_SUPPORTED	MPASS(L1_BLOCKS_SUPPORTED)
367 
368 /*
369  * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
370  * that it has currently allocated to a pmap, a cursor ("asid_next") to
371  * optimize its search for a free ASID in the bit vector, and an epoch number
372  * ("asid_epoch") to indicate when it has reclaimed all previously allocated
373  * ASIDs that are not currently active on a processor.
374  *
375  * The current epoch number is always in the range [0, INT_MAX).  Negative
376  * numbers and INT_MAX are reserved for special cases that are described
377  * below.
378  */
379 struct asid_set {
380 	int asid_bits;
381 	bitstr_t *asid_set;
382 	int asid_set_size;
383 	int asid_next;
384 	int asid_epoch;
385 	struct mtx asid_set_mutex;
386 };
387 
388 static struct asid_set asids;
389 static struct asid_set vmids;
390 
391 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
392     "ASID allocator");
393 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
394     "The number of bits in an ASID");
395 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
396     "The last allocated ASID plus one");
397 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
398     "The current epoch number");
399 
400 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
401 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
402     "The number of bits in an VMID");
403 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
404     "The last allocated VMID plus one");
405 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
406     "The current epoch number");
407 
408 void (*pmap_clean_stage2_tlbi)(void);
409 void (*pmap_invalidate_vpipt_icache)(void);
410 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
411 void (*pmap_stage2_invalidate_all)(uint64_t);
412 
413 /*
414  * A pmap's cookie encodes an ASID and epoch number.  Cookies for reserved
415  * ASIDs have a negative epoch number, specifically, INT_MIN.  Cookies for
416  * dynamically allocated ASIDs have a non-negative epoch number.
417  *
418  * An invalid ASID is represented by -1.
419  *
420  * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
421  * which indicates that an ASID should never be allocated to the pmap, and
422  * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
423  * allocated when the pmap is next activated.
424  */
425 #define	COOKIE_FROM(asid, epoch)	((long)((u_int)(asid) |	\
426 					    ((u_long)(epoch) << 32)))
427 #define	COOKIE_TO_ASID(cookie)		((int)(cookie))
428 #define	COOKIE_TO_EPOCH(cookie)		((int)((u_long)(cookie) >> 32))
429 
430 #define	TLBI_VA_SHIFT			12
431 #define	TLBI_VA_MASK			((1ul << 44) - 1)
432 #define	TLBI_VA(addr)			(((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
433 #define	TLBI_VA_L3_INCR			(L3_SIZE >> TLBI_VA_SHIFT)
434 
435 static int superpages_enabled = 1;
436 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
437     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
438     "Are large page mappings enabled?");
439 
440 /*
441  * Internal flags for pmap_enter()'s helper functions.
442  */
443 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
444 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
445 
446 TAILQ_HEAD(pv_chunklist, pv_chunk);
447 
448 static void	free_pv_chunk(struct pv_chunk *pc);
449 static void	free_pv_chunk_batch(struct pv_chunklist *batch);
450 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
451 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
452 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
453 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
454 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
455 		    vm_offset_t va);
456 
457 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
458 static bool pmap_activate_int(pmap_t pmap);
459 static void pmap_alloc_asid(pmap_t pmap);
460 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
461     vm_prot_t prot, int mode, bool skip_unmapped);
462 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
463 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
464     vm_offset_t va, struct rwlock **lockp);
465 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
466 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
467     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
468 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
469     u_int flags, vm_page_t m, struct rwlock **lockp);
470 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
471     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
472 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
473     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
474 static void pmap_reset_asid_set(pmap_t pmap);
475 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
476     vm_page_t m, struct rwlock **lockp);
477 
478 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
479 		struct rwlock **lockp);
480 
481 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
482     struct spglist *free);
483 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
484 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
485 
486 /*
487  * These load the old table data and store the new value.
488  * They need to be atomic as the System MMU may write to the table at
489  * the same time as the CPU.
490  */
491 #define	pmap_clear(table)		atomic_store_64(table, 0)
492 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
493 #define	pmap_load(table)		(*table)
494 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
495 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
496 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
497 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
498 
499 /********************/
500 /* Inline functions */
501 /********************/
502 
503 static __inline void
504 pagecopy(void *s, void *d)
505 {
506 
507 	memcpy(d, s, PAGE_SIZE);
508 }
509 
510 static __inline pd_entry_t *
511 pmap_l0(pmap_t pmap, vm_offset_t va)
512 {
513 
514 	return (&pmap->pm_l0[pmap_l0_index(va)]);
515 }
516 
517 static __inline pd_entry_t *
518 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
519 {
520 	pd_entry_t *l1;
521 
522 	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
523 	return (&l1[pmap_l1_index(va)]);
524 }
525 
526 static __inline pd_entry_t *
527 pmap_l1(pmap_t pmap, vm_offset_t va)
528 {
529 	pd_entry_t *l0;
530 
531 	l0 = pmap_l0(pmap, va);
532 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
533 		return (NULL);
534 
535 	return (pmap_l0_to_l1(l0, va));
536 }
537 
538 static __inline pd_entry_t *
539 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
540 {
541 	pd_entry_t l1, *l2p;
542 
543 	l1 = pmap_load(l1p);
544 
545 	KASSERT(ADDR_IS_CANONICAL(va),
546 	    ("%s: Address not in canonical form: %lx", __func__, va));
547 	/*
548 	 * The valid bit may be clear if pmap_update_entry() is concurrently
549 	 * modifying the entry, so for KVA only the entry type may be checked.
550 	 */
551 	KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
552 	    ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
553 	KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
554 	    ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
555 	l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK);
556 	return (&l2p[pmap_l2_index(va)]);
557 }
558 
559 static __inline pd_entry_t *
560 pmap_l2(pmap_t pmap, vm_offset_t va)
561 {
562 	pd_entry_t *l1;
563 
564 	l1 = pmap_l1(pmap, va);
565 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
566 		return (NULL);
567 
568 	return (pmap_l1_to_l2(l1, va));
569 }
570 
571 static __inline pt_entry_t *
572 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
573 {
574 	pd_entry_t l2;
575 	pt_entry_t *l3p;
576 
577 	l2 = pmap_load(l2p);
578 
579 	KASSERT(ADDR_IS_CANONICAL(va),
580 	    ("%s: Address not in canonical form: %lx", __func__, va));
581 	/*
582 	 * The valid bit may be clear if pmap_update_entry() is concurrently
583 	 * modifying the entry, so for KVA only the entry type may be checked.
584 	 */
585 	KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
586 	    ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
587 	KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
588 	    ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
589 	l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK);
590 	return (&l3p[pmap_l3_index(va)]);
591 }
592 
593 /*
594  * Returns the lowest valid pde for a given virtual address.
595  * The next level may or may not point to a valid page or block.
596  */
597 static __inline pd_entry_t *
598 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
599 {
600 	pd_entry_t *l0, *l1, *l2, desc;
601 
602 	l0 = pmap_l0(pmap, va);
603 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
604 	if (desc != L0_TABLE) {
605 		*level = -1;
606 		return (NULL);
607 	}
608 
609 	l1 = pmap_l0_to_l1(l0, va);
610 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
611 	if (desc != L1_TABLE) {
612 		*level = 0;
613 		return (l0);
614 	}
615 
616 	l2 = pmap_l1_to_l2(l1, va);
617 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
618 	if (desc != L2_TABLE) {
619 		*level = 1;
620 		return (l1);
621 	}
622 
623 	*level = 2;
624 	return (l2);
625 }
626 
627 /*
628  * Returns the lowest valid pte block or table entry for a given virtual
629  * address. If there are no valid entries return NULL and set the level to
630  * the first invalid level.
631  */
632 static __inline pt_entry_t *
633 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
634 {
635 	pd_entry_t *l1, *l2, desc;
636 	pt_entry_t *l3;
637 
638 	l1 = pmap_l1(pmap, va);
639 	if (l1 == NULL) {
640 		*level = 0;
641 		return (NULL);
642 	}
643 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
644 	if (desc == L1_BLOCK) {
645 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
646 		*level = 1;
647 		return (l1);
648 	}
649 
650 	if (desc != L1_TABLE) {
651 		*level = 1;
652 		return (NULL);
653 	}
654 
655 	l2 = pmap_l1_to_l2(l1, va);
656 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
657 	if (desc == L2_BLOCK) {
658 		*level = 2;
659 		return (l2);
660 	}
661 
662 	if (desc != L2_TABLE) {
663 		*level = 2;
664 		return (NULL);
665 	}
666 
667 	*level = 3;
668 	l3 = pmap_l2_to_l3(l2, va);
669 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
670 		return (NULL);
671 
672 	return (l3);
673 }
674 
675 /*
676  * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
677  * level that maps the specified virtual address, then a pointer to that entry
678  * is returned.  Otherwise, NULL is returned, unless INVARIANTS are enabled
679  * and a diagnostic message is provided, in which case this function panics.
680  */
681 static __always_inline pt_entry_t *
682 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
683 {
684 	pd_entry_t *l0p, *l1p, *l2p;
685 	pt_entry_t desc, *l3p;
686 	int walk_level __diagused;
687 
688 	KASSERT(level >= 0 && level < 4,
689 	    ("%s: %s passed an out-of-range level (%d)", __func__, diag,
690 	    level));
691 	l0p = pmap_l0(pmap, va);
692 	desc = pmap_load(l0p) & ATTR_DESCR_MASK;
693 	if (desc == L0_TABLE && level > 0) {
694 		l1p = pmap_l0_to_l1(l0p, va);
695 		desc = pmap_load(l1p) & ATTR_DESCR_MASK;
696 		if (desc == L1_BLOCK && level == 1) {
697 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
698 			return (l1p);
699 		}
700 		if (desc == L1_TABLE && level > 1) {
701 			l2p = pmap_l1_to_l2(l1p, va);
702 			desc = pmap_load(l2p) & ATTR_DESCR_MASK;
703 			if (desc == L2_BLOCK && level == 2)
704 				return (l2p);
705 			else if (desc == L2_TABLE && level > 2) {
706 				l3p = pmap_l2_to_l3(l2p, va);
707 				desc = pmap_load(l3p) & ATTR_DESCR_MASK;
708 				if (desc == L3_PAGE && level == 3)
709 					return (l3p);
710 				else
711 					walk_level = 3;
712 			} else
713 				walk_level = 2;
714 		} else
715 			walk_level = 1;
716 	} else
717 		walk_level = 0;
718 	KASSERT(diag == NULL,
719 	    ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
720 	    diag, va, level, desc, walk_level));
721 	return (NULL);
722 }
723 
724 bool
725 pmap_ps_enabled(pmap_t pmap)
726 {
727 	/*
728 	 * Promotion requires a hypervisor call when the kernel is running
729 	 * in EL1. To stop this disable superpage support on non-stage 1
730 	 * pmaps for now.
731 	 */
732 	if (pmap->pm_stage != PM_STAGE1)
733 		return (false);
734 
735 	return (superpages_enabled != 0);
736 }
737 
738 bool
739 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
740     pd_entry_t **l2, pt_entry_t **l3)
741 {
742 	pd_entry_t *l0p, *l1p, *l2p;
743 
744 	if (pmap->pm_l0 == NULL)
745 		return (false);
746 
747 	l0p = pmap_l0(pmap, va);
748 	*l0 = l0p;
749 
750 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
751 		return (false);
752 
753 	l1p = pmap_l0_to_l1(l0p, va);
754 	*l1 = l1p;
755 
756 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
757 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
758 		*l2 = NULL;
759 		*l3 = NULL;
760 		return (true);
761 	}
762 
763 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
764 		return (false);
765 
766 	l2p = pmap_l1_to_l2(l1p, va);
767 	*l2 = l2p;
768 
769 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
770 		*l3 = NULL;
771 		return (true);
772 	}
773 
774 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
775 		return (false);
776 
777 	*l3 = pmap_l2_to_l3(l2p, va);
778 
779 	return (true);
780 }
781 
782 static __inline int
783 pmap_l3_valid(pt_entry_t l3)
784 {
785 
786 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
787 }
788 
789 CTASSERT(L1_BLOCK == L2_BLOCK);
790 
791 static pt_entry_t
792 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
793 {
794 	pt_entry_t val;
795 
796 	if (pmap->pm_stage == PM_STAGE1) {
797 		val = ATTR_S1_IDX(memattr);
798 		if (memattr == VM_MEMATTR_DEVICE)
799 			val |= ATTR_S1_XN;
800 		return (val);
801 	}
802 
803 	val = 0;
804 
805 	switch (memattr) {
806 	case VM_MEMATTR_DEVICE:
807 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
808 		    ATTR_S2_XN(ATTR_S2_XN_ALL));
809 	case VM_MEMATTR_UNCACHEABLE:
810 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
811 	case VM_MEMATTR_WRITE_BACK:
812 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
813 	case VM_MEMATTR_WRITE_THROUGH:
814 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
815 	default:
816 		panic("%s: invalid memory attribute %x", __func__, memattr);
817 	}
818 }
819 
820 static pt_entry_t
821 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
822 {
823 	pt_entry_t val;
824 
825 	val = 0;
826 	if (pmap->pm_stage == PM_STAGE1) {
827 		if ((prot & VM_PROT_EXECUTE) == 0)
828 			val |= ATTR_S1_XN;
829 		if ((prot & VM_PROT_WRITE) == 0)
830 			val |= ATTR_S1_AP(ATTR_S1_AP_RO);
831 	} else {
832 		if ((prot & VM_PROT_WRITE) != 0)
833 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
834 		if ((prot & VM_PROT_READ) != 0)
835 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
836 		if ((prot & VM_PROT_EXECUTE) == 0)
837 			val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
838 	}
839 
840 	return (val);
841 }
842 
843 /*
844  * Checks if the PTE is dirty.
845  */
846 static inline int
847 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
848 {
849 
850 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
851 
852 	if (pmap->pm_stage == PM_STAGE1) {
853 		KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
854 		    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
855 
856 		return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
857 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
858 	}
859 
860 	return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
861 	    ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
862 }
863 
864 static __inline void
865 pmap_resident_count_inc(pmap_t pmap, int count)
866 {
867 
868 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
869 	pmap->pm_stats.resident_count += count;
870 }
871 
872 static __inline void
873 pmap_resident_count_dec(pmap_t pmap, int count)
874 {
875 
876 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
877 	KASSERT(pmap->pm_stats.resident_count >= count,
878 	    ("pmap %p resident count underflow %ld %d", pmap,
879 	    pmap->pm_stats.resident_count, count));
880 	pmap->pm_stats.resident_count -= count;
881 }
882 
883 static vm_paddr_t
884 pmap_early_vtophys(vm_offset_t va)
885 {
886 	vm_paddr_t pa_page;
887 
888 	pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
889 	return (pa_page | (va & PAR_LOW_MASK));
890 }
891 
892 /* State of the bootstrapped DMAP page tables */
893 struct pmap_bootstrap_state {
894 	pt_entry_t	*l1;
895 	pt_entry_t	*l2;
896 	pt_entry_t	*l3;
897 	vm_offset_t	freemempos;
898 	vm_offset_t	va;
899 	vm_paddr_t	pa;
900 	pt_entry_t	table_attrs;
901 	u_int		l0_slot;
902 	u_int		l1_slot;
903 	u_int		l2_slot;
904 	bool		dmap_valid;
905 };
906 
907 /* The bootstrap state */
908 static struct pmap_bootstrap_state bs_state = {
909 	.l1 = NULL,
910 	.l2 = NULL,
911 	.l3 = NULL,
912 	.table_attrs = TATTR_PXN_TABLE,
913 	.l0_slot = L0_ENTRIES,
914 	.l1_slot = Ln_ENTRIES,
915 	.l2_slot = Ln_ENTRIES,
916 	.dmap_valid = false,
917 };
918 
919 static void
920 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
921 {
922 	vm_paddr_t l1_pa;
923 	pd_entry_t l0e;
924 	u_int l0_slot;
925 
926 	/* Link the level 0 table to a level 1 table */
927 	l0_slot = pmap_l0_index(state->va);
928 	if (l0_slot != state->l0_slot) {
929 		/*
930 		 * Make sure we move from a low address to high address
931 		 * before the DMAP region is ready. This ensures we never
932 		 * modify an existing mapping until we can map from a
933 		 * physical address to a virtual address.
934 		 */
935 		MPASS(state->l0_slot < l0_slot ||
936 		    state->l0_slot == L0_ENTRIES ||
937 		    state->dmap_valid);
938 
939 		/* Reset lower levels */
940 		state->l2 = NULL;
941 		state->l3 = NULL;
942 		state->l1_slot = Ln_ENTRIES;
943 		state->l2_slot = Ln_ENTRIES;
944 
945 		/* Check the existing L0 entry */
946 		state->l0_slot = l0_slot;
947 		if (state->dmap_valid) {
948 			l0e = pagetable_l0_ttbr1[l0_slot];
949 			if ((l0e & ATTR_DESCR_VALID) != 0) {
950 				MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
951 				l1_pa = l0e & ~ATTR_MASK;
952 				state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
953 				return;
954 			}
955 		}
956 
957 		/* Create a new L0 table entry */
958 		state->l1 = (pt_entry_t *)state->freemempos;
959 		memset(state->l1, 0, PAGE_SIZE);
960 		state->freemempos += PAGE_SIZE;
961 
962 		l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
963 		MPASS((l1_pa & Ln_TABLE_MASK) == 0);
964 		MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
965 		pmap_store(&pagetable_l0_ttbr1[l0_slot], l1_pa |
966 		    TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
967 	}
968 	KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
969 }
970 
971 static void
972 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
973 {
974 	vm_paddr_t l2_pa;
975 	pd_entry_t l1e;
976 	u_int l1_slot;
977 
978 	/* Make sure there is a valid L0 -> L1 table */
979 	pmap_bootstrap_l0_table(state);
980 
981 	/* Link the level 1 table to a level 2 table */
982 	l1_slot = pmap_l1_index(state->va);
983 	if (l1_slot != state->l1_slot) {
984 		/* See pmap_bootstrap_l0_table for a description */
985 		MPASS(state->l1_slot < l1_slot ||
986 		    state->l1_slot == Ln_ENTRIES ||
987 		    state->dmap_valid);
988 
989 		/* Reset lower levels */
990 		state->l3 = NULL;
991 		state->l2_slot = Ln_ENTRIES;
992 
993 		/* Check the existing L1 entry */
994 		state->l1_slot = l1_slot;
995 		if (state->dmap_valid) {
996 			l1e = state->l1[l1_slot];
997 			if ((l1e & ATTR_DESCR_VALID) != 0) {
998 				MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
999 				l2_pa = l1e & ~ATTR_MASK;
1000 				state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1001 				return;
1002 			}
1003 		}
1004 
1005 		/* Create a new L1 table entry */
1006 		state->l2 = (pt_entry_t *)state->freemempos;
1007 		memset(state->l2, 0, PAGE_SIZE);
1008 		state->freemempos += PAGE_SIZE;
1009 
1010 		l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1011 		MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1012 		MPASS(state->l1[l1_slot] == 0);
1013 		pmap_store(&state->l1[l1_slot], l2_pa | state->table_attrs |
1014 		    L1_TABLE);
1015 	}
1016 	KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1017 }
1018 
1019 static void
1020 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1021 {
1022 	vm_paddr_t l3_pa;
1023 	pd_entry_t l2e;
1024 	u_int l2_slot;
1025 
1026 	/* Make sure there is a valid L1 -> L2 table */
1027 	pmap_bootstrap_l1_table(state);
1028 
1029 	/* Link the level 2 table to a level 3 table */
1030 	l2_slot = pmap_l2_index(state->va);
1031 	if (l2_slot != state->l2_slot) {
1032 		/* See pmap_bootstrap_l0_table for a description */
1033 		MPASS(state->l2_slot < l2_slot ||
1034 		    state->l2_slot == Ln_ENTRIES ||
1035 		    state->dmap_valid);
1036 
1037 		/* Check the existing L2 entry */
1038 		state->l2_slot = l2_slot;
1039 		if (state->dmap_valid) {
1040 			l2e = state->l2[l2_slot];
1041 			if ((l2e & ATTR_DESCR_VALID) != 0) {
1042 				MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1043 				l3_pa = l2e & ~ATTR_MASK;
1044 				state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1045 				return;
1046 			}
1047 		}
1048 
1049 		/* Create a new L2 table entry */
1050 		state->l3 = (pt_entry_t *)state->freemempos;
1051 		memset(state->l3, 0, PAGE_SIZE);
1052 		state->freemempos += PAGE_SIZE;
1053 
1054 		l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1055 		MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1056 		MPASS(state->l2[l2_slot] == 0);
1057 		pmap_store(&state->l2[l2_slot], l3_pa | state->table_attrs |
1058 		    L2_TABLE);
1059 	}
1060 	KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1061 }
1062 
1063 static void
1064 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1065 {
1066 	u_int l2_slot;
1067 	bool first;
1068 
1069 	if ((physmap[i + 1] - state->pa) < L2_SIZE)
1070 		return;
1071 
1072 	/* Make sure there is a valid L1 table */
1073 	pmap_bootstrap_l1_table(state);
1074 
1075 	MPASS((state->va & L2_OFFSET) == 0);
1076 	for (first = true;
1077 	    state->va < DMAP_MAX_ADDRESS &&
1078 	    (physmap[i + 1] - state->pa) >= L2_SIZE;
1079 	    state->va += L2_SIZE, state->pa += L2_SIZE) {
1080 		/*
1081 		 * Stop if we are about to walk off the end of what the
1082 		 * current L1 slot can address.
1083 		 */
1084 		if (!first && (state->pa & L1_OFFSET) == 0)
1085 			break;
1086 
1087 		first = false;
1088 		l2_slot = pmap_l2_index(state->va);
1089 		MPASS((state->pa & L2_OFFSET) == 0);
1090 		MPASS(state->l2[l2_slot] == 0);
1091 		pmap_store(&state->l2[l2_slot], state->pa | ATTR_DEFAULT |
1092 		    ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1093 		    L2_BLOCK);
1094 	}
1095 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1096 }
1097 
1098 static void
1099 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1100 {
1101 	u_int l3_slot;
1102 	bool first;
1103 
1104 	if ((physmap[i + 1] - state->pa) < L3_SIZE)
1105 		return;
1106 
1107 	/* Make sure there is a valid L2 table */
1108 	pmap_bootstrap_l2_table(state);
1109 
1110 	MPASS((state->va & L3_OFFSET) == 0);
1111 	for (first = true;
1112 	    state->va < DMAP_MAX_ADDRESS &&
1113 	    (physmap[i + 1] - state->pa) >= L3_SIZE;
1114 	    state->va += L3_SIZE, state->pa += L3_SIZE) {
1115 		/*
1116 		 * Stop if we are about to walk off the end of what the
1117 		 * current L2 slot can address.
1118 		 */
1119 		if (!first && (state->pa & L2_OFFSET) == 0)
1120 			break;
1121 
1122 		first = false;
1123 		l3_slot = pmap_l3_index(state->va);
1124 		MPASS((state->pa & L3_OFFSET) == 0);
1125 		MPASS(state->l3[l3_slot] == 0);
1126 		pmap_store(&state->l3[l3_slot], state->pa | ATTR_DEFAULT |
1127 		    ATTR_S1_XN | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1128 		    L3_PAGE);
1129 	}
1130 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1131 }
1132 
1133 static void
1134 pmap_bootstrap_dmap(vm_paddr_t min_pa)
1135 {
1136 	int i;
1137 
1138 	dmap_phys_base = min_pa & ~L1_OFFSET;
1139 	dmap_phys_max = 0;
1140 	dmap_max_addr = 0;
1141 
1142 	for (i = 0; i < (physmap_idx * 2); i += 2) {
1143 		bs_state.pa = physmap[i] & ~L3_OFFSET;
1144 		bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1145 
1146 		/* Create L3 mappings at the start of the region */
1147 		if ((bs_state.pa & L2_OFFSET) != 0)
1148 			pmap_bootstrap_l3_page(&bs_state, i);
1149 		MPASS(bs_state.pa <= physmap[i + 1]);
1150 
1151 		if (L1_BLOCKS_SUPPORTED) {
1152 			/* Create L2 mappings at the start of the region */
1153 			if ((bs_state.pa & L1_OFFSET) != 0)
1154 				pmap_bootstrap_l2_block(&bs_state, i);
1155 			MPASS(bs_state.pa <= physmap[i + 1]);
1156 
1157 			/* Create the main L1 block mappings */
1158 			for (; bs_state.va < DMAP_MAX_ADDRESS &&
1159 			    (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1160 			    bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1161 				/* Make sure there is a valid L1 table */
1162 				pmap_bootstrap_l0_table(&bs_state);
1163 				MPASS((bs_state.pa & L1_OFFSET) == 0);
1164 				pmap_store(
1165 				    &bs_state.l1[pmap_l1_index(bs_state.va)],
1166 				    bs_state.pa | ATTR_DEFAULT | ATTR_S1_XN |
1167 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1168 				    L1_BLOCK);
1169 			}
1170 			MPASS(bs_state.pa <= physmap[i + 1]);
1171 
1172 			/* Create L2 mappings at the end of the region */
1173 			pmap_bootstrap_l2_block(&bs_state, i);
1174 		} else {
1175 			while (bs_state.va < DMAP_MAX_ADDRESS &&
1176 			    (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1177 				pmap_bootstrap_l2_block(&bs_state, i);
1178 			}
1179 		}
1180 		MPASS(bs_state.pa <= physmap[i + 1]);
1181 
1182 		/* Create L3 mappings at the end of the region */
1183 		pmap_bootstrap_l3_page(&bs_state, i);
1184 		MPASS(bs_state.pa == physmap[i + 1]);
1185 
1186 		if (bs_state.pa > dmap_phys_max) {
1187 			dmap_phys_max = bs_state.pa;
1188 			dmap_max_addr = bs_state.va;
1189 		}
1190 	}
1191 
1192 	cpu_tlb_flushID();
1193 }
1194 
1195 static void
1196 pmap_bootstrap_l2(vm_offset_t va)
1197 {
1198 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1199 
1200 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1201 	bs_state.va = va;
1202 
1203 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1204 		pmap_bootstrap_l1_table(&bs_state);
1205 }
1206 
1207 static void
1208 pmap_bootstrap_l3(vm_offset_t va)
1209 {
1210 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1211 
1212 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1213 	bs_state.va = va;
1214 
1215 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1216 		pmap_bootstrap_l2_table(&bs_state);
1217 }
1218 
1219 #ifdef KASAN
1220 static void
1221 pmap_bootstrap_allocate_kasan_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1222     vm_offset_t *start_va, int *nkasan_l2)
1223 {
1224 	int i;
1225 	vm_paddr_t pa;
1226 	vm_offset_t va;
1227 	pd_entry_t *l2;
1228 
1229 	va = *start_va;
1230 	pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1231 	l2 = pmap_l2(kernel_pmap, va);
1232 
1233 	for (i = 0; pa >= start_pa && i < *nkasan_l2;
1234 	    i++, va += L2_SIZE, pa -= L2_SIZE, l2++) {
1235 		/*
1236 		 * KASAN stack checking results in us having already allocated
1237 		 * part of our shadow map, so we can just skip those segments.
1238 		 */
1239 		if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1240 			pa += L2_SIZE;
1241 			continue;
1242 		}
1243 
1244 		pmap_store(l2, pa | PMAP_SAN_PTE_BITS | L2_BLOCK);
1245 	}
1246 
1247 	/*
1248 	 * Ended the allocation due to start_pa constraint, rather than because
1249 	 * we allocated everything.  Adjust back up to the start_pa and remove
1250 	 * the invalid L2 block from our accounting.
1251 	 */
1252 	if (pa < start_pa) {
1253 		va += L2_SIZE;
1254 		i--;
1255 		pa = start_pa;
1256 	}
1257 
1258 	bzero((void *)PHYS_TO_DMAP(pa), i * L2_SIZE);
1259 	physmem_exclude_region(pa, i * L2_SIZE, EXFLAG_NOALLOC);
1260 
1261 	*nkasan_l2 -= i;
1262 	*start_va = va;
1263 }
1264 #endif
1265 
1266 /*
1267  *	Bootstrap the system enough to run with virtual memory.
1268  */
1269 void
1270 pmap_bootstrap(vm_paddr_t kernstart, vm_size_t kernlen)
1271 {
1272 	vm_offset_t dpcpu, msgbufpv;
1273 	vm_paddr_t start_pa, pa, min_pa;
1274 	uint64_t kern_delta;
1275 	int i;
1276 
1277 	/* Verify that the ASID is set through TTBR0. */
1278 	KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1279 	    ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1280 
1281 	kern_delta = KERNBASE - kernstart;
1282 
1283 	printf("pmap_bootstrap %lx %lx\n", kernstart, kernlen);
1284 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
1285 
1286 	/* Set this early so we can use the pagetable walking functions */
1287 	kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1288 	PMAP_LOCK_INIT(kernel_pmap);
1289 	kernel_pmap->pm_l0_paddr =
1290 	    pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1291 	kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1292 	kernel_pmap->pm_stage = PM_STAGE1;
1293 	kernel_pmap->pm_levels = 4;
1294 	kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1295 	kernel_pmap->pm_asid_set = &asids;
1296 
1297 	/* Assume the address we were loaded to is a valid physical address */
1298 	min_pa = KERNBASE - kern_delta;
1299 
1300 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1301 	physmap_idx /= 2;
1302 
1303 	/*
1304 	 * Find the minimum physical address. physmap is sorted,
1305 	 * but may contain empty ranges.
1306 	 */
1307 	for (i = 0; i < physmap_idx * 2; i += 2) {
1308 		if (physmap[i] == physmap[i + 1])
1309 			continue;
1310 		if (physmap[i] <= min_pa)
1311 			min_pa = physmap[i];
1312 	}
1313 
1314 	bs_state.freemempos = KERNBASE + kernlen;
1315 	bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1316 
1317 	/* Create a direct map region early so we can use it for pa -> va */
1318 	pmap_bootstrap_dmap(min_pa);
1319 	bs_state.dmap_valid = true;
1320 	/*
1321 	 * We only use PXN when we know nothing will be executed from it, e.g.
1322 	 * the DMAP region.
1323 	 */
1324 	bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1325 
1326 	start_pa = pa = KERNBASE - kern_delta;
1327 
1328 	/*
1329 	 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS.  We assume that the
1330 	 * loader allocated the first and only l2 page table page used to map
1331 	 * the kernel, preloaded files and module metadata.
1332 	 */
1333 	pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1334 	/* And the l3 tables for the early devmap */
1335 	pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1336 
1337 	cpu_tlb_flushID();
1338 
1339 #define alloc_pages(var, np)						\
1340 	(var) = bs_state.freemempos;					\
1341 	bs_state.freemempos += (np * PAGE_SIZE);			\
1342 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
1343 
1344 	/* Allocate dynamic per-cpu area. */
1345 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1346 	dpcpu_init((void *)dpcpu, 0);
1347 
1348 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1349 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1350 	msgbufp = (void *)msgbufpv;
1351 
1352 	/* Reserve some VA space for early BIOS/ACPI mapping */
1353 	preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1354 
1355 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1356 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
1357 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1358 	kernel_vm_end = virtual_avail;
1359 
1360 	pa = pmap_early_vtophys(bs_state.freemempos);
1361 
1362 	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1363 
1364 	cpu_tlb_flushID();
1365 }
1366 
1367 #if defined(KASAN)
1368 /*
1369  * Finish constructing the initial shadow map:
1370  * - Count how many pages from KERNBASE to virtual_avail (scaled for
1371  *   shadow map)
1372  * - Map that entire range using L2 superpages.
1373  */
1374 void
1375 pmap_bootstrap_san(vm_paddr_t kernstart)
1376 {
1377 	vm_offset_t va;
1378 	int i, shadow_npages, nkasan_l2;
1379 
1380 	/*
1381 	 * Rebuild physmap one more time, we may have excluded more regions from
1382 	 * allocation since pmap_bootstrap().
1383 	 */
1384 	bzero(physmap, sizeof(physmap));
1385 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1386 	physmap_idx /= 2;
1387 
1388 	shadow_npages = (virtual_avail - VM_MIN_KERNEL_ADDRESS) / PAGE_SIZE;
1389 	shadow_npages = howmany(shadow_npages, KASAN_SHADOW_SCALE);
1390 	nkasan_l2 = howmany(shadow_npages, Ln_ENTRIES);
1391 
1392 	/* Map the valid KVA up to this point. */
1393 	va = KASAN_MIN_ADDRESS;
1394 
1395 	/*
1396 	 * Find a slot in the physmap large enough for what we needed.  We try to put
1397 	 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1398 	 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1399 	 */
1400 	for (i = (physmap_idx * 2) - 2; i >= 0 && nkasan_l2 > 0; i -= 2) {
1401 		vm_paddr_t plow, phigh;
1402 
1403 		/* L2 mappings must be backed by memory that is L2-aligned */
1404 		plow = roundup2(physmap[i], L2_SIZE);
1405 		phigh = physmap[i + 1];
1406 		if (plow >= phigh)
1407 			continue;
1408 		if (kernstart >= plow && kernstart < phigh)
1409 			phigh = kernstart;
1410 		if (phigh - plow >= L2_SIZE)
1411 			pmap_bootstrap_allocate_kasan_l2(plow, phigh, &va,
1412 			    &nkasan_l2);
1413 	}
1414 
1415 	if (nkasan_l2 != 0)
1416 		panic("Could not find phys region for shadow map");
1417 
1418 	/*
1419 	 * Done. We should now have a valid shadow address mapped for all KVA
1420 	 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1421 	 * shadow accesses by the kasan(9) runtime will succeed for this range.
1422 	 * When the kernel virtual address range is later expanded, as will
1423 	 * happen in vm_mem_init(), the shadow map will be grown as well. This
1424 	 * is handled by pmap_san_enter().
1425 	 */
1426 }
1427 #endif
1428 
1429 /*
1430  *	Initialize a vm_page's machine-dependent fields.
1431  */
1432 void
1433 pmap_page_init(vm_page_t m)
1434 {
1435 
1436 	TAILQ_INIT(&m->md.pv_list);
1437 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1438 }
1439 
1440 static void
1441 pmap_init_asids(struct asid_set *set, int bits)
1442 {
1443 	int i;
1444 
1445 	set->asid_bits = bits;
1446 
1447 	/*
1448 	 * We may be too early in the overall initialization process to use
1449 	 * bit_alloc().
1450 	 */
1451 	set->asid_set_size = 1 << set->asid_bits;
1452 	set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1453 	    M_WAITOK | M_ZERO);
1454 	for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1455 		bit_set(set->asid_set, i);
1456 	set->asid_next = ASID_FIRST_AVAILABLE;
1457 	mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1458 }
1459 
1460 static void
1461 pmap_init_pv_table(void)
1462 {
1463 	struct vm_phys_seg *seg, *next_seg;
1464 	struct pmap_large_md_page *pvd;
1465 	vm_size_t s;
1466 	int domain, i, j, pages;
1467 
1468 	/*
1469 	 * We strongly depend on the size being a power of two, so the assert
1470 	 * is overzealous. However, should the struct be resized to a
1471 	 * different power of two, the code below needs to be revisited.
1472 	 */
1473 	CTASSERT((sizeof(*pvd) == 64));
1474 
1475 	/*
1476 	 * Calculate the size of the array.
1477 	 */
1478 	s = 0;
1479 	for (i = 0; i < vm_phys_nsegs; i++) {
1480 		seg = &vm_phys_segs[i];
1481 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1482 		    pmap_l2_pindex(seg->start);
1483 		s += round_page(pages * sizeof(*pvd));
1484 	}
1485 	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1486 	if (pv_table == NULL)
1487 		panic("%s: kva_alloc failed\n", __func__);
1488 
1489 	/*
1490 	 * Iterate physical segments to allocate domain-local memory for PV
1491 	 * list headers.
1492 	 */
1493 	pvd = pv_table;
1494 	for (i = 0; i < vm_phys_nsegs; i++) {
1495 		seg = &vm_phys_segs[i];
1496 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1497 		    pmap_l2_pindex(seg->start);
1498 		domain = seg->domain;
1499 
1500 		s = round_page(pages * sizeof(*pvd));
1501 
1502 		for (j = 0; j < s; j += PAGE_SIZE) {
1503 			vm_page_t m = vm_page_alloc_noobj_domain(domain,
1504 			    VM_ALLOC_ZERO);
1505 			if (m == NULL)
1506 				panic("failed to allocate PV table page");
1507 			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1508 		}
1509 
1510 		for (j = 0; j < s / sizeof(*pvd); j++) {
1511 			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1512 			TAILQ_INIT(&pvd->pv_page.pv_list);
1513 			pvd++;
1514 		}
1515 	}
1516 	pvd = &pv_dummy_large;
1517 	memset(pvd, 0, sizeof(*pvd));
1518 	rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1519 	TAILQ_INIT(&pvd->pv_page.pv_list);
1520 
1521 	/*
1522 	 * Set pointers from vm_phys_segs to pv_table.
1523 	 */
1524 	for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1525 		seg = &vm_phys_segs[i];
1526 		seg->md_first = pvd;
1527 		pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1528 		    pmap_l2_pindex(seg->start);
1529 
1530 		/*
1531 		 * If there is a following segment, and the final
1532 		 * superpage of this segment and the initial superpage
1533 		 * of the next segment are the same then adjust the
1534 		 * pv_table entry for that next segment down by one so
1535 		 * that the pv_table entries will be shared.
1536 		 */
1537 		if (i + 1 < vm_phys_nsegs) {
1538 			next_seg = &vm_phys_segs[i + 1];
1539 			if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1540 			    pmap_l2_pindex(next_seg->start)) {
1541 				pvd--;
1542 			}
1543 		}
1544 	}
1545 }
1546 
1547 /*
1548  *	Initialize the pmap module.
1549  *	Called by vm_init, to initialize any structures that the pmap
1550  *	system needs to map virtual memory.
1551  */
1552 void
1553 pmap_init(void)
1554 {
1555 	uint64_t mmfr1;
1556 	int i, vmid_bits;
1557 
1558 	/*
1559 	 * Are large page mappings enabled?
1560 	 */
1561 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1562 	if (superpages_enabled) {
1563 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1564 		    ("pmap_init: can't assign to pagesizes[1]"));
1565 		pagesizes[1] = L2_SIZE;
1566 		if (L1_BLOCKS_SUPPORTED) {
1567 			KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1568 			    ("pmap_init: can't assign to pagesizes[2]"));
1569 			pagesizes[2] = L1_SIZE;
1570 		}
1571 	}
1572 
1573 	/*
1574 	 * Initialize the ASID allocator.
1575 	 */
1576 	pmap_init_asids(&asids,
1577 	    (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1578 
1579 	if (has_hyp()) {
1580 		mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1581 		vmid_bits = 8;
1582 
1583 		if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1584 		    ID_AA64MMFR1_VMIDBits_16)
1585 			vmid_bits = 16;
1586 		pmap_init_asids(&vmids, vmid_bits);
1587 	}
1588 
1589 	/*
1590 	 * Initialize pv chunk lists.
1591 	 */
1592 	for (i = 0; i < PMAP_MEMDOM; i++) {
1593 		mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1594 		    MTX_DEF);
1595 		TAILQ_INIT(&pv_chunks[i].pvc_list);
1596 	}
1597 	pmap_init_pv_table();
1598 
1599 	vm_initialized = 1;
1600 }
1601 
1602 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1603     "2MB page mapping counters");
1604 
1605 static u_long pmap_l2_demotions;
1606 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1607     &pmap_l2_demotions, 0, "2MB page demotions");
1608 
1609 static u_long pmap_l2_mappings;
1610 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1611     &pmap_l2_mappings, 0, "2MB page mappings");
1612 
1613 static u_long pmap_l2_p_failures;
1614 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1615     &pmap_l2_p_failures, 0, "2MB page promotion failures");
1616 
1617 static u_long pmap_l2_promotions;
1618 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1619     &pmap_l2_promotions, 0, "2MB page promotions");
1620 
1621 /*
1622  * If the given value for "final_only" is false, then any cached intermediate-
1623  * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1624  * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1625  * Otherwise, just the cached final-level entry is invalidated.
1626  */
1627 static __inline void
1628 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1629 {
1630 	if (final_only)
1631 		__asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1632 	else
1633 		__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1634 }
1635 
1636 static __inline void
1637 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1638 {
1639 	if (final_only)
1640 		__asm __volatile("tlbi vale1is, %0" : : "r" (r));
1641 	else
1642 		__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1643 }
1644 
1645 /*
1646  * Invalidates any cached final- and optionally intermediate-level TLB entries
1647  * for the specified virtual address in the given virtual address space.
1648  */
1649 static __inline void
1650 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1651 {
1652 	uint64_t r;
1653 
1654 	PMAP_ASSERT_STAGE1(pmap);
1655 
1656 	dsb(ishst);
1657 	r = TLBI_VA(va);
1658 	if (pmap == kernel_pmap) {
1659 		pmap_s1_invalidate_kernel(r, final_only);
1660 	} else {
1661 		r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1662 		pmap_s1_invalidate_user(r, final_only);
1663 	}
1664 	dsb(ish);
1665 	isb();
1666 }
1667 
1668 static __inline void
1669 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1670 {
1671 	PMAP_ASSERT_STAGE2(pmap);
1672 	MPASS(pmap_stage2_invalidate_range != NULL);
1673 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1674 	    final_only);
1675 }
1676 
1677 static __inline void
1678 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1679 {
1680 	if (pmap->pm_stage == PM_STAGE1)
1681 		pmap_s1_invalidate_page(pmap, va, final_only);
1682 	else
1683 		pmap_s2_invalidate_page(pmap, va, final_only);
1684 }
1685 
1686 /*
1687  * Invalidates any cached final- and optionally intermediate-level TLB entries
1688  * for the specified virtual address range in the given virtual address space.
1689  */
1690 static __inline void
1691 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1692     bool final_only)
1693 {
1694 	uint64_t end, r, start;
1695 
1696 	PMAP_ASSERT_STAGE1(pmap);
1697 
1698 	dsb(ishst);
1699 	if (pmap == kernel_pmap) {
1700 		start = TLBI_VA(sva);
1701 		end = TLBI_VA(eva);
1702 		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1703 			pmap_s1_invalidate_kernel(r, final_only);
1704 	} else {
1705 		start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1706 		start |= TLBI_VA(sva);
1707 		end |= TLBI_VA(eva);
1708 		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1709 			pmap_s1_invalidate_user(r, final_only);
1710 	}
1711 	dsb(ish);
1712 	isb();
1713 }
1714 
1715 static __inline void
1716 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1717     bool final_only)
1718 {
1719 	PMAP_ASSERT_STAGE2(pmap);
1720 	MPASS(pmap_stage2_invalidate_range != NULL);
1721 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1722 }
1723 
1724 static __inline void
1725 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1726     bool final_only)
1727 {
1728 	if (pmap->pm_stage == PM_STAGE1)
1729 		pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1730 	else
1731 		pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1732 }
1733 
1734 /*
1735  * Invalidates all cached intermediate- and final-level TLB entries for the
1736  * given virtual address space.
1737  */
1738 static __inline void
1739 pmap_s1_invalidate_all(pmap_t pmap)
1740 {
1741 	uint64_t r;
1742 
1743 	PMAP_ASSERT_STAGE1(pmap);
1744 
1745 	dsb(ishst);
1746 	if (pmap == kernel_pmap) {
1747 		__asm __volatile("tlbi vmalle1is");
1748 	} else {
1749 		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1750 		__asm __volatile("tlbi aside1is, %0" : : "r" (r));
1751 	}
1752 	dsb(ish);
1753 	isb();
1754 }
1755 
1756 static __inline void
1757 pmap_s2_invalidate_all(pmap_t pmap)
1758 {
1759 	PMAP_ASSERT_STAGE2(pmap);
1760 	MPASS(pmap_stage2_invalidate_all != NULL);
1761 	pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1762 }
1763 
1764 static __inline void
1765 pmap_invalidate_all(pmap_t pmap)
1766 {
1767 	if (pmap->pm_stage == PM_STAGE1)
1768 		pmap_s1_invalidate_all(pmap);
1769 	else
1770 		pmap_s2_invalidate_all(pmap);
1771 }
1772 
1773 /*
1774  *	Routine:	pmap_extract
1775  *	Function:
1776  *		Extract the physical page address associated
1777  *		with the given map/virtual_address pair.
1778  */
1779 vm_paddr_t
1780 pmap_extract(pmap_t pmap, vm_offset_t va)
1781 {
1782 	pt_entry_t *pte, tpte;
1783 	vm_paddr_t pa;
1784 	int lvl;
1785 
1786 	pa = 0;
1787 	PMAP_LOCK(pmap);
1788 	/*
1789 	 * Find the block or page map for this virtual address. pmap_pte
1790 	 * will return either a valid block/page entry, or NULL.
1791 	 */
1792 	pte = pmap_pte(pmap, va, &lvl);
1793 	if (pte != NULL) {
1794 		tpte = pmap_load(pte);
1795 		pa = tpte & ~ATTR_MASK;
1796 		switch(lvl) {
1797 		case 1:
1798 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1799 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1800 			    ("pmap_extract: Invalid L1 pte found: %lx",
1801 			    tpte & ATTR_DESCR_MASK));
1802 			pa |= (va & L1_OFFSET);
1803 			break;
1804 		case 2:
1805 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1806 			    ("pmap_extract: Invalid L2 pte found: %lx",
1807 			    tpte & ATTR_DESCR_MASK));
1808 			pa |= (va & L2_OFFSET);
1809 			break;
1810 		case 3:
1811 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1812 			    ("pmap_extract: Invalid L3 pte found: %lx",
1813 			    tpte & ATTR_DESCR_MASK));
1814 			pa |= (va & L3_OFFSET);
1815 			break;
1816 		}
1817 	}
1818 	PMAP_UNLOCK(pmap);
1819 	return (pa);
1820 }
1821 
1822 /*
1823  *	Routine:	pmap_extract_and_hold
1824  *	Function:
1825  *		Atomically extract and hold the physical page
1826  *		with the given pmap and virtual address pair
1827  *		if that mapping permits the given protection.
1828  */
1829 vm_page_t
1830 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1831 {
1832 	pt_entry_t *pte, tpte;
1833 	vm_offset_t off;
1834 	vm_page_t m;
1835 	int lvl;
1836 	bool use;
1837 
1838 	m = NULL;
1839 	PMAP_LOCK(pmap);
1840 	pte = pmap_pte(pmap, va, &lvl);
1841 	if (pte != NULL) {
1842 		tpte = pmap_load(pte);
1843 
1844 		KASSERT(lvl > 0 && lvl <= 3,
1845 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1846 		/*
1847 		 * Check that the pte is either a L3 page, or a L1 or L2 block
1848 		 * entry. We can assume L1_BLOCK == L2_BLOCK.
1849 		 */
1850 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1851 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1852 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1853 		     tpte & ATTR_DESCR_MASK));
1854 
1855 		use = false;
1856 		if ((prot & VM_PROT_WRITE) == 0)
1857 			use = true;
1858 		else if (pmap->pm_stage == PM_STAGE1 &&
1859 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1860 			use = true;
1861 		else if (pmap->pm_stage == PM_STAGE2 &&
1862 		    ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1863 		     ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1864 			use = true;
1865 
1866 		if (use) {
1867 			switch (lvl) {
1868 			case 1:
1869 				off = va & L1_OFFSET;
1870 				break;
1871 			case 2:
1872 				off = va & L2_OFFSET;
1873 				break;
1874 			case 3:
1875 			default:
1876 				off = 0;
1877 			}
1878 			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
1879 			if (m != NULL && !vm_page_wire_mapped(m))
1880 				m = NULL;
1881 		}
1882 	}
1883 	PMAP_UNLOCK(pmap);
1884 	return (m);
1885 }
1886 
1887 /*
1888  * Walks the page tables to translate a kernel virtual address to a
1889  * physical address. Returns true if the kva is valid and stores the
1890  * physical address in pa if it is not NULL.
1891  *
1892  * See the comment above data_abort() for the rationale for specifying
1893  * NO_PERTHREAD_SSP here.
1894  */
1895 bool NO_PERTHREAD_SSP
1896 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1897 {
1898 	pt_entry_t *pte, tpte;
1899 	register_t intr;
1900 	uint64_t par;
1901 
1902 	/*
1903 	 * Disable interrupts so we don't get interrupted between asking
1904 	 * for address translation, and getting the result back.
1905 	 */
1906 	intr = intr_disable();
1907 	par = arm64_address_translate_s1e1r(va);
1908 	intr_restore(intr);
1909 
1910 	if (PAR_SUCCESS(par)) {
1911 		if (pa != NULL)
1912 			*pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
1913 		return (true);
1914 	}
1915 
1916 	/*
1917 	 * Fall back to walking the page table. The address translation
1918 	 * instruction may fail when the page is in a break-before-make
1919 	 * sequence. As we only clear the valid bit in said sequence we
1920 	 * can walk the page table to find the physical address.
1921 	 */
1922 
1923 	pte = pmap_l1(kernel_pmap, va);
1924 	if (pte == NULL)
1925 		return (false);
1926 
1927 	/*
1928 	 * A concurrent pmap_update_entry() will clear the entry's valid bit
1929 	 * but leave the rest of the entry unchanged.  Therefore, we treat a
1930 	 * non-zero entry as being valid, and we ignore the valid bit when
1931 	 * determining whether the entry maps a block, page, or table.
1932 	 */
1933 	tpte = pmap_load(pte);
1934 	if (tpte == 0)
1935 		return (false);
1936 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1937 		if (pa != NULL)
1938 			*pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET);
1939 		return (true);
1940 	}
1941 	pte = pmap_l1_to_l2(&tpte, va);
1942 	tpte = pmap_load(pte);
1943 	if (tpte == 0)
1944 		return (false);
1945 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1946 		if (pa != NULL)
1947 			*pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET);
1948 		return (true);
1949 	}
1950 	pte = pmap_l2_to_l3(&tpte, va);
1951 	tpte = pmap_load(pte);
1952 	if (tpte == 0)
1953 		return (false);
1954 	if (pa != NULL)
1955 		*pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET);
1956 	return (true);
1957 }
1958 
1959 vm_paddr_t
1960 pmap_kextract(vm_offset_t va)
1961 {
1962 	vm_paddr_t pa;
1963 
1964 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
1965 		return (DMAP_TO_PHYS(va));
1966 
1967 	if (pmap_klookup(va, &pa) == false)
1968 		return (0);
1969 	return (pa);
1970 }
1971 
1972 /***************************************************
1973  * Low level mapping routines.....
1974  ***************************************************/
1975 
1976 void
1977 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1978 {
1979 	pd_entry_t *pde;
1980 	pt_entry_t *pte, attr;
1981 	vm_offset_t va;
1982 	int lvl;
1983 
1984 	KASSERT((pa & L3_OFFSET) == 0,
1985 	   ("pmap_kenter: Invalid physical address"));
1986 	KASSERT((sva & L3_OFFSET) == 0,
1987 	   ("pmap_kenter: Invalid virtual address"));
1988 	KASSERT((size & PAGE_MASK) == 0,
1989 	    ("pmap_kenter: Mapping is not page-sized"));
1990 
1991 	attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1992 	    ATTR_S1_IDX(mode) | L3_PAGE;
1993 	va = sva;
1994 	while (size != 0) {
1995 		pde = pmap_pde(kernel_pmap, va, &lvl);
1996 		KASSERT(pde != NULL,
1997 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1998 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1999 
2000 		pte = pmap_l2_to_l3(pde, va);
2001 		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
2002 
2003 		va += PAGE_SIZE;
2004 		pa += PAGE_SIZE;
2005 		size -= PAGE_SIZE;
2006 	}
2007 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2008 }
2009 
2010 void
2011 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2012 {
2013 
2014 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2015 }
2016 
2017 /*
2018  * Remove a page from the kernel pagetables.
2019  */
2020 PMAP_INLINE void
2021 pmap_kremove(vm_offset_t va)
2022 {
2023 	pt_entry_t *pte;
2024 
2025 	pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2026 	pmap_clear(pte);
2027 	pmap_s1_invalidate_page(kernel_pmap, va, true);
2028 }
2029 
2030 void
2031 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2032 {
2033 	pt_entry_t *pte;
2034 	vm_offset_t va;
2035 
2036 	KASSERT((sva & L3_OFFSET) == 0,
2037 	   ("pmap_kremove_device: Invalid virtual address"));
2038 	KASSERT((size & PAGE_MASK) == 0,
2039 	    ("pmap_kremove_device: Mapping is not page-sized"));
2040 
2041 	va = sva;
2042 	while (size != 0) {
2043 		pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2044 		pmap_clear(pte);
2045 
2046 		va += PAGE_SIZE;
2047 		size -= PAGE_SIZE;
2048 	}
2049 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2050 }
2051 
2052 /*
2053  *	Used to map a range of physical addresses into kernel
2054  *	virtual address space.
2055  *
2056  *	The value passed in '*virt' is a suggested virtual address for
2057  *	the mapping. Architectures which can support a direct-mapped
2058  *	physical to virtual region can return the appropriate address
2059  *	within that region, leaving '*virt' unchanged. Other
2060  *	architectures should map the pages starting at '*virt' and
2061  *	update '*virt' with the first usable address after the mapped
2062  *	region.
2063  */
2064 vm_offset_t
2065 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2066 {
2067 	return PHYS_TO_DMAP(start);
2068 }
2069 
2070 /*
2071  * Add a list of wired pages to the kva
2072  * this routine is only used for temporary
2073  * kernel mappings that do not need to have
2074  * page modification or references recorded.
2075  * Note that old mappings are simply written
2076  * over.  The page *must* be wired.
2077  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2078  */
2079 void
2080 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2081 {
2082 	pd_entry_t *pde;
2083 	pt_entry_t *pte, pa;
2084 	vm_offset_t va;
2085 	vm_page_t m;
2086 	int i, lvl;
2087 
2088 	va = sva;
2089 	for (i = 0; i < count; i++) {
2090 		pde = pmap_pde(kernel_pmap, va, &lvl);
2091 		KASSERT(pde != NULL,
2092 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2093 		KASSERT(lvl == 2,
2094 		    ("pmap_qenter: Invalid level %d", lvl));
2095 
2096 		m = ma[i];
2097 		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
2098 		    ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2099 		    ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2100 		pte = pmap_l2_to_l3(pde, va);
2101 		pmap_load_store(pte, pa);
2102 
2103 		va += L3_SIZE;
2104 	}
2105 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2106 }
2107 
2108 /*
2109  * This routine tears out page mappings from the
2110  * kernel -- it is meant only for temporary mappings.
2111  */
2112 void
2113 pmap_qremove(vm_offset_t sva, int count)
2114 {
2115 	pt_entry_t *pte;
2116 	vm_offset_t va;
2117 
2118 	KASSERT(ADDR_IS_CANONICAL(sva),
2119 	    ("%s: Address not in canonical form: %lx", __func__, sva));
2120 	KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2121 
2122 	va = sva;
2123 	while (count-- > 0) {
2124 		pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2125 		if (pte != NULL) {
2126 			pmap_clear(pte);
2127 		}
2128 
2129 		va += PAGE_SIZE;
2130 	}
2131 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2132 }
2133 
2134 /***************************************************
2135  * Page table page management routines.....
2136  ***************************************************/
2137 /*
2138  * Schedule the specified unused page table page to be freed.  Specifically,
2139  * add the page to the specified list of pages that will be released to the
2140  * physical memory manager after the TLB has been updated.
2141  */
2142 static __inline void
2143 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
2144     boolean_t set_PG_ZERO)
2145 {
2146 
2147 	if (set_PG_ZERO)
2148 		m->flags |= PG_ZERO;
2149 	else
2150 		m->flags &= ~PG_ZERO;
2151 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2152 }
2153 
2154 /*
2155  * Decrements a page table page's reference count, which is used to record the
2156  * number of valid page table entries within the page.  If the reference count
2157  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
2158  * page table page was unmapped and FALSE otherwise.
2159  */
2160 static inline boolean_t
2161 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2162 {
2163 
2164 	--m->ref_count;
2165 	if (m->ref_count == 0) {
2166 		_pmap_unwire_l3(pmap, va, m, free);
2167 		return (TRUE);
2168 	} else
2169 		return (FALSE);
2170 }
2171 
2172 static void
2173 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2174 {
2175 
2176 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2177 	/*
2178 	 * unmap the page table page
2179 	 */
2180 	if (m->pindex >= (NUL2E + NUL1E)) {
2181 		/* l1 page */
2182 		pd_entry_t *l0;
2183 
2184 		l0 = pmap_l0(pmap, va);
2185 		pmap_clear(l0);
2186 	} else if (m->pindex >= NUL2E) {
2187 		/* l2 page */
2188 		pd_entry_t *l1;
2189 
2190 		l1 = pmap_l1(pmap, va);
2191 		pmap_clear(l1);
2192 	} else {
2193 		/* l3 page */
2194 		pd_entry_t *l2;
2195 
2196 		l2 = pmap_l2(pmap, va);
2197 		pmap_clear(l2);
2198 	}
2199 	pmap_resident_count_dec(pmap, 1);
2200 	if (m->pindex < NUL2E) {
2201 		/* We just released an l3, unhold the matching l2 */
2202 		pd_entry_t *l1, tl1;
2203 		vm_page_t l2pg;
2204 
2205 		l1 = pmap_l1(pmap, va);
2206 		tl1 = pmap_load(l1);
2207 		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
2208 		pmap_unwire_l3(pmap, va, l2pg, free);
2209 	} else if (m->pindex < (NUL2E + NUL1E)) {
2210 		/* We just released an l2, unhold the matching l1 */
2211 		pd_entry_t *l0, tl0;
2212 		vm_page_t l1pg;
2213 
2214 		l0 = pmap_l0(pmap, va);
2215 		tl0 = pmap_load(l0);
2216 		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
2217 		pmap_unwire_l3(pmap, va, l1pg, free);
2218 	}
2219 	pmap_invalidate_page(pmap, va, false);
2220 
2221 	/*
2222 	 * Put page on a list so that it is released after
2223 	 * *ALL* TLB shootdown is done
2224 	 */
2225 	pmap_add_delayed_free_list(m, free, TRUE);
2226 }
2227 
2228 /*
2229  * After removing a page table entry, this routine is used to
2230  * conditionally free the page, and manage the reference count.
2231  */
2232 static int
2233 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2234     struct spglist *free)
2235 {
2236 	vm_page_t mpte;
2237 
2238 	KASSERT(ADDR_IS_CANONICAL(va),
2239 	    ("%s: Address not in canonical form: %lx", __func__, va));
2240 	if (ADDR_IS_KERNEL(va))
2241 		return (0);
2242 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2243 	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
2244 	return (pmap_unwire_l3(pmap, va, mpte, free));
2245 }
2246 
2247 /*
2248  * Release a page table page reference after a failed attempt to create a
2249  * mapping.
2250  */
2251 static void
2252 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2253 {
2254 	struct spglist free;
2255 
2256 	SLIST_INIT(&free);
2257 	if (pmap_unwire_l3(pmap, va, mpte, &free))
2258 		vm_page_free_pages_toq(&free, true);
2259 }
2260 
2261 void
2262 pmap_pinit0(pmap_t pmap)
2263 {
2264 
2265 	PMAP_LOCK_INIT(pmap);
2266 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2267 	pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2268 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2269 	vm_radix_init(&pmap->pm_root);
2270 	pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2271 	pmap->pm_stage = PM_STAGE1;
2272 	pmap->pm_levels = 4;
2273 	pmap->pm_ttbr = pmap->pm_l0_paddr;
2274 	pmap->pm_asid_set = &asids;
2275 
2276 	PCPU_SET(curpmap, pmap);
2277 }
2278 
2279 int
2280 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2281 {
2282 	vm_page_t m;
2283 
2284 	/*
2285 	 * allocate the l0 page
2286 	 */
2287 	m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2288 	    VM_ALLOC_ZERO);
2289 	pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2290 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2291 
2292 	vm_radix_init(&pmap->pm_root);
2293 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2294 	pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2295 
2296 	MPASS(levels == 3 || levels == 4);
2297 	pmap->pm_levels = levels;
2298 	pmap->pm_stage = stage;
2299 	switch (stage) {
2300 	case PM_STAGE1:
2301 		pmap->pm_asid_set = &asids;
2302 		break;
2303 	case PM_STAGE2:
2304 		pmap->pm_asid_set = &vmids;
2305 		break;
2306 	default:
2307 		panic("%s: Invalid pmap type %d", __func__, stage);
2308 		break;
2309 	}
2310 
2311 	/* XXX Temporarily disable deferred ASID allocation. */
2312 	pmap_alloc_asid(pmap);
2313 
2314 	/*
2315 	 * Allocate the level 1 entry to use as the root. This will increase
2316 	 * the refcount on the level 1 page so it won't be removed until
2317 	 * pmap_release() is called.
2318 	 */
2319 	if (pmap->pm_levels == 3) {
2320 		PMAP_LOCK(pmap);
2321 		m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2322 		PMAP_UNLOCK(pmap);
2323 	}
2324 	pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2325 
2326 	return (1);
2327 }
2328 
2329 int
2330 pmap_pinit(pmap_t pmap)
2331 {
2332 
2333 	return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2334 }
2335 
2336 /*
2337  * This routine is called if the desired page table page does not exist.
2338  *
2339  * If page table page allocation fails, this routine may sleep before
2340  * returning NULL.  It sleeps only if a lock pointer was given.
2341  *
2342  * Note: If a page allocation fails at page table level two or three,
2343  * one or two pages may be held during the wait, only to be released
2344  * afterwards.  This conservative approach is easily argued to avoid
2345  * race conditions.
2346  */
2347 static vm_page_t
2348 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2349 {
2350 	vm_page_t m, l1pg, l2pg;
2351 
2352 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2353 
2354 	/*
2355 	 * Allocate a page table page.
2356 	 */
2357 	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2358 		if (lockp != NULL) {
2359 			RELEASE_PV_LIST_LOCK(lockp);
2360 			PMAP_UNLOCK(pmap);
2361 			vm_wait(NULL);
2362 			PMAP_LOCK(pmap);
2363 		}
2364 
2365 		/*
2366 		 * Indicate the need to retry.  While waiting, the page table
2367 		 * page may have been allocated.
2368 		 */
2369 		return (NULL);
2370 	}
2371 	m->pindex = ptepindex;
2372 
2373 	/*
2374 	 * Because of AArch64's weak memory consistency model, we must have a
2375 	 * barrier here to ensure that the stores for zeroing "m", whether by
2376 	 * pmap_zero_page() or an earlier function, are visible before adding
2377 	 * "m" to the page table.  Otherwise, a page table walk by another
2378 	 * processor's MMU could see the mapping to "m" and a stale, non-zero
2379 	 * PTE within "m".
2380 	 */
2381 	dmb(ishst);
2382 
2383 	/*
2384 	 * Map the pagetable page into the process address space, if
2385 	 * it isn't already there.
2386 	 */
2387 
2388 	if (ptepindex >= (NUL2E + NUL1E)) {
2389 		pd_entry_t *l0p, l0e;
2390 		vm_pindex_t l0index;
2391 
2392 		l0index = ptepindex - (NUL2E + NUL1E);
2393 		l0p = &pmap->pm_l0[l0index];
2394 		KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2395 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2396 		l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE;
2397 
2398 		/*
2399 		 * Mark all kernel memory as not accessible from userspace
2400 		 * and userspace memory as not executable from the kernel.
2401 		 * This has been done for the bootstrap L0 entries in
2402 		 * locore.S.
2403 		 */
2404 		if (pmap == kernel_pmap)
2405 			l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2406 		else
2407 			l0e |= TATTR_PXN_TABLE;
2408 		pmap_store(l0p, l0e);
2409 	} else if (ptepindex >= NUL2E) {
2410 		vm_pindex_t l0index, l1index;
2411 		pd_entry_t *l0, *l1;
2412 		pd_entry_t tl0;
2413 
2414 		l1index = ptepindex - NUL2E;
2415 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2416 
2417 		l0 = &pmap->pm_l0[l0index];
2418 		tl0 = pmap_load(l0);
2419 		if (tl0 == 0) {
2420 			/* recurse for allocating page dir */
2421 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2422 			    lockp) == NULL) {
2423 				vm_page_unwire_noq(m);
2424 				vm_page_free_zero(m);
2425 				return (NULL);
2426 			}
2427 		} else {
2428 			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
2429 			l1pg->ref_count++;
2430 		}
2431 
2432 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
2433 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
2434 		KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2435 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2436 		pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
2437 	} else {
2438 		vm_pindex_t l0index, l1index;
2439 		pd_entry_t *l0, *l1, *l2;
2440 		pd_entry_t tl0, tl1;
2441 
2442 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2443 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2444 
2445 		l0 = &pmap->pm_l0[l0index];
2446 		tl0 = pmap_load(l0);
2447 		if (tl0 == 0) {
2448 			/* recurse for allocating page dir */
2449 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2450 			    lockp) == NULL) {
2451 				vm_page_unwire_noq(m);
2452 				vm_page_free_zero(m);
2453 				return (NULL);
2454 			}
2455 			tl0 = pmap_load(l0);
2456 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
2457 			l1 = &l1[l1index & Ln_ADDR_MASK];
2458 		} else {
2459 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
2460 			l1 = &l1[l1index & Ln_ADDR_MASK];
2461 			tl1 = pmap_load(l1);
2462 			if (tl1 == 0) {
2463 				/* recurse for allocating page dir */
2464 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2465 				    lockp) == NULL) {
2466 					vm_page_unwire_noq(m);
2467 					vm_page_free_zero(m);
2468 					return (NULL);
2469 				}
2470 			} else {
2471 				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
2472 				l2pg->ref_count++;
2473 			}
2474 		}
2475 
2476 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
2477 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
2478 		KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2479 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2480 		pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
2481 	}
2482 
2483 	pmap_resident_count_inc(pmap, 1);
2484 
2485 	return (m);
2486 }
2487 
2488 static pd_entry_t *
2489 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2490     struct rwlock **lockp)
2491 {
2492 	pd_entry_t *l1, *l2;
2493 	vm_page_t l2pg;
2494 	vm_pindex_t l2pindex;
2495 
2496 	KASSERT(ADDR_IS_CANONICAL(va),
2497 	    ("%s: Address not in canonical form: %lx", __func__, va));
2498 
2499 retry:
2500 	l1 = pmap_l1(pmap, va);
2501 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2502 		l2 = pmap_l1_to_l2(l1, va);
2503 		if (!ADDR_IS_KERNEL(va)) {
2504 			/* Add a reference to the L2 page. */
2505 			l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK);
2506 			l2pg->ref_count++;
2507 		} else
2508 			l2pg = NULL;
2509 	} else if (!ADDR_IS_KERNEL(va)) {
2510 		/* Allocate a L2 page. */
2511 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2512 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2513 		if (l2pg == NULL) {
2514 			if (lockp != NULL)
2515 				goto retry;
2516 			else
2517 				return (NULL);
2518 		}
2519 		l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2520 		l2 = &l2[pmap_l2_index(va)];
2521 	} else
2522 		panic("pmap_alloc_l2: missing page table page for va %#lx",
2523 		    va);
2524 	*l2pgp = l2pg;
2525 	return (l2);
2526 }
2527 
2528 static vm_page_t
2529 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2530 {
2531 	vm_pindex_t ptepindex;
2532 	pd_entry_t *pde, tpde;
2533 #ifdef INVARIANTS
2534 	pt_entry_t *pte;
2535 #endif
2536 	vm_page_t m;
2537 	int lvl;
2538 
2539 	/*
2540 	 * Calculate pagetable page index
2541 	 */
2542 	ptepindex = pmap_l2_pindex(va);
2543 retry:
2544 	/*
2545 	 * Get the page directory entry
2546 	 */
2547 	pde = pmap_pde(pmap, va, &lvl);
2548 
2549 	/*
2550 	 * If the page table page is mapped, we just increment the hold count,
2551 	 * and activate it. If we get a level 2 pde it will point to a level 3
2552 	 * table.
2553 	 */
2554 	switch (lvl) {
2555 	case -1:
2556 		break;
2557 	case 0:
2558 #ifdef INVARIANTS
2559 		pte = pmap_l0_to_l1(pde, va);
2560 		KASSERT(pmap_load(pte) == 0,
2561 		    ("pmap_alloc_l3: TODO: l0 superpages"));
2562 #endif
2563 		break;
2564 	case 1:
2565 #ifdef INVARIANTS
2566 		pte = pmap_l1_to_l2(pde, va);
2567 		KASSERT(pmap_load(pte) == 0,
2568 		    ("pmap_alloc_l3: TODO: l1 superpages"));
2569 #endif
2570 		break;
2571 	case 2:
2572 		tpde = pmap_load(pde);
2573 		if (tpde != 0) {
2574 			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
2575 			m->ref_count++;
2576 			return (m);
2577 		}
2578 		break;
2579 	default:
2580 		panic("pmap_alloc_l3: Invalid level %d", lvl);
2581 	}
2582 
2583 	/*
2584 	 * Here if the pte page isn't mapped, or if it has been deallocated.
2585 	 */
2586 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2587 	if (m == NULL && lockp != NULL)
2588 		goto retry;
2589 
2590 	return (m);
2591 }
2592 
2593 /***************************************************
2594  * Pmap allocation/deallocation routines.
2595  ***************************************************/
2596 
2597 /*
2598  * Release any resources held by the given physical map.
2599  * Called when a pmap initialized by pmap_pinit is being released.
2600  * Should only be called if the map contains no valid mappings.
2601  */
2602 void
2603 pmap_release(pmap_t pmap)
2604 {
2605 	boolean_t rv __diagused;
2606 	struct spglist free;
2607 	struct asid_set *set;
2608 	vm_page_t m;
2609 	int asid;
2610 
2611 	if (pmap->pm_levels != 4) {
2612 		PMAP_ASSERT_STAGE2(pmap);
2613 		KASSERT(pmap->pm_stats.resident_count == 1,
2614 		    ("pmap_release: pmap resident count %ld != 0",
2615 		    pmap->pm_stats.resident_count));
2616 		KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2617 		    ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2618 
2619 		SLIST_INIT(&free);
2620 		m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2621 		PMAP_LOCK(pmap);
2622 		rv = pmap_unwire_l3(pmap, 0, m, &free);
2623 		PMAP_UNLOCK(pmap);
2624 		MPASS(rv == TRUE);
2625 		vm_page_free_pages_toq(&free, true);
2626 	}
2627 
2628 	KASSERT(pmap->pm_stats.resident_count == 0,
2629 	    ("pmap_release: pmap resident count %ld != 0",
2630 	    pmap->pm_stats.resident_count));
2631 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2632 	    ("pmap_release: pmap has reserved page table page(s)"));
2633 
2634 	set = pmap->pm_asid_set;
2635 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2636 
2637 	/*
2638 	 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2639 	 * the entries when removing them so rely on a later tlb invalidation.
2640 	 * this will happen when updating the VMID generation. Because of this
2641 	 * we don't reuse VMIDs within a generation.
2642 	 */
2643 	if (pmap->pm_stage == PM_STAGE1) {
2644 		mtx_lock_spin(&set->asid_set_mutex);
2645 		if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2646 			asid = COOKIE_TO_ASID(pmap->pm_cookie);
2647 			KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2648 			    asid < set->asid_set_size,
2649 			    ("pmap_release: pmap cookie has out-of-range asid"));
2650 			bit_clear(set->asid_set, asid);
2651 		}
2652 		mtx_unlock_spin(&set->asid_set_mutex);
2653 	}
2654 
2655 	m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2656 	vm_page_unwire_noq(m);
2657 	vm_page_free_zero(m);
2658 }
2659 
2660 static int
2661 kvm_size(SYSCTL_HANDLER_ARGS)
2662 {
2663 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2664 
2665 	return sysctl_handle_long(oidp, &ksize, 0, req);
2666 }
2667 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2668     0, 0, kvm_size, "LU",
2669     "Size of KVM");
2670 
2671 static int
2672 kvm_free(SYSCTL_HANDLER_ARGS)
2673 {
2674 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2675 
2676 	return sysctl_handle_long(oidp, &kfree, 0, req);
2677 }
2678 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2679     0, 0, kvm_free, "LU",
2680     "Amount of KVM free");
2681 
2682 /*
2683  * grow the number of kernel page table entries, if needed
2684  */
2685 void
2686 pmap_growkernel(vm_offset_t addr)
2687 {
2688 	vm_paddr_t paddr;
2689 	vm_page_t nkpg;
2690 	pd_entry_t *l0, *l1, *l2;
2691 
2692 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2693 
2694 	addr = roundup2(addr, L2_SIZE);
2695 	if (addr - 1 >= vm_map_max(kernel_map))
2696 		addr = vm_map_max(kernel_map);
2697 	if (kernel_vm_end < addr)
2698 		kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2699 	while (kernel_vm_end < addr) {
2700 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2701 		KASSERT(pmap_load(l0) != 0,
2702 		    ("pmap_growkernel: No level 0 kernel entry"));
2703 
2704 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2705 		if (pmap_load(l1) == 0) {
2706 			/* We need a new PDP entry */
2707 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2708 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2709 			if (nkpg == NULL)
2710 				panic("pmap_growkernel: no memory to grow kernel");
2711 			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2712 			/* See the dmb() in _pmap_alloc_l3(). */
2713 			dmb(ishst);
2714 			paddr = VM_PAGE_TO_PHYS(nkpg);
2715 			pmap_store(l1, paddr | L1_TABLE);
2716 			continue; /* try again */
2717 		}
2718 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2719 		if (pmap_load(l2) != 0) {
2720 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2721 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2722 				kernel_vm_end = vm_map_max(kernel_map);
2723 				break;
2724 			}
2725 			continue;
2726 		}
2727 
2728 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2729 		    VM_ALLOC_ZERO);
2730 		if (nkpg == NULL)
2731 			panic("pmap_growkernel: no memory to grow kernel");
2732 		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2733 		/* See the dmb() in _pmap_alloc_l3(). */
2734 		dmb(ishst);
2735 		paddr = VM_PAGE_TO_PHYS(nkpg);
2736 		pmap_store(l2, paddr | L2_TABLE);
2737 
2738 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2739 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2740 			kernel_vm_end = vm_map_max(kernel_map);
2741 			break;
2742 		}
2743 	}
2744 }
2745 
2746 /***************************************************
2747  * page management routines.
2748  ***************************************************/
2749 
2750 static const uint64_t pc_freemask[_NPCM] = {
2751 	[0 ... _NPCM - 2] = PC_FREEN,
2752 	[_NPCM - 1] = PC_FREEL
2753 };
2754 
2755 #ifdef PV_STATS
2756 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2757 
2758 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2759 	"Current number of pv entry chunks");
2760 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2761 	"Current number of pv entry chunks allocated");
2762 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2763 	"Current number of pv entry chunks frees");
2764 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2765 	"Number of times tried to get a chunk page but failed.");
2766 
2767 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2768 static int pv_entry_spare;
2769 
2770 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2771 	"Current number of pv entry frees");
2772 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2773 	"Current number of pv entry allocs");
2774 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2775 	"Current number of pv entries");
2776 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2777 	"Current number of spare pv entries");
2778 #endif
2779 
2780 /*
2781  * We are in a serious low memory condition.  Resort to
2782  * drastic measures to free some pages so we can allocate
2783  * another pv entry chunk.
2784  *
2785  * Returns NULL if PV entries were reclaimed from the specified pmap.
2786  *
2787  * We do not, however, unmap 2mpages because subsequent accesses will
2788  * allocate per-page pv entries until repromotion occurs, thereby
2789  * exacerbating the shortage of free pv entries.
2790  */
2791 static vm_page_t
2792 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
2793 {
2794 	struct pv_chunks_list *pvc;
2795 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
2796 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
2797 	struct md_page *pvh;
2798 	pd_entry_t *pde;
2799 	pmap_t next_pmap, pmap;
2800 	pt_entry_t *pte, tpte;
2801 	pv_entry_t pv;
2802 	vm_offset_t va;
2803 	vm_page_t m, m_pc;
2804 	struct spglist free;
2805 	uint64_t inuse;
2806 	int bit, field, freed, lvl;
2807 
2808 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2809 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2810 
2811 	pmap = NULL;
2812 	m_pc = NULL;
2813 	SLIST_INIT(&free);
2814 	bzero(&pc_marker_b, sizeof(pc_marker_b));
2815 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
2816 	pc_marker = (struct pv_chunk *)&pc_marker_b;
2817 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
2818 
2819 	pvc = &pv_chunks[domain];
2820 	mtx_lock(&pvc->pvc_lock);
2821 	pvc->active_reclaims++;
2822 	TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
2823 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
2824 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
2825 	    SLIST_EMPTY(&free)) {
2826 		next_pmap = pc->pc_pmap;
2827 		if (next_pmap == NULL) {
2828 			/*
2829 			 * The next chunk is a marker.  However, it is
2830 			 * not our marker, so active_reclaims must be
2831 			 * > 1.  Consequently, the next_chunk code
2832 			 * will not rotate the pv_chunks list.
2833 			 */
2834 			goto next_chunk;
2835 		}
2836 		mtx_unlock(&pvc->pvc_lock);
2837 
2838 		/*
2839 		 * A pv_chunk can only be removed from the pc_lru list
2840 		 * when both pvc->pvc_lock is owned and the
2841 		 * corresponding pmap is locked.
2842 		 */
2843 		if (pmap != next_pmap) {
2844 			if (pmap != NULL && pmap != locked_pmap)
2845 				PMAP_UNLOCK(pmap);
2846 			pmap = next_pmap;
2847 			/* Avoid deadlock and lock recursion. */
2848 			if (pmap > locked_pmap) {
2849 				RELEASE_PV_LIST_LOCK(lockp);
2850 				PMAP_LOCK(pmap);
2851 				mtx_lock(&pvc->pvc_lock);
2852 				continue;
2853 			} else if (pmap != locked_pmap) {
2854 				if (PMAP_TRYLOCK(pmap)) {
2855 					mtx_lock(&pvc->pvc_lock);
2856 					continue;
2857 				} else {
2858 					pmap = NULL; /* pmap is not locked */
2859 					mtx_lock(&pvc->pvc_lock);
2860 					pc = TAILQ_NEXT(pc_marker, pc_lru);
2861 					if (pc == NULL ||
2862 					    pc->pc_pmap != next_pmap)
2863 						continue;
2864 					goto next_chunk;
2865 				}
2866 			}
2867 		}
2868 
2869 		/*
2870 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2871 		 */
2872 		freed = 0;
2873 		for (field = 0; field < _NPCM; field++) {
2874 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2875 			    inuse != 0; inuse &= ~(1UL << bit)) {
2876 				bit = ffsl(inuse) - 1;
2877 				pv = &pc->pc_pventry[field * 64 + bit];
2878 				va = pv->pv_va;
2879 				pde = pmap_pde(pmap, va, &lvl);
2880 				if (lvl != 2)
2881 					continue;
2882 				pte = pmap_l2_to_l3(pde, va);
2883 				tpte = pmap_load(pte);
2884 				if ((tpte & ATTR_SW_WIRED) != 0)
2885 					continue;
2886 				tpte = pmap_load_clear(pte);
2887 				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
2888 				if (pmap_pte_dirty(pmap, tpte))
2889 					vm_page_dirty(m);
2890 				if ((tpte & ATTR_AF) != 0) {
2891 					pmap_s1_invalidate_page(pmap, va, true);
2892 					vm_page_aflag_set(m, PGA_REFERENCED);
2893 				}
2894 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2895 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2896 				m->md.pv_gen++;
2897 				if (TAILQ_EMPTY(&m->md.pv_list) &&
2898 				    (m->flags & PG_FICTITIOUS) == 0) {
2899 					pvh = page_to_pvh(m);
2900 					if (TAILQ_EMPTY(&pvh->pv_list)) {
2901 						vm_page_aflag_clear(m,
2902 						    PGA_WRITEABLE);
2903 					}
2904 				}
2905 				pc->pc_map[field] |= 1UL << bit;
2906 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
2907 				freed++;
2908 			}
2909 		}
2910 		if (freed == 0) {
2911 			mtx_lock(&pvc->pvc_lock);
2912 			goto next_chunk;
2913 		}
2914 		/* Every freed mapping is for a 4 KB page. */
2915 		pmap_resident_count_dec(pmap, freed);
2916 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2917 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2918 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2919 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2920 		if (pc_is_free(pc)) {
2921 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2922 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2923 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2924 			/* Entire chunk is free; return it. */
2925 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2926 			dump_drop_page(m_pc->phys_addr);
2927 			mtx_lock(&pvc->pvc_lock);
2928 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
2929 			break;
2930 		}
2931 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2932 		mtx_lock(&pvc->pvc_lock);
2933 		/* One freed pv entry in locked_pmap is sufficient. */
2934 		if (pmap == locked_pmap)
2935 			break;
2936 
2937 next_chunk:
2938 		TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
2939 		TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
2940 		if (pvc->active_reclaims == 1 && pmap != NULL) {
2941 			/*
2942 			 * Rotate the pv chunks list so that we do not
2943 			 * scan the same pv chunks that could not be
2944 			 * freed (because they contained a wired
2945 			 * and/or superpage mapping) on every
2946 			 * invocation of reclaim_pv_chunk().
2947 			 */
2948 			while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
2949 				MPASS(pc->pc_pmap != NULL);
2950 				TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
2951 				TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
2952 			}
2953 		}
2954 	}
2955 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
2956 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
2957 	pvc->active_reclaims--;
2958 	mtx_unlock(&pvc->pvc_lock);
2959 	if (pmap != NULL && pmap != locked_pmap)
2960 		PMAP_UNLOCK(pmap);
2961 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2962 		m_pc = SLIST_FIRST(&free);
2963 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2964 		/* Recycle a freed page table page. */
2965 		m_pc->ref_count = 1;
2966 	}
2967 	vm_page_free_pages_toq(&free, true);
2968 	return (m_pc);
2969 }
2970 
2971 static vm_page_t
2972 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2973 {
2974 	vm_page_t m;
2975 	int i, domain;
2976 
2977 	domain = PCPU_GET(domain);
2978 	for (i = 0; i < vm_ndomains; i++) {
2979 		m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
2980 		if (m != NULL)
2981 			break;
2982 		domain = (domain + 1) % vm_ndomains;
2983 	}
2984 
2985 	return (m);
2986 }
2987 
2988 /*
2989  * free the pv_entry back to the free list
2990  */
2991 static void
2992 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2993 {
2994 	struct pv_chunk *pc;
2995 	int idx, field, bit;
2996 
2997 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2998 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2999 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3000 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3001 	pc = pv_to_chunk(pv);
3002 	idx = pv - &pc->pc_pventry[0];
3003 	field = idx / 64;
3004 	bit = idx % 64;
3005 	pc->pc_map[field] |= 1ul << bit;
3006 	if (!pc_is_free(pc)) {
3007 		/* 98% of the time, pc is already at the head of the list. */
3008 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3009 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3010 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3011 		}
3012 		return;
3013 	}
3014 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3015 	free_pv_chunk(pc);
3016 }
3017 
3018 static void
3019 free_pv_chunk_dequeued(struct pv_chunk *pc)
3020 {
3021 	vm_page_t m;
3022 
3023 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3024 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3025 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3026 	/* entire chunk is free, return it */
3027 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3028 	dump_drop_page(m->phys_addr);
3029 	vm_page_unwire_noq(m);
3030 	vm_page_free(m);
3031 }
3032 
3033 static void
3034 free_pv_chunk(struct pv_chunk *pc)
3035 {
3036 	struct pv_chunks_list *pvc;
3037 
3038 	pvc = &pv_chunks[pc_to_domain(pc)];
3039 	mtx_lock(&pvc->pvc_lock);
3040 	TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3041 	mtx_unlock(&pvc->pvc_lock);
3042 	free_pv_chunk_dequeued(pc);
3043 }
3044 
3045 static void
3046 free_pv_chunk_batch(struct pv_chunklist *batch)
3047 {
3048 	struct pv_chunks_list *pvc;
3049 	struct pv_chunk *pc, *npc;
3050 	int i;
3051 
3052 	for (i = 0; i < vm_ndomains; i++) {
3053 		if (TAILQ_EMPTY(&batch[i]))
3054 			continue;
3055 		pvc = &pv_chunks[i];
3056 		mtx_lock(&pvc->pvc_lock);
3057 		TAILQ_FOREACH(pc, &batch[i], pc_list) {
3058 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3059 		}
3060 		mtx_unlock(&pvc->pvc_lock);
3061 	}
3062 
3063 	for (i = 0; i < vm_ndomains; i++) {
3064 		TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3065 			free_pv_chunk_dequeued(pc);
3066 		}
3067 	}
3068 }
3069 
3070 /*
3071  * Returns a new PV entry, allocating a new PV chunk from the system when
3072  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3073  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3074  * returned.
3075  *
3076  * The given PV list lock may be released.
3077  */
3078 static pv_entry_t
3079 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3080 {
3081 	struct pv_chunks_list *pvc;
3082 	int bit, field;
3083 	pv_entry_t pv;
3084 	struct pv_chunk *pc;
3085 	vm_page_t m;
3086 
3087 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3088 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3089 retry:
3090 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3091 	if (pc != NULL) {
3092 		for (field = 0; field < _NPCM; field++) {
3093 			if (pc->pc_map[field]) {
3094 				bit = ffsl(pc->pc_map[field]) - 1;
3095 				break;
3096 			}
3097 		}
3098 		if (field < _NPCM) {
3099 			pv = &pc->pc_pventry[field * 64 + bit];
3100 			pc->pc_map[field] &= ~(1ul << bit);
3101 			/* If this was the last item, move it to tail */
3102 			if (pc_is_full(pc)) {
3103 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3104 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3105 				    pc_list);
3106 			}
3107 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3108 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3109 			return (pv);
3110 		}
3111 	}
3112 	/* No free items, allocate another chunk */
3113 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3114 	if (m == NULL) {
3115 		if (lockp == NULL) {
3116 			PV_STAT(pc_chunk_tryfail++);
3117 			return (NULL);
3118 		}
3119 		m = reclaim_pv_chunk(pmap, lockp);
3120 		if (m == NULL)
3121 			goto retry;
3122 	}
3123 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3124 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3125 	dump_add_page(m->phys_addr);
3126 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3127 	pc->pc_pmap = pmap;
3128 	memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3129 	pc->pc_map[0] &= ~1ul;		/* preallocated bit 0 */
3130 	pvc = &pv_chunks[vm_page_domain(m)];
3131 	mtx_lock(&pvc->pvc_lock);
3132 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3133 	mtx_unlock(&pvc->pvc_lock);
3134 	pv = &pc->pc_pventry[0];
3135 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3136 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3137 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3138 	return (pv);
3139 }
3140 
3141 /*
3142  * Ensure that the number of spare PV entries in the specified pmap meets or
3143  * exceeds the given count, "needed".
3144  *
3145  * The given PV list lock may be released.
3146  */
3147 static void
3148 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3149 {
3150 	struct pv_chunks_list *pvc;
3151 	struct pch new_tail[PMAP_MEMDOM];
3152 	struct pv_chunk *pc;
3153 	vm_page_t m;
3154 	int avail, free, i;
3155 	bool reclaimed;
3156 
3157 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3158 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3159 
3160 	/*
3161 	 * Newly allocated PV chunks must be stored in a private list until
3162 	 * the required number of PV chunks have been allocated.  Otherwise,
3163 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3164 	 * contrast, these chunks must be added to the pmap upon allocation.
3165 	 */
3166 	for (i = 0; i < PMAP_MEMDOM; i++)
3167 		TAILQ_INIT(&new_tail[i]);
3168 retry:
3169 	avail = 0;
3170 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3171 		bit_count((bitstr_t *)pc->pc_map, 0,
3172 		    sizeof(pc->pc_map) * NBBY, &free);
3173 		if (free == 0)
3174 			break;
3175 		avail += free;
3176 		if (avail >= needed)
3177 			break;
3178 	}
3179 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
3180 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3181 		if (m == NULL) {
3182 			m = reclaim_pv_chunk(pmap, lockp);
3183 			if (m == NULL)
3184 				goto retry;
3185 			reclaimed = true;
3186 		}
3187 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3188 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3189 		dump_add_page(m->phys_addr);
3190 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3191 		pc->pc_pmap = pmap;
3192 		memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3193 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3194 		TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3195 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3196 
3197 		/*
3198 		 * The reclaim might have freed a chunk from the current pmap.
3199 		 * If that chunk contained available entries, we need to
3200 		 * re-count the number of available entries.
3201 		 */
3202 		if (reclaimed)
3203 			goto retry;
3204 	}
3205 	for (i = 0; i < vm_ndomains; i++) {
3206 		if (TAILQ_EMPTY(&new_tail[i]))
3207 			continue;
3208 		pvc = &pv_chunks[i];
3209 		mtx_lock(&pvc->pvc_lock);
3210 		TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3211 		mtx_unlock(&pvc->pvc_lock);
3212 	}
3213 }
3214 
3215 /*
3216  * First find and then remove the pv entry for the specified pmap and virtual
3217  * address from the specified pv list.  Returns the pv entry if found and NULL
3218  * otherwise.  This operation can be performed on pv lists for either 4KB or
3219  * 2MB page mappings.
3220  */
3221 static __inline pv_entry_t
3222 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3223 {
3224 	pv_entry_t pv;
3225 
3226 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3227 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3228 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3229 			pvh->pv_gen++;
3230 			break;
3231 		}
3232 	}
3233 	return (pv);
3234 }
3235 
3236 /*
3237  * After demotion from a 2MB page mapping to 512 4KB page mappings,
3238  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3239  * entries for each of the 4KB page mappings.
3240  */
3241 static void
3242 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3243     struct rwlock **lockp)
3244 {
3245 	struct md_page *pvh;
3246 	struct pv_chunk *pc;
3247 	pv_entry_t pv;
3248 	vm_offset_t va_last;
3249 	vm_page_t m;
3250 	int bit, field;
3251 
3252 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3253 	KASSERT((va & L2_OFFSET) == 0,
3254 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3255 	KASSERT((pa & L2_OFFSET) == 0,
3256 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3257 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3258 
3259 	/*
3260 	 * Transfer the 2mpage's pv entry for this mapping to the first
3261 	 * page's pv list.  Once this transfer begins, the pv list lock
3262 	 * must not be released until the last pv entry is reinstantiated.
3263 	 */
3264 	pvh = pa_to_pvh(pa);
3265 	pv = pmap_pvh_remove(pvh, pmap, va);
3266 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3267 	m = PHYS_TO_VM_PAGE(pa);
3268 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3269 	m->md.pv_gen++;
3270 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3271 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3272 	va_last = va + L2_SIZE - PAGE_SIZE;
3273 	for (;;) {
3274 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3275 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3276 		for (field = 0; field < _NPCM; field++) {
3277 			while (pc->pc_map[field]) {
3278 				bit = ffsl(pc->pc_map[field]) - 1;
3279 				pc->pc_map[field] &= ~(1ul << bit);
3280 				pv = &pc->pc_pventry[field * 64 + bit];
3281 				va += PAGE_SIZE;
3282 				pv->pv_va = va;
3283 				m++;
3284 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3285 			    ("pmap_pv_demote_l2: page %p is not managed", m));
3286 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3287 				m->md.pv_gen++;
3288 				if (va == va_last)
3289 					goto out;
3290 			}
3291 		}
3292 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3293 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3294 	}
3295 out:
3296 	if (pc_is_full(pc)) {
3297 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3298 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3299 	}
3300 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3301 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3302 }
3303 
3304 /*
3305  * First find and then destroy the pv entry for the specified pmap and virtual
3306  * address.  This operation can be performed on pv lists for either 4KB or 2MB
3307  * page mappings.
3308  */
3309 static void
3310 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3311 {
3312 	pv_entry_t pv;
3313 
3314 	pv = pmap_pvh_remove(pvh, pmap, va);
3315 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3316 	free_pv_entry(pmap, pv);
3317 }
3318 
3319 /*
3320  * Conditionally create the PV entry for a 4KB page mapping if the required
3321  * memory can be allocated without resorting to reclamation.
3322  */
3323 static boolean_t
3324 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3325     struct rwlock **lockp)
3326 {
3327 	pv_entry_t pv;
3328 
3329 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3330 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3331 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3332 		pv->pv_va = va;
3333 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3334 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3335 		m->md.pv_gen++;
3336 		return (TRUE);
3337 	} else
3338 		return (FALSE);
3339 }
3340 
3341 /*
3342  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3343  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3344  * false if the PV entry cannot be allocated without resorting to reclamation.
3345  */
3346 static bool
3347 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3348     struct rwlock **lockp)
3349 {
3350 	struct md_page *pvh;
3351 	pv_entry_t pv;
3352 	vm_paddr_t pa;
3353 
3354 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3355 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3356 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3357 	    NULL : lockp)) == NULL)
3358 		return (false);
3359 	pv->pv_va = va;
3360 	pa = l2e & ~ATTR_MASK;
3361 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3362 	pvh = pa_to_pvh(pa);
3363 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3364 	pvh->pv_gen++;
3365 	return (true);
3366 }
3367 
3368 static void
3369 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3370 {
3371 	pt_entry_t newl2, oldl2 __diagused;
3372 	vm_page_t ml3;
3373 	vm_paddr_t ml3pa;
3374 
3375 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3376 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3377 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3378 
3379 	ml3 = pmap_remove_pt_page(pmap, va);
3380 	if (ml3 == NULL)
3381 		panic("pmap_remove_kernel_l2: Missing pt page");
3382 
3383 	ml3pa = VM_PAGE_TO_PHYS(ml3);
3384 	newl2 = ml3pa | L2_TABLE;
3385 
3386 	/*
3387 	 * If this page table page was unmapped by a promotion, then it
3388 	 * contains valid mappings.  Zero it to invalidate those mappings.
3389 	 */
3390 	if (ml3->valid != 0)
3391 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
3392 
3393 	/*
3394 	 * Demote the mapping.  The caller must have already invalidated the
3395 	 * mapping (i.e., the "break" in break-before-make).
3396 	 */
3397 	oldl2 = pmap_load_store(l2, newl2);
3398 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3399 	    __func__, l2, oldl2));
3400 }
3401 
3402 /*
3403  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3404  */
3405 static int
3406 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3407     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3408 {
3409 	struct md_page *pvh;
3410 	pt_entry_t old_l2;
3411 	vm_page_t m, ml3, mt;
3412 
3413 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3414 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3415 	old_l2 = pmap_load_clear(l2);
3416 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3417 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3418 
3419 	/*
3420 	 * Since a promotion must break the 4KB page mappings before making
3421 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3422 	 */
3423 	pmap_s1_invalidate_page(pmap, sva, true);
3424 
3425 	if (old_l2 & ATTR_SW_WIRED)
3426 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3427 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3428 	if (old_l2 & ATTR_SW_MANAGED) {
3429 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
3430 		pvh = page_to_pvh(m);
3431 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
3432 		pmap_pvh_free(pvh, pmap, sva);
3433 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3434 			if (pmap_pte_dirty(pmap, old_l2))
3435 				vm_page_dirty(mt);
3436 			if (old_l2 & ATTR_AF)
3437 				vm_page_aflag_set(mt, PGA_REFERENCED);
3438 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3439 			    TAILQ_EMPTY(&pvh->pv_list))
3440 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3441 		}
3442 	}
3443 	if (pmap == kernel_pmap) {
3444 		pmap_remove_kernel_l2(pmap, l2, sva);
3445 	} else {
3446 		ml3 = pmap_remove_pt_page(pmap, sva);
3447 		if (ml3 != NULL) {
3448 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
3449 			    ("pmap_remove_l2: l3 page not promoted"));
3450 			pmap_resident_count_dec(pmap, 1);
3451 			KASSERT(ml3->ref_count == NL3PG,
3452 			    ("pmap_remove_l2: l3 page ref count error"));
3453 			ml3->ref_count = 0;
3454 			pmap_add_delayed_free_list(ml3, free, FALSE);
3455 		}
3456 	}
3457 	return (pmap_unuse_pt(pmap, sva, l1e, free));
3458 }
3459 
3460 /*
3461  * pmap_remove_l3: do the things to unmap a page in a process
3462  */
3463 static int
3464 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3465     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3466 {
3467 	struct md_page *pvh;
3468 	pt_entry_t old_l3;
3469 	vm_page_t m;
3470 
3471 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3472 	old_l3 = pmap_load_clear(l3);
3473 	pmap_s1_invalidate_page(pmap, va, true);
3474 	if (old_l3 & ATTR_SW_WIRED)
3475 		pmap->pm_stats.wired_count -= 1;
3476 	pmap_resident_count_dec(pmap, 1);
3477 	if (old_l3 & ATTR_SW_MANAGED) {
3478 		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
3479 		if (pmap_pte_dirty(pmap, old_l3))
3480 			vm_page_dirty(m);
3481 		if (old_l3 & ATTR_AF)
3482 			vm_page_aflag_set(m, PGA_REFERENCED);
3483 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3484 		pmap_pvh_free(&m->md, pmap, va);
3485 		if (TAILQ_EMPTY(&m->md.pv_list) &&
3486 		    (m->flags & PG_FICTITIOUS) == 0) {
3487 			pvh = page_to_pvh(m);
3488 			if (TAILQ_EMPTY(&pvh->pv_list))
3489 				vm_page_aflag_clear(m, PGA_WRITEABLE);
3490 		}
3491 	}
3492 	return (pmap_unuse_pt(pmap, va, l2e, free));
3493 }
3494 
3495 /*
3496  * Remove the specified range of addresses from the L3 page table that is
3497  * identified by the given L2 entry.
3498  */
3499 static void
3500 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3501     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3502 {
3503 	struct md_page *pvh;
3504 	struct rwlock *new_lock;
3505 	pt_entry_t *l3, old_l3;
3506 	vm_offset_t va;
3507 	vm_page_t l3pg, m;
3508 
3509 	KASSERT(ADDR_IS_CANONICAL(sva),
3510 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
3511 	KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3512 	    ("%s: End address not in canonical form: %lx", __func__, eva));
3513 
3514 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3515 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3516 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3517 	l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL;
3518 	va = eva;
3519 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3520 		if (!pmap_l3_valid(pmap_load(l3))) {
3521 			if (va != eva) {
3522 				pmap_invalidate_range(pmap, va, sva, true);
3523 				va = eva;
3524 			}
3525 			continue;
3526 		}
3527 		old_l3 = pmap_load_clear(l3);
3528 		if ((old_l3 & ATTR_SW_WIRED) != 0)
3529 			pmap->pm_stats.wired_count--;
3530 		pmap_resident_count_dec(pmap, 1);
3531 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3532 			m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
3533 			if (pmap_pte_dirty(pmap, old_l3))
3534 				vm_page_dirty(m);
3535 			if ((old_l3 & ATTR_AF) != 0)
3536 				vm_page_aflag_set(m, PGA_REFERENCED);
3537 			new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m));
3538 			if (new_lock != *lockp) {
3539 				if (*lockp != NULL) {
3540 					/*
3541 					 * Pending TLB invalidations must be
3542 					 * performed before the PV list lock is
3543 					 * released.  Otherwise, a concurrent
3544 					 * pmap_remove_all() on a physical page
3545 					 * could return while a stale TLB entry
3546 					 * still provides access to that page.
3547 					 */
3548 					if (va != eva) {
3549 						pmap_invalidate_range(pmap, va,
3550 						    sva, true);
3551 						va = eva;
3552 					}
3553 					rw_wunlock(*lockp);
3554 				}
3555 				*lockp = new_lock;
3556 				rw_wlock(*lockp);
3557 			}
3558 			pmap_pvh_free(&m->md, pmap, sva);
3559 			if (TAILQ_EMPTY(&m->md.pv_list) &&
3560 			    (m->flags & PG_FICTITIOUS) == 0) {
3561 				pvh = page_to_pvh(m);
3562 				if (TAILQ_EMPTY(&pvh->pv_list))
3563 					vm_page_aflag_clear(m, PGA_WRITEABLE);
3564 			}
3565 		}
3566 		if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3567 			/*
3568 			 * _pmap_unwire_l3() has already invalidated the TLB
3569 			 * entries at all levels for "sva".  So, we need not
3570 			 * perform "sva += L3_SIZE;" here.  Moreover, we need
3571 			 * not perform "va = sva;" if "sva" is at the start
3572 			 * of a new valid range consisting of a single page.
3573 			 */
3574 			break;
3575 		}
3576 		if (va == eva)
3577 			va = sva;
3578 	}
3579 	if (va != eva)
3580 		pmap_invalidate_range(pmap, va, sva, true);
3581 }
3582 
3583 /*
3584  *	Remove the given range of addresses from the specified map.
3585  *
3586  *	It is assumed that the start and end are properly
3587  *	rounded to the page size.
3588  */
3589 void
3590 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3591 {
3592 	struct rwlock *lock;
3593 	vm_offset_t va_next;
3594 	pd_entry_t *l0, *l1, *l2;
3595 	pt_entry_t l3_paddr;
3596 	struct spglist free;
3597 
3598 	/*
3599 	 * Perform an unsynchronized read.  This is, however, safe.
3600 	 */
3601 	if (pmap->pm_stats.resident_count == 0)
3602 		return;
3603 
3604 	SLIST_INIT(&free);
3605 
3606 	PMAP_LOCK(pmap);
3607 
3608 	lock = NULL;
3609 	for (; sva < eva; sva = va_next) {
3610 		if (pmap->pm_stats.resident_count == 0)
3611 			break;
3612 
3613 		l0 = pmap_l0(pmap, sva);
3614 		if (pmap_load(l0) == 0) {
3615 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3616 			if (va_next < sva)
3617 				va_next = eva;
3618 			continue;
3619 		}
3620 
3621 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3622 		if (va_next < sva)
3623 			va_next = eva;
3624 		l1 = pmap_l0_to_l1(l0, sva);
3625 		if (pmap_load(l1) == 0)
3626 			continue;
3627 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3628 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
3629 			KASSERT(va_next <= eva,
3630 			    ("partial update of non-transparent 1G page "
3631 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3632 			    pmap_load(l1), sva, eva, va_next));
3633 			MPASS(pmap != kernel_pmap);
3634 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3635 			pmap_clear(l1);
3636 			pmap_s1_invalidate_page(pmap, sva, true);
3637 			pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
3638 			pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
3639 			continue;
3640 		}
3641 
3642 		/*
3643 		 * Calculate index for next page table.
3644 		 */
3645 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3646 		if (va_next < sva)
3647 			va_next = eva;
3648 
3649 		l2 = pmap_l1_to_l2(l1, sva);
3650 		if (l2 == NULL)
3651 			continue;
3652 
3653 		l3_paddr = pmap_load(l2);
3654 
3655 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
3656 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3657 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
3658 				    &free, &lock);
3659 				continue;
3660 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
3661 			    &lock) == NULL)
3662 				continue;
3663 			l3_paddr = pmap_load(l2);
3664 		}
3665 
3666 		/*
3667 		 * Weed out invalid mappings.
3668 		 */
3669 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
3670 			continue;
3671 
3672 		/*
3673 		 * Limit our scan to either the end of the va represented
3674 		 * by the current page table page, or to the end of the
3675 		 * range being removed.
3676 		 */
3677 		if (va_next > eva)
3678 			va_next = eva;
3679 
3680 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
3681 		    &lock);
3682 	}
3683 	if (lock != NULL)
3684 		rw_wunlock(lock);
3685 	PMAP_UNLOCK(pmap);
3686 	vm_page_free_pages_toq(&free, true);
3687 }
3688 
3689 /*
3690  *	Remove the given range of addresses as part of a logical unmap
3691  *	operation. This has the effect of calling pmap_remove(), but
3692  *	also clears any metadata that should persist for the lifetime
3693  *	of a logical mapping.
3694  */
3695 void
3696 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3697 {
3698 	pmap_remove(pmap, sva, eva);
3699 }
3700 
3701 /*
3702  *	Routine:	pmap_remove_all
3703  *	Function:
3704  *		Removes this physical page from
3705  *		all physical maps in which it resides.
3706  *		Reflects back modify bits to the pager.
3707  *
3708  *	Notes:
3709  *		Original versions of this routine were very
3710  *		inefficient because they iteratively called
3711  *		pmap_remove (slow...)
3712  */
3713 
3714 void
3715 pmap_remove_all(vm_page_t m)
3716 {
3717 	struct md_page *pvh;
3718 	pv_entry_t pv;
3719 	pmap_t pmap;
3720 	struct rwlock *lock;
3721 	pd_entry_t *pde, tpde;
3722 	pt_entry_t *pte, tpte;
3723 	vm_offset_t va;
3724 	struct spglist free;
3725 	int lvl, pvh_gen, md_gen;
3726 
3727 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3728 	    ("pmap_remove_all: page %p is not managed", m));
3729 	SLIST_INIT(&free);
3730 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3731 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
3732 	rw_wlock(lock);
3733 retry:
3734 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3735 		pmap = PV_PMAP(pv);
3736 		if (!PMAP_TRYLOCK(pmap)) {
3737 			pvh_gen = pvh->pv_gen;
3738 			rw_wunlock(lock);
3739 			PMAP_LOCK(pmap);
3740 			rw_wlock(lock);
3741 			if (pvh_gen != pvh->pv_gen) {
3742 				PMAP_UNLOCK(pmap);
3743 				goto retry;
3744 			}
3745 		}
3746 		va = pv->pv_va;
3747 		pte = pmap_pte_exists(pmap, va, 2, __func__);
3748 		pmap_demote_l2_locked(pmap, pte, va, &lock);
3749 		PMAP_UNLOCK(pmap);
3750 	}
3751 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3752 		pmap = PV_PMAP(pv);
3753 		if (!PMAP_TRYLOCK(pmap)) {
3754 			pvh_gen = pvh->pv_gen;
3755 			md_gen = m->md.pv_gen;
3756 			rw_wunlock(lock);
3757 			PMAP_LOCK(pmap);
3758 			rw_wlock(lock);
3759 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3760 				PMAP_UNLOCK(pmap);
3761 				goto retry;
3762 			}
3763 		}
3764 		pmap_resident_count_dec(pmap, 1);
3765 
3766 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
3767 		KASSERT(pde != NULL,
3768 		    ("pmap_remove_all: no page directory entry found"));
3769 		KASSERT(lvl == 2,
3770 		    ("pmap_remove_all: invalid pde level %d", lvl));
3771 		tpde = pmap_load(pde);
3772 
3773 		pte = pmap_l2_to_l3(pde, pv->pv_va);
3774 		tpte = pmap_load_clear(pte);
3775 		if (tpte & ATTR_SW_WIRED)
3776 			pmap->pm_stats.wired_count--;
3777 		if ((tpte & ATTR_AF) != 0) {
3778 			pmap_invalidate_page(pmap, pv->pv_va, true);
3779 			vm_page_aflag_set(m, PGA_REFERENCED);
3780 		}
3781 
3782 		/*
3783 		 * Update the vm_page_t clean and reference bits.
3784 		 */
3785 		if (pmap_pte_dirty(pmap, tpte))
3786 			vm_page_dirty(m);
3787 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
3788 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3789 		m->md.pv_gen++;
3790 		free_pv_entry(pmap, pv);
3791 		PMAP_UNLOCK(pmap);
3792 	}
3793 	vm_page_aflag_clear(m, PGA_WRITEABLE);
3794 	rw_wunlock(lock);
3795 	vm_page_free_pages_toq(&free, true);
3796 }
3797 
3798 /*
3799  * Masks and sets bits in a level 2 page table entries in the specified pmap
3800  */
3801 static void
3802 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
3803     pt_entry_t nbits)
3804 {
3805 	pd_entry_t old_l2;
3806 	vm_page_t m, mt;
3807 
3808 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3809 	PMAP_ASSERT_STAGE1(pmap);
3810 	KASSERT((sva & L2_OFFSET) == 0,
3811 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
3812 	old_l2 = pmap_load(l2);
3813 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3814 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
3815 
3816 	/*
3817 	 * Return if the L2 entry already has the desired access restrictions
3818 	 * in place.
3819 	 */
3820 	if ((old_l2 & mask) == nbits)
3821 		return;
3822 
3823 	while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
3824 		cpu_spinwait();
3825 
3826 	/*
3827 	 * When a dirty read/write superpage mapping is write protected,
3828 	 * update the dirty field of each of the superpage's constituent 4KB
3829 	 * pages.
3830 	 */
3831 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
3832 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3833 	    pmap_pte_dirty(pmap, old_l2)) {
3834 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
3835 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3836 			vm_page_dirty(mt);
3837 	}
3838 
3839 	/*
3840 	 * Since a promotion must break the 4KB page mappings before making
3841 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3842 	 */
3843 	pmap_s1_invalidate_page(pmap, sva, true);
3844 }
3845 
3846 /*
3847  * Masks and sets bits in last level page table entries in the specified
3848  * pmap and range
3849  */
3850 static void
3851 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
3852     pt_entry_t nbits, bool invalidate)
3853 {
3854 	vm_offset_t va, va_next;
3855 	pd_entry_t *l0, *l1, *l2;
3856 	pt_entry_t *l3p, l3;
3857 
3858 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3859 	for (; sva < eva; sva = va_next) {
3860 		l0 = pmap_l0(pmap, sva);
3861 		if (pmap_load(l0) == 0) {
3862 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3863 			if (va_next < sva)
3864 				va_next = eva;
3865 			continue;
3866 		}
3867 
3868 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3869 		if (va_next < sva)
3870 			va_next = eva;
3871 		l1 = pmap_l0_to_l1(l0, sva);
3872 		if (pmap_load(l1) == 0)
3873 			continue;
3874 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3875 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
3876 			KASSERT(va_next <= eva,
3877 			    ("partial update of non-transparent 1G page "
3878 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3879 			    pmap_load(l1), sva, eva, va_next));
3880 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3881 			if ((pmap_load(l1) & mask) != nbits) {
3882 				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
3883 				if (invalidate)
3884 					pmap_s1_invalidate_page(pmap, sva, true);
3885 			}
3886 			continue;
3887 		}
3888 
3889 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3890 		if (va_next < sva)
3891 			va_next = eva;
3892 
3893 		l2 = pmap_l1_to_l2(l1, sva);
3894 		if (pmap_load(l2) == 0)
3895 			continue;
3896 
3897 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3898 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3899 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
3900 				continue;
3901 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
3902 				continue;
3903 		}
3904 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3905 		    ("pmap_protect: Invalid L2 entry after demotion"));
3906 
3907 		if (va_next > eva)
3908 			va_next = eva;
3909 
3910 		va = va_next;
3911 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
3912 		    sva += L3_SIZE) {
3913 			l3 = pmap_load(l3p);
3914 
3915 			/*
3916 			 * Go to the next L3 entry if the current one is
3917 			 * invalid or already has the desired access
3918 			 * restrictions in place.  (The latter case occurs
3919 			 * frequently.  For example, in a "buildworld"
3920 			 * workload, almost 1 out of 4 L3 entries already
3921 			 * have the desired restrictions.)
3922 			 */
3923 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
3924 				if (va != va_next) {
3925 					if (invalidate)
3926 						pmap_s1_invalidate_range(pmap,
3927 						    va, sva, true);
3928 					va = va_next;
3929 				}
3930 				continue;
3931 			}
3932 
3933 			while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
3934 			    nbits))
3935 				cpu_spinwait();
3936 
3937 			/*
3938 			 * When a dirty read/write mapping is write protected,
3939 			 * update the page's dirty field.
3940 			 */
3941 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
3942 			    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3943 			    pmap_pte_dirty(pmap, l3))
3944 				vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK));
3945 
3946 			if (va == va_next)
3947 				va = sva;
3948 		}
3949 		if (va != va_next && invalidate)
3950 			pmap_s1_invalidate_range(pmap, va, sva, true);
3951 	}
3952 }
3953 
3954 static void
3955 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
3956     pt_entry_t nbits, bool invalidate)
3957 {
3958 	PMAP_LOCK(pmap);
3959 	pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
3960 	PMAP_UNLOCK(pmap);
3961 }
3962 
3963 /*
3964  *	Set the physical protection on the
3965  *	specified range of this map as requested.
3966  */
3967 void
3968 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3969 {
3970 	pt_entry_t mask, nbits;
3971 
3972 	PMAP_ASSERT_STAGE1(pmap);
3973 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3974 	if (prot == VM_PROT_NONE) {
3975 		pmap_remove(pmap, sva, eva);
3976 		return;
3977 	}
3978 
3979 	mask = nbits = 0;
3980 	if ((prot & VM_PROT_WRITE) == 0) {
3981 		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
3982 		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
3983 	}
3984 	if ((prot & VM_PROT_EXECUTE) == 0) {
3985 		mask |= ATTR_S1_XN;
3986 		nbits |= ATTR_S1_XN;
3987 	}
3988 	if (mask == 0)
3989 		return;
3990 
3991 	pmap_mask_set(pmap, sva, eva, mask, nbits, true);
3992 }
3993 
3994 void
3995 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
3996 {
3997 
3998 	MPASS((sva & L3_OFFSET) == 0);
3999 	MPASS(((sva + size) & L3_OFFSET) == 0);
4000 
4001 	pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4002 	    ATTR_SW_NO_PROMOTE, false);
4003 }
4004 
4005 /*
4006  * Inserts the specified page table page into the specified pmap's collection
4007  * of idle page table pages.  Each of a pmap's page table pages is responsible
4008  * for mapping a distinct range of virtual addresses.  The pmap's collection is
4009  * ordered by this virtual address range.
4010  *
4011  * If "promoted" is false, then the page table page "mpte" must be zero filled.
4012  */
4013 static __inline int
4014 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
4015 {
4016 
4017 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4018 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
4019 	return (vm_radix_insert(&pmap->pm_root, mpte));
4020 }
4021 
4022 /*
4023  * Removes the page table page mapping the specified virtual address from the
4024  * specified pmap's collection of idle page table pages, and returns it.
4025  * Otherwise, returns NULL if there is no page table page corresponding to the
4026  * specified virtual address.
4027  */
4028 static __inline vm_page_t
4029 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4030 {
4031 
4032 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4033 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4034 }
4035 
4036 /*
4037  * Performs a break-before-make update of a pmap entry. This is needed when
4038  * either promoting or demoting pages to ensure the TLB doesn't get into an
4039  * inconsistent state.
4040  */
4041 static void
4042 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
4043     vm_offset_t va, vm_size_t size)
4044 {
4045 	register_t intr;
4046 
4047 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4048 
4049 	if ((newpte & ATTR_SW_NO_PROMOTE) != 0)
4050 		panic("%s: Updating non-promote pte", __func__);
4051 
4052 	/*
4053 	 * Ensure we don't get switched out with the page table in an
4054 	 * inconsistent state. We also need to ensure no interrupts fire
4055 	 * as they may make use of an address we are about to invalidate.
4056 	 */
4057 	intr = intr_disable();
4058 
4059 	/*
4060 	 * Clear the old mapping's valid bit, but leave the rest of the entry
4061 	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4062 	 * lookup the physical address.
4063 	 */
4064 	pmap_clear_bits(pte, ATTR_DESCR_VALID);
4065 
4066 	/*
4067 	 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4068 	 * be cached, so we invalidate intermediate entries as well as final
4069 	 * entries.
4070 	 */
4071 	pmap_s1_invalidate_range(pmap, va, va + size, false);
4072 
4073 	/* Create the new mapping */
4074 	pmap_store(pte, newpte);
4075 	dsb(ishst);
4076 
4077 	intr_restore(intr);
4078 }
4079 
4080 #if VM_NRESERVLEVEL > 0
4081 /*
4082  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4083  * replace the many pv entries for the 4KB page mappings by a single pv entry
4084  * for the 2MB page mapping.
4085  */
4086 static void
4087 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4088     struct rwlock **lockp)
4089 {
4090 	struct md_page *pvh;
4091 	pv_entry_t pv;
4092 	vm_offset_t va_last;
4093 	vm_page_t m;
4094 
4095 	KASSERT((pa & L2_OFFSET) == 0,
4096 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4097 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4098 
4099 	/*
4100 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
4101 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
4102 	 * a transfer avoids the possibility that get_pv_entry() calls
4103 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4104 	 * mappings that is being promoted.
4105 	 */
4106 	m = PHYS_TO_VM_PAGE(pa);
4107 	va = va & ~L2_OFFSET;
4108 	pv = pmap_pvh_remove(&m->md, pmap, va);
4109 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4110 	pvh = page_to_pvh(m);
4111 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4112 	pvh->pv_gen++;
4113 	/* Free the remaining NPTEPG - 1 pv entries. */
4114 	va_last = va + L2_SIZE - PAGE_SIZE;
4115 	do {
4116 		m++;
4117 		va += PAGE_SIZE;
4118 		pmap_pvh_free(&m->md, pmap, va);
4119 	} while (va < va_last);
4120 }
4121 
4122 /*
4123  * Tries to promote the 512, contiguous 4KB page mappings that are within a
4124  * single level 2 table entry to a single 2MB page mapping.  For promotion
4125  * to occur, two conditions must be met: (1) the 4KB page mappings must map
4126  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4127  * identical characteristics.
4128  */
4129 static void
4130 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4131     struct rwlock **lockp)
4132 {
4133 	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
4134 
4135 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4136 	PMAP_ASSERT_STAGE1(pmap);
4137 
4138 	/*
4139 	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
4140 	 * ineligible for promotion, invalid, or does not map the first 4KB
4141 	 * physical page within a 2MB page.
4142 	 */
4143 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(pmap_load(l2) & ~ATTR_MASK);
4144 	newl2 = pmap_load(firstl3);
4145 	if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4146 		return;
4147 	if ((newl2 & ((~ATTR_MASK & L2_OFFSET) | ATTR_DESCR_MASK)) != L3_PAGE) {
4148 		atomic_add_long(&pmap_l2_p_failures, 1);
4149 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4150 		    " in pmap %p", va, pmap);
4151 		return;
4152 	}
4153 
4154 	/*
4155 	 * Both here and in the below "for" loop, to allow for repromotion
4156 	 * after MADV_FREE, conditionally write protect a clean L3E before
4157 	 * possibly aborting the promotion due to other L3E attributes.  Why?
4158 	 * Suppose that MADV_FREE is applied to a part of a superpage, the
4159 	 * address range [S, E).  pmap_advise() will demote the superpage
4160 	 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4161 	 * set AP_RO and clear AF in the L3Es for the rest of [S, E).  Later,
4162 	 * imagine that the memory in [S, E) is recycled, but the last 4KB
4163 	 * page in [S, E) is not the last to be rewritten, or simply accessed.
4164 	 * In other words, there is still a 4KB page in [S, E), call it P,
4165 	 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4166 	 * Unless we write protect P before aborting the promotion, if and
4167 	 * when P is finally rewritten, there won't be a page fault to trigger
4168 	 * repromotion.
4169 	 */
4170 setl2:
4171 	if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4172 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4173 		/*
4174 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4175 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4176 		 */
4177 		if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4178 			goto setl2;
4179 		newl2 &= ~ATTR_SW_DBM;
4180 	}
4181 	if ((newl2 & ATTR_AF) == 0) {
4182 		atomic_add_long(&pmap_l2_p_failures, 1);
4183 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4184 		    " in pmap %p", va, pmap);
4185 		return;
4186 	}
4187 
4188 	/*
4189 	 * Examine each of the other L3Es in the specified PTP.  Abort if this
4190 	 * L3E maps an unexpected 4KB physical page or does not have identical
4191 	 * characteristics to the first L3E.
4192 	 */
4193 	pa = (newl2 & (~ATTR_MASK | ATTR_DESCR_MASK)) + L2_SIZE - PAGE_SIZE;
4194 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4195 		oldl3 = pmap_load(l3);
4196 		if ((oldl3 & (~ATTR_MASK | ATTR_DESCR_MASK)) != pa) {
4197 			atomic_add_long(&pmap_l2_p_failures, 1);
4198 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4199 			    " in pmap %p", va, pmap);
4200 			return;
4201 		}
4202 setl3:
4203 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4204 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4205 			/*
4206 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4207 			 * set, ATTR_SW_DBM can be cleared without a TLB
4208 			 * invalidation.
4209 			 */
4210 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4211 			    ~ATTR_SW_DBM))
4212 				goto setl3;
4213 			oldl3 &= ~ATTR_SW_DBM;
4214 		}
4215 		if ((oldl3 & ATTR_MASK) != (newl2 & ATTR_MASK)) {
4216 			atomic_add_long(&pmap_l2_p_failures, 1);
4217 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4218 			    " in pmap %p", va, pmap);
4219 			return;
4220 		}
4221 		pa -= PAGE_SIZE;
4222 	}
4223 
4224 	/*
4225 	 * Save the page table page in its current state until the L2
4226 	 * mapping the superpage is demoted by pmap_demote_l2() or
4227 	 * destroyed by pmap_remove_l3().
4228 	 */
4229 	if (mpte == NULL)
4230 		mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
4231 	KASSERT(mpte >= vm_page_array &&
4232 	    mpte < &vm_page_array[vm_page_array_size],
4233 	    ("pmap_promote_l2: page table page is out of range"));
4234 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
4235 	    ("pmap_promote_l2: page table page's pindex is wrong"));
4236 	if (pmap_insert_pt_page(pmap, mpte, true)) {
4237 		atomic_add_long(&pmap_l2_p_failures, 1);
4238 		CTR2(KTR_PMAP,
4239 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4240 		    pmap);
4241 		return;
4242 	}
4243 
4244 	if ((newl2 & ATTR_SW_MANAGED) != 0)
4245 		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
4246 
4247 	newl2 &= ~ATTR_DESCR_MASK;
4248 	newl2 |= L2_BLOCK;
4249 
4250 	pmap_update_entry(pmap, l2, newl2, va & ~L2_OFFSET, L2_SIZE);
4251 
4252 	atomic_add_long(&pmap_l2_promotions, 1);
4253 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4254 	    pmap);
4255 }
4256 #endif /* VM_NRESERVLEVEL > 0 */
4257 
4258 static int
4259 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
4260     int psind)
4261 {
4262 	pd_entry_t *l0p, *l1p, *l2p, origpte;
4263 	vm_page_t mp;
4264 
4265 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4266 	KASSERT(psind > 0 && psind < MAXPAGESIZES,
4267 	    ("psind %d unexpected", psind));
4268 	KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0,
4269 	    ("unaligned phys address %#lx newpte %#lx psind %d",
4270 	    (newpte & ~ATTR_MASK), newpte, psind));
4271 
4272 restart:
4273 	if (psind == 2) {
4274 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4275 
4276 		l0p = pmap_l0(pmap, va);
4277 		if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4278 			mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4279 			if (mp == NULL) {
4280 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4281 					return (KERN_RESOURCE_SHORTAGE);
4282 				PMAP_UNLOCK(pmap);
4283 				vm_wait(NULL);
4284 				PMAP_LOCK(pmap);
4285 				goto restart;
4286 			}
4287 			l1p = pmap_l0_to_l1(l0p, va);
4288 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4289 			origpte = pmap_load(l1p);
4290 		} else {
4291 			l1p = pmap_l0_to_l1(l0p, va);
4292 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4293 			origpte = pmap_load(l1p);
4294 			if ((origpte & ATTR_DESCR_VALID) == 0) {
4295 				mp = PHYS_TO_VM_PAGE(pmap_load(l0p) &
4296 				    ~ATTR_MASK);
4297 				mp->ref_count++;
4298 			}
4299 		}
4300 		KASSERT(((origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK) &&
4301 		    (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
4302 		    (origpte & ATTR_DESCR_VALID) == 0,
4303 		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
4304 		    va, origpte, newpte));
4305 		pmap_store(l1p, newpte);
4306 	} else /* (psind == 1) */ {
4307 		l2p = pmap_l2(pmap, va);
4308 		if (l2p == NULL) {
4309 			mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
4310 			if (mp == NULL) {
4311 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4312 					return (KERN_RESOURCE_SHORTAGE);
4313 				PMAP_UNLOCK(pmap);
4314 				vm_wait(NULL);
4315 				PMAP_LOCK(pmap);
4316 				goto restart;
4317 			}
4318 			l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
4319 			l2p = &l2p[pmap_l2_index(va)];
4320 			origpte = pmap_load(l2p);
4321 		} else {
4322 			l1p = pmap_l1(pmap, va);
4323 			origpte = pmap_load(l2p);
4324 			if ((origpte & ATTR_DESCR_VALID) == 0) {
4325 				mp = PHYS_TO_VM_PAGE(pmap_load(l1p) &
4326 				    ~ATTR_MASK);
4327 				mp->ref_count++;
4328 			}
4329 		}
4330 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
4331 		    ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
4332 		     (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
4333 		    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
4334 		    va, origpte, newpte));
4335 		pmap_store(l2p, newpte);
4336 	}
4337 	dsb(ishst);
4338 
4339 	if ((origpte & ATTR_DESCR_VALID) == 0)
4340 		pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
4341 	if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
4342 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
4343 	else if ((newpte & ATTR_SW_WIRED) == 0 &&
4344 	    (origpte & ATTR_SW_WIRED) != 0)
4345 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
4346 
4347 	return (KERN_SUCCESS);
4348 }
4349 
4350 /*
4351  *	Insert the given physical page (p) at
4352  *	the specified virtual address (v) in the
4353  *	target physical map with the protection requested.
4354  *
4355  *	If specified, the page will be wired down, meaning
4356  *	that the related pte can not be reclaimed.
4357  *
4358  *	NB:  This is the only routine which MAY NOT lazy-evaluate
4359  *	or lose information.  That is, this routine must actually
4360  *	insert this page into the given map NOW.
4361  */
4362 int
4363 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4364     u_int flags, int8_t psind)
4365 {
4366 	struct rwlock *lock;
4367 	pd_entry_t *pde;
4368 	pt_entry_t new_l3, orig_l3;
4369 	pt_entry_t *l2, *l3;
4370 	pv_entry_t pv;
4371 	vm_paddr_t opa, pa;
4372 	vm_page_t mpte, om;
4373 	boolean_t nosleep;
4374 	int lvl, rv;
4375 
4376 	KASSERT(ADDR_IS_CANONICAL(va),
4377 	    ("%s: Address not in canonical form: %lx", __func__, va));
4378 
4379 	va = trunc_page(va);
4380 	if ((m->oflags & VPO_UNMANAGED) == 0)
4381 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
4382 	pa = VM_PAGE_TO_PHYS(m);
4383 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE);
4384 	new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
4385 	new_l3 |= pmap_pte_prot(pmap, prot);
4386 
4387 	if ((flags & PMAP_ENTER_WIRED) != 0)
4388 		new_l3 |= ATTR_SW_WIRED;
4389 	if (pmap->pm_stage == PM_STAGE1) {
4390 		if (!ADDR_IS_KERNEL(va))
4391 			new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4392 		else
4393 			new_l3 |= ATTR_S1_UXN;
4394 		if (pmap != kernel_pmap)
4395 			new_l3 |= ATTR_S1_nG;
4396 	} else {
4397 		/*
4398 		 * Clear the access flag on executable mappings, this will be
4399 		 * set later when the page is accessed. The fault handler is
4400 		 * required to invalidate the I-cache.
4401 		 *
4402 		 * TODO: Switch to the valid flag to allow hardware management
4403 		 * of the access flag. Much of the pmap code assumes the
4404 		 * valid flag is set and fails to destroy the old page tables
4405 		 * correctly if it is clear.
4406 		 */
4407 		if (prot & VM_PROT_EXECUTE)
4408 			new_l3 &= ~ATTR_AF;
4409 	}
4410 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4411 		new_l3 |= ATTR_SW_MANAGED;
4412 		if ((prot & VM_PROT_WRITE) != 0) {
4413 			new_l3 |= ATTR_SW_DBM;
4414 			if ((flags & VM_PROT_WRITE) == 0) {
4415 				if (pmap->pm_stage == PM_STAGE1)
4416 					new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
4417 				else
4418 					new_l3 &=
4419 					    ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
4420 			}
4421 		}
4422 	}
4423 
4424 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
4425 
4426 	lock = NULL;
4427 	PMAP_LOCK(pmap);
4428 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
4429 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
4430 		    ("managed largepage va %#lx flags %#x", va, flags));
4431 		new_l3 &= ~L3_PAGE;
4432 		if (psind == 2) {
4433 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4434 			new_l3 |= L1_BLOCK;
4435 		} else /* (psind == 1) */
4436 			new_l3 |= L2_BLOCK;
4437 		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
4438 		goto out;
4439 	}
4440 	if (psind == 1) {
4441 		/* Assert the required virtual and physical alignment. */
4442 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
4443 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
4444 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
4445 		    flags, m, &lock);
4446 		goto out;
4447 	}
4448 	mpte = NULL;
4449 
4450 	/*
4451 	 * In the case that a page table page is not
4452 	 * resident, we are creating it here.
4453 	 */
4454 retry:
4455 	pde = pmap_pde(pmap, va, &lvl);
4456 	if (pde != NULL && lvl == 2) {
4457 		l3 = pmap_l2_to_l3(pde, va);
4458 		if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
4459 			mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
4460 			mpte->ref_count++;
4461 		}
4462 		goto havel3;
4463 	} else if (pde != NULL && lvl == 1) {
4464 		l2 = pmap_l1_to_l2(pde, va);
4465 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
4466 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
4467 			l3 = &l3[pmap_l3_index(va)];
4468 			if (!ADDR_IS_KERNEL(va)) {
4469 				mpte = PHYS_TO_VM_PAGE(
4470 				    pmap_load(l2) & ~ATTR_MASK);
4471 				mpte->ref_count++;
4472 			}
4473 			goto havel3;
4474 		}
4475 		/* We need to allocate an L3 table. */
4476 	}
4477 	if (!ADDR_IS_KERNEL(va)) {
4478 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
4479 
4480 		/*
4481 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
4482 		 * to handle the possibility that a superpage mapping for "va"
4483 		 * was created while we slept.
4484 		 */
4485 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
4486 		    nosleep ? NULL : &lock);
4487 		if (mpte == NULL && nosleep) {
4488 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
4489 			rv = KERN_RESOURCE_SHORTAGE;
4490 			goto out;
4491 		}
4492 		goto retry;
4493 	} else
4494 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
4495 
4496 havel3:
4497 	orig_l3 = pmap_load(l3);
4498 	opa = orig_l3 & ~ATTR_MASK;
4499 	pv = NULL;
4500 
4501 	/*
4502 	 * Is the specified virtual address already mapped?
4503 	 */
4504 	if (pmap_l3_valid(orig_l3)) {
4505 		/*
4506 		 * Wiring change, just update stats. We don't worry about
4507 		 * wiring PT pages as they remain resident as long as there
4508 		 * are valid mappings in them. Hence, if a user page is wired,
4509 		 * the PT page will be also.
4510 		 */
4511 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
4512 		    (orig_l3 & ATTR_SW_WIRED) == 0)
4513 			pmap->pm_stats.wired_count++;
4514 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
4515 		    (orig_l3 & ATTR_SW_WIRED) != 0)
4516 			pmap->pm_stats.wired_count--;
4517 
4518 		/*
4519 		 * Remove the extra PT page reference.
4520 		 */
4521 		if (mpte != NULL) {
4522 			mpte->ref_count--;
4523 			KASSERT(mpte->ref_count > 0,
4524 			    ("pmap_enter: missing reference to page table page,"
4525 			     " va: 0x%lx", va));
4526 		}
4527 
4528 		/*
4529 		 * Has the physical page changed?
4530 		 */
4531 		if (opa == pa) {
4532 			/*
4533 			 * No, might be a protection or wiring change.
4534 			 */
4535 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4536 			    (new_l3 & ATTR_SW_DBM) != 0)
4537 				vm_page_aflag_set(m, PGA_WRITEABLE);
4538 			goto validate;
4539 		}
4540 
4541 		/*
4542 		 * The physical page has changed.  Temporarily invalidate
4543 		 * the mapping.
4544 		 */
4545 		orig_l3 = pmap_load_clear(l3);
4546 		KASSERT((orig_l3 & ~ATTR_MASK) == opa,
4547 		    ("pmap_enter: unexpected pa update for %#lx", va));
4548 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
4549 			om = PHYS_TO_VM_PAGE(opa);
4550 
4551 			/*
4552 			 * The pmap lock is sufficient to synchronize with
4553 			 * concurrent calls to pmap_page_test_mappings() and
4554 			 * pmap_ts_referenced().
4555 			 */
4556 			if (pmap_pte_dirty(pmap, orig_l3))
4557 				vm_page_dirty(om);
4558 			if ((orig_l3 & ATTR_AF) != 0) {
4559 				pmap_invalidate_page(pmap, va, true);
4560 				vm_page_aflag_set(om, PGA_REFERENCED);
4561 			}
4562 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
4563 			pv = pmap_pvh_remove(&om->md, pmap, va);
4564 			if ((m->oflags & VPO_UNMANAGED) != 0)
4565 				free_pv_entry(pmap, pv);
4566 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
4567 			    TAILQ_EMPTY(&om->md.pv_list) &&
4568 			    ((om->flags & PG_FICTITIOUS) != 0 ||
4569 			    TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
4570 				vm_page_aflag_clear(om, PGA_WRITEABLE);
4571 		} else {
4572 			KASSERT((orig_l3 & ATTR_AF) != 0,
4573 			    ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
4574 			pmap_invalidate_page(pmap, va, true);
4575 		}
4576 		orig_l3 = 0;
4577 	} else {
4578 		/*
4579 		 * Increment the counters.
4580 		 */
4581 		if ((new_l3 & ATTR_SW_WIRED) != 0)
4582 			pmap->pm_stats.wired_count++;
4583 		pmap_resident_count_inc(pmap, 1);
4584 	}
4585 	/*
4586 	 * Enter on the PV list if part of our managed memory.
4587 	 */
4588 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4589 		if (pv == NULL) {
4590 			pv = get_pv_entry(pmap, &lock);
4591 			pv->pv_va = va;
4592 		}
4593 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4594 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4595 		m->md.pv_gen++;
4596 		if ((new_l3 & ATTR_SW_DBM) != 0)
4597 			vm_page_aflag_set(m, PGA_WRITEABLE);
4598 	}
4599 
4600 validate:
4601 	if (pmap->pm_stage == PM_STAGE1) {
4602 		/*
4603 		 * Sync icache if exec permission and attribute
4604 		 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
4605 		 * is stored and made valid for hardware table walk. If done
4606 		 * later, then other can access this page before caches are
4607 		 * properly synced. Don't do it for kernel memory which is
4608 		 * mapped with exec permission even if the memory isn't going
4609 		 * to hold executable code. The only time when icache sync is
4610 		 * needed is after kernel module is loaded and the relocation
4611 		 * info is processed. And it's done in elf_cpu_load_file().
4612 		*/
4613 		if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
4614 		    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
4615 		    (opa != pa || (orig_l3 & ATTR_S1_XN))) {
4616 			PMAP_ASSERT_STAGE1(pmap);
4617 			cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4618 		}
4619 	} else {
4620 		cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4621 	}
4622 
4623 	/*
4624 	 * Update the L3 entry
4625 	 */
4626 	if (pmap_l3_valid(orig_l3)) {
4627 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
4628 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
4629 			/* same PA, different attributes */
4630 			orig_l3 = pmap_load_store(l3, new_l3);
4631 			pmap_invalidate_page(pmap, va, true);
4632 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4633 			    pmap_pte_dirty(pmap, orig_l3))
4634 				vm_page_dirty(m);
4635 		} else {
4636 			/*
4637 			 * orig_l3 == new_l3
4638 			 * This can happens if multiple threads simultaneously
4639 			 * access not yet mapped page. This bad for performance
4640 			 * since this can cause full demotion-NOP-promotion
4641 			 * cycle.
4642 			 * Another possible reasons are:
4643 			 * - VM and pmap memory layout are diverged
4644 			 * - tlb flush is missing somewhere and CPU doesn't see
4645 			 *   actual mapping.
4646 			 */
4647 			CTR4(KTR_PMAP, "%s: already mapped page - "
4648 			    "pmap %p va 0x%#lx pte 0x%lx",
4649 			    __func__, pmap, va, new_l3);
4650 		}
4651 	} else {
4652 		/* New mapping */
4653 		pmap_store(l3, new_l3);
4654 		dsb(ishst);
4655 	}
4656 
4657 #if VM_NRESERVLEVEL > 0
4658 	/*
4659 	 * Try to promote from level 3 pages to a level 2 superpage. This
4660 	 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at
4661 	 * stage 1 specific fields and performs a break-before-make sequence
4662 	 * that is incorrect a stage 2 pmap.
4663 	 */
4664 	if ((mpte == NULL || mpte->ref_count == NL3PG) &&
4665 	    pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 &&
4666 	    (m->flags & PG_FICTITIOUS) == 0 &&
4667 	    vm_reserv_level_iffullpop(m) == 0) {
4668 		pmap_promote_l2(pmap, pde, va, mpte, &lock);
4669 	}
4670 #endif
4671 
4672 	rv = KERN_SUCCESS;
4673 out:
4674 	if (lock != NULL)
4675 		rw_wunlock(lock);
4676 	PMAP_UNLOCK(pmap);
4677 	return (rv);
4678 }
4679 
4680 /*
4681  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns
4682  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
4683  * value.  See pmap_enter_l2() for the possible error values when "no sleep",
4684  * "no replace", and "no reclaim" are specified.
4685  */
4686 static int
4687 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4688     struct rwlock **lockp)
4689 {
4690 	pd_entry_t new_l2;
4691 
4692 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4693 	PMAP_ASSERT_STAGE1(pmap);
4694 	KASSERT(ADDR_IS_CANONICAL(va),
4695 	    ("%s: Address not in canonical form: %lx", __func__, va));
4696 
4697 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
4698 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
4699 	    L2_BLOCK);
4700 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4701 		new_l2 |= ATTR_SW_MANAGED;
4702 		new_l2 &= ~ATTR_AF;
4703 	}
4704 	if ((prot & VM_PROT_EXECUTE) == 0 ||
4705 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
4706 		new_l2 |= ATTR_S1_XN;
4707 	if (!ADDR_IS_KERNEL(va))
4708 		new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4709 	else
4710 		new_l2 |= ATTR_S1_UXN;
4711 	if (pmap != kernel_pmap)
4712 		new_l2 |= ATTR_S1_nG;
4713 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
4714 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
4715 }
4716 
4717 /*
4718  * Returns true if every page table entry in the specified page table is
4719  * zero.
4720  */
4721 static bool
4722 pmap_every_pte_zero(vm_paddr_t pa)
4723 {
4724 	pt_entry_t *pt_end, *pte;
4725 
4726 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
4727 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
4728 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
4729 		if (*pte != 0)
4730 			return (false);
4731 	}
4732 	return (true);
4733 }
4734 
4735 /*
4736  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
4737  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
4738  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
4739  * PMAP_ENTER_NOREPLACE was specified and a 4KB page mapping already exists
4740  * within the 2MB virtual address range starting at the specified virtual
4741  * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
4742  * 2MB page mapping already exists at the specified virtual address.  Returns
4743  * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
4744  * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
4745  * and a PV entry allocation failed.
4746  */
4747 static int
4748 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
4749     vm_page_t m, struct rwlock **lockp)
4750 {
4751 	struct spglist free;
4752 	pd_entry_t *l2, old_l2;
4753 	vm_page_t l2pg, mt;
4754 
4755 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4756 	KASSERT(ADDR_IS_CANONICAL(va),
4757 	    ("%s: Address not in canonical form: %lx", __func__, va));
4758 
4759 	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
4760 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
4761 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
4762 		    va, pmap);
4763 		return (KERN_RESOURCE_SHORTAGE);
4764 	}
4765 
4766 	/*
4767 	 * If there are existing mappings, either abort or remove them.
4768 	 */
4769 	if ((old_l2 = pmap_load(l2)) != 0) {
4770 		KASSERT(l2pg == NULL || l2pg->ref_count > 1,
4771 		    ("pmap_enter_l2: l2pg's ref count is too low"));
4772 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
4773 			if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
4774 				if (l2pg != NULL)
4775 					l2pg->ref_count--;
4776 				CTR2(KTR_PMAP,
4777 				    "pmap_enter_l2: no space for va %#lx"
4778 				    " in pmap %p", va, pmap);
4779 				return (KERN_NO_SPACE);
4780 			} else if (!ADDR_IS_KERNEL(va) ||
4781 			    !pmap_every_pte_zero(old_l2 & ~ATTR_MASK)) {
4782 				if (l2pg != NULL)
4783 					l2pg->ref_count--;
4784 				CTR2(KTR_PMAP,
4785 				    "pmap_enter_l2: failure for va %#lx"
4786 				    " in pmap %p", va, pmap);
4787 				return (KERN_FAILURE);
4788 			}
4789 		}
4790 		SLIST_INIT(&free);
4791 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
4792 			(void)pmap_remove_l2(pmap, l2, va,
4793 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
4794 		else
4795 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
4796 			    &free, lockp);
4797 		if (!ADDR_IS_KERNEL(va)) {
4798 			vm_page_free_pages_toq(&free, true);
4799 			KASSERT(pmap_load(l2) == 0,
4800 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
4801 		} else {
4802 			KASSERT(SLIST_EMPTY(&free),
4803 			    ("pmap_enter_l2: freed kernel page table page"));
4804 
4805 			/*
4806 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
4807 			 * will leave the kernel page table page zero filled.
4808 			 * Nonetheless, the TLB could have an intermediate
4809 			 * entry for the kernel page table page, so request
4810 			 * an invalidation at all levels after clearing
4811 			 * the L2_TABLE entry.
4812 			 */
4813 			mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
4814 			if (pmap_insert_pt_page(pmap, mt, false))
4815 				panic("pmap_enter_l2: trie insert failed");
4816 			pmap_clear(l2);
4817 			pmap_s1_invalidate_page(pmap, va, false);
4818 		}
4819 	}
4820 
4821 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
4822 		/*
4823 		 * Abort this mapping if its PV entry could not be created.
4824 		 */
4825 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
4826 			if (l2pg != NULL)
4827 				pmap_abort_ptp(pmap, va, l2pg);
4828 			CTR2(KTR_PMAP,
4829 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
4830 			    va, pmap);
4831 			return (KERN_RESOURCE_SHORTAGE);
4832 		}
4833 		if ((new_l2 & ATTR_SW_DBM) != 0)
4834 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4835 				vm_page_aflag_set(mt, PGA_WRITEABLE);
4836 	}
4837 
4838 	/*
4839 	 * Increment counters.
4840 	 */
4841 	if ((new_l2 & ATTR_SW_WIRED) != 0)
4842 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
4843 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
4844 
4845 	/*
4846 	 * Conditionally sync the icache.  See pmap_enter() for details.
4847 	 */
4848 	if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) !=
4849 	    (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) &&
4850 	    pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
4851 		cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK),
4852 		    L2_SIZE);
4853 	}
4854 
4855 	/*
4856 	 * Map the superpage.
4857 	 */
4858 	pmap_store(l2, new_l2);
4859 	dsb(ishst);
4860 
4861 	atomic_add_long(&pmap_l2_mappings, 1);
4862 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
4863 	    va, pmap);
4864 
4865 	return (KERN_SUCCESS);
4866 }
4867 
4868 /*
4869  * Maps a sequence of resident pages belonging to the same object.
4870  * The sequence begins with the given page m_start.  This page is
4871  * mapped at the given virtual address start.  Each subsequent page is
4872  * mapped at a virtual address that is offset from start by the same
4873  * amount as the page is offset from m_start within the object.  The
4874  * last page in the sequence is the page with the largest offset from
4875  * m_start that can be mapped at a virtual address less than the given
4876  * virtual address end.  Not every virtual page between start and end
4877  * is mapped; only those for which a resident page exists with the
4878  * corresponding offset from m_start are mapped.
4879  */
4880 void
4881 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4882     vm_page_t m_start, vm_prot_t prot)
4883 {
4884 	struct rwlock *lock;
4885 	vm_offset_t va;
4886 	vm_page_t m, mpte;
4887 	vm_pindex_t diff, psize;
4888 	int rv;
4889 
4890 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4891 
4892 	psize = atop(end - start);
4893 	mpte = NULL;
4894 	m = m_start;
4895 	lock = NULL;
4896 	PMAP_LOCK(pmap);
4897 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4898 		va = start + ptoa(diff);
4899 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
4900 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4901 		    ((rv = pmap_enter_2mpage(pmap, va, m, prot, &lock)) ==
4902 		    KERN_SUCCESS || rv == KERN_NO_SPACE))
4903 			m = &m[L2_SIZE / PAGE_SIZE - 1];
4904 		else
4905 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
4906 			    &lock);
4907 		m = TAILQ_NEXT(m, listq);
4908 	}
4909 	if (lock != NULL)
4910 		rw_wunlock(lock);
4911 	PMAP_UNLOCK(pmap);
4912 }
4913 
4914 /*
4915  * this code makes some *MAJOR* assumptions:
4916  * 1. Current pmap & pmap exists.
4917  * 2. Not wired.
4918  * 3. Read access.
4919  * 4. No page table pages.
4920  * but is *MUCH* faster than pmap_enter...
4921  */
4922 
4923 void
4924 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4925 {
4926 	struct rwlock *lock;
4927 
4928 	lock = NULL;
4929 	PMAP_LOCK(pmap);
4930 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4931 	if (lock != NULL)
4932 		rw_wunlock(lock);
4933 	PMAP_UNLOCK(pmap);
4934 }
4935 
4936 static vm_page_t
4937 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4938     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4939 {
4940 	pd_entry_t *pde;
4941 	pt_entry_t *l1, *l2, *l3, l3_val;
4942 	vm_paddr_t pa;
4943 	int lvl;
4944 
4945 	KASSERT(!VA_IS_CLEANMAP(va) ||
4946 	    (m->oflags & VPO_UNMANAGED) != 0,
4947 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4948 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4949 	PMAP_ASSERT_STAGE1(pmap);
4950 	KASSERT(ADDR_IS_CANONICAL(va),
4951 	    ("%s: Address not in canonical form: %lx", __func__, va));
4952 
4953 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
4954 	/*
4955 	 * In the case that a page table page is not
4956 	 * resident, we are creating it here.
4957 	 */
4958 	if (!ADDR_IS_KERNEL(va)) {
4959 		vm_pindex_t l2pindex;
4960 
4961 		/*
4962 		 * Calculate pagetable page index
4963 		 */
4964 		l2pindex = pmap_l2_pindex(va);
4965 		if (mpte && (mpte->pindex == l2pindex)) {
4966 			mpte->ref_count++;
4967 		} else {
4968 			/*
4969 			 * If the page table page is mapped, we just increment
4970 			 * the hold count, and activate it.  Otherwise, we
4971 			 * attempt to allocate a page table page, passing NULL
4972 			 * instead of the PV list lock pointer because we don't
4973 			 * intend to sleep.  If this attempt fails, we don't
4974 			 * retry.  Instead, we give up.
4975 			 */
4976 			l1 = pmap_l1(pmap, va);
4977 			if (l1 != NULL && pmap_load(l1) != 0) {
4978 				if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
4979 				    L1_BLOCK)
4980 					return (NULL);
4981 				l2 = pmap_l1_to_l2(l1, va);
4982 				if (pmap_load(l2) != 0) {
4983 					if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
4984 					    L2_BLOCK)
4985 						return (NULL);
4986 					mpte = PHYS_TO_VM_PAGE(pmap_load(l2) &
4987 					    ~ATTR_MASK);
4988 					mpte->ref_count++;
4989 				} else {
4990 					mpte = _pmap_alloc_l3(pmap, l2pindex,
4991 					    NULL);
4992 					if (mpte == NULL)
4993 						return (mpte);
4994 				}
4995 			} else {
4996 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
4997 				if (mpte == NULL)
4998 					return (mpte);
4999 			}
5000 		}
5001 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5002 		l3 = &l3[pmap_l3_index(va)];
5003 	} else {
5004 		mpte = NULL;
5005 		pde = pmap_pde(kernel_pmap, va, &lvl);
5006 		KASSERT(pde != NULL,
5007 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
5008 		     va));
5009 		KASSERT(lvl == 2,
5010 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
5011 		l3 = pmap_l2_to_l3(pde, va);
5012 	}
5013 
5014 	/*
5015 	 * Abort if a mapping already exists.
5016 	 */
5017 	if (pmap_load(l3) != 0) {
5018 		if (mpte != NULL)
5019 			mpte->ref_count--;
5020 		return (NULL);
5021 	}
5022 
5023 	/*
5024 	 * Enter on the PV list if part of our managed memory.
5025 	 */
5026 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
5027 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5028 		if (mpte != NULL)
5029 			pmap_abort_ptp(pmap, va, mpte);
5030 		return (NULL);
5031 	}
5032 
5033 	/*
5034 	 * Increment counters
5035 	 */
5036 	pmap_resident_count_inc(pmap, 1);
5037 
5038 	pa = VM_PAGE_TO_PHYS(m);
5039 	l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
5040 	    ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
5041 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5042 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5043 		l3_val |= ATTR_S1_XN;
5044 	if (!ADDR_IS_KERNEL(va))
5045 		l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5046 	else
5047 		l3_val |= ATTR_S1_UXN;
5048 	if (pmap != kernel_pmap)
5049 		l3_val |= ATTR_S1_nG;
5050 
5051 	/*
5052 	 * Now validate mapping with RO protection
5053 	 */
5054 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5055 		l3_val |= ATTR_SW_MANAGED;
5056 		l3_val &= ~ATTR_AF;
5057 	}
5058 
5059 	/* Sync icache before the mapping is stored to PTE */
5060 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5061 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5062 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
5063 
5064 	pmap_store(l3, l3_val);
5065 	dsb(ishst);
5066 
5067 	return (mpte);
5068 }
5069 
5070 /*
5071  * This code maps large physical mmap regions into the
5072  * processor address space.  Note that some shortcuts
5073  * are taken, but the code works.
5074  */
5075 void
5076 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
5077     vm_pindex_t pindex, vm_size_t size)
5078 {
5079 
5080 	VM_OBJECT_ASSERT_WLOCKED(object);
5081 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
5082 	    ("pmap_object_init_pt: non-device object"));
5083 }
5084 
5085 /*
5086  *	Clear the wired attribute from the mappings for the specified range of
5087  *	addresses in the given pmap.  Every valid mapping within that range
5088  *	must have the wired attribute set.  In contrast, invalid mappings
5089  *	cannot have the wired attribute set, so they are ignored.
5090  *
5091  *	The wired attribute of the page table entry is not a hardware feature,
5092  *	so there is no need to invalidate any TLB entries.
5093  */
5094 void
5095 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
5096 {
5097 	vm_offset_t va_next;
5098 	pd_entry_t *l0, *l1, *l2;
5099 	pt_entry_t *l3;
5100 
5101 	PMAP_LOCK(pmap);
5102 	for (; sva < eva; sva = va_next) {
5103 		l0 = pmap_l0(pmap, sva);
5104 		if (pmap_load(l0) == 0) {
5105 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
5106 			if (va_next < sva)
5107 				va_next = eva;
5108 			continue;
5109 		}
5110 
5111 		l1 = pmap_l0_to_l1(l0, sva);
5112 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
5113 		if (va_next < sva)
5114 			va_next = eva;
5115 		if (pmap_load(l1) == 0)
5116 			continue;
5117 
5118 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5119 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5120 			KASSERT(va_next <= eva,
5121 			    ("partial update of non-transparent 1G page "
5122 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
5123 			    pmap_load(l1), sva, eva, va_next));
5124 			MPASS(pmap != kernel_pmap);
5125 			MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
5126 			    ATTR_SW_WIRED)) == ATTR_SW_WIRED);
5127 			pmap_clear_bits(l1, ATTR_SW_WIRED);
5128 			pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
5129 			continue;
5130 		}
5131 
5132 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
5133 		if (va_next < sva)
5134 			va_next = eva;
5135 
5136 		l2 = pmap_l1_to_l2(l1, sva);
5137 		if (pmap_load(l2) == 0)
5138 			continue;
5139 
5140 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
5141 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
5142 				panic("pmap_unwire: l2 %#jx is missing "
5143 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
5144 
5145 			/*
5146 			 * Are we unwiring the entire large page?  If not,
5147 			 * demote the mapping and fall through.
5148 			 */
5149 			if (sva + L2_SIZE == va_next && eva >= va_next) {
5150 				pmap_clear_bits(l2, ATTR_SW_WIRED);
5151 				pmap->pm_stats.wired_count -= L2_SIZE /
5152 				    PAGE_SIZE;
5153 				continue;
5154 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
5155 				panic("pmap_unwire: demotion failed");
5156 		}
5157 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
5158 		    ("pmap_unwire: Invalid l2 entry after demotion"));
5159 
5160 		if (va_next > eva)
5161 			va_next = eva;
5162 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
5163 		    sva += L3_SIZE) {
5164 			if (pmap_load(l3) == 0)
5165 				continue;
5166 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
5167 				panic("pmap_unwire: l3 %#jx is missing "
5168 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
5169 
5170 			/*
5171 			 * ATTR_SW_WIRED must be cleared atomically.  Although
5172 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
5173 			 * the System MMU may write to the entry concurrently.
5174 			 */
5175 			pmap_clear_bits(l3, ATTR_SW_WIRED);
5176 			pmap->pm_stats.wired_count--;
5177 		}
5178 	}
5179 	PMAP_UNLOCK(pmap);
5180 }
5181 
5182 /*
5183  *	Copy the range specified by src_addr/len
5184  *	from the source map to the range dst_addr/len
5185  *	in the destination map.
5186  *
5187  *	This routine is only advisory and need not do anything.
5188  *
5189  *	Because the executable mappings created by this routine are copied,
5190  *	it should not have to flush the instruction cache.
5191  */
5192 void
5193 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
5194     vm_offset_t src_addr)
5195 {
5196 	struct rwlock *lock;
5197 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
5198 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
5199 	vm_offset_t addr, end_addr, va_next;
5200 	vm_page_t dst_m, dstmpte, srcmpte;
5201 
5202 	PMAP_ASSERT_STAGE1(dst_pmap);
5203 	PMAP_ASSERT_STAGE1(src_pmap);
5204 
5205 	if (dst_addr != src_addr)
5206 		return;
5207 	end_addr = src_addr + len;
5208 	lock = NULL;
5209 	if (dst_pmap < src_pmap) {
5210 		PMAP_LOCK(dst_pmap);
5211 		PMAP_LOCK(src_pmap);
5212 	} else {
5213 		PMAP_LOCK(src_pmap);
5214 		PMAP_LOCK(dst_pmap);
5215 	}
5216 	for (addr = src_addr; addr < end_addr; addr = va_next) {
5217 		l0 = pmap_l0(src_pmap, addr);
5218 		if (pmap_load(l0) == 0) {
5219 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
5220 			if (va_next < addr)
5221 				va_next = end_addr;
5222 			continue;
5223 		}
5224 
5225 		va_next = (addr + L1_SIZE) & ~L1_OFFSET;
5226 		if (va_next < addr)
5227 			va_next = end_addr;
5228 		l1 = pmap_l0_to_l1(l0, addr);
5229 		if (pmap_load(l1) == 0)
5230 			continue;
5231 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5232 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5233 			KASSERT(va_next <= end_addr,
5234 			    ("partial update of non-transparent 1G page "
5235 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
5236 			    pmap_load(l1), addr, end_addr, va_next));
5237 			srcptepaddr = pmap_load(l1);
5238 			l1 = pmap_l1(dst_pmap, addr);
5239 			if (l1 == NULL) {
5240 				if (_pmap_alloc_l3(dst_pmap,
5241 				    pmap_l0_pindex(addr), NULL) == NULL)
5242 					break;
5243 				l1 = pmap_l1(dst_pmap, addr);
5244 			} else {
5245 				l0 = pmap_l0(dst_pmap, addr);
5246 				dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) &
5247 				    ~ATTR_MASK);
5248 				dst_m->ref_count++;
5249 			}
5250 			KASSERT(pmap_load(l1) == 0,
5251 			    ("1G mapping present in dst pmap "
5252 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
5253 			    pmap_load(l1), addr, end_addr, va_next));
5254 			pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
5255 			pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
5256 			continue;
5257 		}
5258 
5259 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
5260 		if (va_next < addr)
5261 			va_next = end_addr;
5262 		l2 = pmap_l1_to_l2(l1, addr);
5263 		srcptepaddr = pmap_load(l2);
5264 		if (srcptepaddr == 0)
5265 			continue;
5266 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
5267 			/*
5268 			 * We can only virtual copy whole superpages.
5269 			 */
5270 			if ((addr & L2_OFFSET) != 0 ||
5271 			    addr + L2_SIZE > end_addr)
5272 				continue;
5273 			l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
5274 			if (l2 == NULL)
5275 				break;
5276 			if (pmap_load(l2) == 0 &&
5277 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
5278 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
5279 			    PMAP_ENTER_NORECLAIM, &lock))) {
5280 				/*
5281 				 * We leave the dirty bit unchanged because
5282 				 * managed read/write superpage mappings are
5283 				 * required to be dirty.  However, managed
5284 				 * superpage mappings are not required to
5285 				 * have their accessed bit set, so we clear
5286 				 * it because we don't know if this mapping
5287 				 * will be used.
5288 				 */
5289 				srcptepaddr &= ~ATTR_SW_WIRED;
5290 				if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
5291 					srcptepaddr &= ~ATTR_AF;
5292 				pmap_store(l2, srcptepaddr);
5293 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
5294 				    PAGE_SIZE);
5295 				atomic_add_long(&pmap_l2_mappings, 1);
5296 			} else
5297 				pmap_abort_ptp(dst_pmap, addr, dst_m);
5298 			continue;
5299 		}
5300 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
5301 		    ("pmap_copy: invalid L2 entry"));
5302 		srcptepaddr &= ~ATTR_MASK;
5303 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
5304 		KASSERT(srcmpte->ref_count > 0,
5305 		    ("pmap_copy: source page table page is unused"));
5306 		if (va_next > end_addr)
5307 			va_next = end_addr;
5308 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
5309 		src_pte = &src_pte[pmap_l3_index(addr)];
5310 		dstmpte = NULL;
5311 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
5312 			ptetemp = pmap_load(src_pte);
5313 
5314 			/*
5315 			 * We only virtual copy managed pages.
5316 			 */
5317 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
5318 				continue;
5319 
5320 			if (dstmpte != NULL) {
5321 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
5322 				    ("dstmpte pindex/addr mismatch"));
5323 				dstmpte->ref_count++;
5324 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
5325 			    NULL)) == NULL)
5326 				goto out;
5327 			dst_pte = (pt_entry_t *)
5328 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
5329 			dst_pte = &dst_pte[pmap_l3_index(addr)];
5330 			if (pmap_load(dst_pte) == 0 &&
5331 			    pmap_try_insert_pv_entry(dst_pmap, addr,
5332 			    PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) {
5333 				/*
5334 				 * Clear the wired, modified, and accessed
5335 				 * (referenced) bits during the copy.
5336 				 */
5337 				mask = ATTR_AF | ATTR_SW_WIRED;
5338 				nbits = 0;
5339 				if ((ptetemp & ATTR_SW_DBM) != 0)
5340 					nbits |= ATTR_S1_AP_RW_BIT;
5341 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
5342 				pmap_resident_count_inc(dst_pmap, 1);
5343 			} else {
5344 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
5345 				goto out;
5346 			}
5347 			/* Have we copied all of the valid mappings? */
5348 			if (dstmpte->ref_count >= srcmpte->ref_count)
5349 				break;
5350 		}
5351 	}
5352 out:
5353 	/*
5354 	 * XXX This barrier may not be needed because the destination pmap is
5355 	 * not active.
5356 	 */
5357 	dsb(ishst);
5358 
5359 	if (lock != NULL)
5360 		rw_wunlock(lock);
5361 	PMAP_UNLOCK(src_pmap);
5362 	PMAP_UNLOCK(dst_pmap);
5363 }
5364 
5365 /*
5366  *	pmap_zero_page zeros the specified hardware page by mapping
5367  *	the page into KVM and using bzero to clear its contents.
5368  */
5369 void
5370 pmap_zero_page(vm_page_t m)
5371 {
5372 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5373 
5374 	pagezero((void *)va);
5375 }
5376 
5377 /*
5378  *	pmap_zero_page_area zeros the specified hardware page by mapping
5379  *	the page into KVM and using bzero to clear its contents.
5380  *
5381  *	off and size may not cover an area beyond a single hardware page.
5382  */
5383 void
5384 pmap_zero_page_area(vm_page_t m, int off, int size)
5385 {
5386 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
5387 
5388 	if (off == 0 && size == PAGE_SIZE)
5389 		pagezero((void *)va);
5390 	else
5391 		bzero((char *)va + off, size);
5392 }
5393 
5394 /*
5395  *	pmap_copy_page copies the specified (machine independent)
5396  *	page by mapping the page into virtual memory and using
5397  *	bcopy to copy the page, one machine dependent page at a
5398  *	time.
5399  */
5400 void
5401 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
5402 {
5403 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
5404 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
5405 
5406 	pagecopy((void *)src, (void *)dst);
5407 }
5408 
5409 int unmapped_buf_allowed = 1;
5410 
5411 void
5412 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
5413     vm_offset_t b_offset, int xfersize)
5414 {
5415 	void *a_cp, *b_cp;
5416 	vm_page_t m_a, m_b;
5417 	vm_paddr_t p_a, p_b;
5418 	vm_offset_t a_pg_offset, b_pg_offset;
5419 	int cnt;
5420 
5421 	while (xfersize > 0) {
5422 		a_pg_offset = a_offset & PAGE_MASK;
5423 		m_a = ma[a_offset >> PAGE_SHIFT];
5424 		p_a = m_a->phys_addr;
5425 		b_pg_offset = b_offset & PAGE_MASK;
5426 		m_b = mb[b_offset >> PAGE_SHIFT];
5427 		p_b = m_b->phys_addr;
5428 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
5429 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
5430 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
5431 			panic("!DMAP a %lx", p_a);
5432 		} else {
5433 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
5434 		}
5435 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
5436 			panic("!DMAP b %lx", p_b);
5437 		} else {
5438 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
5439 		}
5440 		bcopy(a_cp, b_cp, cnt);
5441 		a_offset += cnt;
5442 		b_offset += cnt;
5443 		xfersize -= cnt;
5444 	}
5445 }
5446 
5447 vm_offset_t
5448 pmap_quick_enter_page(vm_page_t m)
5449 {
5450 
5451 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
5452 }
5453 
5454 void
5455 pmap_quick_remove_page(vm_offset_t addr)
5456 {
5457 }
5458 
5459 /*
5460  * Returns true if the pmap's pv is one of the first
5461  * 16 pvs linked to from this page.  This count may
5462  * be changed upwards or downwards in the future; it
5463  * is only necessary that true be returned for a small
5464  * subset of pmaps for proper page aging.
5465  */
5466 boolean_t
5467 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
5468 {
5469 	struct md_page *pvh;
5470 	struct rwlock *lock;
5471 	pv_entry_t pv;
5472 	int loops = 0;
5473 	boolean_t rv;
5474 
5475 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5476 	    ("pmap_page_exists_quick: page %p is not managed", m));
5477 	rv = FALSE;
5478 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5479 	rw_rlock(lock);
5480 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5481 		if (PV_PMAP(pv) == pmap) {
5482 			rv = TRUE;
5483 			break;
5484 		}
5485 		loops++;
5486 		if (loops >= 16)
5487 			break;
5488 	}
5489 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
5490 		pvh = page_to_pvh(m);
5491 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5492 			if (PV_PMAP(pv) == pmap) {
5493 				rv = TRUE;
5494 				break;
5495 			}
5496 			loops++;
5497 			if (loops >= 16)
5498 				break;
5499 		}
5500 	}
5501 	rw_runlock(lock);
5502 	return (rv);
5503 }
5504 
5505 /*
5506  *	pmap_page_wired_mappings:
5507  *
5508  *	Return the number of managed mappings to the given physical page
5509  *	that are wired.
5510  */
5511 int
5512 pmap_page_wired_mappings(vm_page_t m)
5513 {
5514 	struct rwlock *lock;
5515 	struct md_page *pvh;
5516 	pmap_t pmap;
5517 	pt_entry_t *pte;
5518 	pv_entry_t pv;
5519 	int count, md_gen, pvh_gen;
5520 
5521 	if ((m->oflags & VPO_UNMANAGED) != 0)
5522 		return (0);
5523 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5524 	rw_rlock(lock);
5525 restart:
5526 	count = 0;
5527 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5528 		pmap = PV_PMAP(pv);
5529 		if (!PMAP_TRYLOCK(pmap)) {
5530 			md_gen = m->md.pv_gen;
5531 			rw_runlock(lock);
5532 			PMAP_LOCK(pmap);
5533 			rw_rlock(lock);
5534 			if (md_gen != m->md.pv_gen) {
5535 				PMAP_UNLOCK(pmap);
5536 				goto restart;
5537 			}
5538 		}
5539 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5540 		if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
5541 			count++;
5542 		PMAP_UNLOCK(pmap);
5543 	}
5544 	if ((m->flags & PG_FICTITIOUS) == 0) {
5545 		pvh = page_to_pvh(m);
5546 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5547 			pmap = PV_PMAP(pv);
5548 			if (!PMAP_TRYLOCK(pmap)) {
5549 				md_gen = m->md.pv_gen;
5550 				pvh_gen = pvh->pv_gen;
5551 				rw_runlock(lock);
5552 				PMAP_LOCK(pmap);
5553 				rw_rlock(lock);
5554 				if (md_gen != m->md.pv_gen ||
5555 				    pvh_gen != pvh->pv_gen) {
5556 					PMAP_UNLOCK(pmap);
5557 					goto restart;
5558 				}
5559 			}
5560 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
5561 			if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
5562 				count++;
5563 			PMAP_UNLOCK(pmap);
5564 		}
5565 	}
5566 	rw_runlock(lock);
5567 	return (count);
5568 }
5569 
5570 /*
5571  * Returns true if the given page is mapped individually or as part of
5572  * a 2mpage.  Otherwise, returns false.
5573  */
5574 bool
5575 pmap_page_is_mapped(vm_page_t m)
5576 {
5577 	struct rwlock *lock;
5578 	bool rv;
5579 
5580 	if ((m->oflags & VPO_UNMANAGED) != 0)
5581 		return (false);
5582 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5583 	rw_rlock(lock);
5584 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
5585 	    ((m->flags & PG_FICTITIOUS) == 0 &&
5586 	    !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
5587 	rw_runlock(lock);
5588 	return (rv);
5589 }
5590 
5591 /*
5592  * Destroy all managed, non-wired mappings in the given user-space
5593  * pmap.  This pmap cannot be active on any processor besides the
5594  * caller.
5595  *
5596  * This function cannot be applied to the kernel pmap.  Moreover, it
5597  * is not intended for general use.  It is only to be used during
5598  * process termination.  Consequently, it can be implemented in ways
5599  * that make it faster than pmap_remove().  First, it can more quickly
5600  * destroy mappings by iterating over the pmap's collection of PV
5601  * entries, rather than searching the page table.  Second, it doesn't
5602  * have to test and clear the page table entries atomically, because
5603  * no processor is currently accessing the user address space.  In
5604  * particular, a page table entry's dirty bit won't change state once
5605  * this function starts.
5606  */
5607 void
5608 pmap_remove_pages(pmap_t pmap)
5609 {
5610 	pd_entry_t *pde;
5611 	pt_entry_t *pte, tpte;
5612 	struct spglist free;
5613 	struct pv_chunklist free_chunks[PMAP_MEMDOM];
5614 	vm_page_t m, ml3, mt;
5615 	pv_entry_t pv;
5616 	struct md_page *pvh;
5617 	struct pv_chunk *pc, *npc;
5618 	struct rwlock *lock;
5619 	int64_t bit;
5620 	uint64_t inuse, bitmask;
5621 	int allfree, field, i, idx, lvl;
5622 	int freed __pvused;
5623 	vm_paddr_t pa;
5624 
5625 	lock = NULL;
5626 
5627 	for (i = 0; i < PMAP_MEMDOM; i++)
5628 		TAILQ_INIT(&free_chunks[i]);
5629 	SLIST_INIT(&free);
5630 	PMAP_LOCK(pmap);
5631 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5632 		allfree = 1;
5633 		freed = 0;
5634 		for (field = 0; field < _NPCM; field++) {
5635 			inuse = ~pc->pc_map[field] & pc_freemask[field];
5636 			while (inuse != 0) {
5637 				bit = ffsl(inuse) - 1;
5638 				bitmask = 1UL << bit;
5639 				idx = field * 64 + bit;
5640 				pv = &pc->pc_pventry[idx];
5641 				inuse &= ~bitmask;
5642 
5643 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
5644 				KASSERT(pde != NULL,
5645 				    ("Attempting to remove an unmapped page"));
5646 
5647 				switch(lvl) {
5648 				case 1:
5649 					pte = pmap_l1_to_l2(pde, pv->pv_va);
5650 					tpte = pmap_load(pte);
5651 					KASSERT((tpte & ATTR_DESCR_MASK) ==
5652 					    L2_BLOCK,
5653 					    ("Attempting to remove an invalid "
5654 					    "block: %lx", tpte));
5655 					break;
5656 				case 2:
5657 					pte = pmap_l2_to_l3(pde, pv->pv_va);
5658 					tpte = pmap_load(pte);
5659 					KASSERT((tpte & ATTR_DESCR_MASK) ==
5660 					    L3_PAGE,
5661 					    ("Attempting to remove an invalid "
5662 					     "page: %lx", tpte));
5663 					break;
5664 				default:
5665 					panic(
5666 					    "Invalid page directory level: %d",
5667 					    lvl);
5668 				}
5669 
5670 /*
5671  * We cannot remove wired pages from a process' mapping at this time
5672  */
5673 				if (tpte & ATTR_SW_WIRED) {
5674 					allfree = 0;
5675 					continue;
5676 				}
5677 
5678 				/* Mark free */
5679 				pc->pc_map[field] |= bitmask;
5680 
5681 				/*
5682 				 * Because this pmap is not active on other
5683 				 * processors, the dirty bit cannot have
5684 				 * changed state since we last loaded pte.
5685 				 */
5686 				pmap_clear(pte);
5687 
5688 				pa = tpte & ~ATTR_MASK;
5689 
5690 				m = PHYS_TO_VM_PAGE(pa);
5691 				KASSERT(m->phys_addr == pa,
5692 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5693 				    m, (uintmax_t)m->phys_addr,
5694 				    (uintmax_t)tpte));
5695 
5696 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5697 				    m < &vm_page_array[vm_page_array_size],
5698 				    ("pmap_remove_pages: bad pte %#jx",
5699 				    (uintmax_t)tpte));
5700 
5701 				/*
5702 				 * Update the vm_page_t clean/reference bits.
5703 				 */
5704 				if (pmap_pte_dirty(pmap, tpte)) {
5705 					switch (lvl) {
5706 					case 1:
5707 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5708 							vm_page_dirty(mt);
5709 						break;
5710 					case 2:
5711 						vm_page_dirty(m);
5712 						break;
5713 					}
5714 				}
5715 
5716 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5717 
5718 				switch (lvl) {
5719 				case 1:
5720 					pmap_resident_count_dec(pmap,
5721 					    L2_SIZE / PAGE_SIZE);
5722 					pvh = page_to_pvh(m);
5723 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
5724 					pvh->pv_gen++;
5725 					if (TAILQ_EMPTY(&pvh->pv_list)) {
5726 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5727 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5728 							    TAILQ_EMPTY(&mt->md.pv_list))
5729 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5730 					}
5731 					ml3 = pmap_remove_pt_page(pmap,
5732 					    pv->pv_va);
5733 					if (ml3 != NULL) {
5734 						KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
5735 						    ("pmap_remove_pages: l3 page not promoted"));
5736 						pmap_resident_count_dec(pmap,1);
5737 						KASSERT(ml3->ref_count == NL3PG,
5738 						    ("pmap_remove_pages: l3 page ref count error"));
5739 						ml3->ref_count = 0;
5740 						pmap_add_delayed_free_list(ml3,
5741 						    &free, FALSE);
5742 					}
5743 					break;
5744 				case 2:
5745 					pmap_resident_count_dec(pmap, 1);
5746 					TAILQ_REMOVE(&m->md.pv_list, pv,
5747 					    pv_next);
5748 					m->md.pv_gen++;
5749 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5750 					    TAILQ_EMPTY(&m->md.pv_list) &&
5751 					    (m->flags & PG_FICTITIOUS) == 0) {
5752 						pvh = page_to_pvh(m);
5753 						if (TAILQ_EMPTY(&pvh->pv_list))
5754 							vm_page_aflag_clear(m,
5755 							    PGA_WRITEABLE);
5756 					}
5757 					break;
5758 				}
5759 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
5760 				    &free);
5761 				freed++;
5762 			}
5763 		}
5764 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5765 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5766 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5767 		if (allfree) {
5768 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5769 			TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
5770 			    pc_list);
5771 		}
5772 	}
5773 	if (lock != NULL)
5774 		rw_wunlock(lock);
5775 	pmap_invalidate_all(pmap);
5776 	free_pv_chunk_batch(free_chunks);
5777 	PMAP_UNLOCK(pmap);
5778 	vm_page_free_pages_toq(&free, true);
5779 }
5780 
5781 /*
5782  * This is used to check if a page has been accessed or modified.
5783  */
5784 static boolean_t
5785 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5786 {
5787 	struct rwlock *lock;
5788 	pv_entry_t pv;
5789 	struct md_page *pvh;
5790 	pt_entry_t *pte, mask, value;
5791 	pmap_t pmap;
5792 	int md_gen, pvh_gen;
5793 	boolean_t rv;
5794 
5795 	rv = FALSE;
5796 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5797 	rw_rlock(lock);
5798 restart:
5799 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5800 		pmap = PV_PMAP(pv);
5801 		PMAP_ASSERT_STAGE1(pmap);
5802 		if (!PMAP_TRYLOCK(pmap)) {
5803 			md_gen = m->md.pv_gen;
5804 			rw_runlock(lock);
5805 			PMAP_LOCK(pmap);
5806 			rw_rlock(lock);
5807 			if (md_gen != m->md.pv_gen) {
5808 				PMAP_UNLOCK(pmap);
5809 				goto restart;
5810 			}
5811 		}
5812 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5813 		mask = 0;
5814 		value = 0;
5815 		if (modified) {
5816 			mask |= ATTR_S1_AP_RW_BIT;
5817 			value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5818 		}
5819 		if (accessed) {
5820 			mask |= ATTR_AF | ATTR_DESCR_MASK;
5821 			value |= ATTR_AF | L3_PAGE;
5822 		}
5823 		rv = (pmap_load(pte) & mask) == value;
5824 		PMAP_UNLOCK(pmap);
5825 		if (rv)
5826 			goto out;
5827 	}
5828 	if ((m->flags & PG_FICTITIOUS) == 0) {
5829 		pvh = page_to_pvh(m);
5830 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5831 			pmap = PV_PMAP(pv);
5832 			PMAP_ASSERT_STAGE1(pmap);
5833 			if (!PMAP_TRYLOCK(pmap)) {
5834 				md_gen = m->md.pv_gen;
5835 				pvh_gen = pvh->pv_gen;
5836 				rw_runlock(lock);
5837 				PMAP_LOCK(pmap);
5838 				rw_rlock(lock);
5839 				if (md_gen != m->md.pv_gen ||
5840 				    pvh_gen != pvh->pv_gen) {
5841 					PMAP_UNLOCK(pmap);
5842 					goto restart;
5843 				}
5844 			}
5845 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
5846 			mask = 0;
5847 			value = 0;
5848 			if (modified) {
5849 				mask |= ATTR_S1_AP_RW_BIT;
5850 				value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5851 			}
5852 			if (accessed) {
5853 				mask |= ATTR_AF | ATTR_DESCR_MASK;
5854 				value |= ATTR_AF | L2_BLOCK;
5855 			}
5856 			rv = (pmap_load(pte) & mask) == value;
5857 			PMAP_UNLOCK(pmap);
5858 			if (rv)
5859 				goto out;
5860 		}
5861 	}
5862 out:
5863 	rw_runlock(lock);
5864 	return (rv);
5865 }
5866 
5867 /*
5868  *	pmap_is_modified:
5869  *
5870  *	Return whether or not the specified physical page was modified
5871  *	in any physical maps.
5872  */
5873 boolean_t
5874 pmap_is_modified(vm_page_t m)
5875 {
5876 
5877 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5878 	    ("pmap_is_modified: page %p is not managed", m));
5879 
5880 	/*
5881 	 * If the page is not busied then this check is racy.
5882 	 */
5883 	if (!pmap_page_is_write_mapped(m))
5884 		return (FALSE);
5885 	return (pmap_page_test_mappings(m, FALSE, TRUE));
5886 }
5887 
5888 /*
5889  *	pmap_is_prefaultable:
5890  *
5891  *	Return whether or not the specified virtual address is eligible
5892  *	for prefault.
5893  */
5894 boolean_t
5895 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5896 {
5897 	pd_entry_t *pde;
5898 	pt_entry_t *pte;
5899 	boolean_t rv;
5900 	int lvl;
5901 
5902 	/*
5903 	 * Return TRUE if and only if the L3 entry for the specified virtual
5904 	 * address is allocated but invalid.
5905 	 */
5906 	rv = FALSE;
5907 	PMAP_LOCK(pmap);
5908 	pde = pmap_pde(pmap, addr, &lvl);
5909 	if (pde != NULL && lvl == 2) {
5910 		pte = pmap_l2_to_l3(pde, addr);
5911 		rv = pmap_load(pte) == 0;
5912 	}
5913 	PMAP_UNLOCK(pmap);
5914 	return (rv);
5915 }
5916 
5917 /*
5918  *	pmap_is_referenced:
5919  *
5920  *	Return whether or not the specified physical page was referenced
5921  *	in any physical maps.
5922  */
5923 boolean_t
5924 pmap_is_referenced(vm_page_t m)
5925 {
5926 
5927 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5928 	    ("pmap_is_referenced: page %p is not managed", m));
5929 	return (pmap_page_test_mappings(m, TRUE, FALSE));
5930 }
5931 
5932 /*
5933  * Clear the write and modified bits in each of the given page's mappings.
5934  */
5935 void
5936 pmap_remove_write(vm_page_t m)
5937 {
5938 	struct md_page *pvh;
5939 	pmap_t pmap;
5940 	struct rwlock *lock;
5941 	pv_entry_t next_pv, pv;
5942 	pt_entry_t oldpte, *pte, set, clear, mask, val;
5943 	vm_offset_t va;
5944 	int md_gen, pvh_gen;
5945 
5946 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5947 	    ("pmap_remove_write: page %p is not managed", m));
5948 	vm_page_assert_busied(m);
5949 
5950 	if (!pmap_page_is_write_mapped(m))
5951 		return;
5952 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5953 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5954 	rw_wlock(lock);
5955 retry:
5956 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5957 		pmap = PV_PMAP(pv);
5958 		PMAP_ASSERT_STAGE1(pmap);
5959 		if (!PMAP_TRYLOCK(pmap)) {
5960 			pvh_gen = pvh->pv_gen;
5961 			rw_wunlock(lock);
5962 			PMAP_LOCK(pmap);
5963 			rw_wlock(lock);
5964 			if (pvh_gen != pvh->pv_gen) {
5965 				PMAP_UNLOCK(pmap);
5966 				goto retry;
5967 			}
5968 		}
5969 		va = pv->pv_va;
5970 		pte = pmap_pte_exists(pmap, va, 2, __func__);
5971 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
5972 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
5973 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5974 		    ("inconsistent pv lock %p %p for page %p",
5975 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5976 		PMAP_UNLOCK(pmap);
5977 	}
5978 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5979 		pmap = PV_PMAP(pv);
5980 		if (!PMAP_TRYLOCK(pmap)) {
5981 			pvh_gen = pvh->pv_gen;
5982 			md_gen = m->md.pv_gen;
5983 			rw_wunlock(lock);
5984 			PMAP_LOCK(pmap);
5985 			rw_wlock(lock);
5986 			if (pvh_gen != pvh->pv_gen ||
5987 			    md_gen != m->md.pv_gen) {
5988 				PMAP_UNLOCK(pmap);
5989 				goto retry;
5990 			}
5991 		}
5992 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5993 		oldpte = pmap_load(pte);
5994 		if ((oldpte & ATTR_SW_DBM) != 0) {
5995 			if (pmap->pm_stage == PM_STAGE1) {
5996 				set = ATTR_S1_AP_RW_BIT;
5997 				clear = 0;
5998 				mask = ATTR_S1_AP_RW_BIT;
5999 				val = ATTR_S1_AP(ATTR_S1_AP_RW);
6000 			} else {
6001 				set = 0;
6002 				clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
6003 				mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
6004 				val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
6005 			}
6006 			clear |= ATTR_SW_DBM;
6007 			while (!atomic_fcmpset_64(pte, &oldpte,
6008 			    (oldpte | set) & ~clear))
6009 				cpu_spinwait();
6010 
6011 			if ((oldpte & mask) == val)
6012 				vm_page_dirty(m);
6013 			pmap_invalidate_page(pmap, pv->pv_va, true);
6014 		}
6015 		PMAP_UNLOCK(pmap);
6016 	}
6017 	rw_wunlock(lock);
6018 	vm_page_aflag_clear(m, PGA_WRITEABLE);
6019 }
6020 
6021 /*
6022  *	pmap_ts_referenced:
6023  *
6024  *	Return a count of reference bits for a page, clearing those bits.
6025  *	It is not necessary for every reference bit to be cleared, but it
6026  *	is necessary that 0 only be returned when there are truly no
6027  *	reference bits set.
6028  *
6029  *	As an optimization, update the page's dirty field if a modified bit is
6030  *	found while counting reference bits.  This opportunistic update can be
6031  *	performed at low cost and can eliminate the need for some future calls
6032  *	to pmap_is_modified().  However, since this function stops after
6033  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
6034  *	dirty pages.  Those dirty pages will only be detected by a future call
6035  *	to pmap_is_modified().
6036  */
6037 int
6038 pmap_ts_referenced(vm_page_t m)
6039 {
6040 	struct md_page *pvh;
6041 	pv_entry_t pv, pvf;
6042 	pmap_t pmap;
6043 	struct rwlock *lock;
6044 	pt_entry_t *pte, tpte;
6045 	vm_offset_t va;
6046 	vm_paddr_t pa;
6047 	int cleared, md_gen, not_cleared, pvh_gen;
6048 	struct spglist free;
6049 
6050 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6051 	    ("pmap_ts_referenced: page %p is not managed", m));
6052 	SLIST_INIT(&free);
6053 	cleared = 0;
6054 	pa = VM_PAGE_TO_PHYS(m);
6055 	lock = PHYS_TO_PV_LIST_LOCK(pa);
6056 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
6057 	rw_wlock(lock);
6058 retry:
6059 	not_cleared = 0;
6060 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
6061 		goto small_mappings;
6062 	pv = pvf;
6063 	do {
6064 		if (pvf == NULL)
6065 			pvf = pv;
6066 		pmap = PV_PMAP(pv);
6067 		if (!PMAP_TRYLOCK(pmap)) {
6068 			pvh_gen = pvh->pv_gen;
6069 			rw_wunlock(lock);
6070 			PMAP_LOCK(pmap);
6071 			rw_wlock(lock);
6072 			if (pvh_gen != pvh->pv_gen) {
6073 				PMAP_UNLOCK(pmap);
6074 				goto retry;
6075 			}
6076 		}
6077 		va = pv->pv_va;
6078 		pte = pmap_pte_exists(pmap, va, 2, __func__);
6079 		tpte = pmap_load(pte);
6080 		if (pmap_pte_dirty(pmap, tpte)) {
6081 			/*
6082 			 * Although "tpte" is mapping a 2MB page, because
6083 			 * this function is called at a 4KB page granularity,
6084 			 * we only update the 4KB page under test.
6085 			 */
6086 			vm_page_dirty(m);
6087 		}
6088 		if ((tpte & ATTR_AF) != 0) {
6089 			/*
6090 			 * Since this reference bit is shared by 512 4KB pages,
6091 			 * it should not be cleared every time it is tested.
6092 			 * Apply a simple "hash" function on the physical page
6093 			 * number, the virtual superpage number, and the pmap
6094 			 * address to select one 4KB page out of the 512 on
6095 			 * which testing the reference bit will result in
6096 			 * clearing that reference bit.  This function is
6097 			 * designed to avoid the selection of the same 4KB page
6098 			 * for every 2MB page mapping.
6099 			 *
6100 			 * On demotion, a mapping that hasn't been referenced
6101 			 * is simply destroyed.  To avoid the possibility of a
6102 			 * subsequent page fault on a demoted wired mapping,
6103 			 * always leave its reference bit set.  Moreover,
6104 			 * since the superpage is wired, the current state of
6105 			 * its reference bit won't affect page replacement.
6106 			 */
6107 			if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
6108 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
6109 			    (tpte & ATTR_SW_WIRED) == 0) {
6110 				pmap_clear_bits(pte, ATTR_AF);
6111 				pmap_invalidate_page(pmap, va, true);
6112 				cleared++;
6113 			} else
6114 				not_cleared++;
6115 		}
6116 		PMAP_UNLOCK(pmap);
6117 		/* Rotate the PV list if it has more than one entry. */
6118 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
6119 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
6120 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
6121 			pvh->pv_gen++;
6122 		}
6123 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
6124 			goto out;
6125 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
6126 small_mappings:
6127 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
6128 		goto out;
6129 	pv = pvf;
6130 	do {
6131 		if (pvf == NULL)
6132 			pvf = pv;
6133 		pmap = PV_PMAP(pv);
6134 		if (!PMAP_TRYLOCK(pmap)) {
6135 			pvh_gen = pvh->pv_gen;
6136 			md_gen = m->md.pv_gen;
6137 			rw_wunlock(lock);
6138 			PMAP_LOCK(pmap);
6139 			rw_wlock(lock);
6140 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6141 				PMAP_UNLOCK(pmap);
6142 				goto retry;
6143 			}
6144 		}
6145 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6146 		tpte = pmap_load(pte);
6147 		if (pmap_pte_dirty(pmap, tpte))
6148 			vm_page_dirty(m);
6149 		if ((tpte & ATTR_AF) != 0) {
6150 			if ((tpte & ATTR_SW_WIRED) == 0) {
6151 				pmap_clear_bits(pte, ATTR_AF);
6152 				pmap_invalidate_page(pmap, pv->pv_va, true);
6153 				cleared++;
6154 			} else
6155 				not_cleared++;
6156 		}
6157 		PMAP_UNLOCK(pmap);
6158 		/* Rotate the PV list if it has more than one entry. */
6159 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
6160 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
6161 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
6162 			m->md.pv_gen++;
6163 		}
6164 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
6165 	    not_cleared < PMAP_TS_REFERENCED_MAX);
6166 out:
6167 	rw_wunlock(lock);
6168 	vm_page_free_pages_toq(&free, true);
6169 	return (cleared + not_cleared);
6170 }
6171 
6172 /*
6173  *	Apply the given advice to the specified range of addresses within the
6174  *	given pmap.  Depending on the advice, clear the referenced and/or
6175  *	modified flags in each mapping and set the mapped page's dirty field.
6176  */
6177 void
6178 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
6179 {
6180 	struct rwlock *lock;
6181 	vm_offset_t va, va_next;
6182 	vm_page_t m;
6183 	pd_entry_t *l0, *l1, *l2, oldl2;
6184 	pt_entry_t *l3, oldl3;
6185 
6186 	PMAP_ASSERT_STAGE1(pmap);
6187 
6188 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
6189 		return;
6190 
6191 	PMAP_LOCK(pmap);
6192 	for (; sva < eva; sva = va_next) {
6193 		l0 = pmap_l0(pmap, sva);
6194 		if (pmap_load(l0) == 0) {
6195 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6196 			if (va_next < sva)
6197 				va_next = eva;
6198 			continue;
6199 		}
6200 
6201 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6202 		if (va_next < sva)
6203 			va_next = eva;
6204 		l1 = pmap_l0_to_l1(l0, sva);
6205 		if (pmap_load(l1) == 0)
6206 			continue;
6207 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6208 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6209 			continue;
6210 		}
6211 
6212 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6213 		if (va_next < sva)
6214 			va_next = eva;
6215 		l2 = pmap_l1_to_l2(l1, sva);
6216 		oldl2 = pmap_load(l2);
6217 		if (oldl2 == 0)
6218 			continue;
6219 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
6220 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
6221 				continue;
6222 			lock = NULL;
6223 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
6224 				if (lock != NULL)
6225 					rw_wunlock(lock);
6226 
6227 				/*
6228 				 * The 2MB page mapping was destroyed.
6229 				 */
6230 				continue;
6231 			}
6232 
6233 			/*
6234 			 * Unless the page mappings are wired, remove the
6235 			 * mapping to a single page so that a subsequent
6236 			 * access may repromote.  Choosing the last page
6237 			 * within the address range [sva, min(va_next, eva))
6238 			 * generally results in more repromotions.  Since the
6239 			 * underlying page table page is fully populated, this
6240 			 * removal never frees a page table page.
6241 			 */
6242 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
6243 				va = eva;
6244 				if (va > va_next)
6245 					va = va_next;
6246 				va -= PAGE_SIZE;
6247 				KASSERT(va >= sva,
6248 				    ("pmap_advise: no address gap"));
6249 				l3 = pmap_l2_to_l3(l2, va);
6250 				KASSERT(pmap_load(l3) != 0,
6251 				    ("pmap_advise: invalid PTE"));
6252 				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
6253 				    NULL, &lock);
6254 			}
6255 			if (lock != NULL)
6256 				rw_wunlock(lock);
6257 		}
6258 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6259 		    ("pmap_advise: invalid L2 entry after demotion"));
6260 		if (va_next > eva)
6261 			va_next = eva;
6262 		va = va_next;
6263 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
6264 		    sva += L3_SIZE) {
6265 			oldl3 = pmap_load(l3);
6266 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
6267 			    (ATTR_SW_MANAGED | L3_PAGE))
6268 				goto maybe_invlrng;
6269 			else if (pmap_pte_dirty(pmap, oldl3)) {
6270 				if (advice == MADV_DONTNEED) {
6271 					/*
6272 					 * Future calls to pmap_is_modified()
6273 					 * can be avoided by making the page
6274 					 * dirty now.
6275 					 */
6276 					m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK);
6277 					vm_page_dirty(m);
6278 				}
6279 				while (!atomic_fcmpset_long(l3, &oldl3,
6280 				    (oldl3 & ~ATTR_AF) |
6281 				    ATTR_S1_AP(ATTR_S1_AP_RO)))
6282 					cpu_spinwait();
6283 			} else if ((oldl3 & ATTR_AF) != 0)
6284 				pmap_clear_bits(l3, ATTR_AF);
6285 			else
6286 				goto maybe_invlrng;
6287 			if (va == va_next)
6288 				va = sva;
6289 			continue;
6290 maybe_invlrng:
6291 			if (va != va_next) {
6292 				pmap_s1_invalidate_range(pmap, va, sva, true);
6293 				va = va_next;
6294 			}
6295 		}
6296 		if (va != va_next)
6297 			pmap_s1_invalidate_range(pmap, va, sva, true);
6298 	}
6299 	PMAP_UNLOCK(pmap);
6300 }
6301 
6302 /*
6303  *	Clear the modify bits on the specified physical page.
6304  */
6305 void
6306 pmap_clear_modify(vm_page_t m)
6307 {
6308 	struct md_page *pvh;
6309 	struct rwlock *lock;
6310 	pmap_t pmap;
6311 	pv_entry_t next_pv, pv;
6312 	pd_entry_t *l2, oldl2;
6313 	pt_entry_t *l3, oldl3;
6314 	vm_offset_t va;
6315 	int md_gen, pvh_gen;
6316 
6317 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6318 	    ("pmap_clear_modify: page %p is not managed", m));
6319 	vm_page_assert_busied(m);
6320 
6321 	if (!pmap_page_is_write_mapped(m))
6322 		return;
6323 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
6324 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6325 	rw_wlock(lock);
6326 restart:
6327 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
6328 		pmap = PV_PMAP(pv);
6329 		PMAP_ASSERT_STAGE1(pmap);
6330 		if (!PMAP_TRYLOCK(pmap)) {
6331 			pvh_gen = pvh->pv_gen;
6332 			rw_wunlock(lock);
6333 			PMAP_LOCK(pmap);
6334 			rw_wlock(lock);
6335 			if (pvh_gen != pvh->pv_gen) {
6336 				PMAP_UNLOCK(pmap);
6337 				goto restart;
6338 			}
6339 		}
6340 		va = pv->pv_va;
6341 		l2 = pmap_l2(pmap, va);
6342 		oldl2 = pmap_load(l2);
6343 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
6344 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
6345 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
6346 		    (oldl2 & ATTR_SW_WIRED) == 0) {
6347 			/*
6348 			 * Write protect the mapping to a single page so that
6349 			 * a subsequent write access may repromote.
6350 			 */
6351 			va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK);
6352 			l3 = pmap_l2_to_l3(l2, va);
6353 			oldl3 = pmap_load(l3);
6354 			while (!atomic_fcmpset_long(l3, &oldl3,
6355 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
6356 				cpu_spinwait();
6357 			vm_page_dirty(m);
6358 			pmap_s1_invalidate_page(pmap, va, true);
6359 		}
6360 		PMAP_UNLOCK(pmap);
6361 	}
6362 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6363 		pmap = PV_PMAP(pv);
6364 		PMAP_ASSERT_STAGE1(pmap);
6365 		if (!PMAP_TRYLOCK(pmap)) {
6366 			md_gen = m->md.pv_gen;
6367 			pvh_gen = pvh->pv_gen;
6368 			rw_wunlock(lock);
6369 			PMAP_LOCK(pmap);
6370 			rw_wlock(lock);
6371 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
6372 				PMAP_UNLOCK(pmap);
6373 				goto restart;
6374 			}
6375 		}
6376 		l2 = pmap_l2(pmap, pv->pv_va);
6377 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
6378 		oldl3 = pmap_load(l3);
6379 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){
6380 			pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
6381 			pmap_s1_invalidate_page(pmap, pv->pv_va, true);
6382 		}
6383 		PMAP_UNLOCK(pmap);
6384 	}
6385 	rw_wunlock(lock);
6386 }
6387 
6388 void *
6389 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
6390 {
6391 	struct pmap_preinit_mapping *ppim;
6392 	vm_offset_t va, offset;
6393 	pd_entry_t *pde;
6394 	pt_entry_t *l2;
6395 	int i, lvl, l2_blocks, free_l2_count, start_idx;
6396 
6397 	if (!vm_initialized) {
6398 		/*
6399 		 * No L3 ptables so map entire L2 blocks where start VA is:
6400 		 * 	preinit_map_va + start_idx * L2_SIZE
6401 		 * There may be duplicate mappings (multiple VA -> same PA) but
6402 		 * ARM64 dcache is always PIPT so that's acceptable.
6403 		 */
6404 		 if (size == 0)
6405 			 return (NULL);
6406 
6407 		 /* Calculate how many L2 blocks are needed for the mapping */
6408 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
6409 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
6410 
6411 		offset = pa & L2_OFFSET;
6412 
6413 		if (preinit_map_va == 0)
6414 			return (NULL);
6415 
6416 		/* Map 2MiB L2 blocks from reserved VA space */
6417 
6418 		free_l2_count = 0;
6419 		start_idx = -1;
6420 		/* Find enough free contiguous VA space */
6421 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6422 			ppim = pmap_preinit_mapping + i;
6423 			if (free_l2_count > 0 && ppim->pa != 0) {
6424 				/* Not enough space here */
6425 				free_l2_count = 0;
6426 				start_idx = -1;
6427 				continue;
6428 			}
6429 
6430 			if (ppim->pa == 0) {
6431 				/* Free L2 block */
6432 				if (start_idx == -1)
6433 					start_idx = i;
6434 				free_l2_count++;
6435 				if (free_l2_count == l2_blocks)
6436 					break;
6437 			}
6438 		}
6439 		if (free_l2_count != l2_blocks)
6440 			panic("%s: too many preinit mappings", __func__);
6441 
6442 		va = preinit_map_va + (start_idx * L2_SIZE);
6443 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
6444 			/* Mark entries as allocated */
6445 			ppim = pmap_preinit_mapping + i;
6446 			ppim->pa = pa;
6447 			ppim->va = va + offset;
6448 			ppim->size = size;
6449 		}
6450 
6451 		/* Map L2 blocks */
6452 		pa = rounddown2(pa, L2_SIZE);
6453 		for (i = 0; i < l2_blocks; i++) {
6454 			pde = pmap_pde(kernel_pmap, va, &lvl);
6455 			KASSERT(pde != NULL,
6456 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
6457 			    va));
6458 			KASSERT(lvl == 1,
6459 			    ("pmap_mapbios: Invalid level %d", lvl));
6460 
6461 			/* Insert L2_BLOCK */
6462 			l2 = pmap_l1_to_l2(pde, va);
6463 			pmap_load_store(l2,
6464 			    pa | ATTR_DEFAULT | ATTR_S1_XN |
6465 			    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
6466 
6467 			va += L2_SIZE;
6468 			pa += L2_SIZE;
6469 		}
6470 		pmap_s1_invalidate_all(kernel_pmap);
6471 
6472 		va = preinit_map_va + (start_idx * L2_SIZE);
6473 
6474 	} else {
6475 		/* kva_alloc may be used to map the pages */
6476 		offset = pa & PAGE_MASK;
6477 		size = round_page(offset + size);
6478 
6479 		va = kva_alloc(size);
6480 		if (va == 0)
6481 			panic("%s: Couldn't allocate KVA", __func__);
6482 
6483 		pde = pmap_pde(kernel_pmap, va, &lvl);
6484 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
6485 
6486 		/* L3 table is linked */
6487 		va = trunc_page(va);
6488 		pa = trunc_page(pa);
6489 		pmap_kenter(va, size, pa, memory_mapping_mode(pa));
6490 	}
6491 
6492 	return ((void *)(va + offset));
6493 }
6494 
6495 void
6496 pmap_unmapbios(void *p, vm_size_t size)
6497 {
6498 	struct pmap_preinit_mapping *ppim;
6499 	vm_offset_t offset, tmpsize, va, va_trunc;
6500 	pd_entry_t *pde;
6501 	pt_entry_t *l2;
6502 	int i, lvl, l2_blocks, block;
6503 	bool preinit_map;
6504 
6505 	va = (vm_offset_t)p;
6506 	l2_blocks =
6507 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
6508 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
6509 
6510 	/* Remove preinit mapping */
6511 	preinit_map = false;
6512 	block = 0;
6513 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
6514 		ppim = pmap_preinit_mapping + i;
6515 		if (ppim->va == va) {
6516 			KASSERT(ppim->size == size,
6517 			    ("pmap_unmapbios: size mismatch"));
6518 			ppim->va = 0;
6519 			ppim->pa = 0;
6520 			ppim->size = 0;
6521 			preinit_map = true;
6522 			offset = block * L2_SIZE;
6523 			va_trunc = rounddown2(va, L2_SIZE) + offset;
6524 
6525 			/* Remove L2_BLOCK */
6526 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
6527 			KASSERT(pde != NULL,
6528 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
6529 			    va_trunc));
6530 			l2 = pmap_l1_to_l2(pde, va_trunc);
6531 			pmap_clear(l2);
6532 
6533 			if (block == (l2_blocks - 1))
6534 				break;
6535 			block++;
6536 		}
6537 	}
6538 	if (preinit_map) {
6539 		pmap_s1_invalidate_all(kernel_pmap);
6540 		return;
6541 	}
6542 
6543 	/* Unmap the pages reserved with kva_alloc. */
6544 	if (vm_initialized) {
6545 		offset = va & PAGE_MASK;
6546 		size = round_page(offset + size);
6547 		va = trunc_page(va);
6548 
6549 		pde = pmap_pde(kernel_pmap, va, &lvl);
6550 		KASSERT(pde != NULL,
6551 		    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
6552 		KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
6553 
6554 		/* Unmap and invalidate the pages */
6555                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
6556 			pmap_kremove(va + tmpsize);
6557 
6558 		kva_free(va, size);
6559 	}
6560 }
6561 
6562 /*
6563  * Sets the memory attribute for the specified page.
6564  */
6565 void
6566 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
6567 {
6568 
6569 	m->md.pv_memattr = ma;
6570 
6571 	/*
6572 	 * If "m" is a normal page, update its direct mapping.  This update
6573 	 * can be relied upon to perform any cache operations that are
6574 	 * required for data coherence.
6575 	 */
6576 	if ((m->flags & PG_FICTITIOUS) == 0 &&
6577 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
6578 	    m->md.pv_memattr) != 0)
6579 		panic("memory attribute change on the direct map failed");
6580 }
6581 
6582 /*
6583  * Changes the specified virtual address range's memory type to that given by
6584  * the parameter "mode".  The specified virtual address range must be
6585  * completely contained within either the direct map or the kernel map.  If
6586  * the virtual address range is contained within the kernel map, then the
6587  * memory type for each of the corresponding ranges of the direct map is also
6588  * changed.  (The corresponding ranges of the direct map are those ranges that
6589  * map the same physical pages as the specified virtual address range.)  These
6590  * changes to the direct map are necessary because Intel describes the
6591  * behavior of their processors as "undefined" if two or more mappings to the
6592  * same physical page have different memory types.
6593  *
6594  * Returns zero if the change completed successfully, and either EINVAL or
6595  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
6596  * of the virtual address range was not mapped, and ENOMEM is returned if
6597  * there was insufficient memory available to complete the change.  In the
6598  * latter case, the memory type may have been changed on some part of the
6599  * virtual address range or the direct map.
6600  */
6601 int
6602 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
6603 {
6604 	int error;
6605 
6606 	PMAP_LOCK(kernel_pmap);
6607 	error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
6608 	PMAP_UNLOCK(kernel_pmap);
6609 	return (error);
6610 }
6611 
6612 /*
6613  * Changes the specified virtual address range's protections to those
6614  * specified by "prot".  Like pmap_change_attr(), protections for aliases
6615  * in the direct map are updated as well.  Protections on aliasing mappings may
6616  * be a subset of the requested protections; for example, mappings in the direct
6617  * map are never executable.
6618  */
6619 int
6620 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
6621 {
6622 	int error;
6623 
6624 	/* Only supported within the kernel map. */
6625 	if (va < VM_MIN_KERNEL_ADDRESS)
6626 		return (EINVAL);
6627 
6628 	PMAP_LOCK(kernel_pmap);
6629 	error = pmap_change_props_locked(va, size, prot, -1, false);
6630 	PMAP_UNLOCK(kernel_pmap);
6631 	return (error);
6632 }
6633 
6634 static int
6635 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
6636     int mode, bool skip_unmapped)
6637 {
6638 	vm_offset_t base, offset, tmpva;
6639 	vm_size_t pte_size;
6640 	vm_paddr_t pa;
6641 	pt_entry_t pte, *ptep, *newpte;
6642 	pt_entry_t bits, mask;
6643 	int lvl, rv;
6644 
6645 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6646 	base = trunc_page(va);
6647 	offset = va & PAGE_MASK;
6648 	size = round_page(offset + size);
6649 
6650 	if (!VIRT_IN_DMAP(base) &&
6651 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
6652 		return (EINVAL);
6653 
6654 	bits = 0;
6655 	mask = 0;
6656 	if (mode != -1) {
6657 		bits = ATTR_S1_IDX(mode);
6658 		mask = ATTR_S1_IDX_MASK;
6659 		if (mode == VM_MEMATTR_DEVICE) {
6660 			mask |= ATTR_S1_XN;
6661 			bits |= ATTR_S1_XN;
6662 		}
6663 	}
6664 	if (prot != VM_PROT_NONE) {
6665 		/* Don't mark the DMAP as executable. It never is on arm64. */
6666 		if (VIRT_IN_DMAP(base)) {
6667 			prot &= ~VM_PROT_EXECUTE;
6668 			/*
6669 			 * XXX Mark the DMAP as writable for now. We rely
6670 			 * on this in ddb & dtrace to insert breakpoint
6671 			 * instructions.
6672 			 */
6673 			prot |= VM_PROT_WRITE;
6674 		}
6675 
6676 		if ((prot & VM_PROT_WRITE) == 0) {
6677 			bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
6678 		}
6679 		if ((prot & VM_PROT_EXECUTE) == 0) {
6680 			bits |= ATTR_S1_PXN;
6681 		}
6682 		bits |= ATTR_S1_UXN;
6683 		mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
6684 	}
6685 
6686 	for (tmpva = base; tmpva < base + size; ) {
6687 		ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
6688 		if (ptep == NULL && !skip_unmapped) {
6689 			return (EINVAL);
6690 		} else if ((ptep == NULL && skip_unmapped) ||
6691 		    (pmap_load(ptep) & mask) == bits) {
6692 			/*
6693 			 * We already have the correct attribute or there
6694 			 * is no memory mapped at this address and we are
6695 			 * skipping unmapped memory.
6696 			 */
6697 			switch (lvl) {
6698 			default:
6699 				panic("Invalid DMAP table level: %d\n", lvl);
6700 			case 1:
6701 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
6702 				break;
6703 			case 2:
6704 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
6705 				break;
6706 			case 3:
6707 				tmpva += PAGE_SIZE;
6708 				break;
6709 			}
6710 		} else {
6711 			/* We can't demote/promote this entry */
6712 			MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
6713 
6714 			/*
6715 			 * Split the entry to an level 3 table, then
6716 			 * set the new attribute.
6717 			 */
6718 			switch (lvl) {
6719 			default:
6720 				panic("Invalid DMAP table level: %d\n", lvl);
6721 			case 1:
6722 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6723 				if ((tmpva & L1_OFFSET) == 0 &&
6724 				    (base + size - tmpva) >= L1_SIZE) {
6725 					pte_size = L1_SIZE;
6726 					break;
6727 				}
6728 				newpte = pmap_demote_l1(kernel_pmap, ptep,
6729 				    tmpva & ~L1_OFFSET);
6730 				if (newpte == NULL)
6731 					return (EINVAL);
6732 				ptep = pmap_l1_to_l2(ptep, tmpva);
6733 				/* FALLTHROUGH */
6734 			case 2:
6735 				if ((tmpva & L2_OFFSET) == 0 &&
6736 				    (base + size - tmpva) >= L2_SIZE) {
6737 					pte_size = L2_SIZE;
6738 					break;
6739 				}
6740 				newpte = pmap_demote_l2(kernel_pmap, ptep,
6741 				    tmpva);
6742 				if (newpte == NULL)
6743 					return (EINVAL);
6744 				ptep = pmap_l2_to_l3(ptep, tmpva);
6745 				/* FALLTHROUGH */
6746 			case 3:
6747 				pte_size = PAGE_SIZE;
6748 				break;
6749 			}
6750 
6751 			/* Update the entry */
6752 			pte = pmap_load(ptep);
6753 			pte &= ~mask;
6754 			pte |= bits;
6755 
6756 			pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
6757 			    pte_size);
6758 
6759 			pa = pte & ~ATTR_MASK;
6760 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
6761 				/*
6762 				 * Keep the DMAP memory in sync.
6763 				 */
6764 				rv = pmap_change_props_locked(
6765 				    PHYS_TO_DMAP(pa), pte_size,
6766 				    prot, mode, true);
6767 				if (rv != 0)
6768 					return (rv);
6769 			}
6770 
6771 			/*
6772 			 * If moving to a non-cacheable entry flush
6773 			 * the cache.
6774 			 */
6775 			if (mode == VM_MEMATTR_UNCACHEABLE)
6776 				cpu_dcache_wbinv_range(tmpva, pte_size);
6777 			tmpva += pte_size;
6778 		}
6779 	}
6780 
6781 	return (0);
6782 }
6783 
6784 /*
6785  * Create an L2 table to map all addresses within an L1 mapping.
6786  */
6787 static pt_entry_t *
6788 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
6789 {
6790 	pt_entry_t *l2, newl2, oldl1;
6791 	vm_offset_t tmpl1;
6792 	vm_paddr_t l2phys, phys;
6793 	vm_page_t ml2;
6794 	int i;
6795 
6796 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6797 	oldl1 = pmap_load(l1);
6798 	PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6799 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
6800 	    ("pmap_demote_l1: Demoting a non-block entry"));
6801 	KASSERT((va & L1_OFFSET) == 0,
6802 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
6803 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
6804 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
6805 	KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
6806 	    ("pmap_demote_l1: Demoting entry with no-demote flag set"));
6807 
6808 	tmpl1 = 0;
6809 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
6810 		tmpl1 = kva_alloc(PAGE_SIZE);
6811 		if (tmpl1 == 0)
6812 			return (NULL);
6813 	}
6814 
6815 	if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
6816 	    NULL) {
6817 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
6818 		    " in pmap %p", va, pmap);
6819 		l2 = NULL;
6820 		goto fail;
6821 	}
6822 
6823 	l2phys = VM_PAGE_TO_PHYS(ml2);
6824 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
6825 
6826 	/* Address the range points at */
6827 	phys = oldl1 & ~ATTR_MASK;
6828 	/* The attributed from the old l1 table to be copied */
6829 	newl2 = oldl1 & ATTR_MASK;
6830 
6831 	/* Create the new entries */
6832 	for (i = 0; i < Ln_ENTRIES; i++) {
6833 		l2[i] = newl2 | phys;
6834 		phys += L2_SIZE;
6835 	}
6836 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
6837 	    ("Invalid l2 page (%lx != %lx)", l2[0],
6838 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
6839 
6840 	if (tmpl1 != 0) {
6841 		pmap_kenter(tmpl1, PAGE_SIZE,
6842 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
6843 		    VM_MEMATTR_WRITE_BACK);
6844 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
6845 	}
6846 
6847 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
6848 
6849 fail:
6850 	if (tmpl1 != 0) {
6851 		pmap_kremove(tmpl1);
6852 		kva_free(tmpl1, PAGE_SIZE);
6853 	}
6854 
6855 	return (l2);
6856 }
6857 
6858 static void
6859 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
6860 {
6861 	pt_entry_t *l3;
6862 
6863 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
6864 		*l3 = newl3;
6865 		newl3 += L3_SIZE;
6866 	}
6867 }
6868 
6869 static void
6870 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
6871     struct rwlock **lockp)
6872 {
6873 	struct spglist free;
6874 
6875 	SLIST_INIT(&free);
6876 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
6877 	    lockp);
6878 	vm_page_free_pages_toq(&free, true);
6879 }
6880 
6881 /*
6882  * Create an L3 table to map all addresses within an L2 mapping.
6883  */
6884 static pt_entry_t *
6885 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
6886     struct rwlock **lockp)
6887 {
6888 	pt_entry_t *l3, newl3, oldl2;
6889 	vm_offset_t tmpl2;
6890 	vm_paddr_t l3phys;
6891 	vm_page_t ml3;
6892 
6893 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6894 	PMAP_ASSERT_STAGE1(pmap);
6895 	KASSERT(ADDR_IS_CANONICAL(va),
6896 	    ("%s: Address not in canonical form: %lx", __func__, va));
6897 
6898 	l3 = NULL;
6899 	oldl2 = pmap_load(l2);
6900 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
6901 	    ("pmap_demote_l2: Demoting a non-block entry"));
6902 	KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
6903 	    ("pmap_demote_l2: Demoting entry with no-demote flag set"));
6904 	va &= ~L2_OFFSET;
6905 
6906 	tmpl2 = 0;
6907 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
6908 		tmpl2 = kva_alloc(PAGE_SIZE);
6909 		if (tmpl2 == 0)
6910 			return (NULL);
6911 	}
6912 
6913 	/*
6914 	 * Invalidate the 2MB page mapping and return "failure" if the
6915 	 * mapping was never accessed.
6916 	 */
6917 	if ((oldl2 & ATTR_AF) == 0) {
6918 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6919 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
6920 		pmap_demote_l2_abort(pmap, va, l2, lockp);
6921 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
6922 		    va, pmap);
6923 		goto fail;
6924 	}
6925 
6926 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
6927 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6928 		    ("pmap_demote_l2: page table page for a wired mapping"
6929 		    " is missing"));
6930 
6931 		/*
6932 		 * If the page table page is missing and the mapping
6933 		 * is for a kernel address, the mapping must belong to
6934 		 * either the direct map or the early kernel memory.
6935 		 * Page table pages are preallocated for every other
6936 		 * part of the kernel address space, so the direct map
6937 		 * region and early kernel memory are the only parts of the
6938 		 * kernel address space that must be handled here.
6939 		 */
6940 		KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
6941 		    (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
6942 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
6943 
6944 		/*
6945 		 * If the 2MB page mapping belongs to the direct map
6946 		 * region of the kernel's address space, then the page
6947 		 * allocation request specifies the highest possible
6948 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
6949 		 * priority is normal.
6950 		 */
6951 		ml3 = vm_page_alloc_noobj(
6952 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
6953 		    VM_ALLOC_WIRED);
6954 
6955 		/*
6956 		 * If the allocation of the new page table page fails,
6957 		 * invalidate the 2MB page mapping and return "failure".
6958 		 */
6959 		if (ml3 == NULL) {
6960 			pmap_demote_l2_abort(pmap, va, l2, lockp);
6961 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
6962 			    " in pmap %p", va, pmap);
6963 			goto fail;
6964 		}
6965 		ml3->pindex = pmap_l2_pindex(va);
6966 
6967 		if (!ADDR_IS_KERNEL(va)) {
6968 			ml3->ref_count = NL3PG;
6969 			pmap_resident_count_inc(pmap, 1);
6970 		}
6971 	}
6972 	l3phys = VM_PAGE_TO_PHYS(ml3);
6973 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
6974 	newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
6975 	KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
6976 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
6977 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
6978 
6979 	/*
6980 	 * If the page table page is not leftover from an earlier promotion,
6981 	 * or the mapping attributes have changed, (re)initialize the L3 table.
6982 	 *
6983 	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
6984 	 * performs a dsb().  That dsb() ensures that the stores for filling
6985 	 * "l3" are visible before "l3" is added to the page table.
6986 	 */
6987 	if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK))
6988 		pmap_fill_l3(l3, newl3);
6989 
6990 	/*
6991 	 * Map the temporary page so we don't lose access to the l2 table.
6992 	 */
6993 	if (tmpl2 != 0) {
6994 		pmap_kenter(tmpl2, PAGE_SIZE,
6995 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
6996 		    VM_MEMATTR_WRITE_BACK);
6997 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
6998 	}
6999 
7000 	/*
7001 	 * The spare PV entries must be reserved prior to demoting the
7002 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
7003 	 * of the L2 and the PV lists will be inconsistent, which can result
7004 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
7005 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
7006 	 * PV entry for the 2MB page mapping that is being demoted.
7007 	 */
7008 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
7009 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
7010 
7011 	/*
7012 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
7013 	 * the 2MB page mapping.
7014 	 */
7015 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
7016 
7017 	/*
7018 	 * Demote the PV entry.
7019 	 */
7020 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
7021 		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
7022 
7023 	atomic_add_long(&pmap_l2_demotions, 1);
7024 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
7025 	    " in pmap %p %lx", va, pmap, l3[0]);
7026 
7027 fail:
7028 	if (tmpl2 != 0) {
7029 		pmap_kremove(tmpl2);
7030 		kva_free(tmpl2, PAGE_SIZE);
7031 	}
7032 
7033 	return (l3);
7034 
7035 }
7036 
7037 static pt_entry_t *
7038 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
7039 {
7040 	struct rwlock *lock;
7041 	pt_entry_t *l3;
7042 
7043 	lock = NULL;
7044 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
7045 	if (lock != NULL)
7046 		rw_wunlock(lock);
7047 	return (l3);
7048 }
7049 
7050 /*
7051  * Perform the pmap work for mincore(2).  If the page is not both referenced and
7052  * modified by this pmap, returns its physical address so that the caller can
7053  * find other mappings.
7054  */
7055 int
7056 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
7057 {
7058 	pt_entry_t *pte, tpte;
7059 	vm_paddr_t mask, pa;
7060 	int lvl, val;
7061 	bool managed;
7062 
7063 	PMAP_ASSERT_STAGE1(pmap);
7064 	PMAP_LOCK(pmap);
7065 	pte = pmap_pte(pmap, addr, &lvl);
7066 	if (pte != NULL) {
7067 		tpte = pmap_load(pte);
7068 
7069 		switch (lvl) {
7070 		case 3:
7071 			mask = L3_OFFSET;
7072 			break;
7073 		case 2:
7074 			mask = L2_OFFSET;
7075 			break;
7076 		case 1:
7077 			mask = L1_OFFSET;
7078 			break;
7079 		default:
7080 			panic("pmap_mincore: invalid level %d", lvl);
7081 		}
7082 
7083 		managed = (tpte & ATTR_SW_MANAGED) != 0;
7084 		val = MINCORE_INCORE;
7085 		if (lvl != 3)
7086 			val |= MINCORE_PSIND(3 - lvl);
7087 		if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
7088 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
7089 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
7090 		if ((tpte & ATTR_AF) == ATTR_AF)
7091 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
7092 
7093 		pa = (tpte & ~ATTR_MASK) | (addr & mask);
7094 	} else {
7095 		managed = false;
7096 		val = 0;
7097 	}
7098 
7099 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
7100 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
7101 		*pap = pa;
7102 	}
7103 	PMAP_UNLOCK(pmap);
7104 	return (val);
7105 }
7106 
7107 /*
7108  * Garbage collect every ASID that is neither active on a processor nor
7109  * reserved.
7110  */
7111 static void
7112 pmap_reset_asid_set(pmap_t pmap)
7113 {
7114 	pmap_t curpmap;
7115 	int asid, cpuid, epoch;
7116 	struct asid_set *set;
7117 	enum pmap_stage stage;
7118 
7119 	set = pmap->pm_asid_set;
7120 	stage = pmap->pm_stage;
7121 
7122 	set = pmap->pm_asid_set;
7123 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7124 	mtx_assert(&set->asid_set_mutex, MA_OWNED);
7125 
7126 	/*
7127 	 * Ensure that the store to asid_epoch is globally visible before the
7128 	 * loads from pc_curpmap are performed.
7129 	 */
7130 	epoch = set->asid_epoch + 1;
7131 	if (epoch == INT_MAX)
7132 		epoch = 0;
7133 	set->asid_epoch = epoch;
7134 	dsb(ishst);
7135 	if (stage == PM_STAGE1) {
7136 		__asm __volatile("tlbi vmalle1is");
7137 	} else {
7138 		KASSERT(pmap_clean_stage2_tlbi != NULL,
7139 		    ("%s: Unset stage 2 tlb invalidation callback\n",
7140 		    __func__));
7141 		pmap_clean_stage2_tlbi();
7142 	}
7143 	dsb(ish);
7144 	bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
7145 	    set->asid_set_size - 1);
7146 	CPU_FOREACH(cpuid) {
7147 		if (cpuid == curcpu)
7148 			continue;
7149 		if (stage == PM_STAGE1) {
7150 			curpmap = pcpu_find(cpuid)->pc_curpmap;
7151 			PMAP_ASSERT_STAGE1(pmap);
7152 		} else {
7153 			curpmap = pcpu_find(cpuid)->pc_curvmpmap;
7154 			if (curpmap == NULL)
7155 				continue;
7156 			PMAP_ASSERT_STAGE2(pmap);
7157 		}
7158 		KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
7159 		asid = COOKIE_TO_ASID(curpmap->pm_cookie);
7160 		if (asid == -1)
7161 			continue;
7162 		bit_set(set->asid_set, asid);
7163 		curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
7164 	}
7165 }
7166 
7167 /*
7168  * Allocate a new ASID for the specified pmap.
7169  */
7170 static void
7171 pmap_alloc_asid(pmap_t pmap)
7172 {
7173 	struct asid_set *set;
7174 	int new_asid;
7175 
7176 	set = pmap->pm_asid_set;
7177 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7178 
7179 	mtx_lock_spin(&set->asid_set_mutex);
7180 
7181 	/*
7182 	 * While this processor was waiting to acquire the asid set mutex,
7183 	 * pmap_reset_asid_set() running on another processor might have
7184 	 * updated this pmap's cookie to the current epoch.  In which case, we
7185 	 * don't need to allocate a new ASID.
7186 	 */
7187 	if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
7188 		goto out;
7189 
7190 	bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
7191 	    &new_asid);
7192 	if (new_asid == -1) {
7193 		bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
7194 		    set->asid_next, &new_asid);
7195 		if (new_asid == -1) {
7196 			pmap_reset_asid_set(pmap);
7197 			bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
7198 			    set->asid_set_size, &new_asid);
7199 			KASSERT(new_asid != -1, ("ASID allocation failure"));
7200 		}
7201 	}
7202 	bit_set(set->asid_set, new_asid);
7203 	set->asid_next = new_asid + 1;
7204 	pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
7205 out:
7206 	mtx_unlock_spin(&set->asid_set_mutex);
7207 }
7208 
7209 static uint64_t __read_mostly ttbr_flags;
7210 
7211 /*
7212  * Compute the value that should be stored in ttbr0 to activate the specified
7213  * pmap.  This value may change from time to time.
7214  */
7215 uint64_t
7216 pmap_to_ttbr0(pmap_t pmap)
7217 {
7218 	uint64_t ttbr;
7219 
7220 	ttbr = pmap->pm_ttbr;
7221 	ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
7222 	ttbr |= ttbr_flags;
7223 
7224 	return (ttbr);
7225 }
7226 
7227 static void
7228 pmap_set_cnp(void *arg)
7229 {
7230 	uint64_t ttbr0, ttbr1;
7231 	u_int cpuid;
7232 
7233 	cpuid = *(u_int *)arg;
7234 	if (cpuid == curcpu) {
7235 		/*
7236 		 * Set the flags while all CPUs are handling the
7237 		 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
7238 		 * to pmap_to_ttbr0 after this will have the CnP flag set.
7239 		 * The dsb after invalidating the TLB will act as a barrier
7240 		 * to ensure all CPUs can observe this change.
7241 		 */
7242 		ttbr_flags |= TTBR_CnP;
7243 	}
7244 
7245 	ttbr0 = READ_SPECIALREG(ttbr0_el1);
7246 	ttbr0 |= TTBR_CnP;
7247 
7248 	ttbr1 = READ_SPECIALREG(ttbr1_el1);
7249 	ttbr1 |= TTBR_CnP;
7250 
7251 	/* Update ttbr{0,1}_el1 with the CnP flag */
7252 	WRITE_SPECIALREG(ttbr0_el1, ttbr0);
7253 	WRITE_SPECIALREG(ttbr1_el1, ttbr1);
7254 	isb();
7255 	__asm __volatile("tlbi vmalle1is");
7256 	dsb(ish);
7257 	isb();
7258 }
7259 
7260 /*
7261  * Defer enabling CnP until we have read the ID registers to know if it's
7262  * supported on all CPUs.
7263  */
7264 static void
7265 pmap_init_cnp(void *dummy __unused)
7266 {
7267 	uint64_t reg;
7268 	u_int cpuid;
7269 
7270 	if (!get_kernel_reg(ID_AA64MMFR2_EL1, &reg))
7271 		return;
7272 
7273 	if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
7274 		if (bootverbose)
7275 			printf("Enabling CnP\n");
7276 		cpuid = curcpu;
7277 		smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
7278 	}
7279 
7280 }
7281 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
7282 
7283 static bool
7284 pmap_activate_int(pmap_t pmap)
7285 {
7286 	struct asid_set *set;
7287 	int epoch;
7288 
7289 	KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
7290 	KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
7291 
7292 	if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
7293 	    (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
7294 		/*
7295 		 * Handle the possibility that the old thread was preempted
7296 		 * after an "ic" or "tlbi" instruction but before it performed
7297 		 * a "dsb" instruction.  If the old thread migrates to a new
7298 		 * processor, its completion of a "dsb" instruction on that
7299 		 * new processor does not guarantee that the "ic" or "tlbi"
7300 		 * instructions performed on the old processor have completed.
7301 		 */
7302 		dsb(ish);
7303 		return (false);
7304 	}
7305 
7306 	set = pmap->pm_asid_set;
7307 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
7308 
7309 	/*
7310 	 * Ensure that the store to curpmap is globally visible before the
7311 	 * load from asid_epoch is performed.
7312 	 */
7313 	if (pmap->pm_stage == PM_STAGE1)
7314 		PCPU_SET(curpmap, pmap);
7315 	else
7316 		PCPU_SET(curvmpmap, pmap);
7317 	dsb(ish);
7318 	epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
7319 	if (epoch >= 0 && epoch != set->asid_epoch)
7320 		pmap_alloc_asid(pmap);
7321 
7322 	if (pmap->pm_stage == PM_STAGE1) {
7323 		set_ttbr0(pmap_to_ttbr0(pmap));
7324 		if (PCPU_GET(bcast_tlbi_workaround) != 0)
7325 			invalidate_local_icache();
7326 	}
7327 	return (true);
7328 }
7329 
7330 void
7331 pmap_activate_vm(pmap_t pmap)
7332 {
7333 
7334 	PMAP_ASSERT_STAGE2(pmap);
7335 
7336 	(void)pmap_activate_int(pmap);
7337 }
7338 
7339 void
7340 pmap_activate(struct thread *td)
7341 {
7342 	pmap_t	pmap;
7343 
7344 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
7345 	PMAP_ASSERT_STAGE1(pmap);
7346 	critical_enter();
7347 	(void)pmap_activate_int(pmap);
7348 	critical_exit();
7349 }
7350 
7351 /*
7352  * Activate the thread we are switching to.
7353  * To simplify the assembly in cpu_throw return the new threads pcb.
7354  */
7355 struct pcb *
7356 pmap_switch(struct thread *new)
7357 {
7358 	pcpu_bp_harden bp_harden;
7359 	struct pcb *pcb;
7360 
7361 	/* Store the new curthread */
7362 	PCPU_SET(curthread, new);
7363 
7364 	/* And the new pcb */
7365 	pcb = new->td_pcb;
7366 	PCPU_SET(curpcb, pcb);
7367 
7368 	/*
7369 	 * TODO: We may need to flush the cache here if switching
7370 	 * to a user process.
7371 	 */
7372 
7373 	if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
7374 		/*
7375 		 * Stop userspace from training the branch predictor against
7376 		 * other processes. This will call into a CPU specific
7377 		 * function that clears the branch predictor state.
7378 		 */
7379 		bp_harden = PCPU_GET(bp_harden);
7380 		if (bp_harden != NULL)
7381 			bp_harden();
7382 	}
7383 
7384 	return (pcb);
7385 }
7386 
7387 void
7388 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
7389 {
7390 
7391 	PMAP_ASSERT_STAGE1(pmap);
7392 	KASSERT(ADDR_IS_CANONICAL(va),
7393 	    ("%s: Address not in canonical form: %lx", __func__, va));
7394 
7395 	if (ADDR_IS_KERNEL(va)) {
7396 		cpu_icache_sync_range(va, sz);
7397 	} else {
7398 		u_int len, offset;
7399 		vm_paddr_t pa;
7400 
7401 		/* Find the length of data in this page to flush */
7402 		offset = va & PAGE_MASK;
7403 		len = imin(PAGE_SIZE - offset, sz);
7404 
7405 		while (sz != 0) {
7406 			/* Extract the physical address & find it in the DMAP */
7407 			pa = pmap_extract(pmap, va);
7408 			if (pa != 0)
7409 				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
7410 
7411 			/* Move to the next page */
7412 			sz -= len;
7413 			va += len;
7414 			/* Set the length for the next iteration */
7415 			len = imin(PAGE_SIZE, sz);
7416 		}
7417 	}
7418 }
7419 
7420 static int
7421 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
7422 {
7423 	pd_entry_t *pdep;
7424 	pt_entry_t *ptep, pte;
7425 	int rv, lvl, dfsc;
7426 
7427 	PMAP_ASSERT_STAGE2(pmap);
7428 	rv = KERN_FAILURE;
7429 
7430 	/* Data and insn aborts use same encoding for FSC field. */
7431 	dfsc = esr & ISS_DATA_DFSC_MASK;
7432 	switch (dfsc) {
7433 	case ISS_DATA_DFSC_TF_L0:
7434 	case ISS_DATA_DFSC_TF_L1:
7435 	case ISS_DATA_DFSC_TF_L2:
7436 	case ISS_DATA_DFSC_TF_L3:
7437 		PMAP_LOCK(pmap);
7438 		pdep = pmap_pde(pmap, far, &lvl);
7439 		if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
7440 			PMAP_LOCK(pmap);
7441 			break;
7442 		}
7443 
7444 		switch (lvl) {
7445 		case 0:
7446 			ptep = pmap_l0_to_l1(pdep, far);
7447 			break;
7448 		case 1:
7449 			ptep = pmap_l1_to_l2(pdep, far);
7450 			break;
7451 		case 2:
7452 			ptep = pmap_l2_to_l3(pdep, far);
7453 			break;
7454 		default:
7455 			panic("%s: Invalid pde level %d", __func__,lvl);
7456 		}
7457 		goto fault_exec;
7458 
7459 	case ISS_DATA_DFSC_AFF_L1:
7460 	case ISS_DATA_DFSC_AFF_L2:
7461 	case ISS_DATA_DFSC_AFF_L3:
7462 		PMAP_LOCK(pmap);
7463 		ptep = pmap_pte(pmap, far, &lvl);
7464 fault_exec:
7465 		if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
7466 			if (icache_vmid) {
7467 				pmap_invalidate_vpipt_icache();
7468 			} else {
7469 				/*
7470 				 * If accessing an executable page invalidate
7471 				 * the I-cache so it will be valid when we
7472 				 * continue execution in the guest. The D-cache
7473 				 * is assumed to already be clean to the Point
7474 				 * of Coherency.
7475 				 */
7476 				if ((pte & ATTR_S2_XN_MASK) !=
7477 				    ATTR_S2_XN(ATTR_S2_XN_NONE)) {
7478 					invalidate_icache();
7479 				}
7480 			}
7481 			pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
7482 			rv = KERN_SUCCESS;
7483 		}
7484 		PMAP_UNLOCK(pmap);
7485 		break;
7486 	}
7487 
7488 	return (rv);
7489 }
7490 
7491 int
7492 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
7493 {
7494 	pt_entry_t pte, *ptep;
7495 	register_t intr;
7496 	uint64_t ec, par;
7497 	int lvl, rv;
7498 
7499 	rv = KERN_FAILURE;
7500 
7501 	ec = ESR_ELx_EXCEPTION(esr);
7502 	switch (ec) {
7503 	case EXCP_INSN_ABORT_L:
7504 	case EXCP_INSN_ABORT:
7505 	case EXCP_DATA_ABORT_L:
7506 	case EXCP_DATA_ABORT:
7507 		break;
7508 	default:
7509 		return (rv);
7510 	}
7511 
7512 	if (pmap->pm_stage == PM_STAGE2)
7513 		return (pmap_stage2_fault(pmap, esr, far));
7514 
7515 	/* Data and insn aborts use same encoding for FSC field. */
7516 	switch (esr & ISS_DATA_DFSC_MASK) {
7517 	case ISS_DATA_DFSC_AFF_L1:
7518 	case ISS_DATA_DFSC_AFF_L2:
7519 	case ISS_DATA_DFSC_AFF_L3:
7520 		PMAP_LOCK(pmap);
7521 		ptep = pmap_pte(pmap, far, &lvl);
7522 		if (ptep != NULL) {
7523 			pmap_set_bits(ptep, ATTR_AF);
7524 			rv = KERN_SUCCESS;
7525 			/*
7526 			 * XXXMJ as an optimization we could mark the entry
7527 			 * dirty if this is a write fault.
7528 			 */
7529 		}
7530 		PMAP_UNLOCK(pmap);
7531 		break;
7532 	case ISS_DATA_DFSC_PF_L1:
7533 	case ISS_DATA_DFSC_PF_L2:
7534 	case ISS_DATA_DFSC_PF_L3:
7535 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
7536 		    (esr & ISS_DATA_WnR) == 0)
7537 			return (rv);
7538 		PMAP_LOCK(pmap);
7539 		ptep = pmap_pte(pmap, far, &lvl);
7540 		if (ptep != NULL &&
7541 		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
7542 			if ((pte & ATTR_S1_AP_RW_BIT) ==
7543 			    ATTR_S1_AP(ATTR_S1_AP_RO)) {
7544 				pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
7545 				pmap_s1_invalidate_page(pmap, far, true);
7546 			}
7547 			rv = KERN_SUCCESS;
7548 		}
7549 		PMAP_UNLOCK(pmap);
7550 		break;
7551 	case ISS_DATA_DFSC_TF_L0:
7552 	case ISS_DATA_DFSC_TF_L1:
7553 	case ISS_DATA_DFSC_TF_L2:
7554 	case ISS_DATA_DFSC_TF_L3:
7555 		/*
7556 		 * Retry the translation.  A break-before-make sequence can
7557 		 * produce a transient fault.
7558 		 */
7559 		if (pmap == kernel_pmap) {
7560 			/*
7561 			 * The translation fault may have occurred within a
7562 			 * critical section.  Therefore, we must check the
7563 			 * address without acquiring the kernel pmap's lock.
7564 			 */
7565 			if (pmap_klookup(far, NULL))
7566 				rv = KERN_SUCCESS;
7567 		} else {
7568 			PMAP_LOCK(pmap);
7569 			/* Ask the MMU to check the address. */
7570 			intr = intr_disable();
7571 			par = arm64_address_translate_s1e0r(far);
7572 			intr_restore(intr);
7573 			PMAP_UNLOCK(pmap);
7574 
7575 			/*
7576 			 * If the translation was successful, then we can
7577 			 * return success to the trap handler.
7578 			 */
7579 			if (PAR_SUCCESS(par))
7580 				rv = KERN_SUCCESS;
7581 		}
7582 		break;
7583 	}
7584 
7585 	return (rv);
7586 }
7587 
7588 /*
7589  *	Increase the starting virtual address of the given mapping if a
7590  *	different alignment might result in more superpage mappings.
7591  */
7592 void
7593 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
7594     vm_offset_t *addr, vm_size_t size)
7595 {
7596 	vm_offset_t superpage_offset;
7597 
7598 	if (size < L2_SIZE)
7599 		return;
7600 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
7601 		offset += ptoa(object->pg_color);
7602 	superpage_offset = offset & L2_OFFSET;
7603 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
7604 	    (*addr & L2_OFFSET) == superpage_offset)
7605 		return;
7606 	if ((*addr & L2_OFFSET) < superpage_offset)
7607 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
7608 	else
7609 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
7610 }
7611 
7612 /**
7613  * Get the kernel virtual address of a set of physical pages. If there are
7614  * physical addresses not covered by the DMAP perform a transient mapping
7615  * that will be removed when calling pmap_unmap_io_transient.
7616  *
7617  * \param page        The pages the caller wishes to obtain the virtual
7618  *                    address on the kernel memory map.
7619  * \param vaddr       On return contains the kernel virtual memory address
7620  *                    of the pages passed in the page parameter.
7621  * \param count       Number of pages passed in.
7622  * \param can_fault   TRUE if the thread using the mapped pages can take
7623  *                    page faults, FALSE otherwise.
7624  *
7625  * \returns TRUE if the caller must call pmap_unmap_io_transient when
7626  *          finished or FALSE otherwise.
7627  *
7628  */
7629 boolean_t
7630 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7631     boolean_t can_fault)
7632 {
7633 	vm_paddr_t paddr;
7634 	boolean_t needs_mapping;
7635 	int error __diagused, i;
7636 
7637 	/*
7638 	 * Allocate any KVA space that we need, this is done in a separate
7639 	 * loop to prevent calling vmem_alloc while pinned.
7640 	 */
7641 	needs_mapping = FALSE;
7642 	for (i = 0; i < count; i++) {
7643 		paddr = VM_PAGE_TO_PHYS(page[i]);
7644 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
7645 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
7646 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
7647 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
7648 			needs_mapping = TRUE;
7649 		} else {
7650 			vaddr[i] = PHYS_TO_DMAP(paddr);
7651 		}
7652 	}
7653 
7654 	/* Exit early if everything is covered by the DMAP */
7655 	if (!needs_mapping)
7656 		return (FALSE);
7657 
7658 	if (!can_fault)
7659 		sched_pin();
7660 	for (i = 0; i < count; i++) {
7661 		paddr = VM_PAGE_TO_PHYS(page[i]);
7662 		if (!PHYS_IN_DMAP(paddr)) {
7663 			panic(
7664 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
7665 		}
7666 	}
7667 
7668 	return (needs_mapping);
7669 }
7670 
7671 void
7672 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
7673     boolean_t can_fault)
7674 {
7675 	vm_paddr_t paddr;
7676 	int i;
7677 
7678 	if (!can_fault)
7679 		sched_unpin();
7680 	for (i = 0; i < count; i++) {
7681 		paddr = VM_PAGE_TO_PHYS(page[i]);
7682 		if (!PHYS_IN_DMAP(paddr)) {
7683 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
7684 		}
7685 	}
7686 }
7687 
7688 boolean_t
7689 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
7690 {
7691 
7692 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
7693 }
7694 
7695 #if defined(KASAN)
7696 static vm_paddr_t	pmap_san_early_kernstart;
7697 static pd_entry_t	*pmap_san_early_l2;
7698 
7699 void __nosanitizeaddress
7700 pmap_san_bootstrap(struct arm64_bootparams *abp)
7701 {
7702 
7703 	pmap_san_early_kernstart = KERNBASE - abp->kern_delta;
7704 	kasan_init_early(abp->kern_stack, KSTACK_PAGES * PAGE_SIZE);
7705 }
7706 
7707 #define	SAN_BOOTSTRAP_L2_SIZE	(1 * L2_SIZE)
7708 #define	SAN_BOOTSTRAP_SIZE	(2 * PAGE_SIZE)
7709 static vm_offset_t __nosanitizeaddress
7710 pmap_san_enter_bootstrap_alloc_l2(void)
7711 {
7712 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
7713 	static size_t offset = 0;
7714 	vm_offset_t addr;
7715 
7716 	if (offset + L2_SIZE > sizeof(bootstrap_data)) {
7717 		panic("%s: out of memory for the bootstrap shadow map L2 entries",
7718 		    __func__);
7719 	}
7720 
7721 	addr = (uintptr_t)&bootstrap_data[offset];
7722 	offset += L2_SIZE;
7723 	return (addr);
7724 }
7725 
7726 /*
7727  * SAN L1 + L2 pages, maybe L3 entries later?
7728  */
7729 static vm_offset_t __nosanitizeaddress
7730 pmap_san_enter_bootstrap_alloc_pages(int npages)
7731 {
7732 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
7733 	static size_t offset = 0;
7734 	vm_offset_t addr;
7735 
7736 	if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
7737 		panic("%s: out of memory for the bootstrap shadow map",
7738 		    __func__);
7739 	}
7740 
7741 	addr = (uintptr_t)&bootstrap_data[offset];
7742 	offset += (npages * PAGE_SIZE);
7743 	return (addr);
7744 }
7745 
7746 static void __nosanitizeaddress
7747 pmap_san_enter_bootstrap(void)
7748 {
7749 	vm_offset_t freemempos;
7750 
7751 	/* L1, L2 */
7752 	freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
7753 	bs_state.freemempos = freemempos;
7754 	bs_state.va = KASAN_MIN_ADDRESS;
7755 	pmap_bootstrap_l1_table(&bs_state);
7756 	pmap_san_early_l2 = bs_state.l2;
7757 }
7758 
7759 static vm_page_t
7760 pmap_san_enter_alloc_l3(void)
7761 {
7762 	vm_page_t m;
7763 
7764 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
7765 	    VM_ALLOC_ZERO);
7766 	if (m == NULL)
7767 		panic("%s: no memory to grow shadow map", __func__);
7768 	return (m);
7769 }
7770 
7771 static vm_page_t
7772 pmap_san_enter_alloc_l2(void)
7773 {
7774 	return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
7775 	    Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
7776 }
7777 
7778 void __nosanitizeaddress
7779 pmap_san_enter(vm_offset_t va)
7780 {
7781 	pd_entry_t *l1, *l2;
7782 	pt_entry_t *l3;
7783 	vm_page_t m;
7784 
7785 	if (virtual_avail == 0) {
7786 		vm_offset_t block;
7787 		int slot;
7788 		bool first;
7789 
7790 		/* Temporary shadow map prior to pmap_bootstrap(). */
7791 		first = pmap_san_early_l2 == NULL;
7792 		if (first)
7793 			pmap_san_enter_bootstrap();
7794 
7795 		l2 = pmap_san_early_l2;
7796 		slot = pmap_l2_index(va);
7797 
7798 		if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
7799 			MPASS(first);
7800 			block = pmap_san_enter_bootstrap_alloc_l2();
7801 			pmap_store(&l2[slot], pmap_early_vtophys(block) |
7802 			    PMAP_SAN_PTE_BITS | L2_BLOCK);
7803 			dmb(ishst);
7804 		}
7805 
7806 		return;
7807 	}
7808 
7809 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
7810 	l1 = pmap_l1(kernel_pmap, va);
7811 	MPASS(l1 != NULL);
7812 	if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
7813 		m = pmap_san_enter_alloc_l3();
7814 		pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
7815 	}
7816 	l2 = pmap_l1_to_l2(l1, va);
7817 	if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
7818 		m = pmap_san_enter_alloc_l2();
7819 		if (m != NULL) {
7820 			pmap_store(l2, VM_PAGE_TO_PHYS(m) | PMAP_SAN_PTE_BITS |
7821 			    L2_BLOCK);
7822 		} else {
7823 			m = pmap_san_enter_alloc_l3();
7824 			pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
7825 		}
7826 		dmb(ishst);
7827 	}
7828 	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
7829 		return;
7830 	l3 = pmap_l2_to_l3(l2, va);
7831 	if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
7832 		return;
7833 	m = pmap_san_enter_alloc_l3();
7834 	pmap_store(l3, VM_PAGE_TO_PHYS(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
7835 	dmb(ishst);
7836 }
7837 #endif /* KASAN */
7838 
7839 /*
7840  * Track a range of the kernel's virtual address space that is contiguous
7841  * in various mapping attributes.
7842  */
7843 struct pmap_kernel_map_range {
7844 	vm_offset_t sva;
7845 	pt_entry_t attrs;
7846 	int l3pages;
7847 	int l3contig;
7848 	int l2blocks;
7849 	int l1blocks;
7850 };
7851 
7852 static void
7853 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
7854     vm_offset_t eva)
7855 {
7856 	const char *mode;
7857 	int index;
7858 
7859 	if (eva <= range->sva)
7860 		return;
7861 
7862 	index = range->attrs & ATTR_S1_IDX_MASK;
7863 	switch (index) {
7864 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
7865 		mode = "DEV-NP";
7866 		break;
7867 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
7868 		mode = "DEV";
7869 		break;
7870 	case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
7871 		mode = "UC";
7872 		break;
7873 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
7874 		mode = "WB";
7875 		break;
7876 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
7877 		mode = "WT";
7878 		break;
7879 	default:
7880 		printf(
7881 		    "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
7882 		    __func__, index, range->sva, eva);
7883 		mode = "??";
7884 		break;
7885 	}
7886 
7887 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %6s %d %d %d %d\n",
7888 	    range->sva, eva,
7889 	    (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
7890 	    (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
7891 	    (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
7892 	    (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
7893 	    mode, range->l1blocks, range->l2blocks, range->l3contig,
7894 	    range->l3pages);
7895 
7896 	/* Reset to sentinel value. */
7897 	range->sva = 0xfffffffffffffffful;
7898 }
7899 
7900 /*
7901  * Determine whether the attributes specified by a page table entry match those
7902  * being tracked by the current range.
7903  */
7904 static bool
7905 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
7906 {
7907 
7908 	return (range->attrs == attrs);
7909 }
7910 
7911 static void
7912 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
7913     pt_entry_t attrs)
7914 {
7915 
7916 	memset(range, 0, sizeof(*range));
7917 	range->sva = va;
7918 	range->attrs = attrs;
7919 }
7920 
7921 /* Get the block/page attributes that correspond to the table attributes */
7922 static pt_entry_t
7923 sysctl_kmaps_table_attrs(pd_entry_t table)
7924 {
7925 	pt_entry_t attrs;
7926 
7927 	attrs = 0;
7928 	if ((table & TATTR_UXN_TABLE) != 0)
7929 		attrs |= ATTR_S1_UXN;
7930 	if ((table & TATTR_PXN_TABLE) != 0)
7931 		attrs |= ATTR_S1_PXN;
7932 	if ((table & TATTR_AP_TABLE_RO) != 0)
7933 		attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
7934 
7935 	return (attrs);
7936 }
7937 
7938 /* Read the block/page attributes we care about */
7939 static pt_entry_t
7940 sysctl_kmaps_block_attrs(pt_entry_t block)
7941 {
7942 	return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK));
7943 }
7944 
7945 /*
7946  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
7947  * those of the current run, dump the address range and its attributes, and
7948  * begin a new run.
7949  */
7950 static void
7951 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
7952     vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
7953     pt_entry_t l3e)
7954 {
7955 	pt_entry_t attrs;
7956 
7957 	attrs = sysctl_kmaps_table_attrs(l0e);
7958 
7959 	if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7960 		attrs |= sysctl_kmaps_block_attrs(l1e);
7961 		goto done;
7962 	}
7963 	attrs |= sysctl_kmaps_table_attrs(l1e);
7964 
7965 	if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7966 		attrs |= sysctl_kmaps_block_attrs(l2e);
7967 		goto done;
7968 	}
7969 	attrs |= sysctl_kmaps_table_attrs(l2e);
7970 	attrs |= sysctl_kmaps_block_attrs(l3e);
7971 
7972 done:
7973 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
7974 		sysctl_kmaps_dump(sb, range, va);
7975 		sysctl_kmaps_reinit(range, va, attrs);
7976 	}
7977 }
7978 
7979 static int
7980 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
7981 {
7982 	struct pmap_kernel_map_range range;
7983 	struct sbuf sbuf, *sb;
7984 	pd_entry_t l0e, *l1, l1e, *l2, l2e;
7985 	pt_entry_t *l3, l3e;
7986 	vm_offset_t sva;
7987 	vm_paddr_t pa;
7988 	int error, i, j, k, l;
7989 
7990 	error = sysctl_wire_old_buffer(req, 0);
7991 	if (error != 0)
7992 		return (error);
7993 	sb = &sbuf;
7994 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
7995 
7996 	/* Sentinel value. */
7997 	range.sva = 0xfffffffffffffffful;
7998 
7999 	/*
8000 	 * Iterate over the kernel page tables without holding the kernel pmap
8001 	 * lock.  Kernel page table pages are never freed, so at worst we will
8002 	 * observe inconsistencies in the output.
8003 	 */
8004 	for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
8005 	    i++) {
8006 		if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
8007 			sbuf_printf(sb, "\nDirect map:\n");
8008 		else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
8009 			sbuf_printf(sb, "\nKernel map:\n");
8010 #ifdef KASAN
8011 		else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
8012 			sbuf_printf(sb, "\nKASAN shadow map:\n");
8013 #endif
8014 
8015 		l0e = kernel_pmap->pm_l0[i];
8016 		if ((l0e & ATTR_DESCR_VALID) == 0) {
8017 			sysctl_kmaps_dump(sb, &range, sva);
8018 			sva += L0_SIZE;
8019 			continue;
8020 		}
8021 		pa = l0e & ~ATTR_MASK;
8022 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
8023 
8024 		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
8025 			l1e = l1[j];
8026 			if ((l1e & ATTR_DESCR_VALID) == 0) {
8027 				sysctl_kmaps_dump(sb, &range, sva);
8028 				sva += L1_SIZE;
8029 				continue;
8030 			}
8031 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
8032 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8033 				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
8034 				    0, 0);
8035 				range.l1blocks++;
8036 				sva += L1_SIZE;
8037 				continue;
8038 			}
8039 			pa = l1e & ~ATTR_MASK;
8040 			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
8041 
8042 			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
8043 				l2e = l2[k];
8044 				if ((l2e & ATTR_DESCR_VALID) == 0) {
8045 					sysctl_kmaps_dump(sb, &range, sva);
8046 					sva += L2_SIZE;
8047 					continue;
8048 				}
8049 				if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
8050 					sysctl_kmaps_check(sb, &range, sva,
8051 					    l0e, l1e, l2e, 0);
8052 					range.l2blocks++;
8053 					sva += L2_SIZE;
8054 					continue;
8055 				}
8056 				pa = l2e & ~ATTR_MASK;
8057 				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
8058 
8059 				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
8060 				    l++, sva += L3_SIZE) {
8061 					l3e = l3[l];
8062 					if ((l3e & ATTR_DESCR_VALID) == 0) {
8063 						sysctl_kmaps_dump(sb, &range,
8064 						    sva);
8065 						continue;
8066 					}
8067 					sysctl_kmaps_check(sb, &range, sva,
8068 					    l0e, l1e, l2e, l3e);
8069 					if ((l3e & ATTR_CONTIGUOUS) != 0)
8070 						range.l3contig += l % 16 == 0 ?
8071 						    1 : 0;
8072 					else
8073 						range.l3pages++;
8074 				}
8075 			}
8076 		}
8077 	}
8078 
8079 	error = sbuf_finish(sb);
8080 	sbuf_delete(sb);
8081 	return (error);
8082 }
8083 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
8084     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
8085     NULL, 0, sysctl_kmaps, "A",
8086     "Dump kernel address layout");
8087