xref: /freebsd/sys/arm64/arm64/pmap.c (revision c03c5b1c)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *	This product includes software developed by the University of
35  *	California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  *
52  *	from:	@(#)pmap.c	7.7 (Berkeley)	5/12/91
53  */
54 /*-
55  * Copyright (c) 2003 Networks Associates Technology, Inc.
56  * All rights reserved.
57  *
58  * This software was developed for the FreeBSD Project by Jake Burkholder,
59  * Safeport Network Services, and Network Associates Laboratories, the
60  * Security Research Division of Network Associates, Inc. under
61  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
62  * CHATS research program.
63  *
64  * Redistribution and use in source and binary forms, with or without
65  * modification, are permitted provided that the following conditions
66  * are met:
67  * 1. Redistributions of source code must retain the above copyright
68  *    notice, this list of conditions and the following disclaimer.
69  * 2. Redistributions in binary form must reproduce the above copyright
70  *    notice, this list of conditions and the following disclaimer in the
71  *    documentation and/or other materials provided with the distribution.
72  *
73  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
74  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
75  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
76  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
77  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
78  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
79  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
80  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
81  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
82  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
83  * SUCH DAMAGE.
84  */
85 
86 #include <sys/cdefs.h>
87 __FBSDID("$FreeBSD$");
88 
89 /*
90  *	Manages physical address maps.
91  *
92  *	Since the information managed by this module is
93  *	also stored by the logical address mapping module,
94  *	this module may throw away valid virtual-to-physical
95  *	mappings at almost any time.  However, invalidations
96  *	of virtual-to-physical mappings must be done as
97  *	requested.
98  *
99  *	In order to cope with hardware architectures which
100  *	make virtual-to-physical map invalidates expensive,
101  *	this module may delay invalidate or reduced protection
102  *	operations until such time as they are actually
103  *	necessary.  This module is given full information as
104  *	to which processors are currently using which maps,
105  *	and to when physical maps must be made correct.
106  */
107 
108 #include "opt_vm.h"
109 
110 #include <sys/param.h>
111 #include <sys/bitstring.h>
112 #include <sys/bus.h>
113 #include <sys/systm.h>
114 #include <sys/kernel.h>
115 #include <sys/ktr.h>
116 #include <sys/limits.h>
117 #include <sys/lock.h>
118 #include <sys/malloc.h>
119 #include <sys/mman.h>
120 #include <sys/msgbuf.h>
121 #include <sys/mutex.h>
122 #include <sys/physmem.h>
123 #include <sys/proc.h>
124 #include <sys/rwlock.h>
125 #include <sys/sbuf.h>
126 #include <sys/sx.h>
127 #include <sys/vmem.h>
128 #include <sys/vmmeter.h>
129 #include <sys/sched.h>
130 #include <sys/sysctl.h>
131 #include <sys/_unrhdr.h>
132 #include <sys/smp.h>
133 
134 #include <vm/vm.h>
135 #include <vm/vm_param.h>
136 #include <vm/vm_kern.h>
137 #include <vm/vm_page.h>
138 #include <vm/vm_map.h>
139 #include <vm/vm_object.h>
140 #include <vm/vm_extern.h>
141 #include <vm/vm_pageout.h>
142 #include <vm/vm_pager.h>
143 #include <vm/vm_phys.h>
144 #include <vm/vm_radix.h>
145 #include <vm/vm_reserv.h>
146 #include <vm/vm_dumpset.h>
147 #include <vm/uma.h>
148 
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152 
153 #define	PMAP_ASSERT_STAGE1(pmap)	MPASS((pmap)->pm_stage == PM_STAGE1)
154 #define	PMAP_ASSERT_STAGE2(pmap)	MPASS((pmap)->pm_stage == PM_STAGE2)
155 
156 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
157 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
158 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
159 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
160 
161 #define	NUL0E		L0_ENTRIES
162 #define	NUL1E		(NUL0E * NL1PG)
163 #define	NUL2E		(NUL1E * NL2PG)
164 
165 #if !defined(DIAGNOSTIC)
166 #ifdef __GNUC_GNU_INLINE__
167 #define PMAP_INLINE	__attribute__((__gnu_inline__)) inline
168 #else
169 #define PMAP_INLINE	extern inline
170 #endif
171 #else
172 #define PMAP_INLINE
173 #endif
174 
175 #ifdef PV_STATS
176 #define PV_STAT(x)	do { x ; } while (0)
177 #else
178 #define PV_STAT(x)	do { } while (0)
179 #endif
180 
181 #define	pmap_l0_pindex(v)	(NUL2E + NUL1E + ((v) >> L0_SHIFT))
182 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
183 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
184 
185 static struct md_page *
186 pa_to_pvh(vm_paddr_t pa)
187 {
188 	struct vm_phys_seg *seg;
189 	int segind;
190 
191 	for (segind = 0; segind < vm_phys_nsegs; segind++) {
192 		seg = &vm_phys_segs[segind];
193 		if (pa >= seg->start && pa < seg->end)
194 			return ((struct md_page *)seg->md_first +
195 			    pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
196 	}
197 	panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
198 }
199 
200 static struct md_page *
201 page_to_pvh(vm_page_t m)
202 {
203 	struct vm_phys_seg *seg;
204 
205 	seg = &vm_phys_segs[m->segind];
206 	return ((struct md_page *)seg->md_first +
207 	    pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
208 }
209 
210 #define	NPV_LIST_LOCKS	MAXCPU
211 
212 #define	PHYS_TO_PV_LIST_LOCK(pa)	\
213 			(&pv_list_locks[pa_index(pa) % NPV_LIST_LOCKS])
214 
215 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)	do {	\
216 	struct rwlock **_lockp = (lockp);		\
217 	struct rwlock *_new_lock;			\
218 							\
219 	_new_lock = PHYS_TO_PV_LIST_LOCK(pa);		\
220 	if (_new_lock != *_lockp) {			\
221 		if (*_lockp != NULL)			\
222 			rw_wunlock(*_lockp);		\
223 		*_lockp = _new_lock;			\
224 		rw_wlock(*_lockp);			\
225 	}						\
226 } while (0)
227 
228 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
229 			CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, VM_PAGE_TO_PHYS(m))
230 
231 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
232 	struct rwlock **_lockp = (lockp);		\
233 							\
234 	if (*_lockp != NULL) {				\
235 		rw_wunlock(*_lockp);			\
236 		*_lockp = NULL;				\
237 	}						\
238 } while (0)
239 
240 #define	VM_PAGE_TO_PV_LIST_LOCK(m)	\
241 			PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m))
242 
243 /*
244  * The presence of this flag indicates that the mapping is writeable.
245  * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
246  * it is dirty.  This flag may only be set on managed mappings.
247  *
248  * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
249  * as a software managed bit.
250  */
251 #define	ATTR_SW_DBM	ATTR_DBM
252 
253 struct pmap kernel_pmap_store;
254 
255 /* Used for mapping ACPI memory before VM is initialized */
256 #define	PMAP_PREINIT_MAPPING_COUNT	32
257 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
258 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
259 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
260 
261 /*
262  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
263  * Always map entire L2 block for simplicity.
264  * VA of L2 block = preinit_map_va + i * L2_SIZE
265  */
266 static struct pmap_preinit_mapping {
267 	vm_paddr_t	pa;
268 	vm_offset_t	va;
269 	vm_size_t	size;
270 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
271 
272 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
273 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
274 vm_offset_t kernel_vm_end = 0;
275 
276 /*
277  * Data for the pv entry allocation mechanism.
278  */
279 static TAILQ_HEAD(pch, pv_chunk) pv_chunks = TAILQ_HEAD_INITIALIZER(pv_chunks);
280 static struct mtx pv_chunks_mutex;
281 static struct rwlock pv_list_locks[NPV_LIST_LOCKS];
282 static struct md_page *pv_table;
283 static struct md_page pv_dummy;
284 
285 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
286 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
287 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
288 
289 /* This code assumes all L1 DMAP entries will be used */
290 CTASSERT((DMAP_MIN_ADDRESS  & ~L0_OFFSET) == DMAP_MIN_ADDRESS);
291 CTASSERT((DMAP_MAX_ADDRESS  & ~L0_OFFSET) == DMAP_MAX_ADDRESS);
292 
293 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
294 extern pt_entry_t pagetable_dmap[];
295 
296 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
297 static vm_paddr_t physmap[PHYSMAP_SIZE];
298 static u_int physmap_idx;
299 
300 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
301     "VM/pmap parameters");
302 
303 /*
304  * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
305  * that it has currently allocated to a pmap, a cursor ("asid_next") to
306  * optimize its search for a free ASID in the bit vector, and an epoch number
307  * ("asid_epoch") to indicate when it has reclaimed all previously allocated
308  * ASIDs that are not currently active on a processor.
309  *
310  * The current epoch number is always in the range [0, INT_MAX).  Negative
311  * numbers and INT_MAX are reserved for special cases that are described
312  * below.
313  */
314 struct asid_set {
315 	int asid_bits;
316 	bitstr_t *asid_set;
317 	int asid_set_size;
318 	int asid_next;
319 	int asid_epoch;
320 	struct mtx asid_set_mutex;
321 };
322 
323 static struct asid_set asids;
324 static struct asid_set vmids;
325 
326 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
327     "ASID allocator");
328 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
329     "The number of bits in an ASID");
330 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
331     "The last allocated ASID plus one");
332 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
333     "The current epoch number");
334 
335 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
336 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
337     "The number of bits in an VMID");
338 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
339     "The last allocated VMID plus one");
340 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
341     "The current epoch number");
342 
343 void (*pmap_clean_stage2_tlbi)(void);
344 void (*pmap_invalidate_vpipt_icache)(void);
345 
346 /*
347  * A pmap's cookie encodes an ASID and epoch number.  Cookies for reserved
348  * ASIDs have a negative epoch number, specifically, INT_MIN.  Cookies for
349  * dynamically allocated ASIDs have a non-negative epoch number.
350  *
351  * An invalid ASID is represented by -1.
352  *
353  * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
354  * which indicates that an ASID should never be allocated to the pmap, and
355  * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
356  * allocated when the pmap is next activated.
357  */
358 #define	COOKIE_FROM(asid, epoch)	((long)((u_int)(asid) |	\
359 					    ((u_long)(epoch) << 32)))
360 #define	COOKIE_TO_ASID(cookie)		((int)(cookie))
361 #define	COOKIE_TO_EPOCH(cookie)		((int)((u_long)(cookie) >> 32))
362 
363 static int superpages_enabled = 1;
364 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
365     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
366     "Are large page mappings enabled?");
367 
368 /*
369  * Internal flags for pmap_enter()'s helper functions.
370  */
371 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
372 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
373 
374 static void	free_pv_chunk(struct pv_chunk *pc);
375 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
376 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
377 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
378 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
379 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
380 		    vm_offset_t va);
381 
382 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
383 static bool pmap_activate_int(pmap_t pmap);
384 static void pmap_alloc_asid(pmap_t pmap);
385 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
386     vm_prot_t prot, int mode, bool skip_unmapped);
387 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
388 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
389     vm_offset_t va, struct rwlock **lockp);
390 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
391 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
392     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
393 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
394     u_int flags, vm_page_t m, struct rwlock **lockp);
395 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
396     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
397 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
398     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
399 static void pmap_reset_asid_set(pmap_t pmap);
400 static boolean_t pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
401     vm_page_t m, struct rwlock **lockp);
402 
403 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
404 		struct rwlock **lockp);
405 
406 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
407     struct spglist *free);
408 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
409 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
410 
411 /*
412  * These load the old table data and store the new value.
413  * They need to be atomic as the System MMU may write to the table at
414  * the same time as the CPU.
415  */
416 #define	pmap_clear(table)		atomic_store_64(table, 0)
417 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
418 #define	pmap_load(table)		(*table)
419 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
420 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
421 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
422 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
423 
424 /********************/
425 /* Inline functions */
426 /********************/
427 
428 static __inline void
429 pagecopy(void *s, void *d)
430 {
431 
432 	memcpy(d, s, PAGE_SIZE);
433 }
434 
435 static __inline pd_entry_t *
436 pmap_l0(pmap_t pmap, vm_offset_t va)
437 {
438 
439 	return (&pmap->pm_l0[pmap_l0_index(va)]);
440 }
441 
442 static __inline pd_entry_t *
443 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
444 {
445 	pd_entry_t *l1;
446 
447 	l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
448 	return (&l1[pmap_l1_index(va)]);
449 }
450 
451 static __inline pd_entry_t *
452 pmap_l1(pmap_t pmap, vm_offset_t va)
453 {
454 	pd_entry_t *l0;
455 
456 	l0 = pmap_l0(pmap, va);
457 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
458 		return (NULL);
459 
460 	return (pmap_l0_to_l1(l0, va));
461 }
462 
463 static __inline pd_entry_t *
464 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
465 {
466 	pd_entry_t l1, *l2p;
467 
468 	l1 = pmap_load(l1p);
469 
470 	KASSERT(ADDR_IS_CANONICAL(va),
471 	    ("%s: Address not in canonical form: %lx", __func__, va));
472 	/*
473 	 * The valid bit may be clear if pmap_update_entry() is concurrently
474 	 * modifying the entry, so for KVA only the entry type may be checked.
475 	 */
476 	KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
477 	    ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
478 	KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
479 	    ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
480 	l2p = (pd_entry_t *)PHYS_TO_DMAP(l1 & ~ATTR_MASK);
481 	return (&l2p[pmap_l2_index(va)]);
482 }
483 
484 static __inline pd_entry_t *
485 pmap_l2(pmap_t pmap, vm_offset_t va)
486 {
487 	pd_entry_t *l1;
488 
489 	l1 = pmap_l1(pmap, va);
490 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
491 		return (NULL);
492 
493 	return (pmap_l1_to_l2(l1, va));
494 }
495 
496 static __inline pt_entry_t *
497 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
498 {
499 	pd_entry_t l2;
500 	pt_entry_t *l3p;
501 
502 	l2 = pmap_load(l2p);
503 
504 	KASSERT(ADDR_IS_CANONICAL(va),
505 	    ("%s: Address not in canonical form: %lx", __func__, va));
506 	/*
507 	 * The valid bit may be clear if pmap_update_entry() is concurrently
508 	 * modifying the entry, so for KVA only the entry type may be checked.
509 	 */
510 	KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
511 	    ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
512 	KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
513 	    ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
514 	l3p = (pt_entry_t *)PHYS_TO_DMAP(l2 & ~ATTR_MASK);
515 	return (&l3p[pmap_l3_index(va)]);
516 }
517 
518 /*
519  * Returns the lowest valid pde for a given virtual address.
520  * The next level may or may not point to a valid page or block.
521  */
522 static __inline pd_entry_t *
523 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
524 {
525 	pd_entry_t *l0, *l1, *l2, desc;
526 
527 	l0 = pmap_l0(pmap, va);
528 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
529 	if (desc != L0_TABLE) {
530 		*level = -1;
531 		return (NULL);
532 	}
533 
534 	l1 = pmap_l0_to_l1(l0, va);
535 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
536 	if (desc != L1_TABLE) {
537 		*level = 0;
538 		return (l0);
539 	}
540 
541 	l2 = pmap_l1_to_l2(l1, va);
542 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
543 	if (desc != L2_TABLE) {
544 		*level = 1;
545 		return (l1);
546 	}
547 
548 	*level = 2;
549 	return (l2);
550 }
551 
552 /*
553  * Returns the lowest valid pte block or table entry for a given virtual
554  * address. If there are no valid entries return NULL and set the level to
555  * the first invalid level.
556  */
557 static __inline pt_entry_t *
558 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
559 {
560 	pd_entry_t *l1, *l2, desc;
561 	pt_entry_t *l3;
562 
563 	l1 = pmap_l1(pmap, va);
564 	if (l1 == NULL) {
565 		*level = 0;
566 		return (NULL);
567 	}
568 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
569 	if (desc == L1_BLOCK) {
570 		*level = 1;
571 		return (l1);
572 	}
573 
574 	if (desc != L1_TABLE) {
575 		*level = 1;
576 		return (NULL);
577 	}
578 
579 	l2 = pmap_l1_to_l2(l1, va);
580 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
581 	if (desc == L2_BLOCK) {
582 		*level = 2;
583 		return (l2);
584 	}
585 
586 	if (desc != L2_TABLE) {
587 		*level = 2;
588 		return (NULL);
589 	}
590 
591 	*level = 3;
592 	l3 = pmap_l2_to_l3(l2, va);
593 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
594 		return (NULL);
595 
596 	return (l3);
597 }
598 
599 /*
600  * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
601  * level that maps the specified virtual address, then a pointer to that entry
602  * is returned.  Otherwise, NULL is returned, unless INVARIANTS are enabled
603  * and a diagnostic message is provided, in which case this function panics.
604  */
605 static __always_inline pt_entry_t *
606 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
607 {
608 	pd_entry_t *l0p, *l1p, *l2p;
609 	pt_entry_t desc, *l3p;
610 	int walk_level __diagused;
611 
612 	KASSERT(level >= 0 && level < 4,
613 	    ("%s: %s passed an out-of-range level (%d)", __func__, diag,
614 	    level));
615 	l0p = pmap_l0(pmap, va);
616 	desc = pmap_load(l0p) & ATTR_DESCR_MASK;
617 	if (desc == L0_TABLE && level > 0) {
618 		l1p = pmap_l0_to_l1(l0p, va);
619 		desc = pmap_load(l1p) & ATTR_DESCR_MASK;
620 		if (desc == L1_BLOCK && level == 1)
621 			return (l1p);
622 		else if (desc == L1_TABLE && level > 1) {
623 			l2p = pmap_l1_to_l2(l1p, va);
624 			desc = pmap_load(l2p) & ATTR_DESCR_MASK;
625 			if (desc == L2_BLOCK && level == 2)
626 				return (l2p);
627 			else if (desc == L2_TABLE && level > 2) {
628 				l3p = pmap_l2_to_l3(l2p, va);
629 				desc = pmap_load(l3p) & ATTR_DESCR_MASK;
630 				if (desc == L3_PAGE && level == 3)
631 					return (l3p);
632 				else
633 					walk_level = 3;
634 			} else
635 				walk_level = 2;
636 		} else
637 			walk_level = 1;
638 	} else
639 		walk_level = 0;
640 	KASSERT(diag == NULL,
641 	    ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
642 	    diag, va, level, desc, walk_level));
643 	return (NULL);
644 }
645 
646 bool
647 pmap_ps_enabled(pmap_t pmap __unused)
648 {
649 
650 	return (superpages_enabled != 0);
651 }
652 
653 bool
654 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
655     pd_entry_t **l2, pt_entry_t **l3)
656 {
657 	pd_entry_t *l0p, *l1p, *l2p;
658 
659 	if (pmap->pm_l0 == NULL)
660 		return (false);
661 
662 	l0p = pmap_l0(pmap, va);
663 	*l0 = l0p;
664 
665 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
666 		return (false);
667 
668 	l1p = pmap_l0_to_l1(l0p, va);
669 	*l1 = l1p;
670 
671 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
672 		*l2 = NULL;
673 		*l3 = NULL;
674 		return (true);
675 	}
676 
677 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
678 		return (false);
679 
680 	l2p = pmap_l1_to_l2(l1p, va);
681 	*l2 = l2p;
682 
683 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
684 		*l3 = NULL;
685 		return (true);
686 	}
687 
688 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
689 		return (false);
690 
691 	*l3 = pmap_l2_to_l3(l2p, va);
692 
693 	return (true);
694 }
695 
696 static __inline int
697 pmap_l3_valid(pt_entry_t l3)
698 {
699 
700 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
701 }
702 
703 CTASSERT(L1_BLOCK == L2_BLOCK);
704 
705 static pt_entry_t
706 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
707 {
708 	pt_entry_t val;
709 
710 	if (pmap->pm_stage == PM_STAGE1) {
711 		val = ATTR_S1_IDX(memattr);
712 		if (memattr == VM_MEMATTR_DEVICE)
713 			val |= ATTR_S1_XN;
714 		return (val);
715 	}
716 
717 	val = 0;
718 
719 	switch (memattr) {
720 	case VM_MEMATTR_DEVICE:
721 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
722 		    ATTR_S2_XN(ATTR_S2_XN_ALL));
723 	case VM_MEMATTR_UNCACHEABLE:
724 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
725 	case VM_MEMATTR_WRITE_BACK:
726 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
727 	case VM_MEMATTR_WRITE_THROUGH:
728 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
729 	default:
730 		panic("%s: invalid memory attribute %x", __func__, memattr);
731 	}
732 }
733 
734 static pt_entry_t
735 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
736 {
737 	pt_entry_t val;
738 
739 	val = 0;
740 	if (pmap->pm_stage == PM_STAGE1) {
741 		if ((prot & VM_PROT_EXECUTE) == 0)
742 			val |= ATTR_S1_XN;
743 		if ((prot & VM_PROT_WRITE) == 0)
744 			val |= ATTR_S1_AP(ATTR_S1_AP_RO);
745 	} else {
746 		if ((prot & VM_PROT_WRITE) != 0)
747 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
748 		if ((prot & VM_PROT_READ) != 0)
749 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
750 		if ((prot & VM_PROT_EXECUTE) == 0)
751 			val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
752 	}
753 
754 	return (val);
755 }
756 
757 /*
758  * Checks if the PTE is dirty.
759  */
760 static inline int
761 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
762 {
763 
764 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
765 
766 	if (pmap->pm_stage == PM_STAGE1) {
767 		KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
768 		    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
769 
770 		return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
771 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
772 	}
773 
774 	return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
775 	    ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
776 }
777 
778 static __inline void
779 pmap_resident_count_inc(pmap_t pmap, int count)
780 {
781 
782 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
783 	pmap->pm_stats.resident_count += count;
784 }
785 
786 static __inline void
787 pmap_resident_count_dec(pmap_t pmap, int count)
788 {
789 
790 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
791 	KASSERT(pmap->pm_stats.resident_count >= count,
792 	    ("pmap %p resident count underflow %ld %d", pmap,
793 	    pmap->pm_stats.resident_count, count));
794 	pmap->pm_stats.resident_count -= count;
795 }
796 
797 static vm_paddr_t
798 pmap_early_vtophys(vm_offset_t l1pt, vm_offset_t va)
799 {
800 	vm_paddr_t pa_page;
801 
802 	pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
803 	return (pa_page | (va & PAR_LOW_MASK));
804 }
805 
806 static vm_offset_t
807 pmap_bootstrap_dmap(vm_offset_t kern_l1, vm_paddr_t min_pa,
808     vm_offset_t freemempos)
809 {
810 	pt_entry_t *l2;
811 	vm_offset_t va;
812 	vm_paddr_t l2_pa, pa;
813 	u_int l1_slot, l2_slot, prev_l1_slot;
814 	int i;
815 
816 	dmap_phys_base = min_pa & ~L1_OFFSET;
817 	dmap_phys_max = 0;
818 	dmap_max_addr = 0;
819 	l2 = NULL;
820 	prev_l1_slot = -1;
821 
822 #define	DMAP_TABLES	((DMAP_MAX_ADDRESS - DMAP_MIN_ADDRESS) >> L0_SHIFT)
823 	memset(pagetable_dmap, 0, PAGE_SIZE * DMAP_TABLES);
824 
825 	for (i = 0; i < (physmap_idx * 2); i += 2) {
826 		pa = physmap[i] & ~L2_OFFSET;
827 		va = pa - dmap_phys_base + DMAP_MIN_ADDRESS;
828 
829 		/* Create L2 mappings at the start of the region */
830 		if ((pa & L1_OFFSET) != 0) {
831 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
832 			if (l1_slot != prev_l1_slot) {
833 				prev_l1_slot = l1_slot;
834 				l2 = (pt_entry_t *)freemempos;
835 				l2_pa = pmap_early_vtophys(kern_l1,
836 				    (vm_offset_t)l2);
837 				freemempos += PAGE_SIZE;
838 
839 				pmap_store(&pagetable_dmap[l1_slot],
840 				    (l2_pa & ~Ln_TABLE_MASK) |
841 				    TATTR_PXN_TABLE | L1_TABLE);
842 
843 				memset(l2, 0, PAGE_SIZE);
844 			}
845 			KASSERT(l2 != NULL,
846 			    ("pmap_bootstrap_dmap: NULL l2 map"));
847 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
848 			    pa += L2_SIZE, va += L2_SIZE) {
849 				/*
850 				 * We are on a boundary, stop to
851 				 * create a level 1 block
852 				 */
853 				if ((pa & L1_OFFSET) == 0)
854 					break;
855 
856 				l2_slot = pmap_l2_index(va);
857 				KASSERT(l2_slot != 0, ("..."));
858 				pmap_store(&l2[l2_slot],
859 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT |
860 				    ATTR_S1_XN |
861 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
862 				    L2_BLOCK);
863 			}
864 			KASSERT(va == (pa - dmap_phys_base + DMAP_MIN_ADDRESS),
865 			    ("..."));
866 		}
867 
868 		for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1] &&
869 		    (physmap[i + 1] - pa) >= L1_SIZE;
870 		    pa += L1_SIZE, va += L1_SIZE) {
871 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
872 			pmap_store(&pagetable_dmap[l1_slot],
873 			    (pa & ~L1_OFFSET) | ATTR_DEFAULT | ATTR_S1_XN |
874 			    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L1_BLOCK);
875 		}
876 
877 		/* Create L2 mappings at the end of the region */
878 		if (pa < physmap[i + 1]) {
879 			l1_slot = ((va - DMAP_MIN_ADDRESS) >> L1_SHIFT);
880 			if (l1_slot != prev_l1_slot) {
881 				prev_l1_slot = l1_slot;
882 				l2 = (pt_entry_t *)freemempos;
883 				l2_pa = pmap_early_vtophys(kern_l1,
884 				    (vm_offset_t)l2);
885 				freemempos += PAGE_SIZE;
886 
887 				pmap_store(&pagetable_dmap[l1_slot],
888 				    (l2_pa & ~Ln_TABLE_MASK) | L1_TABLE);
889 
890 				memset(l2, 0, PAGE_SIZE);
891 			}
892 			KASSERT(l2 != NULL,
893 			    ("pmap_bootstrap_dmap: NULL l2 map"));
894 			for (; va < DMAP_MAX_ADDRESS && pa < physmap[i + 1];
895 			    pa += L2_SIZE, va += L2_SIZE) {
896 				l2_slot = pmap_l2_index(va);
897 				pmap_store(&l2[l2_slot],
898 				    (pa & ~L2_OFFSET) | ATTR_DEFAULT |
899 				    ATTR_S1_XN |
900 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
901 				    L2_BLOCK);
902 			}
903 		}
904 
905 		if (pa > dmap_phys_max) {
906 			dmap_phys_max = pa;
907 			dmap_max_addr = va;
908 		}
909 	}
910 
911 	cpu_tlb_flushID();
912 
913 	return (freemempos);
914 }
915 
916 static vm_offset_t
917 pmap_bootstrap_l2(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l2_start)
918 {
919 	vm_offset_t l2pt;
920 	vm_paddr_t pa;
921 	pd_entry_t *l1;
922 	u_int l1_slot;
923 
924 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
925 
926 	l1 = (pd_entry_t *)l1pt;
927 	l1_slot = pmap_l1_index(va);
928 	l2pt = l2_start;
929 
930 	for (; va < VM_MAX_KERNEL_ADDRESS; l1_slot++, va += L1_SIZE) {
931 		KASSERT(l1_slot < Ln_ENTRIES, ("Invalid L1 index"));
932 
933 		pa = pmap_early_vtophys(l1pt, l2pt);
934 		pmap_store(&l1[l1_slot],
935 		    (pa & ~Ln_TABLE_MASK) | L1_TABLE);
936 		l2pt += PAGE_SIZE;
937 	}
938 
939 	/* Clean the L2 page table */
940 	memset((void *)l2_start, 0, l2pt - l2_start);
941 
942 	return l2pt;
943 }
944 
945 static vm_offset_t
946 pmap_bootstrap_l3(vm_offset_t l1pt, vm_offset_t va, vm_offset_t l3_start)
947 {
948 	vm_offset_t l3pt;
949 	vm_paddr_t pa;
950 	pd_entry_t *l2;
951 	u_int l2_slot;
952 
953 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
954 
955 	l2 = pmap_l2(kernel_pmap, va);
956 	l2 = (pd_entry_t *)rounddown2((uintptr_t)l2, PAGE_SIZE);
957 	l2_slot = pmap_l2_index(va);
958 	l3pt = l3_start;
959 
960 	for (; va < VM_MAX_KERNEL_ADDRESS; l2_slot++, va += L2_SIZE) {
961 		KASSERT(l2_slot < Ln_ENTRIES, ("Invalid L2 index"));
962 
963 		pa = pmap_early_vtophys(l1pt, l3pt);
964 		pmap_store(&l2[l2_slot],
965 		    (pa & ~Ln_TABLE_MASK) | ATTR_S1_UXN | L2_TABLE);
966 		l3pt += PAGE_SIZE;
967 	}
968 
969 	/* Clean the L2 page table */
970 	memset((void *)l3_start, 0, l3pt - l3_start);
971 
972 	return l3pt;
973 }
974 
975 /*
976  *	Bootstrap the system enough to run with virtual memory.
977  */
978 void
979 pmap_bootstrap(vm_offset_t l0pt, vm_offset_t l1pt, vm_paddr_t kernstart,
980     vm_size_t kernlen)
981 {
982 	vm_offset_t freemempos;
983 	vm_offset_t dpcpu, msgbufpv;
984 	vm_paddr_t start_pa, pa, min_pa;
985 	uint64_t kern_delta;
986 	int i;
987 
988 	/* Verify that the ASID is set through TTBR0. */
989 	KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
990 	    ("pmap_bootstrap: TCR_EL1.A1 != 0"));
991 
992 	kern_delta = KERNBASE - kernstart;
993 
994 	printf("pmap_bootstrap %lx %lx %lx\n", l1pt, kernstart, kernlen);
995 	printf("%lx\n", l1pt);
996 	printf("%lx\n", (KERNBASE >> L1_SHIFT) & Ln_ADDR_MASK);
997 
998 	/* Set this early so we can use the pagetable walking functions */
999 	kernel_pmap_store.pm_l0 = (pd_entry_t *)l0pt;
1000 	PMAP_LOCK_INIT(kernel_pmap);
1001 	kernel_pmap->pm_l0_paddr = l0pt - kern_delta;
1002 	kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1003 	kernel_pmap->pm_stage = PM_STAGE1;
1004 	kernel_pmap->pm_levels = 4;
1005 	kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1006 	kernel_pmap->pm_asid_set = &asids;
1007 
1008 	/* Assume the address we were loaded to is a valid physical address */
1009 	min_pa = KERNBASE - kern_delta;
1010 
1011 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1012 	physmap_idx /= 2;
1013 
1014 	/*
1015 	 * Find the minimum physical address. physmap is sorted,
1016 	 * but may contain empty ranges.
1017 	 */
1018 	for (i = 0; i < physmap_idx * 2; i += 2) {
1019 		if (physmap[i] == physmap[i + 1])
1020 			continue;
1021 		if (physmap[i] <= min_pa)
1022 			min_pa = physmap[i];
1023 	}
1024 
1025 	freemempos = KERNBASE + kernlen;
1026 	freemempos = roundup2(freemempos, PAGE_SIZE);
1027 
1028 	/* Create a direct map region early so we can use it for pa -> va */
1029 	freemempos = pmap_bootstrap_dmap(l1pt, min_pa, freemempos);
1030 
1031 	start_pa = pa = KERNBASE - kern_delta;
1032 
1033 	/*
1034 	 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS.  We assume that the
1035 	 * loader allocated the first and only l2 page table page used to map
1036 	 * the kernel, preloaded files and module metadata.
1037 	 */
1038 	freemempos = pmap_bootstrap_l2(l1pt, KERNBASE + L1_SIZE, freemempos);
1039 	/* And the l3 tables for the early devmap */
1040 	freemempos = pmap_bootstrap_l3(l1pt,
1041 	    VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE), freemempos);
1042 
1043 	cpu_tlb_flushID();
1044 
1045 #define alloc_pages(var, np)						\
1046 	(var) = freemempos;						\
1047 	freemempos += (np * PAGE_SIZE);					\
1048 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
1049 
1050 	/* Allocate dynamic per-cpu area. */
1051 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1052 	dpcpu_init((void *)dpcpu, 0);
1053 
1054 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1055 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1056 	msgbufp = (void *)msgbufpv;
1057 
1058 	/* Reserve some VA space for early BIOS/ACPI mapping */
1059 	preinit_map_va = roundup2(freemempos, L2_SIZE);
1060 
1061 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1062 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
1063 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1064 	kernel_vm_end = virtual_avail;
1065 
1066 	pa = pmap_early_vtophys(l1pt, freemempos);
1067 
1068 	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1069 
1070 	cpu_tlb_flushID();
1071 }
1072 
1073 /*
1074  *	Initialize a vm_page's machine-dependent fields.
1075  */
1076 void
1077 pmap_page_init(vm_page_t m)
1078 {
1079 
1080 	TAILQ_INIT(&m->md.pv_list);
1081 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1082 }
1083 
1084 static void
1085 pmap_init_asids(struct asid_set *set, int bits)
1086 {
1087 	int i;
1088 
1089 	set->asid_bits = bits;
1090 
1091 	/*
1092 	 * We may be too early in the overall initialization process to use
1093 	 * bit_alloc().
1094 	 */
1095 	set->asid_set_size = 1 << set->asid_bits;
1096 	set->asid_set = (bitstr_t *)kmem_malloc(bitstr_size(set->asid_set_size),
1097 	    M_WAITOK | M_ZERO);
1098 	for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1099 		bit_set(set->asid_set, i);
1100 	set->asid_next = ASID_FIRST_AVAILABLE;
1101 	mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1102 }
1103 
1104 /*
1105  *	Initialize the pmap module.
1106  *	Called by vm_init, to initialize any structures that the pmap
1107  *	system needs to map virtual memory.
1108  */
1109 void
1110 pmap_init(void)
1111 {
1112 	struct vm_phys_seg *seg, *next_seg;
1113 	struct md_page *pvh;
1114 	vm_size_t s;
1115 	uint64_t mmfr1;
1116 	int i, pv_npg, vmid_bits;
1117 
1118 	/*
1119 	 * Are large page mappings enabled?
1120 	 */
1121 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1122 	if (superpages_enabled) {
1123 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1124 		    ("pmap_init: can't assign to pagesizes[1]"));
1125 		pagesizes[1] = L2_SIZE;
1126 		KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1127 		    ("pmap_init: can't assign to pagesizes[2]"));
1128 		pagesizes[2] = L1_SIZE;
1129 	}
1130 
1131 	/*
1132 	 * Initialize the ASID allocator.
1133 	 */
1134 	pmap_init_asids(&asids,
1135 	    (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1136 
1137 	if (has_hyp()) {
1138 		mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1139 		vmid_bits = 8;
1140 
1141 		if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1142 		    ID_AA64MMFR1_VMIDBits_16)
1143 			vmid_bits = 16;
1144 		pmap_init_asids(&vmids, vmid_bits);
1145 	}
1146 
1147 	/*
1148 	 * Initialize the pv chunk list mutex.
1149 	 */
1150 	mtx_init(&pv_chunks_mutex, "pmap pv chunk list", NULL, MTX_DEF);
1151 
1152 	/*
1153 	 * Initialize the pool of pv list locks.
1154 	 */
1155 	for (i = 0; i < NPV_LIST_LOCKS; i++)
1156 		rw_init(&pv_list_locks[i], "pmap pv list");
1157 
1158 	/*
1159 	 * Calculate the size of the pv head table for superpages.
1160 	 */
1161 	pv_npg = 0;
1162 	for (i = 0; i < vm_phys_nsegs; i++) {
1163 		seg = &vm_phys_segs[i];
1164 		pv_npg += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1165 		    pmap_l2_pindex(seg->start);
1166 	}
1167 
1168 	/*
1169 	 * Allocate memory for the pv head table for superpages.
1170 	 */
1171 	s = (vm_size_t)(pv_npg * sizeof(struct md_page));
1172 	s = round_page(s);
1173 	pv_table = (struct md_page *)kmem_malloc(s, M_WAITOK | M_ZERO);
1174 	for (i = 0; i < pv_npg; i++)
1175 		TAILQ_INIT(&pv_table[i].pv_list);
1176 	TAILQ_INIT(&pv_dummy.pv_list);
1177 
1178 	/*
1179 	 * Set pointers from vm_phys_segs to pv_table.
1180 	 */
1181 	for (i = 0, pvh = pv_table; i < vm_phys_nsegs; i++) {
1182 		seg = &vm_phys_segs[i];
1183 		seg->md_first = pvh;
1184 		pvh += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1185 		    pmap_l2_pindex(seg->start);
1186 
1187 		/*
1188 		 * If there is a following segment, and the final
1189 		 * superpage of this segment and the initial superpage
1190 		 * of the next segment are the same then adjust the
1191 		 * pv_table entry for that next segment down by one so
1192 		 * that the pv_table entries will be shared.
1193 		 */
1194 		if (i + 1 < vm_phys_nsegs) {
1195 			next_seg = &vm_phys_segs[i + 1];
1196 			if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1197 			    pmap_l2_pindex(next_seg->start)) {
1198 				pvh--;
1199 			}
1200 		}
1201 	}
1202 
1203 	vm_initialized = 1;
1204 }
1205 
1206 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1207     "2MB page mapping counters");
1208 
1209 static u_long pmap_l2_demotions;
1210 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1211     &pmap_l2_demotions, 0, "2MB page demotions");
1212 
1213 static u_long pmap_l2_mappings;
1214 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1215     &pmap_l2_mappings, 0, "2MB page mappings");
1216 
1217 static u_long pmap_l2_p_failures;
1218 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1219     &pmap_l2_p_failures, 0, "2MB page promotion failures");
1220 
1221 static u_long pmap_l2_promotions;
1222 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1223     &pmap_l2_promotions, 0, "2MB page promotions");
1224 
1225 /*
1226  * If the given value for "final_only" is false, then any cached intermediate-
1227  * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1228  * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1229  * Otherwise, just the cached final-level entry is invalidated.
1230  */
1231 static __inline void
1232 pmap_invalidate_kernel(uint64_t r, bool final_only)
1233 {
1234 	if (final_only)
1235 		__asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1236 	else
1237 		__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1238 }
1239 
1240 static __inline void
1241 pmap_invalidate_user(uint64_t r, bool final_only)
1242 {
1243 	if (final_only)
1244 		__asm __volatile("tlbi vale1is, %0" : : "r" (r));
1245 	else
1246 		__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1247 }
1248 
1249 /*
1250  * Invalidates any cached final- and optionally intermediate-level TLB entries
1251  * for the specified virtual address in the given virtual address space.
1252  */
1253 static __inline void
1254 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1255 {
1256 	uint64_t r;
1257 
1258 	PMAP_ASSERT_STAGE1(pmap);
1259 
1260 	dsb(ishst);
1261 	if (pmap == kernel_pmap) {
1262 		r = atop(va);
1263 		pmap_invalidate_kernel(r, final_only);
1264 	} else {
1265 		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) | atop(va);
1266 		pmap_invalidate_user(r, final_only);
1267 	}
1268 	dsb(ish);
1269 	isb();
1270 }
1271 
1272 /*
1273  * Invalidates any cached final- and optionally intermediate-level TLB entries
1274  * for the specified virtual address range in the given virtual address space.
1275  */
1276 static __inline void
1277 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1278     bool final_only)
1279 {
1280 	uint64_t end, r, start;
1281 
1282 	PMAP_ASSERT_STAGE1(pmap);
1283 
1284 	dsb(ishst);
1285 	if (pmap == kernel_pmap) {
1286 		start = atop(sva);
1287 		end = atop(eva);
1288 		for (r = start; r < end; r++)
1289 			pmap_invalidate_kernel(r, final_only);
1290 	} else {
1291 		start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1292 		start |= atop(sva);
1293 		end |= atop(eva);
1294 		for (r = start; r < end; r++)
1295 			pmap_invalidate_user(r, final_only);
1296 	}
1297 	dsb(ish);
1298 	isb();
1299 }
1300 
1301 /*
1302  * Invalidates all cached intermediate- and final-level TLB entries for the
1303  * given virtual address space.
1304  */
1305 static __inline void
1306 pmap_invalidate_all(pmap_t pmap)
1307 {
1308 	uint64_t r;
1309 
1310 	PMAP_ASSERT_STAGE1(pmap);
1311 
1312 	dsb(ishst);
1313 	if (pmap == kernel_pmap) {
1314 		__asm __volatile("tlbi vmalle1is");
1315 	} else {
1316 		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1317 		__asm __volatile("tlbi aside1is, %0" : : "r" (r));
1318 	}
1319 	dsb(ish);
1320 	isb();
1321 }
1322 
1323 /*
1324  *	Routine:	pmap_extract
1325  *	Function:
1326  *		Extract the physical page address associated
1327  *		with the given map/virtual_address pair.
1328  */
1329 vm_paddr_t
1330 pmap_extract(pmap_t pmap, vm_offset_t va)
1331 {
1332 	pt_entry_t *pte, tpte;
1333 	vm_paddr_t pa;
1334 	int lvl;
1335 
1336 	pa = 0;
1337 	PMAP_LOCK(pmap);
1338 	/*
1339 	 * Find the block or page map for this virtual address. pmap_pte
1340 	 * will return either a valid block/page entry, or NULL.
1341 	 */
1342 	pte = pmap_pte(pmap, va, &lvl);
1343 	if (pte != NULL) {
1344 		tpte = pmap_load(pte);
1345 		pa = tpte & ~ATTR_MASK;
1346 		switch(lvl) {
1347 		case 1:
1348 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1349 			    ("pmap_extract: Invalid L1 pte found: %lx",
1350 			    tpte & ATTR_DESCR_MASK));
1351 			pa |= (va & L1_OFFSET);
1352 			break;
1353 		case 2:
1354 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1355 			    ("pmap_extract: Invalid L2 pte found: %lx",
1356 			    tpte & ATTR_DESCR_MASK));
1357 			pa |= (va & L2_OFFSET);
1358 			break;
1359 		case 3:
1360 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1361 			    ("pmap_extract: Invalid L3 pte found: %lx",
1362 			    tpte & ATTR_DESCR_MASK));
1363 			pa |= (va & L3_OFFSET);
1364 			break;
1365 		}
1366 	}
1367 	PMAP_UNLOCK(pmap);
1368 	return (pa);
1369 }
1370 
1371 /*
1372  *	Routine:	pmap_extract_and_hold
1373  *	Function:
1374  *		Atomically extract and hold the physical page
1375  *		with the given pmap and virtual address pair
1376  *		if that mapping permits the given protection.
1377  */
1378 vm_page_t
1379 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1380 {
1381 	pt_entry_t *pte, tpte;
1382 	vm_offset_t off;
1383 	vm_page_t m;
1384 	int lvl;
1385 	bool use;
1386 
1387 	m = NULL;
1388 	PMAP_LOCK(pmap);
1389 	pte = pmap_pte(pmap, va, &lvl);
1390 	if (pte != NULL) {
1391 		tpte = pmap_load(pte);
1392 
1393 		KASSERT(lvl > 0 && lvl <= 3,
1394 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1395 		CTASSERT(L1_BLOCK == L2_BLOCK);
1396 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1397 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1398 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1399 		     tpte & ATTR_DESCR_MASK));
1400 
1401 		use = false;
1402 		if ((prot & VM_PROT_WRITE) == 0)
1403 			use = true;
1404 		else if (pmap->pm_stage == PM_STAGE1 &&
1405 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1406 			use = true;
1407 		else if (pmap->pm_stage == PM_STAGE2 &&
1408 		    ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1409 		     ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1410 			use = true;
1411 
1412 		if (use) {
1413 			switch (lvl) {
1414 			case 1:
1415 				off = va & L1_OFFSET;
1416 				break;
1417 			case 2:
1418 				off = va & L2_OFFSET;
1419 				break;
1420 			case 3:
1421 			default:
1422 				off = 0;
1423 			}
1424 			m = PHYS_TO_VM_PAGE((tpte & ~ATTR_MASK) | off);
1425 			if (m != NULL && !vm_page_wire_mapped(m))
1426 				m = NULL;
1427 		}
1428 	}
1429 	PMAP_UNLOCK(pmap);
1430 	return (m);
1431 }
1432 
1433 /*
1434  * Walks the page tables to translate a kernel virtual address to a
1435  * physical address. Returns true if the kva is valid and stores the
1436  * physical address in pa if it is not NULL.
1437  */
1438 bool
1439 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1440 {
1441 	pt_entry_t *pte, tpte;
1442 	register_t intr;
1443 	uint64_t par;
1444 
1445 	/*
1446 	 * Disable interrupts so we don't get interrupted between asking
1447 	 * for address translation, and getting the result back.
1448 	 */
1449 	intr = intr_disable();
1450 	par = arm64_address_translate_s1e1r(va);
1451 	intr_restore(intr);
1452 
1453 	if (PAR_SUCCESS(par)) {
1454 		if (pa != NULL)
1455 			*pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
1456 		return (true);
1457 	}
1458 
1459 	/*
1460 	 * Fall back to walking the page table. The address translation
1461 	 * instruction may fail when the page is in a break-before-make
1462 	 * sequence. As we only clear the valid bit in said sequence we
1463 	 * can walk the page table to find the physical address.
1464 	 */
1465 
1466 	pte = pmap_l1(kernel_pmap, va);
1467 	if (pte == NULL)
1468 		return (false);
1469 
1470 	/*
1471 	 * A concurrent pmap_update_entry() will clear the entry's valid bit
1472 	 * but leave the rest of the entry unchanged.  Therefore, we treat a
1473 	 * non-zero entry as being valid, and we ignore the valid bit when
1474 	 * determining whether the entry maps a block, page, or table.
1475 	 */
1476 	tpte = pmap_load(pte);
1477 	if (tpte == 0)
1478 		return (false);
1479 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1480 		if (pa != NULL)
1481 			*pa = (tpte & ~ATTR_MASK) | (va & L1_OFFSET);
1482 		return (true);
1483 	}
1484 	pte = pmap_l1_to_l2(&tpte, va);
1485 	tpte = pmap_load(pte);
1486 	if (tpte == 0)
1487 		return (false);
1488 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
1489 		if (pa != NULL)
1490 			*pa = (tpte & ~ATTR_MASK) | (va & L2_OFFSET);
1491 		return (true);
1492 	}
1493 	pte = pmap_l2_to_l3(&tpte, va);
1494 	tpte = pmap_load(pte);
1495 	if (tpte == 0)
1496 		return (false);
1497 	if (pa != NULL)
1498 		*pa = (tpte & ~ATTR_MASK) | (va & L3_OFFSET);
1499 	return (true);
1500 }
1501 
1502 vm_paddr_t
1503 pmap_kextract(vm_offset_t va)
1504 {
1505 	vm_paddr_t pa;
1506 
1507 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
1508 		return (DMAP_TO_PHYS(va));
1509 
1510 	if (pmap_klookup(va, &pa) == false)
1511 		return (0);
1512 	return (pa);
1513 }
1514 
1515 /***************************************************
1516  * Low level mapping routines.....
1517  ***************************************************/
1518 
1519 void
1520 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
1521 {
1522 	pd_entry_t *pde;
1523 	pt_entry_t *pte, attr;
1524 	vm_offset_t va;
1525 	int lvl;
1526 
1527 	KASSERT((pa & L3_OFFSET) == 0,
1528 	   ("pmap_kenter: Invalid physical address"));
1529 	KASSERT((sva & L3_OFFSET) == 0,
1530 	   ("pmap_kenter: Invalid virtual address"));
1531 	KASSERT((size & PAGE_MASK) == 0,
1532 	    ("pmap_kenter: Mapping is not page-sized"));
1533 
1534 	attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1535 	    ATTR_S1_IDX(mode) | L3_PAGE;
1536 	va = sva;
1537 	while (size != 0) {
1538 		pde = pmap_pde(kernel_pmap, va, &lvl);
1539 		KASSERT(pde != NULL,
1540 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
1541 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
1542 
1543 		pte = pmap_l2_to_l3(pde, va);
1544 		pmap_load_store(pte, (pa & ~L3_OFFSET) | attr);
1545 
1546 		va += PAGE_SIZE;
1547 		pa += PAGE_SIZE;
1548 		size -= PAGE_SIZE;
1549 	}
1550 	pmap_invalidate_range(kernel_pmap, sva, va, true);
1551 }
1552 
1553 void
1554 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
1555 {
1556 
1557 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
1558 }
1559 
1560 /*
1561  * Remove a page from the kernel pagetables.
1562  */
1563 PMAP_INLINE void
1564 pmap_kremove(vm_offset_t va)
1565 {
1566 	pt_entry_t *pte;
1567 
1568 	pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
1569 	pmap_clear(pte);
1570 	pmap_invalidate_page(kernel_pmap, va, true);
1571 }
1572 
1573 void
1574 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
1575 {
1576 	pt_entry_t *pte;
1577 	vm_offset_t va;
1578 
1579 	KASSERT((sva & L3_OFFSET) == 0,
1580 	   ("pmap_kremove_device: Invalid virtual address"));
1581 	KASSERT((size & PAGE_MASK) == 0,
1582 	    ("pmap_kremove_device: Mapping is not page-sized"));
1583 
1584 	va = sva;
1585 	while (size != 0) {
1586 		pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
1587 		pmap_clear(pte);
1588 
1589 		va += PAGE_SIZE;
1590 		size -= PAGE_SIZE;
1591 	}
1592 	pmap_invalidate_range(kernel_pmap, sva, va, true);
1593 }
1594 
1595 /*
1596  *	Used to map a range of physical addresses into kernel
1597  *	virtual address space.
1598  *
1599  *	The value passed in '*virt' is a suggested virtual address for
1600  *	the mapping. Architectures which can support a direct-mapped
1601  *	physical to virtual region can return the appropriate address
1602  *	within that region, leaving '*virt' unchanged. Other
1603  *	architectures should map the pages starting at '*virt' and
1604  *	update '*virt' with the first usable address after the mapped
1605  *	region.
1606  */
1607 vm_offset_t
1608 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
1609 {
1610 	return PHYS_TO_DMAP(start);
1611 }
1612 
1613 /*
1614  * Add a list of wired pages to the kva
1615  * this routine is only used for temporary
1616  * kernel mappings that do not need to have
1617  * page modification or references recorded.
1618  * Note that old mappings are simply written
1619  * over.  The page *must* be wired.
1620  * Note: SMP coherent.  Uses a ranged shootdown IPI.
1621  */
1622 void
1623 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
1624 {
1625 	pd_entry_t *pde;
1626 	pt_entry_t *pte, pa;
1627 	vm_offset_t va;
1628 	vm_page_t m;
1629 	int i, lvl;
1630 
1631 	va = sva;
1632 	for (i = 0; i < count; i++) {
1633 		pde = pmap_pde(kernel_pmap, va, &lvl);
1634 		KASSERT(pde != NULL,
1635 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
1636 		KASSERT(lvl == 2,
1637 		    ("pmap_qenter: Invalid level %d", lvl));
1638 
1639 		m = ma[i];
1640 		pa = VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
1641 		    ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
1642 		    ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
1643 		pte = pmap_l2_to_l3(pde, va);
1644 		pmap_load_store(pte, pa);
1645 
1646 		va += L3_SIZE;
1647 	}
1648 	pmap_invalidate_range(kernel_pmap, sva, va, true);
1649 }
1650 
1651 /*
1652  * This routine tears out page mappings from the
1653  * kernel -- it is meant only for temporary mappings.
1654  */
1655 void
1656 pmap_qremove(vm_offset_t sva, int count)
1657 {
1658 	pt_entry_t *pte;
1659 	vm_offset_t va;
1660 
1661 	KASSERT(ADDR_IS_CANONICAL(sva),
1662 	    ("%s: Address not in canonical form: %lx", __func__, sva));
1663 	KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
1664 
1665 	va = sva;
1666 	while (count-- > 0) {
1667 		pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
1668 		if (pte != NULL) {
1669 			pmap_clear(pte);
1670 		}
1671 
1672 		va += PAGE_SIZE;
1673 	}
1674 	pmap_invalidate_range(kernel_pmap, sva, va, true);
1675 }
1676 
1677 /***************************************************
1678  * Page table page management routines.....
1679  ***************************************************/
1680 /*
1681  * Schedule the specified unused page table page to be freed.  Specifically,
1682  * add the page to the specified list of pages that will be released to the
1683  * physical memory manager after the TLB has been updated.
1684  */
1685 static __inline void
1686 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free,
1687     boolean_t set_PG_ZERO)
1688 {
1689 
1690 	if (set_PG_ZERO)
1691 		m->flags |= PG_ZERO;
1692 	else
1693 		m->flags &= ~PG_ZERO;
1694 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
1695 }
1696 
1697 /*
1698  * Decrements a page table page's reference count, which is used to record the
1699  * number of valid page table entries within the page.  If the reference count
1700  * drops to zero, then the page table page is unmapped.  Returns TRUE if the
1701  * page table page was unmapped and FALSE otherwise.
1702  */
1703 static inline boolean_t
1704 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1705 {
1706 
1707 	--m->ref_count;
1708 	if (m->ref_count == 0) {
1709 		_pmap_unwire_l3(pmap, va, m, free);
1710 		return (TRUE);
1711 	} else
1712 		return (FALSE);
1713 }
1714 
1715 static void
1716 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
1717 {
1718 
1719 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1720 	/*
1721 	 * unmap the page table page
1722 	 */
1723 	if (m->pindex >= (NUL2E + NUL1E)) {
1724 		/* l1 page */
1725 		pd_entry_t *l0;
1726 
1727 		l0 = pmap_l0(pmap, va);
1728 		pmap_clear(l0);
1729 	} else if (m->pindex >= NUL2E) {
1730 		/* l2 page */
1731 		pd_entry_t *l1;
1732 
1733 		l1 = pmap_l1(pmap, va);
1734 		pmap_clear(l1);
1735 	} else {
1736 		/* l3 page */
1737 		pd_entry_t *l2;
1738 
1739 		l2 = pmap_l2(pmap, va);
1740 		pmap_clear(l2);
1741 	}
1742 	pmap_resident_count_dec(pmap, 1);
1743 	if (m->pindex < NUL2E) {
1744 		/* We just released an l3, unhold the matching l2 */
1745 		pd_entry_t *l1, tl1;
1746 		vm_page_t l2pg;
1747 
1748 		l1 = pmap_l1(pmap, va);
1749 		tl1 = pmap_load(l1);
1750 		l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
1751 		pmap_unwire_l3(pmap, va, l2pg, free);
1752 	} else if (m->pindex < (NUL2E + NUL1E)) {
1753 		/* We just released an l2, unhold the matching l1 */
1754 		pd_entry_t *l0, tl0;
1755 		vm_page_t l1pg;
1756 
1757 		l0 = pmap_l0(pmap, va);
1758 		tl0 = pmap_load(l0);
1759 		l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1760 		pmap_unwire_l3(pmap, va, l1pg, free);
1761 	}
1762 	pmap_invalidate_page(pmap, va, false);
1763 
1764 	/*
1765 	 * Put page on a list so that it is released after
1766 	 * *ALL* TLB shootdown is done
1767 	 */
1768 	pmap_add_delayed_free_list(m, free, TRUE);
1769 }
1770 
1771 /*
1772  * After removing a page table entry, this routine is used to
1773  * conditionally free the page, and manage the reference count.
1774  */
1775 static int
1776 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
1777     struct spglist *free)
1778 {
1779 	vm_page_t mpte;
1780 
1781 	KASSERT(ADDR_IS_CANONICAL(va),
1782 	    ("%s: Address not in canonical form: %lx", __func__, va));
1783 	if (ADDR_IS_KERNEL(va))
1784 		return (0);
1785 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
1786 	mpte = PHYS_TO_VM_PAGE(ptepde & ~ATTR_MASK);
1787 	return (pmap_unwire_l3(pmap, va, mpte, free));
1788 }
1789 
1790 /*
1791  * Release a page table page reference after a failed attempt to create a
1792  * mapping.
1793  */
1794 static void
1795 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
1796 {
1797 	struct spglist free;
1798 
1799 	SLIST_INIT(&free);
1800 	if (pmap_unwire_l3(pmap, va, mpte, &free))
1801 		vm_page_free_pages_toq(&free, true);
1802 }
1803 
1804 void
1805 pmap_pinit0(pmap_t pmap)
1806 {
1807 
1808 	PMAP_LOCK_INIT(pmap);
1809 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1810 	pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
1811 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
1812 	vm_radix_init(&pmap->pm_root);
1813 	pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
1814 	pmap->pm_stage = PM_STAGE1;
1815 	pmap->pm_levels = 4;
1816 	pmap->pm_ttbr = pmap->pm_l0_paddr;
1817 	pmap->pm_asid_set = &asids;
1818 
1819 	PCPU_SET(curpmap, pmap);
1820 }
1821 
1822 int
1823 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
1824 {
1825 	vm_page_t m;
1826 
1827 	/*
1828 	 * allocate the l0 page
1829 	 */
1830 	m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
1831 	    VM_ALLOC_ZERO);
1832 	pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
1833 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
1834 
1835 	vm_radix_init(&pmap->pm_root);
1836 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
1837 	pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
1838 
1839 	MPASS(levels == 3 || levels == 4);
1840 	pmap->pm_levels = levels;
1841 	pmap->pm_stage = stage;
1842 	switch (stage) {
1843 	case PM_STAGE1:
1844 		pmap->pm_asid_set = &asids;
1845 		break;
1846 	case PM_STAGE2:
1847 		pmap->pm_asid_set = &vmids;
1848 		break;
1849 	default:
1850 		panic("%s: Invalid pmap type %d", __func__, stage);
1851 		break;
1852 	}
1853 
1854 	/* XXX Temporarily disable deferred ASID allocation. */
1855 	pmap_alloc_asid(pmap);
1856 
1857 	/*
1858 	 * Allocate the level 1 entry to use as the root. This will increase
1859 	 * the refcount on the level 1 page so it won't be removed until
1860 	 * pmap_release() is called.
1861 	 */
1862 	if (pmap->pm_levels == 3) {
1863 		PMAP_LOCK(pmap);
1864 		m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
1865 		PMAP_UNLOCK(pmap);
1866 	}
1867 	pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
1868 
1869 	return (1);
1870 }
1871 
1872 int
1873 pmap_pinit(pmap_t pmap)
1874 {
1875 
1876 	return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
1877 }
1878 
1879 /*
1880  * This routine is called if the desired page table page does not exist.
1881  *
1882  * If page table page allocation fails, this routine may sleep before
1883  * returning NULL.  It sleeps only if a lock pointer was given.
1884  *
1885  * Note: If a page allocation fails at page table level two or three,
1886  * one or two pages may be held during the wait, only to be released
1887  * afterwards.  This conservative approach is easily argued to avoid
1888  * race conditions.
1889  */
1890 static vm_page_t
1891 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
1892 {
1893 	vm_page_t m, l1pg, l2pg;
1894 
1895 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
1896 
1897 	/*
1898 	 * Allocate a page table page.
1899 	 */
1900 	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
1901 		if (lockp != NULL) {
1902 			RELEASE_PV_LIST_LOCK(lockp);
1903 			PMAP_UNLOCK(pmap);
1904 			vm_wait(NULL);
1905 			PMAP_LOCK(pmap);
1906 		}
1907 
1908 		/*
1909 		 * Indicate the need to retry.  While waiting, the page table
1910 		 * page may have been allocated.
1911 		 */
1912 		return (NULL);
1913 	}
1914 	m->pindex = ptepindex;
1915 
1916 	/*
1917 	 * Because of AArch64's weak memory consistency model, we must have a
1918 	 * barrier here to ensure that the stores for zeroing "m", whether by
1919 	 * pmap_zero_page() or an earlier function, are visible before adding
1920 	 * "m" to the page table.  Otherwise, a page table walk by another
1921 	 * processor's MMU could see the mapping to "m" and a stale, non-zero
1922 	 * PTE within "m".
1923 	 */
1924 	dmb(ishst);
1925 
1926 	/*
1927 	 * Map the pagetable page into the process address space, if
1928 	 * it isn't already there.
1929 	 */
1930 
1931 	if (ptepindex >= (NUL2E + NUL1E)) {
1932 		pd_entry_t *l0p, l0e;
1933 		vm_pindex_t l0index;
1934 
1935 		l0index = ptepindex - (NUL2E + NUL1E);
1936 		l0p = &pmap->pm_l0[l0index];
1937 		KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
1938 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
1939 		l0e = VM_PAGE_TO_PHYS(m) | L0_TABLE;
1940 
1941 		/*
1942 		 * Mark all kernel memory as not accessible from userspace
1943 		 * and userspace memory as not executable from the kernel.
1944 		 * This has been done for the bootstrap L0 entries in
1945 		 * locore.S.
1946 		 */
1947 		if (pmap == kernel_pmap)
1948 			l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
1949 		else
1950 			l0e |= TATTR_PXN_TABLE;
1951 		pmap_store(l0p, l0e);
1952 	} else if (ptepindex >= NUL2E) {
1953 		vm_pindex_t l0index, l1index;
1954 		pd_entry_t *l0, *l1;
1955 		pd_entry_t tl0;
1956 
1957 		l1index = ptepindex - NUL2E;
1958 		l0index = l1index >> L0_ENTRIES_SHIFT;
1959 
1960 		l0 = &pmap->pm_l0[l0index];
1961 		tl0 = pmap_load(l0);
1962 		if (tl0 == 0) {
1963 			/* recurse for allocating page dir */
1964 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
1965 			    lockp) == NULL) {
1966 				vm_page_unwire_noq(m);
1967 				vm_page_free_zero(m);
1968 				return (NULL);
1969 			}
1970 		} else {
1971 			l1pg = PHYS_TO_VM_PAGE(tl0 & ~ATTR_MASK);
1972 			l1pg->ref_count++;
1973 		}
1974 
1975 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l0) & ~ATTR_MASK);
1976 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
1977 		KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
1978 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
1979 		pmap_store(l1, VM_PAGE_TO_PHYS(m) | L1_TABLE);
1980 	} else {
1981 		vm_pindex_t l0index, l1index;
1982 		pd_entry_t *l0, *l1, *l2;
1983 		pd_entry_t tl0, tl1;
1984 
1985 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
1986 		l0index = l1index >> L0_ENTRIES_SHIFT;
1987 
1988 		l0 = &pmap->pm_l0[l0index];
1989 		tl0 = pmap_load(l0);
1990 		if (tl0 == 0) {
1991 			/* recurse for allocating page dir */
1992 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
1993 			    lockp) == NULL) {
1994 				vm_page_unwire_noq(m);
1995 				vm_page_free_zero(m);
1996 				return (NULL);
1997 			}
1998 			tl0 = pmap_load(l0);
1999 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
2000 			l1 = &l1[l1index & Ln_ADDR_MASK];
2001 		} else {
2002 			l1 = (pd_entry_t *)PHYS_TO_DMAP(tl0 & ~ATTR_MASK);
2003 			l1 = &l1[l1index & Ln_ADDR_MASK];
2004 			tl1 = pmap_load(l1);
2005 			if (tl1 == 0) {
2006 				/* recurse for allocating page dir */
2007 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2008 				    lockp) == NULL) {
2009 					vm_page_unwire_noq(m);
2010 					vm_page_free_zero(m);
2011 					return (NULL);
2012 				}
2013 			} else {
2014 				l2pg = PHYS_TO_VM_PAGE(tl1 & ~ATTR_MASK);
2015 				l2pg->ref_count++;
2016 			}
2017 		}
2018 
2019 		l2 = (pd_entry_t *)PHYS_TO_DMAP(pmap_load(l1) & ~ATTR_MASK);
2020 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
2021 		KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2022 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2023 		pmap_store(l2, VM_PAGE_TO_PHYS(m) | L2_TABLE);
2024 	}
2025 
2026 	pmap_resident_count_inc(pmap, 1);
2027 
2028 	return (m);
2029 }
2030 
2031 static pd_entry_t *
2032 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2033     struct rwlock **lockp)
2034 {
2035 	pd_entry_t *l1, *l2;
2036 	vm_page_t l2pg;
2037 	vm_pindex_t l2pindex;
2038 
2039 	KASSERT(ADDR_IS_CANONICAL(va),
2040 	    ("%s: Address not in canonical form: %lx", __func__, va));
2041 
2042 retry:
2043 	l1 = pmap_l1(pmap, va);
2044 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2045 		l2 = pmap_l1_to_l2(l1, va);
2046 		if (!ADDR_IS_KERNEL(va)) {
2047 			/* Add a reference to the L2 page. */
2048 			l2pg = PHYS_TO_VM_PAGE(pmap_load(l1) & ~ATTR_MASK);
2049 			l2pg->ref_count++;
2050 		} else
2051 			l2pg = NULL;
2052 	} else if (!ADDR_IS_KERNEL(va)) {
2053 		/* Allocate a L2 page. */
2054 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2055 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2056 		if (l2pg == NULL) {
2057 			if (lockp != NULL)
2058 				goto retry;
2059 			else
2060 				return (NULL);
2061 		}
2062 		l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2063 		l2 = &l2[pmap_l2_index(va)];
2064 	} else
2065 		panic("pmap_alloc_l2: missing page table page for va %#lx",
2066 		    va);
2067 	*l2pgp = l2pg;
2068 	return (l2);
2069 }
2070 
2071 static vm_page_t
2072 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2073 {
2074 	vm_pindex_t ptepindex;
2075 	pd_entry_t *pde, tpde;
2076 #ifdef INVARIANTS
2077 	pt_entry_t *pte;
2078 #endif
2079 	vm_page_t m;
2080 	int lvl;
2081 
2082 	/*
2083 	 * Calculate pagetable page index
2084 	 */
2085 	ptepindex = pmap_l2_pindex(va);
2086 retry:
2087 	/*
2088 	 * Get the page directory entry
2089 	 */
2090 	pde = pmap_pde(pmap, va, &lvl);
2091 
2092 	/*
2093 	 * If the page table page is mapped, we just increment the hold count,
2094 	 * and activate it. If we get a level 2 pde it will point to a level 3
2095 	 * table.
2096 	 */
2097 	switch (lvl) {
2098 	case -1:
2099 		break;
2100 	case 0:
2101 #ifdef INVARIANTS
2102 		pte = pmap_l0_to_l1(pde, va);
2103 		KASSERT(pmap_load(pte) == 0,
2104 		    ("pmap_alloc_l3: TODO: l0 superpages"));
2105 #endif
2106 		break;
2107 	case 1:
2108 #ifdef INVARIANTS
2109 		pte = pmap_l1_to_l2(pde, va);
2110 		KASSERT(pmap_load(pte) == 0,
2111 		    ("pmap_alloc_l3: TODO: l1 superpages"));
2112 #endif
2113 		break;
2114 	case 2:
2115 		tpde = pmap_load(pde);
2116 		if (tpde != 0) {
2117 			m = PHYS_TO_VM_PAGE(tpde & ~ATTR_MASK);
2118 			m->ref_count++;
2119 			return (m);
2120 		}
2121 		break;
2122 	default:
2123 		panic("pmap_alloc_l3: Invalid level %d", lvl);
2124 	}
2125 
2126 	/*
2127 	 * Here if the pte page isn't mapped, or if it has been deallocated.
2128 	 */
2129 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2130 	if (m == NULL && lockp != NULL)
2131 		goto retry;
2132 
2133 	return (m);
2134 }
2135 
2136 /***************************************************
2137  * Pmap allocation/deallocation routines.
2138  ***************************************************/
2139 
2140 /*
2141  * Release any resources held by the given physical map.
2142  * Called when a pmap initialized by pmap_pinit is being released.
2143  * Should only be called if the map contains no valid mappings.
2144  */
2145 void
2146 pmap_release(pmap_t pmap)
2147 {
2148 	boolean_t rv __diagused;
2149 	struct spglist free;
2150 	struct asid_set *set;
2151 	vm_page_t m;
2152 	int asid;
2153 
2154 	if (pmap->pm_levels != 4) {
2155 		PMAP_ASSERT_STAGE2(pmap);
2156 		KASSERT(pmap->pm_stats.resident_count == 1,
2157 		    ("pmap_release: pmap resident count %ld != 0",
2158 		    pmap->pm_stats.resident_count));
2159 		KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2160 		    ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2161 
2162 		SLIST_INIT(&free);
2163 		m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2164 		PMAP_LOCK(pmap);
2165 		rv = pmap_unwire_l3(pmap, 0, m, &free);
2166 		PMAP_UNLOCK(pmap);
2167 		MPASS(rv == TRUE);
2168 		vm_page_free_pages_toq(&free, true);
2169 	}
2170 
2171 	KASSERT(pmap->pm_stats.resident_count == 0,
2172 	    ("pmap_release: pmap resident count %ld != 0",
2173 	    pmap->pm_stats.resident_count));
2174 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2175 	    ("pmap_release: pmap has reserved page table page(s)"));
2176 
2177 	set = pmap->pm_asid_set;
2178 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2179 
2180 	/*
2181 	 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2182 	 * the entries when removing them so rely on a later tlb invalidation.
2183 	 * this will happen when updating the VMID generation. Because of this
2184 	 * we don't reuse VMIDs within a generation.
2185 	 */
2186 	if (pmap->pm_stage == PM_STAGE1) {
2187 		mtx_lock_spin(&set->asid_set_mutex);
2188 		if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2189 			asid = COOKIE_TO_ASID(pmap->pm_cookie);
2190 			KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2191 			    asid < set->asid_set_size,
2192 			    ("pmap_release: pmap cookie has out-of-range asid"));
2193 			bit_clear(set->asid_set, asid);
2194 		}
2195 		mtx_unlock_spin(&set->asid_set_mutex);
2196 	}
2197 
2198 	m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2199 	vm_page_unwire_noq(m);
2200 	vm_page_free_zero(m);
2201 }
2202 
2203 static int
2204 kvm_size(SYSCTL_HANDLER_ARGS)
2205 {
2206 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2207 
2208 	return sysctl_handle_long(oidp, &ksize, 0, req);
2209 }
2210 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2211     0, 0, kvm_size, "LU",
2212     "Size of KVM");
2213 
2214 static int
2215 kvm_free(SYSCTL_HANDLER_ARGS)
2216 {
2217 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2218 
2219 	return sysctl_handle_long(oidp, &kfree, 0, req);
2220 }
2221 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2222     0, 0, kvm_free, "LU",
2223     "Amount of KVM free");
2224 
2225 /*
2226  * grow the number of kernel page table entries, if needed
2227  */
2228 void
2229 pmap_growkernel(vm_offset_t addr)
2230 {
2231 	vm_paddr_t paddr;
2232 	vm_page_t nkpg;
2233 	pd_entry_t *l0, *l1, *l2;
2234 
2235 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2236 
2237 	addr = roundup2(addr, L2_SIZE);
2238 	if (addr - 1 >= vm_map_max(kernel_map))
2239 		addr = vm_map_max(kernel_map);
2240 	while (kernel_vm_end < addr) {
2241 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2242 		KASSERT(pmap_load(l0) != 0,
2243 		    ("pmap_growkernel: No level 0 kernel entry"));
2244 
2245 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2246 		if (pmap_load(l1) == 0) {
2247 			/* We need a new PDP entry */
2248 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2249 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2250 			if (nkpg == NULL)
2251 				panic("pmap_growkernel: no memory to grow kernel");
2252 			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2253 			/* See the dmb() in _pmap_alloc_l3(). */
2254 			dmb(ishst);
2255 			paddr = VM_PAGE_TO_PHYS(nkpg);
2256 			pmap_store(l1, paddr | L1_TABLE);
2257 			continue; /* try again */
2258 		}
2259 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2260 		if (pmap_load(l2) != 0) {
2261 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2262 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2263 				kernel_vm_end = vm_map_max(kernel_map);
2264 				break;
2265 			}
2266 			continue;
2267 		}
2268 
2269 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2270 		    VM_ALLOC_ZERO);
2271 		if (nkpg == NULL)
2272 			panic("pmap_growkernel: no memory to grow kernel");
2273 		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2274 		/* See the dmb() in _pmap_alloc_l3(). */
2275 		dmb(ishst);
2276 		paddr = VM_PAGE_TO_PHYS(nkpg);
2277 		pmap_store(l2, paddr | L2_TABLE);
2278 
2279 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2280 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2281 			kernel_vm_end = vm_map_max(kernel_map);
2282 			break;
2283 		}
2284 	}
2285 }
2286 
2287 /***************************************************
2288  * page management routines.
2289  ***************************************************/
2290 
2291 CTASSERT(sizeof(struct pv_chunk) == PAGE_SIZE);
2292 CTASSERT(_NPCM == 3);
2293 CTASSERT(_NPCPV == 168);
2294 
2295 static __inline struct pv_chunk *
2296 pv_to_chunk(pv_entry_t pv)
2297 {
2298 
2299 	return ((struct pv_chunk *)((uintptr_t)pv & ~(uintptr_t)PAGE_MASK));
2300 }
2301 
2302 #define PV_PMAP(pv) (pv_to_chunk(pv)->pc_pmap)
2303 
2304 #define	PC_FREE0	0xfffffffffffffffful
2305 #define	PC_FREE1	0xfffffffffffffffful
2306 #define	PC_FREE2	0x000000fffffffffful
2307 
2308 static const uint64_t pc_freemask[_NPCM] = { PC_FREE0, PC_FREE1, PC_FREE2 };
2309 
2310 #ifdef PV_STATS
2311 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2312 
2313 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2314 	"Current number of pv entry chunks");
2315 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2316 	"Current number of pv entry chunks allocated");
2317 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2318 	"Current number of pv entry chunks frees");
2319 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2320 	"Number of times tried to get a chunk page but failed.");
2321 
2322 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2323 static int pv_entry_spare;
2324 
2325 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2326 	"Current number of pv entry frees");
2327 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2328 	"Current number of pv entry allocs");
2329 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2330 	"Current number of pv entries");
2331 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
2332 	"Current number of spare pv entries");
2333 #endif
2334 
2335 /*
2336  * We are in a serious low memory condition.  Resort to
2337  * drastic measures to free some pages so we can allocate
2338  * another pv entry chunk.
2339  *
2340  * Returns NULL if PV entries were reclaimed from the specified pmap.
2341  *
2342  * We do not, however, unmap 2mpages because subsequent accesses will
2343  * allocate per-page pv entries until repromotion occurs, thereby
2344  * exacerbating the shortage of free pv entries.
2345  */
2346 static vm_page_t
2347 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
2348 {
2349 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
2350 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
2351 	struct md_page *pvh;
2352 	pd_entry_t *pde;
2353 	pmap_t next_pmap, pmap;
2354 	pt_entry_t *pte, tpte;
2355 	pv_entry_t pv;
2356 	vm_offset_t va;
2357 	vm_page_t m, m_pc;
2358 	struct spglist free;
2359 	uint64_t inuse;
2360 	int bit, field, freed, lvl;
2361 	static int active_reclaims = 0;
2362 
2363 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
2364 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
2365 
2366 	pmap = NULL;
2367 	m_pc = NULL;
2368 	SLIST_INIT(&free);
2369 	bzero(&pc_marker_b, sizeof(pc_marker_b));
2370 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
2371 	pc_marker = (struct pv_chunk *)&pc_marker_b;
2372 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
2373 
2374 	mtx_lock(&pv_chunks_mutex);
2375 	active_reclaims++;
2376 	TAILQ_INSERT_HEAD(&pv_chunks, pc_marker, pc_lru);
2377 	TAILQ_INSERT_TAIL(&pv_chunks, pc_marker_end, pc_lru);
2378 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
2379 	    SLIST_EMPTY(&free)) {
2380 		next_pmap = pc->pc_pmap;
2381 		if (next_pmap == NULL) {
2382 			/*
2383 			 * The next chunk is a marker.  However, it is
2384 			 * not our marker, so active_reclaims must be
2385 			 * > 1.  Consequently, the next_chunk code
2386 			 * will not rotate the pv_chunks list.
2387 			 */
2388 			goto next_chunk;
2389 		}
2390 		mtx_unlock(&pv_chunks_mutex);
2391 
2392 		/*
2393 		 * A pv_chunk can only be removed from the pc_lru list
2394 		 * when both pv_chunks_mutex is owned and the
2395 		 * corresponding pmap is locked.
2396 		 */
2397 		if (pmap != next_pmap) {
2398 			if (pmap != NULL && pmap != locked_pmap)
2399 				PMAP_UNLOCK(pmap);
2400 			pmap = next_pmap;
2401 			/* Avoid deadlock and lock recursion. */
2402 			if (pmap > locked_pmap) {
2403 				RELEASE_PV_LIST_LOCK(lockp);
2404 				PMAP_LOCK(pmap);
2405 				mtx_lock(&pv_chunks_mutex);
2406 				continue;
2407 			} else if (pmap != locked_pmap) {
2408 				if (PMAP_TRYLOCK(pmap)) {
2409 					mtx_lock(&pv_chunks_mutex);
2410 					continue;
2411 				} else {
2412 					pmap = NULL; /* pmap is not locked */
2413 					mtx_lock(&pv_chunks_mutex);
2414 					pc = TAILQ_NEXT(pc_marker, pc_lru);
2415 					if (pc == NULL ||
2416 					    pc->pc_pmap != next_pmap)
2417 						continue;
2418 					goto next_chunk;
2419 				}
2420 			}
2421 		}
2422 
2423 		/*
2424 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
2425 		 */
2426 		freed = 0;
2427 		for (field = 0; field < _NPCM; field++) {
2428 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
2429 			    inuse != 0; inuse &= ~(1UL << bit)) {
2430 				bit = ffsl(inuse) - 1;
2431 				pv = &pc->pc_pventry[field * 64 + bit];
2432 				va = pv->pv_va;
2433 				pde = pmap_pde(pmap, va, &lvl);
2434 				if (lvl != 2)
2435 					continue;
2436 				pte = pmap_l2_to_l3(pde, va);
2437 				tpte = pmap_load(pte);
2438 				if ((tpte & ATTR_SW_WIRED) != 0)
2439 					continue;
2440 				tpte = pmap_load_clear(pte);
2441 				m = PHYS_TO_VM_PAGE(tpte & ~ATTR_MASK);
2442 				if (pmap_pte_dirty(pmap, tpte))
2443 					vm_page_dirty(m);
2444 				if ((tpte & ATTR_AF) != 0) {
2445 					pmap_invalidate_page(pmap, va, true);
2446 					vm_page_aflag_set(m, PGA_REFERENCED);
2447 				}
2448 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2449 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
2450 				m->md.pv_gen++;
2451 				if (TAILQ_EMPTY(&m->md.pv_list) &&
2452 				    (m->flags & PG_FICTITIOUS) == 0) {
2453 					pvh = page_to_pvh(m);
2454 					if (TAILQ_EMPTY(&pvh->pv_list)) {
2455 						vm_page_aflag_clear(m,
2456 						    PGA_WRITEABLE);
2457 					}
2458 				}
2459 				pc->pc_map[field] |= 1UL << bit;
2460 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
2461 				freed++;
2462 			}
2463 		}
2464 		if (freed == 0) {
2465 			mtx_lock(&pv_chunks_mutex);
2466 			goto next_chunk;
2467 		}
2468 		/* Every freed mapping is for a 4 KB page. */
2469 		pmap_resident_count_dec(pmap, freed);
2470 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
2471 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
2472 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
2473 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2474 		if (pc->pc_map[0] == PC_FREE0 && pc->pc_map[1] == PC_FREE1 &&
2475 		    pc->pc_map[2] == PC_FREE2) {
2476 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2477 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2478 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2479 			/* Entire chunk is free; return it. */
2480 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2481 			dump_drop_page(m_pc->phys_addr);
2482 			mtx_lock(&pv_chunks_mutex);
2483 			TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2484 			break;
2485 		}
2486 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2487 		mtx_lock(&pv_chunks_mutex);
2488 		/* One freed pv entry in locked_pmap is sufficient. */
2489 		if (pmap == locked_pmap)
2490 			break;
2491 
2492 next_chunk:
2493 		TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
2494 		TAILQ_INSERT_AFTER(&pv_chunks, pc, pc_marker, pc_lru);
2495 		if (active_reclaims == 1 && pmap != NULL) {
2496 			/*
2497 			 * Rotate the pv chunks list so that we do not
2498 			 * scan the same pv chunks that could not be
2499 			 * freed (because they contained a wired
2500 			 * and/or superpage mapping) on every
2501 			 * invocation of reclaim_pv_chunk().
2502 			 */
2503 			while ((pc = TAILQ_FIRST(&pv_chunks)) != pc_marker) {
2504 				MPASS(pc->pc_pmap != NULL);
2505 				TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2506 				TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2507 			}
2508 		}
2509 	}
2510 	TAILQ_REMOVE(&pv_chunks, pc_marker, pc_lru);
2511 	TAILQ_REMOVE(&pv_chunks, pc_marker_end, pc_lru);
2512 	active_reclaims--;
2513 	mtx_unlock(&pv_chunks_mutex);
2514 	if (pmap != NULL && pmap != locked_pmap)
2515 		PMAP_UNLOCK(pmap);
2516 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
2517 		m_pc = SLIST_FIRST(&free);
2518 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
2519 		/* Recycle a freed page table page. */
2520 		m_pc->ref_count = 1;
2521 	}
2522 	vm_page_free_pages_toq(&free, true);
2523 	return (m_pc);
2524 }
2525 
2526 /*
2527  * free the pv_entry back to the free list
2528  */
2529 static void
2530 free_pv_entry(pmap_t pmap, pv_entry_t pv)
2531 {
2532 	struct pv_chunk *pc;
2533 	int idx, field, bit;
2534 
2535 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2536 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
2537 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
2538 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
2539 	pc = pv_to_chunk(pv);
2540 	idx = pv - &pc->pc_pventry[0];
2541 	field = idx / 64;
2542 	bit = idx % 64;
2543 	pc->pc_map[field] |= 1ul << bit;
2544 	if (pc->pc_map[0] != PC_FREE0 || pc->pc_map[1] != PC_FREE1 ||
2545 	    pc->pc_map[2] != PC_FREE2) {
2546 		/* 98% of the time, pc is already at the head of the list. */
2547 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
2548 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2549 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2550 		}
2551 		return;
2552 	}
2553 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2554 	free_pv_chunk(pc);
2555 }
2556 
2557 static void
2558 free_pv_chunk(struct pv_chunk *pc)
2559 {
2560 	vm_page_t m;
2561 
2562 	mtx_lock(&pv_chunks_mutex);
2563  	TAILQ_REMOVE(&pv_chunks, pc, pc_lru);
2564 	mtx_unlock(&pv_chunks_mutex);
2565 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
2566 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
2567 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
2568 	/* entire chunk is free, return it */
2569 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
2570 	dump_drop_page(m->phys_addr);
2571 	vm_page_unwire_noq(m);
2572 	vm_page_free(m);
2573 }
2574 
2575 /*
2576  * Returns a new PV entry, allocating a new PV chunk from the system when
2577  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
2578  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
2579  * returned.
2580  *
2581  * The given PV list lock may be released.
2582  */
2583 static pv_entry_t
2584 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
2585 {
2586 	int bit, field;
2587 	pv_entry_t pv;
2588 	struct pv_chunk *pc;
2589 	vm_page_t m;
2590 
2591 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2592 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
2593 retry:
2594 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2595 	if (pc != NULL) {
2596 		for (field = 0; field < _NPCM; field++) {
2597 			if (pc->pc_map[field]) {
2598 				bit = ffsl(pc->pc_map[field]) - 1;
2599 				break;
2600 			}
2601 		}
2602 		if (field < _NPCM) {
2603 			pv = &pc->pc_pventry[field * 64 + bit];
2604 			pc->pc_map[field] &= ~(1ul << bit);
2605 			/* If this was the last item, move it to tail */
2606 			if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 &&
2607 			    pc->pc_map[2] == 0) {
2608 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2609 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
2610 				    pc_list);
2611 			}
2612 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
2613 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
2614 			return (pv);
2615 		}
2616 	}
2617 	/* No free items, allocate another chunk */
2618 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2619 	if (m == NULL) {
2620 		if (lockp == NULL) {
2621 			PV_STAT(pc_chunk_tryfail++);
2622 			return (NULL);
2623 		}
2624 		m = reclaim_pv_chunk(pmap, lockp);
2625 		if (m == NULL)
2626 			goto retry;
2627 	}
2628 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2629 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2630 	dump_add_page(m->phys_addr);
2631 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2632 	pc->pc_pmap = pmap;
2633 	pc->pc_map[0] = PC_FREE0 & ~1ul;	/* preallocated bit 0 */
2634 	pc->pc_map[1] = PC_FREE1;
2635 	pc->pc_map[2] = PC_FREE2;
2636 	mtx_lock(&pv_chunks_mutex);
2637 	TAILQ_INSERT_TAIL(&pv_chunks, pc, pc_lru);
2638 	mtx_unlock(&pv_chunks_mutex);
2639 	pv = &pc->pc_pventry[0];
2640 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2641 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
2642 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
2643 	return (pv);
2644 }
2645 
2646 /*
2647  * Ensure that the number of spare PV entries in the specified pmap meets or
2648  * exceeds the given count, "needed".
2649  *
2650  * The given PV list lock may be released.
2651  */
2652 static void
2653 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
2654 {
2655 	struct pch new_tail;
2656 	struct pv_chunk *pc;
2657 	vm_page_t m;
2658 	int avail, free;
2659 	bool reclaimed;
2660 
2661 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2662 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
2663 
2664 	/*
2665 	 * Newly allocated PV chunks must be stored in a private list until
2666 	 * the required number of PV chunks have been allocated.  Otherwise,
2667 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
2668 	 * contrast, these chunks must be added to the pmap upon allocation.
2669 	 */
2670 	TAILQ_INIT(&new_tail);
2671 retry:
2672 	avail = 0;
2673 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
2674 		bit_count((bitstr_t *)pc->pc_map, 0,
2675 		    sizeof(pc->pc_map) * NBBY, &free);
2676 		if (free == 0)
2677 			break;
2678 		avail += free;
2679 		if (avail >= needed)
2680 			break;
2681 	}
2682 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
2683 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
2684 		if (m == NULL) {
2685 			m = reclaim_pv_chunk(pmap, lockp);
2686 			if (m == NULL)
2687 				goto retry;
2688 			reclaimed = true;
2689 		}
2690 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
2691 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
2692 		dump_add_page(m->phys_addr);
2693 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
2694 		pc->pc_pmap = pmap;
2695 		pc->pc_map[0] = PC_FREE0;
2696 		pc->pc_map[1] = PC_FREE1;
2697 		pc->pc_map[2] = PC_FREE2;
2698 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
2699 		TAILQ_INSERT_TAIL(&new_tail, pc, pc_lru);
2700 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
2701 
2702 		/*
2703 		 * The reclaim might have freed a chunk from the current pmap.
2704 		 * If that chunk contained available entries, we need to
2705 		 * re-count the number of available entries.
2706 		 */
2707 		if (reclaimed)
2708 			goto retry;
2709 	}
2710 	if (!TAILQ_EMPTY(&new_tail)) {
2711 		mtx_lock(&pv_chunks_mutex);
2712 		TAILQ_CONCAT(&pv_chunks, &new_tail, pc_lru);
2713 		mtx_unlock(&pv_chunks_mutex);
2714 	}
2715 }
2716 
2717 /*
2718  * First find and then remove the pv entry for the specified pmap and virtual
2719  * address from the specified pv list.  Returns the pv entry if found and NULL
2720  * otherwise.  This operation can be performed on pv lists for either 4KB or
2721  * 2MB page mappings.
2722  */
2723 static __inline pv_entry_t
2724 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2725 {
2726 	pv_entry_t pv;
2727 
2728 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
2729 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
2730 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
2731 			pvh->pv_gen++;
2732 			break;
2733 		}
2734 	}
2735 	return (pv);
2736 }
2737 
2738 /*
2739  * After demotion from a 2MB page mapping to 512 4KB page mappings,
2740  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
2741  * entries for each of the 4KB page mappings.
2742  */
2743 static void
2744 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
2745     struct rwlock **lockp)
2746 {
2747 	struct md_page *pvh;
2748 	struct pv_chunk *pc;
2749 	pv_entry_t pv;
2750 	vm_offset_t va_last;
2751 	vm_page_t m;
2752 	int bit, field;
2753 
2754 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2755 	KASSERT((va & L2_OFFSET) == 0,
2756 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
2757 	KASSERT((pa & L2_OFFSET) == 0,
2758 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
2759 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2760 
2761 	/*
2762 	 * Transfer the 2mpage's pv entry for this mapping to the first
2763 	 * page's pv list.  Once this transfer begins, the pv list lock
2764 	 * must not be released until the last pv entry is reinstantiated.
2765 	 */
2766 	pvh = pa_to_pvh(pa);
2767 	pv = pmap_pvh_remove(pvh, pmap, va);
2768 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
2769 	m = PHYS_TO_VM_PAGE(pa);
2770 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2771 	m->md.pv_gen++;
2772 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
2773 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
2774 	va_last = va + L2_SIZE - PAGE_SIZE;
2775 	for (;;) {
2776 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
2777 		KASSERT(pc->pc_map[0] != 0 || pc->pc_map[1] != 0 ||
2778 		    pc->pc_map[2] != 0, ("pmap_pv_demote_l2: missing spare"));
2779 		for (field = 0; field < _NPCM; field++) {
2780 			while (pc->pc_map[field]) {
2781 				bit = ffsl(pc->pc_map[field]) - 1;
2782 				pc->pc_map[field] &= ~(1ul << bit);
2783 				pv = &pc->pc_pventry[field * 64 + bit];
2784 				va += PAGE_SIZE;
2785 				pv->pv_va = va;
2786 				m++;
2787 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
2788 			    ("pmap_pv_demote_l2: page %p is not managed", m));
2789 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2790 				m->md.pv_gen++;
2791 				if (va == va_last)
2792 					goto out;
2793 			}
2794 		}
2795 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2796 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2797 	}
2798 out:
2799 	if (pc->pc_map[0] == 0 && pc->pc_map[1] == 0 && pc->pc_map[2] == 0) {
2800 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
2801 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
2802 	}
2803 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
2804 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
2805 }
2806 
2807 /*
2808  * First find and then destroy the pv entry for the specified pmap and virtual
2809  * address.  This operation can be performed on pv lists for either 4KB or 2MB
2810  * page mappings.
2811  */
2812 static void
2813 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
2814 {
2815 	pv_entry_t pv;
2816 
2817 	pv = pmap_pvh_remove(pvh, pmap, va);
2818 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
2819 	free_pv_entry(pmap, pv);
2820 }
2821 
2822 /*
2823  * Conditionally create the PV entry for a 4KB page mapping if the required
2824  * memory can be allocated without resorting to reclamation.
2825  */
2826 static boolean_t
2827 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
2828     struct rwlock **lockp)
2829 {
2830 	pv_entry_t pv;
2831 
2832 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2833 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2834 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
2835 		pv->pv_va = va;
2836 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2837 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
2838 		m->md.pv_gen++;
2839 		return (TRUE);
2840 	} else
2841 		return (FALSE);
2842 }
2843 
2844 /*
2845  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
2846  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
2847  * false if the PV entry cannot be allocated without resorting to reclamation.
2848  */
2849 static bool
2850 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
2851     struct rwlock **lockp)
2852 {
2853 	struct md_page *pvh;
2854 	pv_entry_t pv;
2855 	vm_paddr_t pa;
2856 
2857 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2858 	/* Pass NULL instead of the lock pointer to disable reclamation. */
2859 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
2860 	    NULL : lockp)) == NULL)
2861 		return (false);
2862 	pv->pv_va = va;
2863 	pa = l2e & ~ATTR_MASK;
2864 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
2865 	pvh = pa_to_pvh(pa);
2866 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
2867 	pvh->pv_gen++;
2868 	return (true);
2869 }
2870 
2871 static void
2872 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
2873 {
2874 	pt_entry_t newl2, oldl2 __diagused;
2875 	vm_page_t ml3;
2876 	vm_paddr_t ml3pa;
2877 
2878 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
2879 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
2880 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2881 
2882 	ml3 = pmap_remove_pt_page(pmap, va);
2883 	if (ml3 == NULL)
2884 		panic("pmap_remove_kernel_l2: Missing pt page");
2885 
2886 	ml3pa = VM_PAGE_TO_PHYS(ml3);
2887 	newl2 = ml3pa | L2_TABLE;
2888 
2889 	/*
2890 	 * If this page table page was unmapped by a promotion, then it
2891 	 * contains valid mappings.  Zero it to invalidate those mappings.
2892 	 */
2893 	if (ml3->valid != 0)
2894 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
2895 
2896 	/*
2897 	 * Demote the mapping.  The caller must have already invalidated the
2898 	 * mapping (i.e., the "break" in break-before-make).
2899 	 */
2900 	oldl2 = pmap_load_store(l2, newl2);
2901 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
2902 	    __func__, l2, oldl2));
2903 }
2904 
2905 /*
2906  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
2907  */
2908 static int
2909 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
2910     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
2911 {
2912 	struct md_page *pvh;
2913 	pt_entry_t old_l2;
2914 	vm_page_t m, ml3, mt;
2915 
2916 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2917 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
2918 	old_l2 = pmap_load_clear(l2);
2919 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
2920 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
2921 
2922 	/*
2923 	 * Since a promotion must break the 4KB page mappings before making
2924 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
2925 	 */
2926 	pmap_invalidate_page(pmap, sva, true);
2927 
2928 	if (old_l2 & ATTR_SW_WIRED)
2929 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
2930 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
2931 	if (old_l2 & ATTR_SW_MANAGED) {
2932 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
2933 		pvh = page_to_pvh(m);
2934 		CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, old_l2 & ~ATTR_MASK);
2935 		pmap_pvh_free(pvh, pmap, sva);
2936 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
2937 			if (pmap_pte_dirty(pmap, old_l2))
2938 				vm_page_dirty(mt);
2939 			if (old_l2 & ATTR_AF)
2940 				vm_page_aflag_set(mt, PGA_REFERENCED);
2941 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
2942 			    TAILQ_EMPTY(&pvh->pv_list))
2943 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
2944 		}
2945 	}
2946 	if (pmap == kernel_pmap) {
2947 		pmap_remove_kernel_l2(pmap, l2, sva);
2948 	} else {
2949 		ml3 = pmap_remove_pt_page(pmap, sva);
2950 		if (ml3 != NULL) {
2951 			KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
2952 			    ("pmap_remove_l2: l3 page not promoted"));
2953 			pmap_resident_count_dec(pmap, 1);
2954 			KASSERT(ml3->ref_count == NL3PG,
2955 			    ("pmap_remove_l2: l3 page ref count error"));
2956 			ml3->ref_count = 0;
2957 			pmap_add_delayed_free_list(ml3, free, FALSE);
2958 		}
2959 	}
2960 	return (pmap_unuse_pt(pmap, sva, l1e, free));
2961 }
2962 
2963 /*
2964  * pmap_remove_l3: do the things to unmap a page in a process
2965  */
2966 static int
2967 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
2968     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
2969 {
2970 	struct md_page *pvh;
2971 	pt_entry_t old_l3;
2972 	vm_page_t m;
2973 
2974 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2975 	old_l3 = pmap_load_clear(l3);
2976 	pmap_invalidate_page(pmap, va, true);
2977 	if (old_l3 & ATTR_SW_WIRED)
2978 		pmap->pm_stats.wired_count -= 1;
2979 	pmap_resident_count_dec(pmap, 1);
2980 	if (old_l3 & ATTR_SW_MANAGED) {
2981 		m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
2982 		if (pmap_pte_dirty(pmap, old_l3))
2983 			vm_page_dirty(m);
2984 		if (old_l3 & ATTR_AF)
2985 			vm_page_aflag_set(m, PGA_REFERENCED);
2986 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
2987 		pmap_pvh_free(&m->md, pmap, va);
2988 		if (TAILQ_EMPTY(&m->md.pv_list) &&
2989 		    (m->flags & PG_FICTITIOUS) == 0) {
2990 			pvh = page_to_pvh(m);
2991 			if (TAILQ_EMPTY(&pvh->pv_list))
2992 				vm_page_aflag_clear(m, PGA_WRITEABLE);
2993 		}
2994 	}
2995 	return (pmap_unuse_pt(pmap, va, l2e, free));
2996 }
2997 
2998 /*
2999  * Remove the specified range of addresses from the L3 page table that is
3000  * identified by the given L2 entry.
3001  */
3002 static void
3003 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3004     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3005 {
3006 	struct md_page *pvh;
3007 	struct rwlock *new_lock;
3008 	pt_entry_t *l3, old_l3;
3009 	vm_offset_t va;
3010 	vm_page_t l3pg, m;
3011 
3012 	KASSERT(ADDR_IS_CANONICAL(sva),
3013 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
3014 	KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3015 	    ("%s: End address not in canonical form: %lx", __func__, eva));
3016 
3017 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3018 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3019 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3020 	l3pg = !ADDR_IS_KERNEL(sva) ? PHYS_TO_VM_PAGE(l2e & ~ATTR_MASK) : NULL;
3021 	va = eva;
3022 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3023 		if (!pmap_l3_valid(pmap_load(l3))) {
3024 			if (va != eva) {
3025 				pmap_invalidate_range(pmap, va, sva, true);
3026 				va = eva;
3027 			}
3028 			continue;
3029 		}
3030 		old_l3 = pmap_load_clear(l3);
3031 		if ((old_l3 & ATTR_SW_WIRED) != 0)
3032 			pmap->pm_stats.wired_count--;
3033 		pmap_resident_count_dec(pmap, 1);
3034 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3035 			m = PHYS_TO_VM_PAGE(old_l3 & ~ATTR_MASK);
3036 			if (pmap_pte_dirty(pmap, old_l3))
3037 				vm_page_dirty(m);
3038 			if ((old_l3 & ATTR_AF) != 0)
3039 				vm_page_aflag_set(m, PGA_REFERENCED);
3040 			new_lock = PHYS_TO_PV_LIST_LOCK(VM_PAGE_TO_PHYS(m));
3041 			if (new_lock != *lockp) {
3042 				if (*lockp != NULL) {
3043 					/*
3044 					 * Pending TLB invalidations must be
3045 					 * performed before the PV list lock is
3046 					 * released.  Otherwise, a concurrent
3047 					 * pmap_remove_all() on a physical page
3048 					 * could return while a stale TLB entry
3049 					 * still provides access to that page.
3050 					 */
3051 					if (va != eva) {
3052 						pmap_invalidate_range(pmap, va,
3053 						    sva, true);
3054 						va = eva;
3055 					}
3056 					rw_wunlock(*lockp);
3057 				}
3058 				*lockp = new_lock;
3059 				rw_wlock(*lockp);
3060 			}
3061 			pmap_pvh_free(&m->md, pmap, sva);
3062 			if (TAILQ_EMPTY(&m->md.pv_list) &&
3063 			    (m->flags & PG_FICTITIOUS) == 0) {
3064 				pvh = page_to_pvh(m);
3065 				if (TAILQ_EMPTY(&pvh->pv_list))
3066 					vm_page_aflag_clear(m, PGA_WRITEABLE);
3067 			}
3068 		}
3069 		if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3070 			/*
3071 			 * _pmap_unwire_l3() has already invalidated the TLB
3072 			 * entries at all levels for "sva".  So, we need not
3073 			 * perform "sva += L3_SIZE;" here.  Moreover, we need
3074 			 * not perform "va = sva;" if "sva" is at the start
3075 			 * of a new valid range consisting of a single page.
3076 			 */
3077 			break;
3078 		}
3079 		if (va == eva)
3080 			va = sva;
3081 	}
3082 	if (va != eva)
3083 		pmap_invalidate_range(pmap, va, sva, true);
3084 }
3085 
3086 /*
3087  *	Remove the given range of addresses from the specified map.
3088  *
3089  *	It is assumed that the start and end are properly
3090  *	rounded to the page size.
3091  */
3092 void
3093 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
3094 {
3095 	struct rwlock *lock;
3096 	vm_offset_t va_next;
3097 	pd_entry_t *l0, *l1, *l2;
3098 	pt_entry_t l3_paddr;
3099 	struct spglist free;
3100 
3101 	/*
3102 	 * Perform an unsynchronized read.  This is, however, safe.
3103 	 */
3104 	if (pmap->pm_stats.resident_count == 0)
3105 		return;
3106 
3107 	SLIST_INIT(&free);
3108 
3109 	PMAP_LOCK(pmap);
3110 
3111 	lock = NULL;
3112 	for (; sva < eva; sva = va_next) {
3113 		if (pmap->pm_stats.resident_count == 0)
3114 			break;
3115 
3116 		l0 = pmap_l0(pmap, sva);
3117 		if (pmap_load(l0) == 0) {
3118 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3119 			if (va_next < sva)
3120 				va_next = eva;
3121 			continue;
3122 		}
3123 
3124 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3125 		if (va_next < sva)
3126 			va_next = eva;
3127 		l1 = pmap_l0_to_l1(l0, sva);
3128 		if (pmap_load(l1) == 0)
3129 			continue;
3130 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3131 			KASSERT(va_next <= eva,
3132 			    ("partial update of non-transparent 1G page "
3133 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3134 			    pmap_load(l1), sva, eva, va_next));
3135 			MPASS(pmap != kernel_pmap);
3136 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3137 			pmap_clear(l1);
3138 			pmap_invalidate_page(pmap, sva, true);
3139 			pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
3140 			pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
3141 			continue;
3142 		}
3143 
3144 		/*
3145 		 * Calculate index for next page table.
3146 		 */
3147 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3148 		if (va_next < sva)
3149 			va_next = eva;
3150 
3151 		l2 = pmap_l1_to_l2(l1, sva);
3152 		if (l2 == NULL)
3153 			continue;
3154 
3155 		l3_paddr = pmap_load(l2);
3156 
3157 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
3158 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3159 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
3160 				    &free, &lock);
3161 				continue;
3162 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
3163 			    &lock) == NULL)
3164 				continue;
3165 			l3_paddr = pmap_load(l2);
3166 		}
3167 
3168 		/*
3169 		 * Weed out invalid mappings.
3170 		 */
3171 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
3172 			continue;
3173 
3174 		/*
3175 		 * Limit our scan to either the end of the va represented
3176 		 * by the current page table page, or to the end of the
3177 		 * range being removed.
3178 		 */
3179 		if (va_next > eva)
3180 			va_next = eva;
3181 
3182 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
3183 		    &lock);
3184 	}
3185 	if (lock != NULL)
3186 		rw_wunlock(lock);
3187 	PMAP_UNLOCK(pmap);
3188 	vm_page_free_pages_toq(&free, true);
3189 }
3190 
3191 /*
3192  *	Routine:	pmap_remove_all
3193  *	Function:
3194  *		Removes this physical page from
3195  *		all physical maps in which it resides.
3196  *		Reflects back modify bits to the pager.
3197  *
3198  *	Notes:
3199  *		Original versions of this routine were very
3200  *		inefficient because they iteratively called
3201  *		pmap_remove (slow...)
3202  */
3203 
3204 void
3205 pmap_remove_all(vm_page_t m)
3206 {
3207 	struct md_page *pvh;
3208 	pv_entry_t pv;
3209 	pmap_t pmap;
3210 	struct rwlock *lock;
3211 	pd_entry_t *pde, tpde;
3212 	pt_entry_t *pte, tpte;
3213 	vm_offset_t va;
3214 	struct spglist free;
3215 	int lvl, pvh_gen, md_gen;
3216 
3217 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3218 	    ("pmap_remove_all: page %p is not managed", m));
3219 	SLIST_INIT(&free);
3220 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3221 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
3222 	rw_wlock(lock);
3223 retry:
3224 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
3225 		pmap = PV_PMAP(pv);
3226 		if (!PMAP_TRYLOCK(pmap)) {
3227 			pvh_gen = pvh->pv_gen;
3228 			rw_wunlock(lock);
3229 			PMAP_LOCK(pmap);
3230 			rw_wlock(lock);
3231 			if (pvh_gen != pvh->pv_gen) {
3232 				PMAP_UNLOCK(pmap);
3233 				goto retry;
3234 			}
3235 		}
3236 		va = pv->pv_va;
3237 		pte = pmap_pte_exists(pmap, va, 2, __func__);
3238 		pmap_demote_l2_locked(pmap, pte, va, &lock);
3239 		PMAP_UNLOCK(pmap);
3240 	}
3241 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
3242 		pmap = PV_PMAP(pv);
3243 		PMAP_ASSERT_STAGE1(pmap);
3244 		if (!PMAP_TRYLOCK(pmap)) {
3245 			pvh_gen = pvh->pv_gen;
3246 			md_gen = m->md.pv_gen;
3247 			rw_wunlock(lock);
3248 			PMAP_LOCK(pmap);
3249 			rw_wlock(lock);
3250 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
3251 				PMAP_UNLOCK(pmap);
3252 				goto retry;
3253 			}
3254 		}
3255 		pmap_resident_count_dec(pmap, 1);
3256 
3257 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
3258 		KASSERT(pde != NULL,
3259 		    ("pmap_remove_all: no page directory entry found"));
3260 		KASSERT(lvl == 2,
3261 		    ("pmap_remove_all: invalid pde level %d", lvl));
3262 		tpde = pmap_load(pde);
3263 
3264 		pte = pmap_l2_to_l3(pde, pv->pv_va);
3265 		tpte = pmap_load_clear(pte);
3266 		if (tpte & ATTR_SW_WIRED)
3267 			pmap->pm_stats.wired_count--;
3268 		if ((tpte & ATTR_AF) != 0) {
3269 			pmap_invalidate_page(pmap, pv->pv_va, true);
3270 			vm_page_aflag_set(m, PGA_REFERENCED);
3271 		}
3272 
3273 		/*
3274 		 * Update the vm_page_t clean and reference bits.
3275 		 */
3276 		if (pmap_pte_dirty(pmap, tpte))
3277 			vm_page_dirty(m);
3278 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
3279 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3280 		m->md.pv_gen++;
3281 		free_pv_entry(pmap, pv);
3282 		PMAP_UNLOCK(pmap);
3283 	}
3284 	vm_page_aflag_clear(m, PGA_WRITEABLE);
3285 	rw_wunlock(lock);
3286 	vm_page_free_pages_toq(&free, true);
3287 }
3288 
3289 /*
3290  * pmap_protect_l2: do the things to protect a 2MB page in a pmap
3291  */
3292 static void
3293 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
3294     pt_entry_t nbits)
3295 {
3296 	pd_entry_t old_l2;
3297 	vm_page_t m, mt;
3298 
3299 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3300 	PMAP_ASSERT_STAGE1(pmap);
3301 	KASSERT((sva & L2_OFFSET) == 0,
3302 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
3303 	old_l2 = pmap_load(l2);
3304 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3305 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
3306 
3307 	/*
3308 	 * Return if the L2 entry already has the desired access restrictions
3309 	 * in place.
3310 	 */
3311 	if ((old_l2 & mask) == nbits)
3312 		return;
3313 
3314 	while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
3315 		cpu_spinwait();
3316 
3317 	/*
3318 	 * When a dirty read/write superpage mapping is write protected,
3319 	 * update the dirty field of each of the superpage's constituent 4KB
3320 	 * pages.
3321 	 */
3322 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
3323 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3324 	    pmap_pte_dirty(pmap, old_l2)) {
3325 		m = PHYS_TO_VM_PAGE(old_l2 & ~ATTR_MASK);
3326 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
3327 			vm_page_dirty(mt);
3328 	}
3329 
3330 	/*
3331 	 * Since a promotion must break the 4KB page mappings before making
3332 	 * the 2MB page mapping, a pmap_invalidate_page() suffices.
3333 	 */
3334 	pmap_invalidate_page(pmap, sva, true);
3335 }
3336 
3337 /*
3338  *	Set the physical protection on the
3339  *	specified range of this map as requested.
3340  */
3341 void
3342 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
3343 {
3344 	vm_offset_t va, va_next;
3345 	pd_entry_t *l0, *l1, *l2;
3346 	pt_entry_t *l3p, l3, mask, nbits;
3347 
3348 	PMAP_ASSERT_STAGE1(pmap);
3349 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
3350 	if (prot == VM_PROT_NONE) {
3351 		pmap_remove(pmap, sva, eva);
3352 		return;
3353 	}
3354 
3355 	mask = nbits = 0;
3356 	if ((prot & VM_PROT_WRITE) == 0) {
3357 		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
3358 		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
3359 	}
3360 	if ((prot & VM_PROT_EXECUTE) == 0) {
3361 		mask |= ATTR_S1_XN;
3362 		nbits |= ATTR_S1_XN;
3363 	}
3364 	if (mask == 0)
3365 		return;
3366 
3367 	PMAP_LOCK(pmap);
3368 	for (; sva < eva; sva = va_next) {
3369 		l0 = pmap_l0(pmap, sva);
3370 		if (pmap_load(l0) == 0) {
3371 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3372 			if (va_next < sva)
3373 				va_next = eva;
3374 			continue;
3375 		}
3376 
3377 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3378 		if (va_next < sva)
3379 			va_next = eva;
3380 		l1 = pmap_l0_to_l1(l0, sva);
3381 		if (pmap_load(l1) == 0)
3382 			continue;
3383 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
3384 			KASSERT(va_next <= eva,
3385 			    ("partial update of non-transparent 1G page "
3386 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
3387 			    pmap_load(l1), sva, eva, va_next));
3388 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
3389 			if ((pmap_load(l1) & mask) != nbits) {
3390 				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
3391 				pmap_invalidate_page(pmap, sva, true);
3392 			}
3393 			continue;
3394 		}
3395 
3396 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
3397 		if (va_next < sva)
3398 			va_next = eva;
3399 
3400 		l2 = pmap_l1_to_l2(l1, sva);
3401 		if (pmap_load(l2) == 0)
3402 			continue;
3403 
3404 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
3405 			if (sva + L2_SIZE == va_next && eva >= va_next) {
3406 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
3407 				continue;
3408 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
3409 				continue;
3410 		}
3411 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
3412 		    ("pmap_protect: Invalid L2 entry after demotion"));
3413 
3414 		if (va_next > eva)
3415 			va_next = eva;
3416 
3417 		va = va_next;
3418 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
3419 		    sva += L3_SIZE) {
3420 			l3 = pmap_load(l3p);
3421 
3422 			/*
3423 			 * Go to the next L3 entry if the current one is
3424 			 * invalid or already has the desired access
3425 			 * restrictions in place.  (The latter case occurs
3426 			 * frequently.  For example, in a "buildworld"
3427 			 * workload, almost 1 out of 4 L3 entries already
3428 			 * have the desired restrictions.)
3429 			 */
3430 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
3431 				if (va != va_next) {
3432 					pmap_invalidate_range(pmap, va, sva,
3433 					    true);
3434 					va = va_next;
3435 				}
3436 				continue;
3437 			}
3438 
3439 			while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
3440 			    nbits))
3441 				cpu_spinwait();
3442 
3443 			/*
3444 			 * When a dirty read/write mapping is write protected,
3445 			 * update the page's dirty field.
3446 			 */
3447 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
3448 			    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
3449 			    pmap_pte_dirty(pmap, l3))
3450 				vm_page_dirty(PHYS_TO_VM_PAGE(l3 & ~ATTR_MASK));
3451 
3452 			if (va == va_next)
3453 				va = sva;
3454 		}
3455 		if (va != va_next)
3456 			pmap_invalidate_range(pmap, va, sva, true);
3457 	}
3458 	PMAP_UNLOCK(pmap);
3459 }
3460 
3461 /*
3462  * Inserts the specified page table page into the specified pmap's collection
3463  * of idle page table pages.  Each of a pmap's page table pages is responsible
3464  * for mapping a distinct range of virtual addresses.  The pmap's collection is
3465  * ordered by this virtual address range.
3466  *
3467  * If "promoted" is false, then the page table page "mpte" must be zero filled.
3468  */
3469 static __inline int
3470 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted)
3471 {
3472 
3473 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3474 	mpte->valid = promoted ? VM_PAGE_BITS_ALL : 0;
3475 	return (vm_radix_insert(&pmap->pm_root, mpte));
3476 }
3477 
3478 /*
3479  * Removes the page table page mapping the specified virtual address from the
3480  * specified pmap's collection of idle page table pages, and returns it.
3481  * Otherwise, returns NULL if there is no page table page corresponding to the
3482  * specified virtual address.
3483  */
3484 static __inline vm_page_t
3485 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
3486 {
3487 
3488 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3489 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
3490 }
3491 
3492 /*
3493  * Performs a break-before-make update of a pmap entry. This is needed when
3494  * either promoting or demoting pages to ensure the TLB doesn't get into an
3495  * inconsistent state.
3496  */
3497 static void
3498 pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
3499     vm_offset_t va, vm_size_t size)
3500 {
3501 	register_t intr;
3502 
3503 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3504 
3505 	/*
3506 	 * Ensure we don't get switched out with the page table in an
3507 	 * inconsistent state. We also need to ensure no interrupts fire
3508 	 * as they may make use of an address we are about to invalidate.
3509 	 */
3510 	intr = intr_disable();
3511 
3512 	/*
3513 	 * Clear the old mapping's valid bit, but leave the rest of the entry
3514 	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
3515 	 * lookup the physical address.
3516 	 */
3517 	pmap_clear_bits(pte, ATTR_DESCR_VALID);
3518 
3519 	/*
3520 	 * When promoting, the L{1,2}_TABLE entry that is being replaced might
3521 	 * be cached, so we invalidate intermediate entries as well as final
3522 	 * entries.
3523 	 */
3524 	pmap_invalidate_range(pmap, va, va + size, false);
3525 
3526 	/* Create the new mapping */
3527 	pmap_store(pte, newpte);
3528 	dsb(ishst);
3529 
3530 	intr_restore(intr);
3531 }
3532 
3533 #if VM_NRESERVLEVEL > 0
3534 /*
3535  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
3536  * replace the many pv entries for the 4KB page mappings by a single pv entry
3537  * for the 2MB page mapping.
3538  */
3539 static void
3540 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3541     struct rwlock **lockp)
3542 {
3543 	struct md_page *pvh;
3544 	pv_entry_t pv;
3545 	vm_offset_t va_last;
3546 	vm_page_t m;
3547 
3548 	KASSERT((pa & L2_OFFSET) == 0,
3549 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
3550 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3551 
3552 	/*
3553 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
3554 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
3555 	 * a transfer avoids the possibility that get_pv_entry() calls
3556 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
3557 	 * mappings that is being promoted.
3558 	 */
3559 	m = PHYS_TO_VM_PAGE(pa);
3560 	va = va & ~L2_OFFSET;
3561 	pv = pmap_pvh_remove(&m->md, pmap, va);
3562 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
3563 	pvh = page_to_pvh(m);
3564 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3565 	pvh->pv_gen++;
3566 	/* Free the remaining NPTEPG - 1 pv entries. */
3567 	va_last = va + L2_SIZE - PAGE_SIZE;
3568 	do {
3569 		m++;
3570 		va += PAGE_SIZE;
3571 		pmap_pvh_free(&m->md, pmap, va);
3572 	} while (va < va_last);
3573 }
3574 
3575 /*
3576  * Tries to promote the 512, contiguous 4KB page mappings that are within a
3577  * single level 2 table entry to a single 2MB page mapping.  For promotion
3578  * to occur, two conditions must be met: (1) the 4KB page mappings must map
3579  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
3580  * identical characteristics.
3581  */
3582 static void
3583 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va,
3584     struct rwlock **lockp)
3585 {
3586 	pt_entry_t *firstl3, *l3, newl2, oldl3, pa;
3587 	vm_page_t mpte;
3588 	vm_offset_t sva;
3589 
3590 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3591 	PMAP_ASSERT_STAGE1(pmap);
3592 
3593 	sva = va & ~L2_OFFSET;
3594 	firstl3 = pmap_l2_to_l3(l2, sva);
3595 	newl2 = pmap_load(firstl3);
3596 
3597 	if (((newl2 & (~ATTR_MASK | ATTR_AF)) & L2_OFFSET) != ATTR_AF) {
3598 		atomic_add_long(&pmap_l2_p_failures, 1);
3599 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
3600 		    " in pmap %p", va, pmap);
3601 		return;
3602 	}
3603 
3604 setl2:
3605 	if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
3606 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
3607 		/*
3608 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
3609 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
3610 		 */
3611 		if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
3612 			goto setl2;
3613 		newl2 &= ~ATTR_SW_DBM;
3614 	}
3615 
3616 	pa = newl2 + L2_SIZE - PAGE_SIZE;
3617 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
3618 		oldl3 = pmap_load(l3);
3619 setl3:
3620 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
3621 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
3622 			/*
3623 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
3624 			 * set, ATTR_SW_DBM can be cleared without a TLB
3625 			 * invalidation.
3626 			 */
3627 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
3628 			    ~ATTR_SW_DBM))
3629 				goto setl3;
3630 			oldl3 &= ~ATTR_SW_DBM;
3631 		}
3632 		if (oldl3 != pa) {
3633 			atomic_add_long(&pmap_l2_p_failures, 1);
3634 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
3635 			    " in pmap %p", va, pmap);
3636 			return;
3637 		}
3638 		pa -= PAGE_SIZE;
3639 	}
3640 
3641 	/*
3642 	 * Save the page table page in its current state until the L2
3643 	 * mapping the superpage is demoted by pmap_demote_l2() or
3644 	 * destroyed by pmap_remove_l3().
3645 	 */
3646 	mpte = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
3647 	KASSERT(mpte >= vm_page_array &&
3648 	    mpte < &vm_page_array[vm_page_array_size],
3649 	    ("pmap_promote_l2: page table page is out of range"));
3650 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
3651 	    ("pmap_promote_l2: page table page's pindex is wrong"));
3652 	if (pmap_insert_pt_page(pmap, mpte, true)) {
3653 		atomic_add_long(&pmap_l2_p_failures, 1);
3654 		CTR2(KTR_PMAP,
3655 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
3656 		    pmap);
3657 		return;
3658 	}
3659 
3660 	if ((newl2 & ATTR_SW_MANAGED) != 0)
3661 		pmap_pv_promote_l2(pmap, va, newl2 & ~ATTR_MASK, lockp);
3662 
3663 	newl2 &= ~ATTR_DESCR_MASK;
3664 	newl2 |= L2_BLOCK;
3665 
3666 	pmap_update_entry(pmap, l2, newl2, sva, L2_SIZE);
3667 
3668 	atomic_add_long(&pmap_l2_promotions, 1);
3669 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
3670 		    pmap);
3671 }
3672 #endif /* VM_NRESERVLEVEL > 0 */
3673 
3674 static int
3675 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
3676     int psind)
3677 {
3678 	pd_entry_t *l0p, *l1p, *l2p, origpte;
3679 	vm_page_t mp;
3680 
3681 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3682 	KASSERT(psind > 0 && psind < MAXPAGESIZES,
3683 	    ("psind %d unexpected", psind));
3684 	KASSERT(((newpte & ~ATTR_MASK) & (pagesizes[psind] - 1)) == 0,
3685 	    ("unaligned phys address %#lx newpte %#lx psind %d",
3686 	    (newpte & ~ATTR_MASK), newpte, psind));
3687 
3688 restart:
3689 	if (psind == 2) {
3690 		l0p = pmap_l0(pmap, va);
3691 		if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
3692 			mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
3693 			if (mp == NULL) {
3694 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
3695 					return (KERN_RESOURCE_SHORTAGE);
3696 				PMAP_UNLOCK(pmap);
3697 				vm_wait(NULL);
3698 				PMAP_LOCK(pmap);
3699 				goto restart;
3700 			}
3701 			l1p = pmap_l0_to_l1(l0p, va);
3702 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
3703 			origpte = pmap_load(l1p);
3704 		} else {
3705 			l1p = pmap_l0_to_l1(l0p, va);
3706 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
3707 			origpte = pmap_load(l1p);
3708 			if ((origpte & ATTR_DESCR_VALID) == 0) {
3709 				mp = PHYS_TO_VM_PAGE(pmap_load(l0p) &
3710 				    ~ATTR_MASK);
3711 				mp->ref_count++;
3712 			}
3713 		}
3714 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
3715 		    ((origpte & ATTR_DESCR_MASK) == L1_BLOCK &&
3716 		     (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
3717 		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
3718 		    va, origpte, newpte));
3719 		pmap_store(l1p, newpte);
3720 	} else /* (psind == 1) */ {
3721 		l2p = pmap_l2(pmap, va);
3722 		if (l2p == NULL) {
3723 			mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
3724 			if (mp == NULL) {
3725 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
3726 					return (KERN_RESOURCE_SHORTAGE);
3727 				PMAP_UNLOCK(pmap);
3728 				vm_wait(NULL);
3729 				PMAP_LOCK(pmap);
3730 				goto restart;
3731 			}
3732 			l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
3733 			l2p = &l2p[pmap_l2_index(va)];
3734 			origpte = pmap_load(l2p);
3735 		} else {
3736 			l1p = pmap_l1(pmap, va);
3737 			origpte = pmap_load(l2p);
3738 			if ((origpte & ATTR_DESCR_VALID) == 0) {
3739 				mp = PHYS_TO_VM_PAGE(pmap_load(l1p) &
3740 				    ~ATTR_MASK);
3741 				mp->ref_count++;
3742 			}
3743 		}
3744 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
3745 		    ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
3746 		     (origpte & ~ATTR_MASK) == (newpte & ~ATTR_MASK)),
3747 		    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
3748 		    va, origpte, newpte));
3749 		pmap_store(l2p, newpte);
3750 	}
3751 	dsb(ishst);
3752 
3753 	if ((origpte & ATTR_DESCR_VALID) == 0)
3754 		pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
3755 	if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
3756 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
3757 	else if ((newpte & ATTR_SW_WIRED) == 0 &&
3758 	    (origpte & ATTR_SW_WIRED) != 0)
3759 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
3760 
3761 	return (KERN_SUCCESS);
3762 }
3763 
3764 /*
3765  *	Insert the given physical page (p) at
3766  *	the specified virtual address (v) in the
3767  *	target physical map with the protection requested.
3768  *
3769  *	If specified, the page will be wired down, meaning
3770  *	that the related pte can not be reclaimed.
3771  *
3772  *	NB:  This is the only routine which MAY NOT lazy-evaluate
3773  *	or lose information.  That is, this routine must actually
3774  *	insert this page into the given map NOW.
3775  */
3776 int
3777 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
3778     u_int flags, int8_t psind)
3779 {
3780 	struct rwlock *lock;
3781 	pd_entry_t *pde;
3782 	pt_entry_t new_l3, orig_l3;
3783 	pt_entry_t *l2, *l3;
3784 	pv_entry_t pv;
3785 	vm_paddr_t opa, pa;
3786 	vm_page_t mpte, om;
3787 	boolean_t nosleep;
3788 	int lvl, rv;
3789 
3790 	KASSERT(ADDR_IS_CANONICAL(va),
3791 	    ("%s: Address not in canonical form: %lx", __func__, va));
3792 
3793 	va = trunc_page(va);
3794 	if ((m->oflags & VPO_UNMANAGED) == 0)
3795 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
3796 	pa = VM_PAGE_TO_PHYS(m);
3797 	new_l3 = (pt_entry_t)(pa | ATTR_DEFAULT | L3_PAGE);
3798 	new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
3799 	new_l3 |= pmap_pte_prot(pmap, prot);
3800 
3801 	if ((flags & PMAP_ENTER_WIRED) != 0)
3802 		new_l3 |= ATTR_SW_WIRED;
3803 	if (pmap->pm_stage == PM_STAGE1) {
3804 		if (!ADDR_IS_KERNEL(va))
3805 			new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
3806 		else
3807 			new_l3 |= ATTR_S1_UXN;
3808 		if (pmap != kernel_pmap)
3809 			new_l3 |= ATTR_S1_nG;
3810 	} else {
3811 		/*
3812 		 * Clear the access flag on executable mappings, this will be
3813 		 * set later when the page is accessed. The fault handler is
3814 		 * required to invalidate the I-cache.
3815 		 *
3816 		 * TODO: Switch to the valid flag to allow hardware management
3817 		 * of the access flag. Much of the pmap code assumes the
3818 		 * valid flag is set and fails to destroy the old page tables
3819 		 * correctly if it is clear.
3820 		 */
3821 		if (prot & VM_PROT_EXECUTE)
3822 			new_l3 &= ~ATTR_AF;
3823 	}
3824 	if ((m->oflags & VPO_UNMANAGED) == 0) {
3825 		new_l3 |= ATTR_SW_MANAGED;
3826 		if ((prot & VM_PROT_WRITE) != 0) {
3827 			new_l3 |= ATTR_SW_DBM;
3828 			if ((flags & VM_PROT_WRITE) == 0) {
3829 				if (pmap->pm_stage == PM_STAGE1)
3830 					new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
3831 				else
3832 					new_l3 &=
3833 					    ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
3834 			}
3835 		}
3836 	}
3837 
3838 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
3839 
3840 	lock = NULL;
3841 	PMAP_LOCK(pmap);
3842 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
3843 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
3844 		    ("managed largepage va %#lx flags %#x", va, flags));
3845 		new_l3 &= ~L3_PAGE;
3846 		if (psind == 2)
3847 			new_l3 |= L1_BLOCK;
3848 		else /* (psind == 1) */
3849 			new_l3 |= L2_BLOCK;
3850 		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
3851 		goto out;
3852 	}
3853 	if (psind == 1) {
3854 		/* Assert the required virtual and physical alignment. */
3855 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
3856 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
3857 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
3858 		    flags, m, &lock);
3859 		goto out;
3860 	}
3861 	mpte = NULL;
3862 
3863 	/*
3864 	 * In the case that a page table page is not
3865 	 * resident, we are creating it here.
3866 	 */
3867 retry:
3868 	pde = pmap_pde(pmap, va, &lvl);
3869 	if (pde != NULL && lvl == 2) {
3870 		l3 = pmap_l2_to_l3(pde, va);
3871 		if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
3872 			mpte = PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
3873 			mpte->ref_count++;
3874 		}
3875 		goto havel3;
3876 	} else if (pde != NULL && lvl == 1) {
3877 		l2 = pmap_l1_to_l2(pde, va);
3878 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
3879 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
3880 			l3 = &l3[pmap_l3_index(va)];
3881 			if (!ADDR_IS_KERNEL(va)) {
3882 				mpte = PHYS_TO_VM_PAGE(
3883 				    pmap_load(l2) & ~ATTR_MASK);
3884 				mpte->ref_count++;
3885 			}
3886 			goto havel3;
3887 		}
3888 		/* We need to allocate an L3 table. */
3889 	}
3890 	if (!ADDR_IS_KERNEL(va)) {
3891 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
3892 
3893 		/*
3894 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
3895 		 * to handle the possibility that a superpage mapping for "va"
3896 		 * was created while we slept.
3897 		 */
3898 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
3899 		    nosleep ? NULL : &lock);
3900 		if (mpte == NULL && nosleep) {
3901 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
3902 			rv = KERN_RESOURCE_SHORTAGE;
3903 			goto out;
3904 		}
3905 		goto retry;
3906 	} else
3907 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
3908 
3909 havel3:
3910 	orig_l3 = pmap_load(l3);
3911 	opa = orig_l3 & ~ATTR_MASK;
3912 	pv = NULL;
3913 
3914 	/*
3915 	 * Is the specified virtual address already mapped?
3916 	 */
3917 	if (pmap_l3_valid(orig_l3)) {
3918 		/*
3919 		 * Only allow adding new entries on stage 2 tables for now.
3920 		 * This simplifies cache invalidation as we may need to call
3921 		 * into EL2 to perform such actions.
3922 		 */
3923 		PMAP_ASSERT_STAGE1(pmap);
3924 		/*
3925 		 * Wiring change, just update stats. We don't worry about
3926 		 * wiring PT pages as they remain resident as long as there
3927 		 * are valid mappings in them. Hence, if a user page is wired,
3928 		 * the PT page will be also.
3929 		 */
3930 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
3931 		    (orig_l3 & ATTR_SW_WIRED) == 0)
3932 			pmap->pm_stats.wired_count++;
3933 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
3934 		    (orig_l3 & ATTR_SW_WIRED) != 0)
3935 			pmap->pm_stats.wired_count--;
3936 
3937 		/*
3938 		 * Remove the extra PT page reference.
3939 		 */
3940 		if (mpte != NULL) {
3941 			mpte->ref_count--;
3942 			KASSERT(mpte->ref_count > 0,
3943 			    ("pmap_enter: missing reference to page table page,"
3944 			     " va: 0x%lx", va));
3945 		}
3946 
3947 		/*
3948 		 * Has the physical page changed?
3949 		 */
3950 		if (opa == pa) {
3951 			/*
3952 			 * No, might be a protection or wiring change.
3953 			 */
3954 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
3955 			    (new_l3 & ATTR_SW_DBM) != 0)
3956 				vm_page_aflag_set(m, PGA_WRITEABLE);
3957 			goto validate;
3958 		}
3959 
3960 		/*
3961 		 * The physical page has changed.  Temporarily invalidate
3962 		 * the mapping.
3963 		 */
3964 		orig_l3 = pmap_load_clear(l3);
3965 		KASSERT((orig_l3 & ~ATTR_MASK) == opa,
3966 		    ("pmap_enter: unexpected pa update for %#lx", va));
3967 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
3968 			om = PHYS_TO_VM_PAGE(opa);
3969 
3970 			/*
3971 			 * The pmap lock is sufficient to synchronize with
3972 			 * concurrent calls to pmap_page_test_mappings() and
3973 			 * pmap_ts_referenced().
3974 			 */
3975 			if (pmap_pte_dirty(pmap, orig_l3))
3976 				vm_page_dirty(om);
3977 			if ((orig_l3 & ATTR_AF) != 0) {
3978 				pmap_invalidate_page(pmap, va, true);
3979 				vm_page_aflag_set(om, PGA_REFERENCED);
3980 			}
3981 			CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, opa);
3982 			pv = pmap_pvh_remove(&om->md, pmap, va);
3983 			if ((m->oflags & VPO_UNMANAGED) != 0)
3984 				free_pv_entry(pmap, pv);
3985 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
3986 			    TAILQ_EMPTY(&om->md.pv_list) &&
3987 			    ((om->flags & PG_FICTITIOUS) != 0 ||
3988 			    TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
3989 				vm_page_aflag_clear(om, PGA_WRITEABLE);
3990 		} else {
3991 			KASSERT((orig_l3 & ATTR_AF) != 0,
3992 			    ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
3993 			pmap_invalidate_page(pmap, va, true);
3994 		}
3995 		orig_l3 = 0;
3996 	} else {
3997 		/*
3998 		 * Increment the counters.
3999 		 */
4000 		if ((new_l3 & ATTR_SW_WIRED) != 0)
4001 			pmap->pm_stats.wired_count++;
4002 		pmap_resident_count_inc(pmap, 1);
4003 	}
4004 	/*
4005 	 * Enter on the PV list if part of our managed memory.
4006 	 */
4007 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4008 		if (pv == NULL) {
4009 			pv = get_pv_entry(pmap, &lock);
4010 			pv->pv_va = va;
4011 		}
4012 		CHANGE_PV_LIST_LOCK_TO_PHYS(&lock, pa);
4013 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
4014 		m->md.pv_gen++;
4015 		if ((new_l3 & ATTR_SW_DBM) != 0)
4016 			vm_page_aflag_set(m, PGA_WRITEABLE);
4017 	}
4018 
4019 validate:
4020 	if (pmap->pm_stage == PM_STAGE1) {
4021 		/*
4022 		 * Sync icache if exec permission and attribute
4023 		 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
4024 		 * is stored and made valid for hardware table walk. If done
4025 		 * later, then other can access this page before caches are
4026 		 * properly synced. Don't do it for kernel memory which is
4027 		 * mapped with exec permission even if the memory isn't going
4028 		 * to hold executable code. The only time when icache sync is
4029 		 * needed is after kernel module is loaded and the relocation
4030 		 * info is processed. And it's done in elf_cpu_load_file().
4031 		*/
4032 		if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
4033 		    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
4034 		    (opa != pa || (orig_l3 & ATTR_S1_XN))) {
4035 			PMAP_ASSERT_STAGE1(pmap);
4036 			cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4037 		}
4038 	} else {
4039 		cpu_dcache_wb_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4040 	}
4041 
4042 	/*
4043 	 * Update the L3 entry
4044 	 */
4045 	if (pmap_l3_valid(orig_l3)) {
4046 		PMAP_ASSERT_STAGE1(pmap);
4047 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
4048 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
4049 			/* same PA, different attributes */
4050 			orig_l3 = pmap_load_store(l3, new_l3);
4051 			pmap_invalidate_page(pmap, va, true);
4052 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
4053 			    pmap_pte_dirty(pmap, orig_l3))
4054 				vm_page_dirty(m);
4055 		} else {
4056 			/*
4057 			 * orig_l3 == new_l3
4058 			 * This can happens if multiple threads simultaneously
4059 			 * access not yet mapped page. This bad for performance
4060 			 * since this can cause full demotion-NOP-promotion
4061 			 * cycle.
4062 			 * Another possible reasons are:
4063 			 * - VM and pmap memory layout are diverged
4064 			 * - tlb flush is missing somewhere and CPU doesn't see
4065 			 *   actual mapping.
4066 			 */
4067 			CTR4(KTR_PMAP, "%s: already mapped page - "
4068 			    "pmap %p va 0x%#lx pte 0x%lx",
4069 			    __func__, pmap, va, new_l3);
4070 		}
4071 	} else {
4072 		/* New mapping */
4073 		pmap_store(l3, new_l3);
4074 		dsb(ishst);
4075 	}
4076 
4077 #if VM_NRESERVLEVEL > 0
4078 	/*
4079 	 * Try to promote from level 3 pages to a level 2 superpage. This
4080 	 * currently only works on stage 1 pmaps as pmap_promote_l2 looks at
4081 	 * stage 1 specific fields and performs a break-before-make sequence
4082 	 * that is incorrect a stage 2 pmap.
4083 	 */
4084 	if ((mpte == NULL || mpte->ref_count == NL3PG) &&
4085 	    pmap_ps_enabled(pmap) && pmap->pm_stage == PM_STAGE1 &&
4086 	    (m->flags & PG_FICTITIOUS) == 0 &&
4087 	    vm_reserv_level_iffullpop(m) == 0) {
4088 		pmap_promote_l2(pmap, pde, va, &lock);
4089 	}
4090 #endif
4091 
4092 	rv = KERN_SUCCESS;
4093 out:
4094 	if (lock != NULL)
4095 		rw_wunlock(lock);
4096 	PMAP_UNLOCK(pmap);
4097 	return (rv);
4098 }
4099 
4100 /*
4101  * Tries to create a read- and/or execute-only 2MB page mapping.  Returns true
4102  * if successful.  Returns false if (1) a page table page cannot be allocated
4103  * without sleeping, (2) a mapping already exists at the specified virtual
4104  * address, or (3) a PV entry cannot be allocated without reclaiming another
4105  * PV entry.
4106  */
4107 static bool
4108 pmap_enter_2mpage(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4109     struct rwlock **lockp)
4110 {
4111 	pd_entry_t new_l2;
4112 
4113 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4114 	PMAP_ASSERT_STAGE1(pmap);
4115 	KASSERT(ADDR_IS_CANONICAL(va),
4116 	    ("%s: Address not in canonical form: %lx", __func__, va));
4117 
4118 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PHYS(m) | ATTR_DEFAULT |
4119 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
4120 	    L2_BLOCK);
4121 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4122 		new_l2 |= ATTR_SW_MANAGED;
4123 		new_l2 &= ~ATTR_AF;
4124 	}
4125 	if ((prot & VM_PROT_EXECUTE) == 0 ||
4126 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
4127 		new_l2 |= ATTR_S1_XN;
4128 	if (!ADDR_IS_KERNEL(va))
4129 		new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4130 	else
4131 		new_l2 |= ATTR_S1_UXN;
4132 	if (pmap != kernel_pmap)
4133 		new_l2 |= ATTR_S1_nG;
4134 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
4135 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp) ==
4136 	    KERN_SUCCESS);
4137 }
4138 
4139 /*
4140  * Returns true if every page table entry in the specified page table is
4141  * zero.
4142  */
4143 static bool
4144 pmap_every_pte_zero(vm_paddr_t pa)
4145 {
4146 	pt_entry_t *pt_end, *pte;
4147 
4148 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
4149 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
4150 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
4151 		if (*pte != 0)
4152 			return (false);
4153 	}
4154 	return (true);
4155 }
4156 
4157 /*
4158  * Tries to create the specified 2MB page mapping.  Returns KERN_SUCCESS if
4159  * the mapping was created, and either KERN_FAILURE or KERN_RESOURCE_SHORTAGE
4160  * otherwise.  Returns KERN_FAILURE if PMAP_ENTER_NOREPLACE was specified and
4161  * a mapping already exists at the specified virtual address.  Returns
4162  * KERN_RESOURCE_SHORTAGE if PMAP_ENTER_NOSLEEP was specified and a page table
4163  * page allocation failed.  Returns KERN_RESOURCE_SHORTAGE if
4164  * PMAP_ENTER_NORECLAIM was specified and a PV entry allocation failed.
4165  */
4166 static int
4167 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
4168     vm_page_t m, struct rwlock **lockp)
4169 {
4170 	struct spglist free;
4171 	pd_entry_t *l2, old_l2;
4172 	vm_page_t l2pg, mt;
4173 
4174 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4175 	KASSERT(ADDR_IS_CANONICAL(va),
4176 	    ("%s: Address not in canonical form: %lx", __func__, va));
4177 
4178 	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
4179 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
4180 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
4181 		    va, pmap);
4182 		return (KERN_RESOURCE_SHORTAGE);
4183 	}
4184 
4185 	/*
4186 	 * If there are existing mappings, either abort or remove them.
4187 	 */
4188 	if ((old_l2 = pmap_load(l2)) != 0) {
4189 		KASSERT(l2pg == NULL || l2pg->ref_count > 1,
4190 		    ("pmap_enter_l2: l2pg's ref count is too low"));
4191 		if ((flags & PMAP_ENTER_NOREPLACE) != 0 &&
4192 		    (!ADDR_IS_KERNEL(va) ||
4193 		    (old_l2 & ATTR_DESCR_MASK) == L2_BLOCK ||
4194 		    !pmap_every_pte_zero(old_l2 & ~ATTR_MASK))) {
4195 			if (l2pg != NULL)
4196 				l2pg->ref_count--;
4197 			CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx"
4198 			    " in pmap %p", va, pmap);
4199 			return (KERN_FAILURE);
4200 		}
4201 		SLIST_INIT(&free);
4202 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
4203 			(void)pmap_remove_l2(pmap, l2, va,
4204 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
4205 		else
4206 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
4207 			    &free, lockp);
4208 		if (!ADDR_IS_KERNEL(va)) {
4209 			vm_page_free_pages_toq(&free, true);
4210 			KASSERT(pmap_load(l2) == 0,
4211 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
4212 		} else {
4213 			KASSERT(SLIST_EMPTY(&free),
4214 			    ("pmap_enter_l2: freed kernel page table page"));
4215 
4216 			/*
4217 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
4218 			 * will leave the kernel page table page zero filled.
4219 			 * Nonetheless, the TLB could have an intermediate
4220 			 * entry for the kernel page table page, so request
4221 			 * an invalidation at all levels after clearing
4222 			 * the L2_TABLE entry.
4223 			 */
4224 			mt = PHYS_TO_VM_PAGE(pmap_load(l2) & ~ATTR_MASK);
4225 			if (pmap_insert_pt_page(pmap, mt, false))
4226 				panic("pmap_enter_l2: trie insert failed");
4227 			pmap_clear(l2);
4228 			pmap_invalidate_page(pmap, va, false);
4229 		}
4230 	}
4231 
4232 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
4233 		/*
4234 		 * Abort this mapping if its PV entry could not be created.
4235 		 */
4236 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
4237 			if (l2pg != NULL)
4238 				pmap_abort_ptp(pmap, va, l2pg);
4239 			CTR2(KTR_PMAP,
4240 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
4241 			    va, pmap);
4242 			return (KERN_RESOURCE_SHORTAGE);
4243 		}
4244 		if ((new_l2 & ATTR_SW_DBM) != 0)
4245 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4246 				vm_page_aflag_set(mt, PGA_WRITEABLE);
4247 	}
4248 
4249 	/*
4250 	 * Increment counters.
4251 	 */
4252 	if ((new_l2 & ATTR_SW_WIRED) != 0)
4253 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
4254 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
4255 
4256 	/*
4257 	 * Conditionally sync the icache.  See pmap_enter() for details.
4258 	 */
4259 	if ((new_l2 & ATTR_S1_XN) == 0 && ((new_l2 & ~ATTR_MASK) !=
4260 	    (old_l2 & ~ATTR_MASK) || (old_l2 & ATTR_S1_XN) != 0) &&
4261 	    pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
4262 		cpu_icache_sync_range(PHYS_TO_DMAP(new_l2 & ~ATTR_MASK),
4263 		    L2_SIZE);
4264 	}
4265 
4266 	/*
4267 	 * Map the superpage.
4268 	 */
4269 	pmap_store(l2, new_l2);
4270 	dsb(ishst);
4271 
4272 	atomic_add_long(&pmap_l2_mappings, 1);
4273 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
4274 	    va, pmap);
4275 
4276 	return (KERN_SUCCESS);
4277 }
4278 
4279 /*
4280  * Maps a sequence of resident pages belonging to the same object.
4281  * The sequence begins with the given page m_start.  This page is
4282  * mapped at the given virtual address start.  Each subsequent page is
4283  * mapped at a virtual address that is offset from start by the same
4284  * amount as the page is offset from m_start within the object.  The
4285  * last page in the sequence is the page with the largest offset from
4286  * m_start that can be mapped at a virtual address less than the given
4287  * virtual address end.  Not every virtual page between start and end
4288  * is mapped; only those for which a resident page exists with the
4289  * corresponding offset from m_start are mapped.
4290  */
4291 void
4292 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
4293     vm_page_t m_start, vm_prot_t prot)
4294 {
4295 	struct rwlock *lock;
4296 	vm_offset_t va;
4297 	vm_page_t m, mpte;
4298 	vm_pindex_t diff, psize;
4299 
4300 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
4301 
4302 	psize = atop(end - start);
4303 	mpte = NULL;
4304 	m = m_start;
4305 	lock = NULL;
4306 	PMAP_LOCK(pmap);
4307 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
4308 		va = start + ptoa(diff);
4309 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
4310 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
4311 		    pmap_enter_2mpage(pmap, va, m, prot, &lock))
4312 			m = &m[L2_SIZE / PAGE_SIZE - 1];
4313 		else
4314 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
4315 			    &lock);
4316 		m = TAILQ_NEXT(m, listq);
4317 	}
4318 	if (lock != NULL)
4319 		rw_wunlock(lock);
4320 	PMAP_UNLOCK(pmap);
4321 }
4322 
4323 /*
4324  * this code makes some *MAJOR* assumptions:
4325  * 1. Current pmap & pmap exists.
4326  * 2. Not wired.
4327  * 3. Read access.
4328  * 4. No page table pages.
4329  * but is *MUCH* faster than pmap_enter...
4330  */
4331 
4332 void
4333 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
4334 {
4335 	struct rwlock *lock;
4336 
4337 	lock = NULL;
4338 	PMAP_LOCK(pmap);
4339 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
4340 	if (lock != NULL)
4341 		rw_wunlock(lock);
4342 	PMAP_UNLOCK(pmap);
4343 }
4344 
4345 static vm_page_t
4346 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
4347     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
4348 {
4349 	pd_entry_t *pde;
4350 	pt_entry_t *l2, *l3, l3_val;
4351 	vm_paddr_t pa;
4352 	int lvl;
4353 
4354 	KASSERT(!VA_IS_CLEANMAP(va) ||
4355 	    (m->oflags & VPO_UNMANAGED) != 0,
4356 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
4357 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4358 	PMAP_ASSERT_STAGE1(pmap);
4359 	KASSERT(ADDR_IS_CANONICAL(va),
4360 	    ("%s: Address not in canonical form: %lx", __func__, va));
4361 
4362 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
4363 	/*
4364 	 * In the case that a page table page is not
4365 	 * resident, we are creating it here.
4366 	 */
4367 	if (!ADDR_IS_KERNEL(va)) {
4368 		vm_pindex_t l2pindex;
4369 
4370 		/*
4371 		 * Calculate pagetable page index
4372 		 */
4373 		l2pindex = pmap_l2_pindex(va);
4374 		if (mpte && (mpte->pindex == l2pindex)) {
4375 			mpte->ref_count++;
4376 		} else {
4377 			/*
4378 			 * Get the l2 entry
4379 			 */
4380 			pde = pmap_pde(pmap, va, &lvl);
4381 
4382 			/*
4383 			 * If the page table page is mapped, we just increment
4384 			 * the hold count, and activate it.  Otherwise, we
4385 			 * attempt to allocate a page table page.  If this
4386 			 * attempt fails, we don't retry.  Instead, we give up.
4387 			 */
4388 			if (lvl == 1) {
4389 				l2 = pmap_l1_to_l2(pde, va);
4390 				if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
4391 				    L2_BLOCK)
4392 					return (NULL);
4393 			}
4394 			if (lvl == 2 && pmap_load(pde) != 0) {
4395 				mpte =
4396 				    PHYS_TO_VM_PAGE(pmap_load(pde) & ~ATTR_MASK);
4397 				mpte->ref_count++;
4398 			} else {
4399 				/*
4400 				 * Pass NULL instead of the PV list lock
4401 				 * pointer, because we don't intend to sleep.
4402 				 */
4403 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
4404 				if (mpte == NULL)
4405 					return (mpte);
4406 			}
4407 		}
4408 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
4409 		l3 = &l3[pmap_l3_index(va)];
4410 	} else {
4411 		mpte = NULL;
4412 		pde = pmap_pde(kernel_pmap, va, &lvl);
4413 		KASSERT(pde != NULL,
4414 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
4415 		     va));
4416 		KASSERT(lvl == 2,
4417 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
4418 		l3 = pmap_l2_to_l3(pde, va);
4419 	}
4420 
4421 	/*
4422 	 * Abort if a mapping already exists.
4423 	 */
4424 	if (pmap_load(l3) != 0) {
4425 		if (mpte != NULL)
4426 			mpte->ref_count--;
4427 		return (NULL);
4428 	}
4429 
4430 	/*
4431 	 * Enter on the PV list if part of our managed memory.
4432 	 */
4433 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
4434 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
4435 		if (mpte != NULL)
4436 			pmap_abort_ptp(pmap, va, mpte);
4437 		return (NULL);
4438 	}
4439 
4440 	/*
4441 	 * Increment counters
4442 	 */
4443 	pmap_resident_count_inc(pmap, 1);
4444 
4445 	pa = VM_PAGE_TO_PHYS(m);
4446 	l3_val = pa | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
4447 	    ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
4448 	if ((prot & VM_PROT_EXECUTE) == 0 ||
4449 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
4450 		l3_val |= ATTR_S1_XN;
4451 	if (!ADDR_IS_KERNEL(va))
4452 		l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
4453 	else
4454 		l3_val |= ATTR_S1_UXN;
4455 	if (pmap != kernel_pmap)
4456 		l3_val |= ATTR_S1_nG;
4457 
4458 	/*
4459 	 * Now validate mapping with RO protection
4460 	 */
4461 	if ((m->oflags & VPO_UNMANAGED) == 0) {
4462 		l3_val |= ATTR_SW_MANAGED;
4463 		l3_val &= ~ATTR_AF;
4464 	}
4465 
4466 	/* Sync icache before the mapping is stored to PTE */
4467 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
4468 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
4469 		cpu_icache_sync_range(PHYS_TO_DMAP(pa), PAGE_SIZE);
4470 
4471 	pmap_store(l3, l3_val);
4472 	dsb(ishst);
4473 
4474 	return (mpte);
4475 }
4476 
4477 /*
4478  * This code maps large physical mmap regions into the
4479  * processor address space.  Note that some shortcuts
4480  * are taken, but the code works.
4481  */
4482 void
4483 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
4484     vm_pindex_t pindex, vm_size_t size)
4485 {
4486 
4487 	VM_OBJECT_ASSERT_WLOCKED(object);
4488 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
4489 	    ("pmap_object_init_pt: non-device object"));
4490 }
4491 
4492 /*
4493  *	Clear the wired attribute from the mappings for the specified range of
4494  *	addresses in the given pmap.  Every valid mapping within that range
4495  *	must have the wired attribute set.  In contrast, invalid mappings
4496  *	cannot have the wired attribute set, so they are ignored.
4497  *
4498  *	The wired attribute of the page table entry is not a hardware feature,
4499  *	so there is no need to invalidate any TLB entries.
4500  */
4501 void
4502 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4503 {
4504 	vm_offset_t va_next;
4505 	pd_entry_t *l0, *l1, *l2;
4506 	pt_entry_t *l3;
4507 
4508 	PMAP_LOCK(pmap);
4509 	for (; sva < eva; sva = va_next) {
4510 		l0 = pmap_l0(pmap, sva);
4511 		if (pmap_load(l0) == 0) {
4512 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4513 			if (va_next < sva)
4514 				va_next = eva;
4515 			continue;
4516 		}
4517 
4518 		l1 = pmap_l0_to_l1(l0, sva);
4519 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4520 		if (va_next < sva)
4521 			va_next = eva;
4522 		if (pmap_load(l1) == 0)
4523 			continue;
4524 
4525 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4526 			KASSERT(va_next <= eva,
4527 			    ("partial update of non-transparent 1G page "
4528 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4529 			    pmap_load(l1), sva, eva, va_next));
4530 			MPASS(pmap != kernel_pmap);
4531 			MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
4532 			    ATTR_SW_WIRED)) == ATTR_SW_WIRED);
4533 			pmap_clear_bits(l1, ATTR_SW_WIRED);
4534 			pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
4535 			continue;
4536 		}
4537 
4538 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4539 		if (va_next < sva)
4540 			va_next = eva;
4541 
4542 		l2 = pmap_l1_to_l2(l1, sva);
4543 		if (pmap_load(l2) == 0)
4544 			continue;
4545 
4546 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4547 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
4548 				panic("pmap_unwire: l2 %#jx is missing "
4549 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
4550 
4551 			/*
4552 			 * Are we unwiring the entire large page?  If not,
4553 			 * demote the mapping and fall through.
4554 			 */
4555 			if (sva + L2_SIZE == va_next && eva >= va_next) {
4556 				pmap_clear_bits(l2, ATTR_SW_WIRED);
4557 				pmap->pm_stats.wired_count -= L2_SIZE /
4558 				    PAGE_SIZE;
4559 				continue;
4560 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
4561 				panic("pmap_unwire: demotion failed");
4562 		}
4563 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4564 		    ("pmap_unwire: Invalid l2 entry after demotion"));
4565 
4566 		if (va_next > eva)
4567 			va_next = eva;
4568 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
4569 		    sva += L3_SIZE) {
4570 			if (pmap_load(l3) == 0)
4571 				continue;
4572 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
4573 				panic("pmap_unwire: l3 %#jx is missing "
4574 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
4575 
4576 			/*
4577 			 * ATTR_SW_WIRED must be cleared atomically.  Although
4578 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
4579 			 * the System MMU may write to the entry concurrently.
4580 			 */
4581 			pmap_clear_bits(l3, ATTR_SW_WIRED);
4582 			pmap->pm_stats.wired_count--;
4583 		}
4584 	}
4585 	PMAP_UNLOCK(pmap);
4586 }
4587 
4588 /*
4589  *	Copy the range specified by src_addr/len
4590  *	from the source map to the range dst_addr/len
4591  *	in the destination map.
4592  *
4593  *	This routine is only advisory and need not do anything.
4594  *
4595  *	Because the executable mappings created by this routine are copied,
4596  *	it should not have to flush the instruction cache.
4597  */
4598 void
4599 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
4600     vm_offset_t src_addr)
4601 {
4602 	struct rwlock *lock;
4603 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
4604 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
4605 	vm_offset_t addr, end_addr, va_next;
4606 	vm_page_t dst_m, dstmpte, srcmpte;
4607 
4608 	PMAP_ASSERT_STAGE1(dst_pmap);
4609 	PMAP_ASSERT_STAGE1(src_pmap);
4610 
4611 	if (dst_addr != src_addr)
4612 		return;
4613 	end_addr = src_addr + len;
4614 	lock = NULL;
4615 	if (dst_pmap < src_pmap) {
4616 		PMAP_LOCK(dst_pmap);
4617 		PMAP_LOCK(src_pmap);
4618 	} else {
4619 		PMAP_LOCK(src_pmap);
4620 		PMAP_LOCK(dst_pmap);
4621 	}
4622 	for (addr = src_addr; addr < end_addr; addr = va_next) {
4623 		l0 = pmap_l0(src_pmap, addr);
4624 		if (pmap_load(l0) == 0) {
4625 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
4626 			if (va_next < addr)
4627 				va_next = end_addr;
4628 			continue;
4629 		}
4630 
4631 		va_next = (addr + L1_SIZE) & ~L1_OFFSET;
4632 		if (va_next < addr)
4633 			va_next = end_addr;
4634 		l1 = pmap_l0_to_l1(l0, addr);
4635 		if (pmap_load(l1) == 0)
4636 			continue;
4637 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4638 			KASSERT(va_next <= end_addr,
4639 			    ("partial update of non-transparent 1G page "
4640 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
4641 			    pmap_load(l1), addr, end_addr, va_next));
4642 			srcptepaddr = pmap_load(l1);
4643 			l1 = pmap_l1(dst_pmap, addr);
4644 			if (l1 == NULL) {
4645 				if (_pmap_alloc_l3(dst_pmap,
4646 				    pmap_l0_pindex(addr), NULL) == NULL)
4647 					break;
4648 				l1 = pmap_l1(dst_pmap, addr);
4649 			} else {
4650 				l0 = pmap_l0(dst_pmap, addr);
4651 				dst_m = PHYS_TO_VM_PAGE(pmap_load(l0) &
4652 				    ~ATTR_MASK);
4653 				dst_m->ref_count++;
4654 			}
4655 			KASSERT(pmap_load(l1) == 0,
4656 			    ("1G mapping present in dst pmap "
4657 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
4658 			    pmap_load(l1), addr, end_addr, va_next));
4659 			pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
4660 			pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
4661 			continue;
4662 		}
4663 
4664 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
4665 		if (va_next < addr)
4666 			va_next = end_addr;
4667 		l2 = pmap_l1_to_l2(l1, addr);
4668 		srcptepaddr = pmap_load(l2);
4669 		if (srcptepaddr == 0)
4670 			continue;
4671 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4672 			/*
4673 			 * We can only virtual copy whole superpages.
4674 			 */
4675 			if ((addr & L2_OFFSET) != 0 ||
4676 			    addr + L2_SIZE > end_addr)
4677 				continue;
4678 			l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
4679 			if (l2 == NULL)
4680 				break;
4681 			if (pmap_load(l2) == 0 &&
4682 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
4683 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
4684 			    PMAP_ENTER_NORECLAIM, &lock))) {
4685 				/*
4686 				 * We leave the dirty bit unchanged because
4687 				 * managed read/write superpage mappings are
4688 				 * required to be dirty.  However, managed
4689 				 * superpage mappings are not required to
4690 				 * have their accessed bit set, so we clear
4691 				 * it because we don't know if this mapping
4692 				 * will be used.
4693 				 */
4694 				srcptepaddr &= ~ATTR_SW_WIRED;
4695 				if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
4696 					srcptepaddr &= ~ATTR_AF;
4697 				pmap_store(l2, srcptepaddr);
4698 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
4699 				    PAGE_SIZE);
4700 				atomic_add_long(&pmap_l2_mappings, 1);
4701 			} else
4702 				pmap_abort_ptp(dst_pmap, addr, dst_m);
4703 			continue;
4704 		}
4705 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
4706 		    ("pmap_copy: invalid L2 entry"));
4707 		srcptepaddr &= ~ATTR_MASK;
4708 		srcmpte = PHYS_TO_VM_PAGE(srcptepaddr);
4709 		KASSERT(srcmpte->ref_count > 0,
4710 		    ("pmap_copy: source page table page is unused"));
4711 		if (va_next > end_addr)
4712 			va_next = end_addr;
4713 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(srcptepaddr);
4714 		src_pte = &src_pte[pmap_l3_index(addr)];
4715 		dstmpte = NULL;
4716 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
4717 			ptetemp = pmap_load(src_pte);
4718 
4719 			/*
4720 			 * We only virtual copy managed pages.
4721 			 */
4722 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
4723 				continue;
4724 
4725 			if (dstmpte != NULL) {
4726 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
4727 				    ("dstmpte pindex/addr mismatch"));
4728 				dstmpte->ref_count++;
4729 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
4730 			    NULL)) == NULL)
4731 				goto out;
4732 			dst_pte = (pt_entry_t *)
4733 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
4734 			dst_pte = &dst_pte[pmap_l3_index(addr)];
4735 			if (pmap_load(dst_pte) == 0 &&
4736 			    pmap_try_insert_pv_entry(dst_pmap, addr,
4737 			    PHYS_TO_VM_PAGE(ptetemp & ~ATTR_MASK), &lock)) {
4738 				/*
4739 				 * Clear the wired, modified, and accessed
4740 				 * (referenced) bits during the copy.
4741 				 */
4742 				mask = ATTR_AF | ATTR_SW_WIRED;
4743 				nbits = 0;
4744 				if ((ptetemp & ATTR_SW_DBM) != 0)
4745 					nbits |= ATTR_S1_AP_RW_BIT;
4746 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
4747 				pmap_resident_count_inc(dst_pmap, 1);
4748 			} else {
4749 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
4750 				goto out;
4751 			}
4752 			/* Have we copied all of the valid mappings? */
4753 			if (dstmpte->ref_count >= srcmpte->ref_count)
4754 				break;
4755 		}
4756 	}
4757 out:
4758 	/*
4759 	 * XXX This barrier may not be needed because the destination pmap is
4760 	 * not active.
4761 	 */
4762 	dsb(ishst);
4763 
4764 	if (lock != NULL)
4765 		rw_wunlock(lock);
4766 	PMAP_UNLOCK(src_pmap);
4767 	PMAP_UNLOCK(dst_pmap);
4768 }
4769 
4770 /*
4771  *	pmap_zero_page zeros the specified hardware page by mapping
4772  *	the page into KVM and using bzero to clear its contents.
4773  */
4774 void
4775 pmap_zero_page(vm_page_t m)
4776 {
4777 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4778 
4779 	pagezero((void *)va);
4780 }
4781 
4782 /*
4783  *	pmap_zero_page_area zeros the specified hardware page by mapping
4784  *	the page into KVM and using bzero to clear its contents.
4785  *
4786  *	off and size may not cover an area beyond a single hardware page.
4787  */
4788 void
4789 pmap_zero_page_area(vm_page_t m, int off, int size)
4790 {
4791 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
4792 
4793 	if (off == 0 && size == PAGE_SIZE)
4794 		pagezero((void *)va);
4795 	else
4796 		bzero((char *)va + off, size);
4797 }
4798 
4799 /*
4800  *	pmap_copy_page copies the specified (machine independent)
4801  *	page by mapping the page into virtual memory and using
4802  *	bcopy to copy the page, one machine dependent page at a
4803  *	time.
4804  */
4805 void
4806 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
4807 {
4808 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
4809 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
4810 
4811 	pagecopy((void *)src, (void *)dst);
4812 }
4813 
4814 int unmapped_buf_allowed = 1;
4815 
4816 void
4817 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
4818     vm_offset_t b_offset, int xfersize)
4819 {
4820 	void *a_cp, *b_cp;
4821 	vm_page_t m_a, m_b;
4822 	vm_paddr_t p_a, p_b;
4823 	vm_offset_t a_pg_offset, b_pg_offset;
4824 	int cnt;
4825 
4826 	while (xfersize > 0) {
4827 		a_pg_offset = a_offset & PAGE_MASK;
4828 		m_a = ma[a_offset >> PAGE_SHIFT];
4829 		p_a = m_a->phys_addr;
4830 		b_pg_offset = b_offset & PAGE_MASK;
4831 		m_b = mb[b_offset >> PAGE_SHIFT];
4832 		p_b = m_b->phys_addr;
4833 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
4834 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
4835 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
4836 			panic("!DMAP a %lx", p_a);
4837 		} else {
4838 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
4839 		}
4840 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
4841 			panic("!DMAP b %lx", p_b);
4842 		} else {
4843 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
4844 		}
4845 		bcopy(a_cp, b_cp, cnt);
4846 		a_offset += cnt;
4847 		b_offset += cnt;
4848 		xfersize -= cnt;
4849 	}
4850 }
4851 
4852 vm_offset_t
4853 pmap_quick_enter_page(vm_page_t m)
4854 {
4855 
4856 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
4857 }
4858 
4859 void
4860 pmap_quick_remove_page(vm_offset_t addr)
4861 {
4862 }
4863 
4864 /*
4865  * Returns true if the pmap's pv is one of the first
4866  * 16 pvs linked to from this page.  This count may
4867  * be changed upwards or downwards in the future; it
4868  * is only necessary that true be returned for a small
4869  * subset of pmaps for proper page aging.
4870  */
4871 boolean_t
4872 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
4873 {
4874 	struct md_page *pvh;
4875 	struct rwlock *lock;
4876 	pv_entry_t pv;
4877 	int loops = 0;
4878 	boolean_t rv;
4879 
4880 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4881 	    ("pmap_page_exists_quick: page %p is not managed", m));
4882 	rv = FALSE;
4883 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4884 	rw_rlock(lock);
4885 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4886 		if (PV_PMAP(pv) == pmap) {
4887 			rv = TRUE;
4888 			break;
4889 		}
4890 		loops++;
4891 		if (loops >= 16)
4892 			break;
4893 	}
4894 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
4895 		pvh = page_to_pvh(m);
4896 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4897 			if (PV_PMAP(pv) == pmap) {
4898 				rv = TRUE;
4899 				break;
4900 			}
4901 			loops++;
4902 			if (loops >= 16)
4903 				break;
4904 		}
4905 	}
4906 	rw_runlock(lock);
4907 	return (rv);
4908 }
4909 
4910 /*
4911  *	pmap_page_wired_mappings:
4912  *
4913  *	Return the number of managed mappings to the given physical page
4914  *	that are wired.
4915  */
4916 int
4917 pmap_page_wired_mappings(vm_page_t m)
4918 {
4919 	struct rwlock *lock;
4920 	struct md_page *pvh;
4921 	pmap_t pmap;
4922 	pt_entry_t *pte;
4923 	pv_entry_t pv;
4924 	int count, md_gen, pvh_gen;
4925 
4926 	if ((m->oflags & VPO_UNMANAGED) != 0)
4927 		return (0);
4928 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4929 	rw_rlock(lock);
4930 restart:
4931 	count = 0;
4932 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
4933 		pmap = PV_PMAP(pv);
4934 		if (!PMAP_TRYLOCK(pmap)) {
4935 			md_gen = m->md.pv_gen;
4936 			rw_runlock(lock);
4937 			PMAP_LOCK(pmap);
4938 			rw_rlock(lock);
4939 			if (md_gen != m->md.pv_gen) {
4940 				PMAP_UNLOCK(pmap);
4941 				goto restart;
4942 			}
4943 		}
4944 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
4945 		if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
4946 			count++;
4947 		PMAP_UNLOCK(pmap);
4948 	}
4949 	if ((m->flags & PG_FICTITIOUS) == 0) {
4950 		pvh = page_to_pvh(m);
4951 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
4952 			pmap = PV_PMAP(pv);
4953 			if (!PMAP_TRYLOCK(pmap)) {
4954 				md_gen = m->md.pv_gen;
4955 				pvh_gen = pvh->pv_gen;
4956 				rw_runlock(lock);
4957 				PMAP_LOCK(pmap);
4958 				rw_rlock(lock);
4959 				if (md_gen != m->md.pv_gen ||
4960 				    pvh_gen != pvh->pv_gen) {
4961 					PMAP_UNLOCK(pmap);
4962 					goto restart;
4963 				}
4964 			}
4965 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
4966 			if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
4967 				count++;
4968 			PMAP_UNLOCK(pmap);
4969 		}
4970 	}
4971 	rw_runlock(lock);
4972 	return (count);
4973 }
4974 
4975 /*
4976  * Returns true if the given page is mapped individually or as part of
4977  * a 2mpage.  Otherwise, returns false.
4978  */
4979 bool
4980 pmap_page_is_mapped(vm_page_t m)
4981 {
4982 	struct rwlock *lock;
4983 	bool rv;
4984 
4985 	if ((m->oflags & VPO_UNMANAGED) != 0)
4986 		return (false);
4987 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4988 	rw_rlock(lock);
4989 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
4990 	    ((m->flags & PG_FICTITIOUS) == 0 &&
4991 	    !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
4992 	rw_runlock(lock);
4993 	return (rv);
4994 }
4995 
4996 /*
4997  * Destroy all managed, non-wired mappings in the given user-space
4998  * pmap.  This pmap cannot be active on any processor besides the
4999  * caller.
5000  *
5001  * This function cannot be applied to the kernel pmap.  Moreover, it
5002  * is not intended for general use.  It is only to be used during
5003  * process termination.  Consequently, it can be implemented in ways
5004  * that make it faster than pmap_remove().  First, it can more quickly
5005  * destroy mappings by iterating over the pmap's collection of PV
5006  * entries, rather than searching the page table.  Second, it doesn't
5007  * have to test and clear the page table entries atomically, because
5008  * no processor is currently accessing the user address space.  In
5009  * particular, a page table entry's dirty bit won't change state once
5010  * this function starts.
5011  */
5012 void
5013 pmap_remove_pages(pmap_t pmap)
5014 {
5015 	pd_entry_t *pde;
5016 	pt_entry_t *pte, tpte;
5017 	struct spglist free;
5018 	vm_page_t m, ml3, mt;
5019 	pv_entry_t pv;
5020 	struct md_page *pvh;
5021 	struct pv_chunk *pc, *npc;
5022 	struct rwlock *lock;
5023 	int64_t bit;
5024 	uint64_t inuse, bitmask;
5025 	int allfree, field, freed, idx, lvl;
5026 	vm_paddr_t pa;
5027 
5028 	lock = NULL;
5029 
5030 	SLIST_INIT(&free);
5031 	PMAP_LOCK(pmap);
5032 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
5033 		allfree = 1;
5034 		freed = 0;
5035 		for (field = 0; field < _NPCM; field++) {
5036 			inuse = ~pc->pc_map[field] & pc_freemask[field];
5037 			while (inuse != 0) {
5038 				bit = ffsl(inuse) - 1;
5039 				bitmask = 1UL << bit;
5040 				idx = field * 64 + bit;
5041 				pv = &pc->pc_pventry[idx];
5042 				inuse &= ~bitmask;
5043 
5044 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
5045 				KASSERT(pde != NULL,
5046 				    ("Attempting to remove an unmapped page"));
5047 
5048 				switch(lvl) {
5049 				case 1:
5050 					pte = pmap_l1_to_l2(pde, pv->pv_va);
5051 					tpte = pmap_load(pte);
5052 					KASSERT((tpte & ATTR_DESCR_MASK) ==
5053 					    L2_BLOCK,
5054 					    ("Attempting to remove an invalid "
5055 					    "block: %lx", tpte));
5056 					break;
5057 				case 2:
5058 					pte = pmap_l2_to_l3(pde, pv->pv_va);
5059 					tpte = pmap_load(pte);
5060 					KASSERT((tpte & ATTR_DESCR_MASK) ==
5061 					    L3_PAGE,
5062 					    ("Attempting to remove an invalid "
5063 					     "page: %lx", tpte));
5064 					break;
5065 				default:
5066 					panic(
5067 					    "Invalid page directory level: %d",
5068 					    lvl);
5069 				}
5070 
5071 /*
5072  * We cannot remove wired pages from a process' mapping at this time
5073  */
5074 				if (tpte & ATTR_SW_WIRED) {
5075 					allfree = 0;
5076 					continue;
5077 				}
5078 
5079 				/* Mark free */
5080 				pc->pc_map[field] |= bitmask;
5081 
5082 				/*
5083 				 * Because this pmap is not active on other
5084 				 * processors, the dirty bit cannot have
5085 				 * changed state since we last loaded pte.
5086 				 */
5087 				pmap_clear(pte);
5088 
5089 				pa = tpte & ~ATTR_MASK;
5090 
5091 				m = PHYS_TO_VM_PAGE(pa);
5092 				KASSERT(m->phys_addr == pa,
5093 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
5094 				    m, (uintmax_t)m->phys_addr,
5095 				    (uintmax_t)tpte));
5096 
5097 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
5098 				    m < &vm_page_array[vm_page_array_size],
5099 				    ("pmap_remove_pages: bad pte %#jx",
5100 				    (uintmax_t)tpte));
5101 
5102 				/*
5103 				 * Update the vm_page_t clean/reference bits.
5104 				 */
5105 				if (pmap_pte_dirty(pmap, tpte)) {
5106 					switch (lvl) {
5107 					case 1:
5108 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5109 							vm_page_dirty(mt);
5110 						break;
5111 					case 2:
5112 						vm_page_dirty(m);
5113 						break;
5114 					}
5115 				}
5116 
5117 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5118 
5119 				switch (lvl) {
5120 				case 1:
5121 					pmap_resident_count_dec(pmap,
5122 					    L2_SIZE / PAGE_SIZE);
5123 					pvh = page_to_pvh(m);
5124 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
5125 					pvh->pv_gen++;
5126 					if (TAILQ_EMPTY(&pvh->pv_list)) {
5127 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5128 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
5129 							    TAILQ_EMPTY(&mt->md.pv_list))
5130 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
5131 					}
5132 					ml3 = pmap_remove_pt_page(pmap,
5133 					    pv->pv_va);
5134 					if (ml3 != NULL) {
5135 						KASSERT(ml3->valid == VM_PAGE_BITS_ALL,
5136 						    ("pmap_remove_pages: l3 page not promoted"));
5137 						pmap_resident_count_dec(pmap,1);
5138 						KASSERT(ml3->ref_count == NL3PG,
5139 						    ("pmap_remove_pages: l3 page ref count error"));
5140 						ml3->ref_count = 0;
5141 						pmap_add_delayed_free_list(ml3,
5142 						    &free, FALSE);
5143 					}
5144 					break;
5145 				case 2:
5146 					pmap_resident_count_dec(pmap, 1);
5147 					TAILQ_REMOVE(&m->md.pv_list, pv,
5148 					    pv_next);
5149 					m->md.pv_gen++;
5150 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
5151 					    TAILQ_EMPTY(&m->md.pv_list) &&
5152 					    (m->flags & PG_FICTITIOUS) == 0) {
5153 						pvh = page_to_pvh(m);
5154 						if (TAILQ_EMPTY(&pvh->pv_list))
5155 							vm_page_aflag_clear(m,
5156 							    PGA_WRITEABLE);
5157 					}
5158 					break;
5159 				}
5160 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
5161 				    &free);
5162 				freed++;
5163 			}
5164 		}
5165 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
5166 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
5167 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
5168 		if (allfree) {
5169 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
5170 			free_pv_chunk(pc);
5171 		}
5172 	}
5173 	if (lock != NULL)
5174 		rw_wunlock(lock);
5175 	pmap_invalidate_all(pmap);
5176 	PMAP_UNLOCK(pmap);
5177 	vm_page_free_pages_toq(&free, true);
5178 }
5179 
5180 /*
5181  * This is used to check if a page has been accessed or modified.
5182  */
5183 static boolean_t
5184 pmap_page_test_mappings(vm_page_t m, boolean_t accessed, boolean_t modified)
5185 {
5186 	struct rwlock *lock;
5187 	pv_entry_t pv;
5188 	struct md_page *pvh;
5189 	pt_entry_t *pte, mask, value;
5190 	pmap_t pmap;
5191 	int md_gen, pvh_gen;
5192 	boolean_t rv;
5193 
5194 	rv = FALSE;
5195 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5196 	rw_rlock(lock);
5197 restart:
5198 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5199 		pmap = PV_PMAP(pv);
5200 		PMAP_ASSERT_STAGE1(pmap);
5201 		if (!PMAP_TRYLOCK(pmap)) {
5202 			md_gen = m->md.pv_gen;
5203 			rw_runlock(lock);
5204 			PMAP_LOCK(pmap);
5205 			rw_rlock(lock);
5206 			if (md_gen != m->md.pv_gen) {
5207 				PMAP_UNLOCK(pmap);
5208 				goto restart;
5209 			}
5210 		}
5211 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5212 		mask = 0;
5213 		value = 0;
5214 		if (modified) {
5215 			mask |= ATTR_S1_AP_RW_BIT;
5216 			value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5217 		}
5218 		if (accessed) {
5219 			mask |= ATTR_AF | ATTR_DESCR_MASK;
5220 			value |= ATTR_AF | L3_PAGE;
5221 		}
5222 		rv = (pmap_load(pte) & mask) == value;
5223 		PMAP_UNLOCK(pmap);
5224 		if (rv)
5225 			goto out;
5226 	}
5227 	if ((m->flags & PG_FICTITIOUS) == 0) {
5228 		pvh = page_to_pvh(m);
5229 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
5230 			pmap = PV_PMAP(pv);
5231 			PMAP_ASSERT_STAGE1(pmap);
5232 			if (!PMAP_TRYLOCK(pmap)) {
5233 				md_gen = m->md.pv_gen;
5234 				pvh_gen = pvh->pv_gen;
5235 				rw_runlock(lock);
5236 				PMAP_LOCK(pmap);
5237 				rw_rlock(lock);
5238 				if (md_gen != m->md.pv_gen ||
5239 				    pvh_gen != pvh->pv_gen) {
5240 					PMAP_UNLOCK(pmap);
5241 					goto restart;
5242 				}
5243 			}
5244 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
5245 			mask = 0;
5246 			value = 0;
5247 			if (modified) {
5248 				mask |= ATTR_S1_AP_RW_BIT;
5249 				value |= ATTR_S1_AP(ATTR_S1_AP_RW);
5250 			}
5251 			if (accessed) {
5252 				mask |= ATTR_AF | ATTR_DESCR_MASK;
5253 				value |= ATTR_AF | L2_BLOCK;
5254 			}
5255 			rv = (pmap_load(pte) & mask) == value;
5256 			PMAP_UNLOCK(pmap);
5257 			if (rv)
5258 				goto out;
5259 		}
5260 	}
5261 out:
5262 	rw_runlock(lock);
5263 	return (rv);
5264 }
5265 
5266 /*
5267  *	pmap_is_modified:
5268  *
5269  *	Return whether or not the specified physical page was modified
5270  *	in any physical maps.
5271  */
5272 boolean_t
5273 pmap_is_modified(vm_page_t m)
5274 {
5275 
5276 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5277 	    ("pmap_is_modified: page %p is not managed", m));
5278 
5279 	/*
5280 	 * If the page is not busied then this check is racy.
5281 	 */
5282 	if (!pmap_page_is_write_mapped(m))
5283 		return (FALSE);
5284 	return (pmap_page_test_mappings(m, FALSE, TRUE));
5285 }
5286 
5287 /*
5288  *	pmap_is_prefaultable:
5289  *
5290  *	Return whether or not the specified virtual address is eligible
5291  *	for prefault.
5292  */
5293 boolean_t
5294 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
5295 {
5296 	pd_entry_t *pde;
5297 	pt_entry_t *pte;
5298 	boolean_t rv;
5299 	int lvl;
5300 
5301 	/*
5302 	 * Return TRUE if and only if the L3 entry for the specified virtual
5303 	 * address is allocated but invalid.
5304 	 */
5305 	rv = FALSE;
5306 	PMAP_LOCK(pmap);
5307 	pde = pmap_pde(pmap, addr, &lvl);
5308 	if (pde != NULL && lvl == 2) {
5309 		pte = pmap_l2_to_l3(pde, addr);
5310 		rv = pmap_load(pte) == 0;
5311 	}
5312 	PMAP_UNLOCK(pmap);
5313 	return (rv);
5314 }
5315 
5316 /*
5317  *	pmap_is_referenced:
5318  *
5319  *	Return whether or not the specified physical page was referenced
5320  *	in any physical maps.
5321  */
5322 boolean_t
5323 pmap_is_referenced(vm_page_t m)
5324 {
5325 
5326 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5327 	    ("pmap_is_referenced: page %p is not managed", m));
5328 	return (pmap_page_test_mappings(m, TRUE, FALSE));
5329 }
5330 
5331 /*
5332  * Clear the write and modified bits in each of the given page's mappings.
5333  */
5334 void
5335 pmap_remove_write(vm_page_t m)
5336 {
5337 	struct md_page *pvh;
5338 	pmap_t pmap;
5339 	struct rwlock *lock;
5340 	pv_entry_t next_pv, pv;
5341 	pt_entry_t oldpte, *pte;
5342 	vm_offset_t va;
5343 	int md_gen, pvh_gen;
5344 
5345 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5346 	    ("pmap_remove_write: page %p is not managed", m));
5347 	vm_page_assert_busied(m);
5348 
5349 	if (!pmap_page_is_write_mapped(m))
5350 		return;
5351 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5352 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5353 	rw_wlock(lock);
5354 retry:
5355 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5356 		pmap = PV_PMAP(pv);
5357 		PMAP_ASSERT_STAGE1(pmap);
5358 		if (!PMAP_TRYLOCK(pmap)) {
5359 			pvh_gen = pvh->pv_gen;
5360 			rw_wunlock(lock);
5361 			PMAP_LOCK(pmap);
5362 			rw_wlock(lock);
5363 			if (pvh_gen != pvh->pv_gen) {
5364 				PMAP_UNLOCK(pmap);
5365 				goto retry;
5366 			}
5367 		}
5368 		va = pv->pv_va;
5369 		pte = pmap_pte_exists(pmap, va, 2, __func__);
5370 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
5371 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
5372 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
5373 		    ("inconsistent pv lock %p %p for page %p",
5374 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
5375 		PMAP_UNLOCK(pmap);
5376 	}
5377 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5378 		pmap = PV_PMAP(pv);
5379 		PMAP_ASSERT_STAGE1(pmap);
5380 		if (!PMAP_TRYLOCK(pmap)) {
5381 			pvh_gen = pvh->pv_gen;
5382 			md_gen = m->md.pv_gen;
5383 			rw_wunlock(lock);
5384 			PMAP_LOCK(pmap);
5385 			rw_wlock(lock);
5386 			if (pvh_gen != pvh->pv_gen ||
5387 			    md_gen != m->md.pv_gen) {
5388 				PMAP_UNLOCK(pmap);
5389 				goto retry;
5390 			}
5391 		}
5392 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5393 		oldpte = pmap_load(pte);
5394 		if ((oldpte & ATTR_SW_DBM) != 0) {
5395 			while (!atomic_fcmpset_64(pte, &oldpte,
5396 			    (oldpte | ATTR_S1_AP_RW_BIT) & ~ATTR_SW_DBM))
5397 				cpu_spinwait();
5398 			if ((oldpte & ATTR_S1_AP_RW_BIT) ==
5399 			    ATTR_S1_AP(ATTR_S1_AP_RW))
5400 				vm_page_dirty(m);
5401 			pmap_invalidate_page(pmap, pv->pv_va, true);
5402 		}
5403 		PMAP_UNLOCK(pmap);
5404 	}
5405 	rw_wunlock(lock);
5406 	vm_page_aflag_clear(m, PGA_WRITEABLE);
5407 }
5408 
5409 /*
5410  *	pmap_ts_referenced:
5411  *
5412  *	Return a count of reference bits for a page, clearing those bits.
5413  *	It is not necessary for every reference bit to be cleared, but it
5414  *	is necessary that 0 only be returned when there are truly no
5415  *	reference bits set.
5416  *
5417  *	As an optimization, update the page's dirty field if a modified bit is
5418  *	found while counting reference bits.  This opportunistic update can be
5419  *	performed at low cost and can eliminate the need for some future calls
5420  *	to pmap_is_modified().  However, since this function stops after
5421  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
5422  *	dirty pages.  Those dirty pages will only be detected by a future call
5423  *	to pmap_is_modified().
5424  */
5425 int
5426 pmap_ts_referenced(vm_page_t m)
5427 {
5428 	struct md_page *pvh;
5429 	pv_entry_t pv, pvf;
5430 	pmap_t pmap;
5431 	struct rwlock *lock;
5432 	pt_entry_t *pte, tpte;
5433 	vm_offset_t va;
5434 	vm_paddr_t pa;
5435 	int cleared, md_gen, not_cleared, pvh_gen;
5436 	struct spglist free;
5437 
5438 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5439 	    ("pmap_ts_referenced: page %p is not managed", m));
5440 	SLIST_INIT(&free);
5441 	cleared = 0;
5442 	pa = VM_PAGE_TO_PHYS(m);
5443 	lock = PHYS_TO_PV_LIST_LOCK(pa);
5444 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5445 	rw_wlock(lock);
5446 retry:
5447 	not_cleared = 0;
5448 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
5449 		goto small_mappings;
5450 	pv = pvf;
5451 	do {
5452 		if (pvf == NULL)
5453 			pvf = pv;
5454 		pmap = PV_PMAP(pv);
5455 		if (!PMAP_TRYLOCK(pmap)) {
5456 			pvh_gen = pvh->pv_gen;
5457 			rw_wunlock(lock);
5458 			PMAP_LOCK(pmap);
5459 			rw_wlock(lock);
5460 			if (pvh_gen != pvh->pv_gen) {
5461 				PMAP_UNLOCK(pmap);
5462 				goto retry;
5463 			}
5464 		}
5465 		va = pv->pv_va;
5466 		pte = pmap_pte_exists(pmap, va, 2, __func__);
5467 		tpte = pmap_load(pte);
5468 		if (pmap_pte_dirty(pmap, tpte)) {
5469 			/*
5470 			 * Although "tpte" is mapping a 2MB page, because
5471 			 * this function is called at a 4KB page granularity,
5472 			 * we only update the 4KB page under test.
5473 			 */
5474 			vm_page_dirty(m);
5475 		}
5476 		if ((tpte & ATTR_AF) != 0) {
5477 			/*
5478 			 * Since this reference bit is shared by 512 4KB pages,
5479 			 * it should not be cleared every time it is tested.
5480 			 * Apply a simple "hash" function on the physical page
5481 			 * number, the virtual superpage number, and the pmap
5482 			 * address to select one 4KB page out of the 512 on
5483 			 * which testing the reference bit will result in
5484 			 * clearing that reference bit.  This function is
5485 			 * designed to avoid the selection of the same 4KB page
5486 			 * for every 2MB page mapping.
5487 			 *
5488 			 * On demotion, a mapping that hasn't been referenced
5489 			 * is simply destroyed.  To avoid the possibility of a
5490 			 * subsequent page fault on a demoted wired mapping,
5491 			 * always leave its reference bit set.  Moreover,
5492 			 * since the superpage is wired, the current state of
5493 			 * its reference bit won't affect page replacement.
5494 			 */
5495 			if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
5496 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
5497 			    (tpte & ATTR_SW_WIRED) == 0) {
5498 				pmap_clear_bits(pte, ATTR_AF);
5499 				pmap_invalidate_page(pmap, va, true);
5500 				cleared++;
5501 			} else
5502 				not_cleared++;
5503 		}
5504 		PMAP_UNLOCK(pmap);
5505 		/* Rotate the PV list if it has more than one entry. */
5506 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
5507 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
5508 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
5509 			pvh->pv_gen++;
5510 		}
5511 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
5512 			goto out;
5513 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
5514 small_mappings:
5515 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
5516 		goto out;
5517 	pv = pvf;
5518 	do {
5519 		if (pvf == NULL)
5520 			pvf = pv;
5521 		pmap = PV_PMAP(pv);
5522 		if (!PMAP_TRYLOCK(pmap)) {
5523 			pvh_gen = pvh->pv_gen;
5524 			md_gen = m->md.pv_gen;
5525 			rw_wunlock(lock);
5526 			PMAP_LOCK(pmap);
5527 			rw_wlock(lock);
5528 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5529 				PMAP_UNLOCK(pmap);
5530 				goto retry;
5531 			}
5532 		}
5533 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
5534 		tpte = pmap_load(pte);
5535 		if (pmap_pte_dirty(pmap, tpte))
5536 			vm_page_dirty(m);
5537 		if ((tpte & ATTR_AF) != 0) {
5538 			if ((tpte & ATTR_SW_WIRED) == 0) {
5539 				pmap_clear_bits(pte, ATTR_AF);
5540 				pmap_invalidate_page(pmap, pv->pv_va, true);
5541 				cleared++;
5542 			} else
5543 				not_cleared++;
5544 		}
5545 		PMAP_UNLOCK(pmap);
5546 		/* Rotate the PV list if it has more than one entry. */
5547 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
5548 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
5549 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5550 			m->md.pv_gen++;
5551 		}
5552 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
5553 	    not_cleared < PMAP_TS_REFERENCED_MAX);
5554 out:
5555 	rw_wunlock(lock);
5556 	vm_page_free_pages_toq(&free, true);
5557 	return (cleared + not_cleared);
5558 }
5559 
5560 /*
5561  *	Apply the given advice to the specified range of addresses within the
5562  *	given pmap.  Depending on the advice, clear the referenced and/or
5563  *	modified flags in each mapping and set the mapped page's dirty field.
5564  */
5565 void
5566 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
5567 {
5568 	struct rwlock *lock;
5569 	vm_offset_t va, va_next;
5570 	vm_page_t m;
5571 	pd_entry_t *l0, *l1, *l2, oldl2;
5572 	pt_entry_t *l3, oldl3;
5573 
5574 	PMAP_ASSERT_STAGE1(pmap);
5575 
5576 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
5577 		return;
5578 
5579 	PMAP_LOCK(pmap);
5580 	for (; sva < eva; sva = va_next) {
5581 		l0 = pmap_l0(pmap, sva);
5582 		if (pmap_load(l0) == 0) {
5583 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
5584 			if (va_next < sva)
5585 				va_next = eva;
5586 			continue;
5587 		}
5588 
5589 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
5590 		if (va_next < sva)
5591 			va_next = eva;
5592 		l1 = pmap_l0_to_l1(l0, sva);
5593 		if (pmap_load(l1) == 0)
5594 			continue;
5595 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
5596 			KASSERT(va_next <= eva,
5597 			    ("partial update of non-transparent 1G page "
5598 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
5599 			    pmap_load(l1), sva, eva, va_next));
5600 			continue;
5601 		}
5602 
5603 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
5604 		if (va_next < sva)
5605 			va_next = eva;
5606 		l2 = pmap_l1_to_l2(l1, sva);
5607 		oldl2 = pmap_load(l2);
5608 		if (oldl2 == 0)
5609 			continue;
5610 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5611 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
5612 				continue;
5613 			lock = NULL;
5614 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
5615 				if (lock != NULL)
5616 					rw_wunlock(lock);
5617 
5618 				/*
5619 				 * The 2MB page mapping was destroyed.
5620 				 */
5621 				continue;
5622 			}
5623 
5624 			/*
5625 			 * Unless the page mappings are wired, remove the
5626 			 * mapping to a single page so that a subsequent
5627 			 * access may repromote.  Choosing the last page
5628 			 * within the address range [sva, min(va_next, eva))
5629 			 * generally results in more repromotions.  Since the
5630 			 * underlying page table page is fully populated, this
5631 			 * removal never frees a page table page.
5632 			 */
5633 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
5634 				va = eva;
5635 				if (va > va_next)
5636 					va = va_next;
5637 				va -= PAGE_SIZE;
5638 				KASSERT(va >= sva,
5639 				    ("pmap_advise: no address gap"));
5640 				l3 = pmap_l2_to_l3(l2, va);
5641 				KASSERT(pmap_load(l3) != 0,
5642 				    ("pmap_advise: invalid PTE"));
5643 				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
5644 				    NULL, &lock);
5645 			}
5646 			if (lock != NULL)
5647 				rw_wunlock(lock);
5648 		}
5649 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
5650 		    ("pmap_advise: invalid L2 entry after demotion"));
5651 		if (va_next > eva)
5652 			va_next = eva;
5653 		va = va_next;
5654 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
5655 		    sva += L3_SIZE) {
5656 			oldl3 = pmap_load(l3);
5657 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
5658 			    (ATTR_SW_MANAGED | L3_PAGE))
5659 				goto maybe_invlrng;
5660 			else if (pmap_pte_dirty(pmap, oldl3)) {
5661 				if (advice == MADV_DONTNEED) {
5662 					/*
5663 					 * Future calls to pmap_is_modified()
5664 					 * can be avoided by making the page
5665 					 * dirty now.
5666 					 */
5667 					m = PHYS_TO_VM_PAGE(oldl3 & ~ATTR_MASK);
5668 					vm_page_dirty(m);
5669 				}
5670 				while (!atomic_fcmpset_long(l3, &oldl3,
5671 				    (oldl3 & ~ATTR_AF) |
5672 				    ATTR_S1_AP(ATTR_S1_AP_RO)))
5673 					cpu_spinwait();
5674 			} else if ((oldl3 & ATTR_AF) != 0)
5675 				pmap_clear_bits(l3, ATTR_AF);
5676 			else
5677 				goto maybe_invlrng;
5678 			if (va == va_next)
5679 				va = sva;
5680 			continue;
5681 maybe_invlrng:
5682 			if (va != va_next) {
5683 				pmap_invalidate_range(pmap, va, sva, true);
5684 				va = va_next;
5685 			}
5686 		}
5687 		if (va != va_next)
5688 			pmap_invalidate_range(pmap, va, sva, true);
5689 	}
5690 	PMAP_UNLOCK(pmap);
5691 }
5692 
5693 /*
5694  *	Clear the modify bits on the specified physical page.
5695  */
5696 void
5697 pmap_clear_modify(vm_page_t m)
5698 {
5699 	struct md_page *pvh;
5700 	struct rwlock *lock;
5701 	pmap_t pmap;
5702 	pv_entry_t next_pv, pv;
5703 	pd_entry_t *l2, oldl2;
5704 	pt_entry_t *l3, oldl3;
5705 	vm_offset_t va;
5706 	int md_gen, pvh_gen;
5707 
5708 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
5709 	    ("pmap_clear_modify: page %p is not managed", m));
5710 	vm_page_assert_busied(m);
5711 
5712 	if (!pmap_page_is_write_mapped(m))
5713 		return;
5714 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
5715 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
5716 	rw_wlock(lock);
5717 restart:
5718 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
5719 		pmap = PV_PMAP(pv);
5720 		PMAP_ASSERT_STAGE1(pmap);
5721 		if (!PMAP_TRYLOCK(pmap)) {
5722 			pvh_gen = pvh->pv_gen;
5723 			rw_wunlock(lock);
5724 			PMAP_LOCK(pmap);
5725 			rw_wlock(lock);
5726 			if (pvh_gen != pvh->pv_gen) {
5727 				PMAP_UNLOCK(pmap);
5728 				goto restart;
5729 			}
5730 		}
5731 		va = pv->pv_va;
5732 		l2 = pmap_l2(pmap, va);
5733 		oldl2 = pmap_load(l2);
5734 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
5735 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
5736 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
5737 		    (oldl2 & ATTR_SW_WIRED) == 0) {
5738 			/*
5739 			 * Write protect the mapping to a single page so that
5740 			 * a subsequent write access may repromote.
5741 			 */
5742 			va += VM_PAGE_TO_PHYS(m) - (oldl2 & ~ATTR_MASK);
5743 			l3 = pmap_l2_to_l3(l2, va);
5744 			oldl3 = pmap_load(l3);
5745 			while (!atomic_fcmpset_long(l3, &oldl3,
5746 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
5747 				cpu_spinwait();
5748 			vm_page_dirty(m);
5749 			pmap_invalidate_page(pmap, va, true);
5750 		}
5751 		PMAP_UNLOCK(pmap);
5752 	}
5753 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
5754 		pmap = PV_PMAP(pv);
5755 		PMAP_ASSERT_STAGE1(pmap);
5756 		if (!PMAP_TRYLOCK(pmap)) {
5757 			md_gen = m->md.pv_gen;
5758 			pvh_gen = pvh->pv_gen;
5759 			rw_wunlock(lock);
5760 			PMAP_LOCK(pmap);
5761 			rw_wlock(lock);
5762 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
5763 				PMAP_UNLOCK(pmap);
5764 				goto restart;
5765 			}
5766 		}
5767 		l2 = pmap_l2(pmap, pv->pv_va);
5768 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
5769 		oldl3 = pmap_load(l3);
5770 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM){
5771 			pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
5772 			pmap_invalidate_page(pmap, pv->pv_va, true);
5773 		}
5774 		PMAP_UNLOCK(pmap);
5775 	}
5776 	rw_wunlock(lock);
5777 }
5778 
5779 void *
5780 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
5781 {
5782 	struct pmap_preinit_mapping *ppim;
5783 	vm_offset_t va, offset;
5784 	pd_entry_t *pde;
5785 	pt_entry_t *l2;
5786 	int i, lvl, l2_blocks, free_l2_count, start_idx;
5787 
5788 	if (!vm_initialized) {
5789 		/*
5790 		 * No L3 ptables so map entire L2 blocks where start VA is:
5791 		 * 	preinit_map_va + start_idx * L2_SIZE
5792 		 * There may be duplicate mappings (multiple VA -> same PA) but
5793 		 * ARM64 dcache is always PIPT so that's acceptable.
5794 		 */
5795 		 if (size == 0)
5796 			 return (NULL);
5797 
5798 		 /* Calculate how many L2 blocks are needed for the mapping */
5799 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
5800 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
5801 
5802 		offset = pa & L2_OFFSET;
5803 
5804 		if (preinit_map_va == 0)
5805 			return (NULL);
5806 
5807 		/* Map 2MiB L2 blocks from reserved VA space */
5808 
5809 		free_l2_count = 0;
5810 		start_idx = -1;
5811 		/* Find enough free contiguous VA space */
5812 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5813 			ppim = pmap_preinit_mapping + i;
5814 			if (free_l2_count > 0 && ppim->pa != 0) {
5815 				/* Not enough space here */
5816 				free_l2_count = 0;
5817 				start_idx = -1;
5818 				continue;
5819 			}
5820 
5821 			if (ppim->pa == 0) {
5822 				/* Free L2 block */
5823 				if (start_idx == -1)
5824 					start_idx = i;
5825 				free_l2_count++;
5826 				if (free_l2_count == l2_blocks)
5827 					break;
5828 			}
5829 		}
5830 		if (free_l2_count != l2_blocks)
5831 			panic("%s: too many preinit mappings", __func__);
5832 
5833 		va = preinit_map_va + (start_idx * L2_SIZE);
5834 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
5835 			/* Mark entries as allocated */
5836 			ppim = pmap_preinit_mapping + i;
5837 			ppim->pa = pa;
5838 			ppim->va = va + offset;
5839 			ppim->size = size;
5840 		}
5841 
5842 		/* Map L2 blocks */
5843 		pa = rounddown2(pa, L2_SIZE);
5844 		for (i = 0; i < l2_blocks; i++) {
5845 			pde = pmap_pde(kernel_pmap, va, &lvl);
5846 			KASSERT(pde != NULL,
5847 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
5848 			    va));
5849 			KASSERT(lvl == 1,
5850 			    ("pmap_mapbios: Invalid level %d", lvl));
5851 
5852 			/* Insert L2_BLOCK */
5853 			l2 = pmap_l1_to_l2(pde, va);
5854 			pmap_load_store(l2,
5855 			    pa | ATTR_DEFAULT | ATTR_S1_XN |
5856 			    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
5857 
5858 			va += L2_SIZE;
5859 			pa += L2_SIZE;
5860 		}
5861 		pmap_invalidate_all(kernel_pmap);
5862 
5863 		va = preinit_map_va + (start_idx * L2_SIZE);
5864 
5865 	} else {
5866 		/* kva_alloc may be used to map the pages */
5867 		offset = pa & PAGE_MASK;
5868 		size = round_page(offset + size);
5869 
5870 		va = kva_alloc(size);
5871 		if (va == 0)
5872 			panic("%s: Couldn't allocate KVA", __func__);
5873 
5874 		pde = pmap_pde(kernel_pmap, va, &lvl);
5875 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
5876 
5877 		/* L3 table is linked */
5878 		va = trunc_page(va);
5879 		pa = trunc_page(pa);
5880 		pmap_kenter(va, size, pa, memory_mapping_mode(pa));
5881 	}
5882 
5883 	return ((void *)(va + offset));
5884 }
5885 
5886 void
5887 pmap_unmapbios(vm_offset_t va, vm_size_t size)
5888 {
5889 	struct pmap_preinit_mapping *ppim;
5890 	vm_offset_t offset, tmpsize, va_trunc;
5891 	pd_entry_t *pde;
5892 	pt_entry_t *l2;
5893 	int i, lvl, l2_blocks, block;
5894 	bool preinit_map;
5895 
5896 	l2_blocks =
5897 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
5898 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
5899 
5900 	/* Remove preinit mapping */
5901 	preinit_map = false;
5902 	block = 0;
5903 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
5904 		ppim = pmap_preinit_mapping + i;
5905 		if (ppim->va == va) {
5906 			KASSERT(ppim->size == size,
5907 			    ("pmap_unmapbios: size mismatch"));
5908 			ppim->va = 0;
5909 			ppim->pa = 0;
5910 			ppim->size = 0;
5911 			preinit_map = true;
5912 			offset = block * L2_SIZE;
5913 			va_trunc = rounddown2(va, L2_SIZE) + offset;
5914 
5915 			/* Remove L2_BLOCK */
5916 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
5917 			KASSERT(pde != NULL,
5918 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
5919 			    va_trunc));
5920 			l2 = pmap_l1_to_l2(pde, va_trunc);
5921 			pmap_clear(l2);
5922 
5923 			if (block == (l2_blocks - 1))
5924 				break;
5925 			block++;
5926 		}
5927 	}
5928 	if (preinit_map) {
5929 		pmap_invalidate_all(kernel_pmap);
5930 		return;
5931 	}
5932 
5933 	/* Unmap the pages reserved with kva_alloc. */
5934 	if (vm_initialized) {
5935 		offset = va & PAGE_MASK;
5936 		size = round_page(offset + size);
5937 		va = trunc_page(va);
5938 
5939 		pde = pmap_pde(kernel_pmap, va, &lvl);
5940 		KASSERT(pde != NULL,
5941 		    ("pmap_unmapbios: Invalid page entry, va: 0x%lx", va));
5942 		KASSERT(lvl == 2, ("pmap_unmapbios: Invalid level %d", lvl));
5943 
5944 		/* Unmap and invalidate the pages */
5945                 for (tmpsize = 0; tmpsize < size; tmpsize += PAGE_SIZE)
5946 			pmap_kremove(va + tmpsize);
5947 
5948 		kva_free(va, size);
5949 	}
5950 }
5951 
5952 /*
5953  * Sets the memory attribute for the specified page.
5954  */
5955 void
5956 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
5957 {
5958 
5959 	m->md.pv_memattr = ma;
5960 
5961 	/*
5962 	 * If "m" is a normal page, update its direct mapping.  This update
5963 	 * can be relied upon to perform any cache operations that are
5964 	 * required for data coherence.
5965 	 */
5966 	if ((m->flags & PG_FICTITIOUS) == 0 &&
5967 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
5968 	    m->md.pv_memattr) != 0)
5969 		panic("memory attribute change on the direct map failed");
5970 }
5971 
5972 /*
5973  * Changes the specified virtual address range's memory type to that given by
5974  * the parameter "mode".  The specified virtual address range must be
5975  * completely contained within either the direct map or the kernel map.  If
5976  * the virtual address range is contained within the kernel map, then the
5977  * memory type for each of the corresponding ranges of the direct map is also
5978  * changed.  (The corresponding ranges of the direct map are those ranges that
5979  * map the same physical pages as the specified virtual address range.)  These
5980  * changes to the direct map are necessary because Intel describes the
5981  * behavior of their processors as "undefined" if two or more mappings to the
5982  * same physical page have different memory types.
5983  *
5984  * Returns zero if the change completed successfully, and either EINVAL or
5985  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
5986  * of the virtual address range was not mapped, and ENOMEM is returned if
5987  * there was insufficient memory available to complete the change.  In the
5988  * latter case, the memory type may have been changed on some part of the
5989  * virtual address range or the direct map.
5990  */
5991 int
5992 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
5993 {
5994 	int error;
5995 
5996 	PMAP_LOCK(kernel_pmap);
5997 	error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
5998 	PMAP_UNLOCK(kernel_pmap);
5999 	return (error);
6000 }
6001 
6002 /*
6003  * Changes the specified virtual address range's protections to those
6004  * specified by "prot".  Like pmap_change_attr(), protections for aliases
6005  * in the direct map are updated as well.  Protections on aliasing mappings may
6006  * be a subset of the requested protections; for example, mappings in the direct
6007  * map are never executable.
6008  */
6009 int
6010 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
6011 {
6012 	int error;
6013 
6014 	/* Only supported within the kernel map. */
6015 	if (va < VM_MIN_KERNEL_ADDRESS)
6016 		return (EINVAL);
6017 
6018 	PMAP_LOCK(kernel_pmap);
6019 	error = pmap_change_props_locked(va, size, prot, -1, false);
6020 	PMAP_UNLOCK(kernel_pmap);
6021 	return (error);
6022 }
6023 
6024 static int
6025 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
6026     int mode, bool skip_unmapped)
6027 {
6028 	vm_offset_t base, offset, tmpva;
6029 	vm_size_t pte_size;
6030 	vm_paddr_t pa;
6031 	pt_entry_t pte, *ptep, *newpte;
6032 	pt_entry_t bits, mask;
6033 	int lvl, rv;
6034 
6035 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
6036 	base = trunc_page(va);
6037 	offset = va & PAGE_MASK;
6038 	size = round_page(offset + size);
6039 
6040 	if (!VIRT_IN_DMAP(base) &&
6041 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
6042 		return (EINVAL);
6043 
6044 	bits = 0;
6045 	mask = 0;
6046 	if (mode != -1) {
6047 		bits = ATTR_S1_IDX(mode);
6048 		mask = ATTR_S1_IDX_MASK;
6049 		if (mode == VM_MEMATTR_DEVICE) {
6050 			mask |= ATTR_S1_XN;
6051 			bits |= ATTR_S1_XN;
6052 		}
6053 	}
6054 	if (prot != VM_PROT_NONE) {
6055 		/* Don't mark the DMAP as executable. It never is on arm64. */
6056 		if (VIRT_IN_DMAP(base)) {
6057 			prot &= ~VM_PROT_EXECUTE;
6058 			/*
6059 			 * XXX Mark the DMAP as writable for now. We rely
6060 			 * on this in ddb & dtrace to insert breakpoint
6061 			 * instructions.
6062 			 */
6063 			prot |= VM_PROT_WRITE;
6064 		}
6065 
6066 		if ((prot & VM_PROT_WRITE) == 0) {
6067 			bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
6068 		}
6069 		if ((prot & VM_PROT_EXECUTE) == 0) {
6070 			bits |= ATTR_S1_PXN;
6071 		}
6072 		bits |= ATTR_S1_UXN;
6073 		mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
6074 	}
6075 
6076 	for (tmpva = base; tmpva < base + size; ) {
6077 		ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
6078 		if (ptep == NULL && !skip_unmapped) {
6079 			return (EINVAL);
6080 		} else if ((ptep == NULL && skip_unmapped) ||
6081 		    (pmap_load(ptep) & mask) == bits) {
6082 			/*
6083 			 * We already have the correct attribute or there
6084 			 * is no memory mapped at this address and we are
6085 			 * skipping unmapped memory.
6086 			 */
6087 			switch (lvl) {
6088 			default:
6089 				panic("Invalid DMAP table level: %d\n", lvl);
6090 			case 1:
6091 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
6092 				break;
6093 			case 2:
6094 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
6095 				break;
6096 			case 3:
6097 				tmpva += PAGE_SIZE;
6098 				break;
6099 			}
6100 		} else {
6101 			/*
6102 			 * Split the entry to an level 3 table, then
6103 			 * set the new attribute.
6104 			 */
6105 			switch (lvl) {
6106 			default:
6107 				panic("Invalid DMAP table level: %d\n", lvl);
6108 			case 1:
6109 				if ((tmpva & L1_OFFSET) == 0 &&
6110 				    (base + size - tmpva) >= L1_SIZE) {
6111 					pte_size = L1_SIZE;
6112 					break;
6113 				}
6114 				newpte = pmap_demote_l1(kernel_pmap, ptep,
6115 				    tmpva & ~L1_OFFSET);
6116 				if (newpte == NULL)
6117 					return (EINVAL);
6118 				ptep = pmap_l1_to_l2(ptep, tmpva);
6119 				/* FALLTHROUGH */
6120 			case 2:
6121 				if ((tmpva & L2_OFFSET) == 0 &&
6122 				    (base + size - tmpva) >= L2_SIZE) {
6123 					pte_size = L2_SIZE;
6124 					break;
6125 				}
6126 				newpte = pmap_demote_l2(kernel_pmap, ptep,
6127 				    tmpva);
6128 				if (newpte == NULL)
6129 					return (EINVAL);
6130 				ptep = pmap_l2_to_l3(ptep, tmpva);
6131 				/* FALLTHROUGH */
6132 			case 3:
6133 				pte_size = PAGE_SIZE;
6134 				break;
6135 			}
6136 
6137 			/* Update the entry */
6138 			pte = pmap_load(ptep);
6139 			pte &= ~mask;
6140 			pte |= bits;
6141 
6142 			pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
6143 			    pte_size);
6144 
6145 			pa = pte & ~ATTR_MASK;
6146 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
6147 				/*
6148 				 * Keep the DMAP memory in sync.
6149 				 */
6150 				rv = pmap_change_props_locked(
6151 				    PHYS_TO_DMAP(pa), pte_size,
6152 				    prot, mode, true);
6153 				if (rv != 0)
6154 					return (rv);
6155 			}
6156 
6157 			/*
6158 			 * If moving to a non-cacheable entry flush
6159 			 * the cache.
6160 			 */
6161 			if (mode == VM_MEMATTR_UNCACHEABLE)
6162 				cpu_dcache_wbinv_range(tmpva, pte_size);
6163 			tmpva += pte_size;
6164 		}
6165 	}
6166 
6167 	return (0);
6168 }
6169 
6170 /*
6171  * Create an L2 table to map all addresses within an L1 mapping.
6172  */
6173 static pt_entry_t *
6174 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
6175 {
6176 	pt_entry_t *l2, newl2, oldl1;
6177 	vm_offset_t tmpl1;
6178 	vm_paddr_t l2phys, phys;
6179 	vm_page_t ml2;
6180 	int i;
6181 
6182 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6183 	oldl1 = pmap_load(l1);
6184 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
6185 	    ("pmap_demote_l1: Demoting a non-block entry"));
6186 	KASSERT((va & L1_OFFSET) == 0,
6187 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
6188 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
6189 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
6190 
6191 	tmpl1 = 0;
6192 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
6193 		tmpl1 = kva_alloc(PAGE_SIZE);
6194 		if (tmpl1 == 0)
6195 			return (NULL);
6196 	}
6197 
6198 	if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
6199 	    NULL) {
6200 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
6201 		    " in pmap %p", va, pmap);
6202 		l2 = NULL;
6203 		goto fail;
6204 	}
6205 
6206 	l2phys = VM_PAGE_TO_PHYS(ml2);
6207 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
6208 
6209 	/* Address the range points at */
6210 	phys = oldl1 & ~ATTR_MASK;
6211 	/* The attributed from the old l1 table to be copied */
6212 	newl2 = oldl1 & ATTR_MASK;
6213 
6214 	/* Create the new entries */
6215 	for (i = 0; i < Ln_ENTRIES; i++) {
6216 		l2[i] = newl2 | phys;
6217 		phys += L2_SIZE;
6218 	}
6219 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
6220 	    ("Invalid l2 page (%lx != %lx)", l2[0],
6221 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
6222 
6223 	if (tmpl1 != 0) {
6224 		pmap_kenter(tmpl1, PAGE_SIZE,
6225 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
6226 		    VM_MEMATTR_WRITE_BACK);
6227 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
6228 	}
6229 
6230 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
6231 
6232 fail:
6233 	if (tmpl1 != 0) {
6234 		pmap_kremove(tmpl1);
6235 		kva_free(tmpl1, PAGE_SIZE);
6236 	}
6237 
6238 	return (l2);
6239 }
6240 
6241 static void
6242 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
6243 {
6244 	pt_entry_t *l3;
6245 
6246 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
6247 		*l3 = newl3;
6248 		newl3 += L3_SIZE;
6249 	}
6250 }
6251 
6252 static void
6253 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
6254     struct rwlock **lockp)
6255 {
6256 	struct spglist free;
6257 
6258 	SLIST_INIT(&free);
6259 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
6260 	    lockp);
6261 	vm_page_free_pages_toq(&free, true);
6262 }
6263 
6264 /*
6265  * Create an L3 table to map all addresses within an L2 mapping.
6266  */
6267 static pt_entry_t *
6268 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
6269     struct rwlock **lockp)
6270 {
6271 	pt_entry_t *l3, newl3, oldl2;
6272 	vm_offset_t tmpl2;
6273 	vm_paddr_t l3phys;
6274 	vm_page_t ml3;
6275 
6276 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6277 	PMAP_ASSERT_STAGE1(pmap);
6278 	KASSERT(ADDR_IS_CANONICAL(va),
6279 	    ("%s: Address not in canonical form: %lx", __func__, va));
6280 
6281 	l3 = NULL;
6282 	oldl2 = pmap_load(l2);
6283 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
6284 	    ("pmap_demote_l2: Demoting a non-block entry"));
6285 	va &= ~L2_OFFSET;
6286 
6287 	tmpl2 = 0;
6288 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
6289 		tmpl2 = kva_alloc(PAGE_SIZE);
6290 		if (tmpl2 == 0)
6291 			return (NULL);
6292 	}
6293 
6294 	/*
6295 	 * Invalidate the 2MB page mapping and return "failure" if the
6296 	 * mapping was never accessed.
6297 	 */
6298 	if ((oldl2 & ATTR_AF) == 0) {
6299 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6300 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
6301 		pmap_demote_l2_abort(pmap, va, l2, lockp);
6302 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
6303 		    va, pmap);
6304 		goto fail;
6305 	}
6306 
6307 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
6308 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
6309 		    ("pmap_demote_l2: page table page for a wired mapping"
6310 		    " is missing"));
6311 
6312 		/*
6313 		 * If the page table page is missing and the mapping
6314 		 * is for a kernel address, the mapping must belong to
6315 		 * either the direct map or the early kernel memory.
6316 		 * Page table pages are preallocated for every other
6317 		 * part of the kernel address space, so the direct map
6318 		 * region and early kernel memory are the only parts of the
6319 		 * kernel address space that must be handled here.
6320 		 */
6321 		KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
6322 		    (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
6323 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
6324 
6325 		/*
6326 		 * If the 2MB page mapping belongs to the direct map
6327 		 * region of the kernel's address space, then the page
6328 		 * allocation request specifies the highest possible
6329 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
6330 		 * priority is normal.
6331 		 */
6332 		ml3 = vm_page_alloc_noobj(
6333 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
6334 		    VM_ALLOC_WIRED);
6335 
6336 		/*
6337 		 * If the allocation of the new page table page fails,
6338 		 * invalidate the 2MB page mapping and return "failure".
6339 		 */
6340 		if (ml3 == NULL) {
6341 			pmap_demote_l2_abort(pmap, va, l2, lockp);
6342 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
6343 			    " in pmap %p", va, pmap);
6344 			goto fail;
6345 		}
6346 		ml3->pindex = pmap_l2_pindex(va);
6347 
6348 		if (!ADDR_IS_KERNEL(va)) {
6349 			ml3->ref_count = NL3PG;
6350 			pmap_resident_count_inc(pmap, 1);
6351 		}
6352 	}
6353 	l3phys = VM_PAGE_TO_PHYS(ml3);
6354 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
6355 	newl3 = (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
6356 	KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
6357 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
6358 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
6359 
6360 	/*
6361 	 * If the page table page is not leftover from an earlier promotion,
6362 	 * or the mapping attributes have changed, (re)initialize the L3 table.
6363 	 *
6364 	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
6365 	 * performs a dsb().  That dsb() ensures that the stores for filling
6366 	 * "l3" are visible before "l3" is added to the page table.
6367 	 */
6368 	if (ml3->valid == 0 || (l3[0] & ATTR_MASK) != (newl3 & ATTR_MASK))
6369 		pmap_fill_l3(l3, newl3);
6370 
6371 	/*
6372 	 * Map the temporary page so we don't lose access to the l2 table.
6373 	 */
6374 	if (tmpl2 != 0) {
6375 		pmap_kenter(tmpl2, PAGE_SIZE,
6376 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
6377 		    VM_MEMATTR_WRITE_BACK);
6378 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
6379 	}
6380 
6381 	/*
6382 	 * The spare PV entries must be reserved prior to demoting the
6383 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
6384 	 * of the L2 and the PV lists will be inconsistent, which can result
6385 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
6386 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
6387 	 * PV entry for the 2MB page mapping that is being demoted.
6388 	 */
6389 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
6390 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
6391 
6392 	/*
6393 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
6394 	 * the 2MB page mapping.
6395 	 */
6396 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
6397 
6398 	/*
6399 	 * Demote the PV entry.
6400 	 */
6401 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
6402 		pmap_pv_demote_l2(pmap, va, oldl2 & ~ATTR_MASK, lockp);
6403 
6404 	atomic_add_long(&pmap_l2_demotions, 1);
6405 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
6406 	    " in pmap %p %lx", va, pmap, l3[0]);
6407 
6408 fail:
6409 	if (tmpl2 != 0) {
6410 		pmap_kremove(tmpl2);
6411 		kva_free(tmpl2, PAGE_SIZE);
6412 	}
6413 
6414 	return (l3);
6415 
6416 }
6417 
6418 static pt_entry_t *
6419 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
6420 {
6421 	struct rwlock *lock;
6422 	pt_entry_t *l3;
6423 
6424 	lock = NULL;
6425 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
6426 	if (lock != NULL)
6427 		rw_wunlock(lock);
6428 	return (l3);
6429 }
6430 
6431 /*
6432  * Perform the pmap work for mincore(2).  If the page is not both referenced and
6433  * modified by this pmap, returns its physical address so that the caller can
6434  * find other mappings.
6435  */
6436 int
6437 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
6438 {
6439 	pt_entry_t *pte, tpte;
6440 	vm_paddr_t mask, pa;
6441 	int lvl, val;
6442 	bool managed;
6443 
6444 	PMAP_ASSERT_STAGE1(pmap);
6445 	PMAP_LOCK(pmap);
6446 	pte = pmap_pte(pmap, addr, &lvl);
6447 	if (pte != NULL) {
6448 		tpte = pmap_load(pte);
6449 
6450 		switch (lvl) {
6451 		case 3:
6452 			mask = L3_OFFSET;
6453 			break;
6454 		case 2:
6455 			mask = L2_OFFSET;
6456 			break;
6457 		case 1:
6458 			mask = L1_OFFSET;
6459 			break;
6460 		default:
6461 			panic("pmap_mincore: invalid level %d", lvl);
6462 		}
6463 
6464 		managed = (tpte & ATTR_SW_MANAGED) != 0;
6465 		val = MINCORE_INCORE;
6466 		if (lvl != 3)
6467 			val |= MINCORE_PSIND(3 - lvl);
6468 		if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
6469 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
6470 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
6471 		if ((tpte & ATTR_AF) == ATTR_AF)
6472 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
6473 
6474 		pa = (tpte & ~ATTR_MASK) | (addr & mask);
6475 	} else {
6476 		managed = false;
6477 		val = 0;
6478 	}
6479 
6480 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
6481 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
6482 		*pap = pa;
6483 	}
6484 	PMAP_UNLOCK(pmap);
6485 	return (val);
6486 }
6487 
6488 /*
6489  * Garbage collect every ASID that is neither active on a processor nor
6490  * reserved.
6491  */
6492 static void
6493 pmap_reset_asid_set(pmap_t pmap)
6494 {
6495 	pmap_t curpmap;
6496 	int asid, cpuid, epoch;
6497 	struct asid_set *set;
6498 	enum pmap_stage stage;
6499 
6500 	set = pmap->pm_asid_set;
6501 	stage = pmap->pm_stage;
6502 
6503 	set = pmap->pm_asid_set;
6504 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6505 	mtx_assert(&set->asid_set_mutex, MA_OWNED);
6506 
6507 	/*
6508 	 * Ensure that the store to asid_epoch is globally visible before the
6509 	 * loads from pc_curpmap are performed.
6510 	 */
6511 	epoch = set->asid_epoch + 1;
6512 	if (epoch == INT_MAX)
6513 		epoch = 0;
6514 	set->asid_epoch = epoch;
6515 	dsb(ishst);
6516 	if (stage == PM_STAGE1) {
6517 		__asm __volatile("tlbi vmalle1is");
6518 	} else {
6519 		KASSERT(pmap_clean_stage2_tlbi != NULL,
6520 		    ("%s: Unset stage 2 tlb invalidation callback\n",
6521 		    __func__));
6522 		pmap_clean_stage2_tlbi();
6523 	}
6524 	dsb(ish);
6525 	bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
6526 	    set->asid_set_size - 1);
6527 	CPU_FOREACH(cpuid) {
6528 		if (cpuid == curcpu)
6529 			continue;
6530 		if (stage == PM_STAGE1) {
6531 			curpmap = pcpu_find(cpuid)->pc_curpmap;
6532 			PMAP_ASSERT_STAGE1(pmap);
6533 		} else {
6534 			curpmap = pcpu_find(cpuid)->pc_curvmpmap;
6535 			if (curpmap == NULL)
6536 				continue;
6537 			PMAP_ASSERT_STAGE2(pmap);
6538 		}
6539 		KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
6540 		asid = COOKIE_TO_ASID(curpmap->pm_cookie);
6541 		if (asid == -1)
6542 			continue;
6543 		bit_set(set->asid_set, asid);
6544 		curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
6545 	}
6546 }
6547 
6548 /*
6549  * Allocate a new ASID for the specified pmap.
6550  */
6551 static void
6552 pmap_alloc_asid(pmap_t pmap)
6553 {
6554 	struct asid_set *set;
6555 	int new_asid;
6556 
6557 	set = pmap->pm_asid_set;
6558 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6559 
6560 	mtx_lock_spin(&set->asid_set_mutex);
6561 
6562 	/*
6563 	 * While this processor was waiting to acquire the asid set mutex,
6564 	 * pmap_reset_asid_set() running on another processor might have
6565 	 * updated this pmap's cookie to the current epoch.  In which case, we
6566 	 * don't need to allocate a new ASID.
6567 	 */
6568 	if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
6569 		goto out;
6570 
6571 	bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
6572 	    &new_asid);
6573 	if (new_asid == -1) {
6574 		bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
6575 		    set->asid_next, &new_asid);
6576 		if (new_asid == -1) {
6577 			pmap_reset_asid_set(pmap);
6578 			bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
6579 			    set->asid_set_size, &new_asid);
6580 			KASSERT(new_asid != -1, ("ASID allocation failure"));
6581 		}
6582 	}
6583 	bit_set(set->asid_set, new_asid);
6584 	set->asid_next = new_asid + 1;
6585 	pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
6586 out:
6587 	mtx_unlock_spin(&set->asid_set_mutex);
6588 }
6589 
6590 /*
6591  * Compute the value that should be stored in ttbr0 to activate the specified
6592  * pmap.  This value may change from time to time.
6593  */
6594 uint64_t
6595 pmap_to_ttbr0(pmap_t pmap)
6596 {
6597 
6598 	return (ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie)) |
6599 	    pmap->pm_ttbr);
6600 }
6601 
6602 static bool
6603 pmap_activate_int(pmap_t pmap)
6604 {
6605 	struct asid_set *set;
6606 	int epoch;
6607 
6608 	KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
6609 	KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
6610 
6611 	if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
6612 	    (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
6613 		/*
6614 		 * Handle the possibility that the old thread was preempted
6615 		 * after an "ic" or "tlbi" instruction but before it performed
6616 		 * a "dsb" instruction.  If the old thread migrates to a new
6617 		 * processor, its completion of a "dsb" instruction on that
6618 		 * new processor does not guarantee that the "ic" or "tlbi"
6619 		 * instructions performed on the old processor have completed.
6620 		 */
6621 		dsb(ish);
6622 		return (false);
6623 	}
6624 
6625 	set = pmap->pm_asid_set;
6626 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
6627 
6628 	/*
6629 	 * Ensure that the store to curpmap is globally visible before the
6630 	 * load from asid_epoch is performed.
6631 	 */
6632 	if (pmap->pm_stage == PM_STAGE1)
6633 		PCPU_SET(curpmap, pmap);
6634 	else
6635 		PCPU_SET(curvmpmap, pmap);
6636 	dsb(ish);
6637 	epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
6638 	if (epoch >= 0 && epoch != set->asid_epoch)
6639 		pmap_alloc_asid(pmap);
6640 
6641 	if (pmap->pm_stage == PM_STAGE1) {
6642 		set_ttbr0(pmap_to_ttbr0(pmap));
6643 		if (PCPU_GET(bcast_tlbi_workaround) != 0)
6644 			invalidate_local_icache();
6645 	}
6646 	return (true);
6647 }
6648 
6649 void
6650 pmap_activate_vm(pmap_t pmap)
6651 {
6652 
6653 	PMAP_ASSERT_STAGE2(pmap);
6654 
6655 	(void)pmap_activate_int(pmap);
6656 }
6657 
6658 void
6659 pmap_activate(struct thread *td)
6660 {
6661 	pmap_t	pmap;
6662 
6663 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
6664 	PMAP_ASSERT_STAGE1(pmap);
6665 	critical_enter();
6666 	(void)pmap_activate_int(pmap);
6667 	critical_exit();
6668 }
6669 
6670 /*
6671  * Activate the thread we are switching to.
6672  * To simplify the assembly in cpu_throw return the new threads pcb.
6673  */
6674 struct pcb *
6675 pmap_switch(struct thread *new)
6676 {
6677 	pcpu_bp_harden bp_harden;
6678 	struct pcb *pcb;
6679 
6680 	/* Store the new curthread */
6681 	PCPU_SET(curthread, new);
6682 #if defined(PERTHREAD_SSP)
6683 	/* Set the new threads SSP canary */
6684 	__asm("msr	sp_el0, %0" :: "r"(&new->td_md.md_canary));
6685 #endif
6686 
6687 	/* And the new pcb */
6688 	pcb = new->td_pcb;
6689 	PCPU_SET(curpcb, pcb);
6690 
6691 	/*
6692 	 * TODO: We may need to flush the cache here if switching
6693 	 * to a user process.
6694 	 */
6695 
6696 	if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
6697 		/*
6698 		 * Stop userspace from training the branch predictor against
6699 		 * other processes. This will call into a CPU specific
6700 		 * function that clears the branch predictor state.
6701 		 */
6702 		bp_harden = PCPU_GET(bp_harden);
6703 		if (bp_harden != NULL)
6704 			bp_harden();
6705 	}
6706 
6707 	return (pcb);
6708 }
6709 
6710 void
6711 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
6712 {
6713 
6714 	PMAP_ASSERT_STAGE1(pmap);
6715 	KASSERT(ADDR_IS_CANONICAL(va),
6716 	    ("%s: Address not in canonical form: %lx", __func__, va));
6717 
6718 	if (ADDR_IS_KERNEL(va)) {
6719 		cpu_icache_sync_range(va, sz);
6720 	} else {
6721 		u_int len, offset;
6722 		vm_paddr_t pa;
6723 
6724 		/* Find the length of data in this page to flush */
6725 		offset = va & PAGE_MASK;
6726 		len = imin(PAGE_SIZE - offset, sz);
6727 
6728 		while (sz != 0) {
6729 			/* Extract the physical address & find it in the DMAP */
6730 			pa = pmap_extract(pmap, va);
6731 			if (pa != 0)
6732 				cpu_icache_sync_range(PHYS_TO_DMAP(pa), len);
6733 
6734 			/* Move to the next page */
6735 			sz -= len;
6736 			va += len;
6737 			/* Set the length for the next iteration */
6738 			len = imin(PAGE_SIZE, sz);
6739 		}
6740 	}
6741 }
6742 
6743 static int
6744 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
6745 {
6746 	pd_entry_t *pdep;
6747 	pt_entry_t *ptep, pte;
6748 	int rv, lvl, dfsc;
6749 
6750 	PMAP_ASSERT_STAGE2(pmap);
6751 	rv = KERN_FAILURE;
6752 
6753 	/* Data and insn aborts use same encoding for FSC field. */
6754 	dfsc = esr & ISS_DATA_DFSC_MASK;
6755 	switch (dfsc) {
6756 	case ISS_DATA_DFSC_TF_L0:
6757 	case ISS_DATA_DFSC_TF_L1:
6758 	case ISS_DATA_DFSC_TF_L2:
6759 	case ISS_DATA_DFSC_TF_L3:
6760 		PMAP_LOCK(pmap);
6761 		pdep = pmap_pde(pmap, far, &lvl);
6762 		if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
6763 			PMAP_LOCK(pmap);
6764 			break;
6765 		}
6766 
6767 		switch (lvl) {
6768 		case 0:
6769 			ptep = pmap_l0_to_l1(pdep, far);
6770 			break;
6771 		case 1:
6772 			ptep = pmap_l1_to_l2(pdep, far);
6773 			break;
6774 		case 2:
6775 			ptep = pmap_l2_to_l3(pdep, far);
6776 			break;
6777 		default:
6778 			panic("%s: Invalid pde level %d", __func__,lvl);
6779 		}
6780 		goto fault_exec;
6781 
6782 	case ISS_DATA_DFSC_AFF_L1:
6783 	case ISS_DATA_DFSC_AFF_L2:
6784 	case ISS_DATA_DFSC_AFF_L3:
6785 		PMAP_LOCK(pmap);
6786 		ptep = pmap_pte(pmap, far, &lvl);
6787 fault_exec:
6788 		if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
6789 			if (icache_vmid) {
6790 				pmap_invalidate_vpipt_icache();
6791 			} else {
6792 				/*
6793 				 * If accessing an executable page invalidate
6794 				 * the I-cache so it will be valid when we
6795 				 * continue execution in the guest. The D-cache
6796 				 * is assumed to already be clean to the Point
6797 				 * of Coherency.
6798 				 */
6799 				if ((pte & ATTR_S2_XN_MASK) !=
6800 				    ATTR_S2_XN(ATTR_S2_XN_NONE)) {
6801 					invalidate_icache();
6802 				}
6803 			}
6804 			pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
6805 			rv = KERN_SUCCESS;
6806 		}
6807 		PMAP_UNLOCK(pmap);
6808 		break;
6809 	}
6810 
6811 	return (rv);
6812 }
6813 
6814 int
6815 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
6816 {
6817 	pt_entry_t pte, *ptep;
6818 	register_t intr;
6819 	uint64_t ec, par;
6820 	int lvl, rv;
6821 
6822 	rv = KERN_FAILURE;
6823 
6824 	ec = ESR_ELx_EXCEPTION(esr);
6825 	switch (ec) {
6826 	case EXCP_INSN_ABORT_L:
6827 	case EXCP_INSN_ABORT:
6828 	case EXCP_DATA_ABORT_L:
6829 	case EXCP_DATA_ABORT:
6830 		break;
6831 	default:
6832 		return (rv);
6833 	}
6834 
6835 	if (pmap->pm_stage == PM_STAGE2)
6836 		return (pmap_stage2_fault(pmap, esr, far));
6837 
6838 	/* Data and insn aborts use same encoding for FSC field. */
6839 	switch (esr & ISS_DATA_DFSC_MASK) {
6840 	case ISS_DATA_DFSC_AFF_L1:
6841 	case ISS_DATA_DFSC_AFF_L2:
6842 	case ISS_DATA_DFSC_AFF_L3:
6843 		PMAP_LOCK(pmap);
6844 		ptep = pmap_pte(pmap, far, &lvl);
6845 		if (ptep != NULL) {
6846 			pmap_set_bits(ptep, ATTR_AF);
6847 			rv = KERN_SUCCESS;
6848 			/*
6849 			 * XXXMJ as an optimization we could mark the entry
6850 			 * dirty if this is a write fault.
6851 			 */
6852 		}
6853 		PMAP_UNLOCK(pmap);
6854 		break;
6855 	case ISS_DATA_DFSC_PF_L1:
6856 	case ISS_DATA_DFSC_PF_L2:
6857 	case ISS_DATA_DFSC_PF_L3:
6858 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
6859 		    (esr & ISS_DATA_WnR) == 0)
6860 			return (rv);
6861 		PMAP_LOCK(pmap);
6862 		ptep = pmap_pte(pmap, far, &lvl);
6863 		if (ptep != NULL &&
6864 		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
6865 			if ((pte & ATTR_S1_AP_RW_BIT) ==
6866 			    ATTR_S1_AP(ATTR_S1_AP_RO)) {
6867 				pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
6868 				pmap_invalidate_page(pmap, far, true);
6869 			}
6870 			rv = KERN_SUCCESS;
6871 		}
6872 		PMAP_UNLOCK(pmap);
6873 		break;
6874 	case ISS_DATA_DFSC_TF_L0:
6875 	case ISS_DATA_DFSC_TF_L1:
6876 	case ISS_DATA_DFSC_TF_L2:
6877 	case ISS_DATA_DFSC_TF_L3:
6878 		/*
6879 		 * Retry the translation.  A break-before-make sequence can
6880 		 * produce a transient fault.
6881 		 */
6882 		if (pmap == kernel_pmap) {
6883 			/*
6884 			 * The translation fault may have occurred within a
6885 			 * critical section.  Therefore, we must check the
6886 			 * address without acquiring the kernel pmap's lock.
6887 			 */
6888 			if (pmap_klookup(far, NULL))
6889 				rv = KERN_SUCCESS;
6890 		} else {
6891 			PMAP_LOCK(pmap);
6892 			/* Ask the MMU to check the address. */
6893 			intr = intr_disable();
6894 			par = arm64_address_translate_s1e0r(far);
6895 			intr_restore(intr);
6896 			PMAP_UNLOCK(pmap);
6897 
6898 			/*
6899 			 * If the translation was successful, then we can
6900 			 * return success to the trap handler.
6901 			 */
6902 			if (PAR_SUCCESS(par))
6903 				rv = KERN_SUCCESS;
6904 		}
6905 		break;
6906 	}
6907 
6908 	return (rv);
6909 }
6910 
6911 /*
6912  *	Increase the starting virtual address of the given mapping if a
6913  *	different alignment might result in more superpage mappings.
6914  */
6915 void
6916 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
6917     vm_offset_t *addr, vm_size_t size)
6918 {
6919 	vm_offset_t superpage_offset;
6920 
6921 	if (size < L2_SIZE)
6922 		return;
6923 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
6924 		offset += ptoa(object->pg_color);
6925 	superpage_offset = offset & L2_OFFSET;
6926 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
6927 	    (*addr & L2_OFFSET) == superpage_offset)
6928 		return;
6929 	if ((*addr & L2_OFFSET) < superpage_offset)
6930 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
6931 	else
6932 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
6933 }
6934 
6935 /**
6936  * Get the kernel virtual address of a set of physical pages. If there are
6937  * physical addresses not covered by the DMAP perform a transient mapping
6938  * that will be removed when calling pmap_unmap_io_transient.
6939  *
6940  * \param page        The pages the caller wishes to obtain the virtual
6941  *                    address on the kernel memory map.
6942  * \param vaddr       On return contains the kernel virtual memory address
6943  *                    of the pages passed in the page parameter.
6944  * \param count       Number of pages passed in.
6945  * \param can_fault   TRUE if the thread using the mapped pages can take
6946  *                    page faults, FALSE otherwise.
6947  *
6948  * \returns TRUE if the caller must call pmap_unmap_io_transient when
6949  *          finished or FALSE otherwise.
6950  *
6951  */
6952 boolean_t
6953 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
6954     boolean_t can_fault)
6955 {
6956 	vm_paddr_t paddr;
6957 	boolean_t needs_mapping;
6958 	int error __diagused, i;
6959 
6960 	/*
6961 	 * Allocate any KVA space that we need, this is done in a separate
6962 	 * loop to prevent calling vmem_alloc while pinned.
6963 	 */
6964 	needs_mapping = FALSE;
6965 	for (i = 0; i < count; i++) {
6966 		paddr = VM_PAGE_TO_PHYS(page[i]);
6967 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
6968 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
6969 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
6970 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
6971 			needs_mapping = TRUE;
6972 		} else {
6973 			vaddr[i] = PHYS_TO_DMAP(paddr);
6974 		}
6975 	}
6976 
6977 	/* Exit early if everything is covered by the DMAP */
6978 	if (!needs_mapping)
6979 		return (FALSE);
6980 
6981 	if (!can_fault)
6982 		sched_pin();
6983 	for (i = 0; i < count; i++) {
6984 		paddr = VM_PAGE_TO_PHYS(page[i]);
6985 		if (!PHYS_IN_DMAP(paddr)) {
6986 			panic(
6987 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
6988 		}
6989 	}
6990 
6991 	return (needs_mapping);
6992 }
6993 
6994 void
6995 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
6996     boolean_t can_fault)
6997 {
6998 	vm_paddr_t paddr;
6999 	int i;
7000 
7001 	if (!can_fault)
7002 		sched_unpin();
7003 	for (i = 0; i < count; i++) {
7004 		paddr = VM_PAGE_TO_PHYS(page[i]);
7005 		if (!PHYS_IN_DMAP(paddr)) {
7006 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
7007 		}
7008 	}
7009 }
7010 
7011 boolean_t
7012 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
7013 {
7014 
7015 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
7016 }
7017 
7018 /*
7019  * Track a range of the kernel's virtual address space that is contiguous
7020  * in various mapping attributes.
7021  */
7022 struct pmap_kernel_map_range {
7023 	vm_offset_t sva;
7024 	pt_entry_t attrs;
7025 	int l3pages;
7026 	int l3contig;
7027 	int l2blocks;
7028 	int l1blocks;
7029 };
7030 
7031 static void
7032 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
7033     vm_offset_t eva)
7034 {
7035 	const char *mode;
7036 	int index;
7037 
7038 	if (eva <= range->sva)
7039 		return;
7040 
7041 	index = range->attrs & ATTR_S1_IDX_MASK;
7042 	switch (index) {
7043 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
7044 		mode = "DEV";
7045 		break;
7046 	case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
7047 		mode = "UC";
7048 		break;
7049 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
7050 		mode = "WB";
7051 		break;
7052 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
7053 		mode = "WT";
7054 		break;
7055 	default:
7056 		printf(
7057 		    "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
7058 		    __func__, index, range->sva, eva);
7059 		mode = "??";
7060 		break;
7061 	}
7062 
7063 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c %3s %d %d %d %d\n",
7064 	    range->sva, eva,
7065 	    (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
7066 	    (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
7067 	    (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
7068 	    (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
7069 	    mode, range->l1blocks, range->l2blocks, range->l3contig,
7070 	    range->l3pages);
7071 
7072 	/* Reset to sentinel value. */
7073 	range->sva = 0xfffffffffffffffful;
7074 }
7075 
7076 /*
7077  * Determine whether the attributes specified by a page table entry match those
7078  * being tracked by the current range.
7079  */
7080 static bool
7081 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
7082 {
7083 
7084 	return (range->attrs == attrs);
7085 }
7086 
7087 static void
7088 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
7089     pt_entry_t attrs)
7090 {
7091 
7092 	memset(range, 0, sizeof(*range));
7093 	range->sva = va;
7094 	range->attrs = attrs;
7095 }
7096 
7097 /* Get the block/page attributes that correspond to the table attributes */
7098 static pt_entry_t
7099 sysctl_kmaps_table_attrs(pd_entry_t table)
7100 {
7101 	pt_entry_t attrs;
7102 
7103 	attrs = 0;
7104 	if ((table & TATTR_UXN_TABLE) != 0)
7105 		attrs |= ATTR_S1_UXN;
7106 	if ((table & TATTR_PXN_TABLE) != 0)
7107 		attrs |= ATTR_S1_PXN;
7108 	if ((table & TATTR_AP_TABLE_RO) != 0)
7109 		attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
7110 
7111 	return (attrs);
7112 }
7113 
7114 /* Read the block/page attributes we care about */
7115 static pt_entry_t
7116 sysctl_kmaps_block_attrs(pt_entry_t block)
7117 {
7118 	return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK));
7119 }
7120 
7121 /*
7122  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
7123  * those of the current run, dump the address range and its attributes, and
7124  * begin a new run.
7125  */
7126 static void
7127 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
7128     vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
7129     pt_entry_t l3e)
7130 {
7131 	pt_entry_t attrs;
7132 
7133 	attrs = sysctl_kmaps_table_attrs(l0e);
7134 
7135 	if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7136 		attrs |= sysctl_kmaps_block_attrs(l1e);
7137 		goto done;
7138 	}
7139 	attrs |= sysctl_kmaps_table_attrs(l1e);
7140 
7141 	if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
7142 		attrs |= sysctl_kmaps_block_attrs(l2e);
7143 		goto done;
7144 	}
7145 	attrs |= sysctl_kmaps_table_attrs(l2e);
7146 	attrs |= sysctl_kmaps_block_attrs(l3e);
7147 
7148 done:
7149 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
7150 		sysctl_kmaps_dump(sb, range, va);
7151 		sysctl_kmaps_reinit(range, va, attrs);
7152 	}
7153 }
7154 
7155 static int
7156 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
7157 {
7158 	struct pmap_kernel_map_range range;
7159 	struct sbuf sbuf, *sb;
7160 	pd_entry_t l0e, *l1, l1e, *l2, l2e;
7161 	pt_entry_t *l3, l3e;
7162 	vm_offset_t sva;
7163 	vm_paddr_t pa;
7164 	int error, i, j, k, l;
7165 
7166 	error = sysctl_wire_old_buffer(req, 0);
7167 	if (error != 0)
7168 		return (error);
7169 	sb = &sbuf;
7170 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
7171 
7172 	/* Sentinel value. */
7173 	range.sva = 0xfffffffffffffffful;
7174 
7175 	/*
7176 	 * Iterate over the kernel page tables without holding the kernel pmap
7177 	 * lock.  Kernel page table pages are never freed, so at worst we will
7178 	 * observe inconsistencies in the output.
7179 	 */
7180 	for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
7181 	    i++) {
7182 		if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
7183 			sbuf_printf(sb, "\nDirect map:\n");
7184 		else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
7185 			sbuf_printf(sb, "\nKernel map:\n");
7186 
7187 		l0e = kernel_pmap->pm_l0[i];
7188 		if ((l0e & ATTR_DESCR_VALID) == 0) {
7189 			sysctl_kmaps_dump(sb, &range, sva);
7190 			sva += L0_SIZE;
7191 			continue;
7192 		}
7193 		pa = l0e & ~ATTR_MASK;
7194 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
7195 
7196 		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
7197 			l1e = l1[j];
7198 			if ((l1e & ATTR_DESCR_VALID) == 0) {
7199 				sysctl_kmaps_dump(sb, &range, sva);
7200 				sva += L1_SIZE;
7201 				continue;
7202 			}
7203 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
7204 				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
7205 				    0, 0);
7206 				range.l1blocks++;
7207 				sva += L1_SIZE;
7208 				continue;
7209 			}
7210 			pa = l1e & ~ATTR_MASK;
7211 			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
7212 
7213 			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
7214 				l2e = l2[k];
7215 				if ((l2e & ATTR_DESCR_VALID) == 0) {
7216 					sysctl_kmaps_dump(sb, &range, sva);
7217 					sva += L2_SIZE;
7218 					continue;
7219 				}
7220 				if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
7221 					sysctl_kmaps_check(sb, &range, sva,
7222 					    l0e, l1e, l2e, 0);
7223 					range.l2blocks++;
7224 					sva += L2_SIZE;
7225 					continue;
7226 				}
7227 				pa = l2e & ~ATTR_MASK;
7228 				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
7229 
7230 				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
7231 				    l++, sva += L3_SIZE) {
7232 					l3e = l3[l];
7233 					if ((l3e & ATTR_DESCR_VALID) == 0) {
7234 						sysctl_kmaps_dump(sb, &range,
7235 						    sva);
7236 						continue;
7237 					}
7238 					sysctl_kmaps_check(sb, &range, sva,
7239 					    l0e, l1e, l2e, l3e);
7240 					if ((l3e & ATTR_CONTIGUOUS) != 0)
7241 						range.l3contig += l % 16 == 0 ?
7242 						    1 : 0;
7243 					else
7244 						range.l3pages++;
7245 				}
7246 			}
7247 		}
7248 	}
7249 
7250 	error = sbuf_finish(sb);
7251 	sbuf_delete(sb);
7252 	return (error);
7253 }
7254 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
7255     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
7256     NULL, 0, sysctl_kmaps, "A",
7257     "Dump kernel address layout");
7258