xref: /freebsd/sys/arm64/arm64/pmap.c (revision b5a1f040)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *	This product includes software developed by the University of
35  *	California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  */
52 /*-
53  * Copyright (c) 2003 Networks Associates Technology, Inc.
54  * All rights reserved.
55  *
56  * This software was developed for the FreeBSD Project by Jake Burkholder,
57  * Safeport Network Services, and Network Associates Laboratories, the
58  * Security Research Division of Network Associates, Inc. under
59  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60  * CHATS research program.
61  *
62  * Redistribution and use in source and binary forms, with or without
63  * modification, are permitted provided that the following conditions
64  * are met:
65  * 1. Redistributions of source code must retain the above copyright
66  *    notice, this list of conditions and the following disclaimer.
67  * 2. Redistributions in binary form must reproduce the above copyright
68  *    notice, this list of conditions and the following disclaimer in the
69  *    documentation and/or other materials provided with the distribution.
70  *
71  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81  * SUCH DAMAGE.
82  */
83 
84 #include <sys/cdefs.h>
85 /*
86  *	Manages physical address maps.
87  *
88  *	Since the information managed by this module is
89  *	also stored by the logical address mapping module,
90  *	this module may throw away valid virtual-to-physical
91  *	mappings at almost any time.  However, invalidations
92  *	of virtual-to-physical mappings must be done as
93  *	requested.
94  *
95  *	In order to cope with hardware architectures which
96  *	make virtual-to-physical map invalidates expensive,
97  *	this module may delay invalidate or reduced protection
98  *	operations until such time as they are actually
99  *	necessary.  This module is given full information as
100  *	to which processors are currently using which maps,
101  *	and to when physical maps must be made correct.
102  */
103 
104 #include "opt_vm.h"
105 
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132 
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147 
148 #include <machine/asan.h>
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152 
153 #ifdef NUMA
154 #define	PMAP_MEMDOM	MAXMEMDOM
155 #else
156 #define	PMAP_MEMDOM	1
157 #endif
158 
159 #define	PMAP_ASSERT_STAGE1(pmap)	MPASS((pmap)->pm_stage == PM_STAGE1)
160 #define	PMAP_ASSERT_STAGE2(pmap)	MPASS((pmap)->pm_stage == PM_STAGE2)
161 
162 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
163 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
164 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
166 
167 #define	NUL0E		L0_ENTRIES
168 #define	NUL1E		(NUL0E * NL1PG)
169 #define	NUL2E		(NUL1E * NL2PG)
170 
171 #ifdef PV_STATS
172 #define PV_STAT(x)	do { x ; } while (0)
173 #define __pvused
174 #else
175 #define PV_STAT(x)	do { } while (0)
176 #define __pvused	__unused
177 #endif
178 
179 #define	pmap_l0_pindex(v)	(NUL2E + NUL1E + ((v) >> L0_SHIFT))
180 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
181 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
182 
183 #ifdef __ARM_FEATURE_BTI_DEFAULT
184 #define	ATTR_KERN_GP		ATTR_S1_GP
185 #else
186 #define	ATTR_KERN_GP		0
187 #endif
188 #define	PMAP_SAN_PTE_BITS	(ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | \
189 	ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
190 
191 struct pmap_large_md_page {
192 	struct rwlock   pv_lock;
193 	struct md_page  pv_page;
194 	/* Pad to a power of 2, see pmap_init_pv_table(). */
195 	int		pv_pad[2];
196 };
197 
198 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
199 #define pv_dummy pv_dummy_large.pv_page
200 __read_mostly static struct pmap_large_md_page *pv_table;
201 
202 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)203 _pa_to_pmdp(vm_paddr_t pa)
204 {
205 	struct vm_phys_seg *seg;
206 
207 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
208 		return ((struct pmap_large_md_page *)seg->md_first +
209 		    pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
210 	return (NULL);
211 }
212 
213 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)214 pa_to_pmdp(vm_paddr_t pa)
215 {
216 	struct pmap_large_md_page *pvd;
217 
218 	pvd = _pa_to_pmdp(pa);
219 	if (pvd == NULL)
220 		panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
221 	return (pvd);
222 }
223 
224 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)225 page_to_pmdp(vm_page_t m)
226 {
227 	struct vm_phys_seg *seg;
228 
229 	seg = &vm_phys_segs[m->segind];
230 	return ((struct pmap_large_md_page *)seg->md_first +
231 	    pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
232 }
233 
234 #define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
235 #define	page_to_pvh(m)	(&(page_to_pmdp(m)->pv_page))
236 
237 #define	PHYS_TO_PV_LIST_LOCK(pa)	({			\
238 	struct pmap_large_md_page *_pvd;			\
239 	struct rwlock *_lock;					\
240 	_pvd = _pa_to_pmdp(pa);					\
241 	if (__predict_false(_pvd == NULL))			\
242 		_lock = &pv_dummy_large.pv_lock;		\
243 	else							\
244 		_lock = &(_pvd->pv_lock);			\
245 	_lock;							\
246 })
247 
248 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)249 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
250 {
251 	if ((m->flags & PG_FICTITIOUS) == 0)
252 		return (&page_to_pmdp(m)->pv_lock);
253 	else
254 		return (&pv_dummy_large.pv_lock);
255 }
256 
257 #define	CHANGE_PV_LIST_LOCK(lockp, new_lock)	do {	\
258 	struct rwlock **_lockp = (lockp);		\
259 	struct rwlock *_new_lock = (new_lock);		\
260 							\
261 	if (_new_lock != *_lockp) {			\
262 		if (*_lockp != NULL)			\
263 			rw_wunlock(*_lockp);		\
264 		*_lockp = _new_lock;			\
265 		rw_wlock(*_lockp);			\
266 	}						\
267 } while (0)
268 
269 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)		\
270 			CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
271 
272 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
273 			CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
274 
275 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
276 	struct rwlock **_lockp = (lockp);		\
277 							\
278 	if (*_lockp != NULL) {				\
279 		rw_wunlock(*_lockp);			\
280 		*_lockp = NULL;				\
281 	}						\
282 } while (0)
283 
284 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
285 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
286 
287 /*
288  * The presence of this flag indicates that the mapping is writeable.
289  * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
290  * it is dirty.  This flag may only be set on managed mappings.
291  *
292  * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
293  * as a software managed bit.
294  */
295 #define	ATTR_SW_DBM	ATTR_DBM
296 
297 struct pmap kernel_pmap_store;
298 
299 /* Used for mapping ACPI memory before VM is initialized */
300 #define	PMAP_PREINIT_MAPPING_COUNT	32
301 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
302 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
303 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
304 
305 /*
306  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
307  * Always map entire L2 block for simplicity.
308  * VA of L2 block = preinit_map_va + i * L2_SIZE
309  */
310 static struct pmap_preinit_mapping {
311 	vm_paddr_t	pa;
312 	vm_offset_t	va;
313 	vm_size_t	size;
314 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
315 
316 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
317 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
318 vm_offset_t kernel_vm_end = 0;
319 
320 /*
321  * Data for the pv entry allocation mechanism.
322  */
323 #ifdef NUMA
324 static __inline int
pc_to_domain(struct pv_chunk * pc)325 pc_to_domain(struct pv_chunk *pc)
326 {
327 	return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
328 }
329 #else
330 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)331 pc_to_domain(struct pv_chunk *pc __unused)
332 {
333 	return (0);
334 }
335 #endif
336 
337 struct pv_chunks_list {
338 	struct mtx pvc_lock;
339 	TAILQ_HEAD(pch, pv_chunk) pvc_list;
340 	int active_reclaims;
341 } __aligned(CACHE_LINE_SIZE);
342 
343 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
344 
345 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
346 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
347 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
348 
349 extern pt_entry_t pagetable_l0_ttbr1[];
350 
351 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
352 static vm_paddr_t physmap[PHYSMAP_SIZE];
353 static u_int physmap_idx;
354 
355 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
356     "VM/pmap parameters");
357 
358 #if PAGE_SIZE == PAGE_SIZE_4K
359 #define	L1_BLOCKS_SUPPORTED	1
360 #else
361 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
362 #define	L1_BLOCKS_SUPPORTED	0
363 #endif
364 
365 #define	PMAP_ASSERT_L1_BLOCKS_SUPPORTED	MPASS(L1_BLOCKS_SUPPORTED)
366 
367 /*
368  * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
369  * that it has currently allocated to a pmap, a cursor ("asid_next") to
370  * optimize its search for a free ASID in the bit vector, and an epoch number
371  * ("asid_epoch") to indicate when it has reclaimed all previously allocated
372  * ASIDs that are not currently active on a processor.
373  *
374  * The current epoch number is always in the range [0, INT_MAX).  Negative
375  * numbers and INT_MAX are reserved for special cases that are described
376  * below.
377  */
378 struct asid_set {
379 	int asid_bits;
380 	bitstr_t *asid_set;
381 	int asid_set_size;
382 	int asid_next;
383 	int asid_epoch;
384 	struct mtx asid_set_mutex;
385 };
386 
387 static struct asid_set asids;
388 static struct asid_set vmids;
389 
390 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
391     "ASID allocator");
392 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
393     "The number of bits in an ASID");
394 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
395     "The last allocated ASID plus one");
396 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
397     "The current epoch number");
398 
399 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
400 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
401     "The number of bits in an VMID");
402 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
403     "The last allocated VMID plus one");
404 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
405     "The current epoch number");
406 
407 void (*pmap_clean_stage2_tlbi)(void);
408 void (*pmap_invalidate_vpipt_icache)(void);
409 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
410 void (*pmap_stage2_invalidate_all)(uint64_t);
411 
412 /*
413  * A pmap's cookie encodes an ASID and epoch number.  Cookies for reserved
414  * ASIDs have a negative epoch number, specifically, INT_MIN.  Cookies for
415  * dynamically allocated ASIDs have a non-negative epoch number.
416  *
417  * An invalid ASID is represented by -1.
418  *
419  * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
420  * which indicates that an ASID should never be allocated to the pmap, and
421  * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
422  * allocated when the pmap is next activated.
423  */
424 #define	COOKIE_FROM(asid, epoch)	((long)((u_int)(asid) |	\
425 					    ((u_long)(epoch) << 32)))
426 #define	COOKIE_TO_ASID(cookie)		((int)(cookie))
427 #define	COOKIE_TO_EPOCH(cookie)		((int)((u_long)(cookie) >> 32))
428 
429 #define	TLBI_VA_SHIFT			12
430 #define	TLBI_VA_MASK			((1ul << 44) - 1)
431 #define	TLBI_VA(addr)			(((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
432 #define	TLBI_VA_L3_INCR			(L3_SIZE >> TLBI_VA_SHIFT)
433 
434 static int __read_frequently superpages_enabled = 1;
435 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
436     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
437     "Are large page mappings enabled?");
438 
439 /*
440  * True when Branch Target Identification should be used by userspace. This
441  * allows pmap to mark pages as guarded with ATTR_S1_GP.
442  */
443 __read_mostly static bool pmap_bti_support = false;
444 
445 /*
446  * Internal flags for pmap_enter()'s helper functions.
447  */
448 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
449 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
450 
451 TAILQ_HEAD(pv_chunklist, pv_chunk);
452 
453 static void	free_pv_chunk(struct pv_chunk *pc);
454 static void	free_pv_chunk_batch(struct pv_chunklist *batch);
455 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
456 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
457 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
458 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
459 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
460 		    vm_offset_t va);
461 
462 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
463 static bool pmap_activate_int(pmap_t pmap);
464 static void pmap_alloc_asid(pmap_t pmap);
465 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
466     vm_prot_t prot, int mode, bool skip_unmapped);
467 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
468     pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
469 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
470 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
471     vm_offset_t va, struct rwlock **lockp);
472 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
473 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
474 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
475     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
476 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
477     u_int flags, vm_page_t m, struct rwlock **lockp);
478 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
479     vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
480 static bool pmap_every_pte_zero(vm_paddr_t pa);
481 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
482     bool all_l3e_AF_set);
483 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
484 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
485     vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
486 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
487     struct rwlock **lockp);
488 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
489 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
490     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
491 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
492     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
493 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
494     vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
495     struct rwlock **lockp);
496 static void pmap_reset_asid_set(pmap_t pmap);
497 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
498     vm_page_t m, struct rwlock **lockp);
499 
500 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
501 		struct rwlock **lockp);
502 
503 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
504     struct spglist *free);
505 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
506 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
507     vm_offset_t va, vm_size_t size);
508 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
509 
510 static uma_zone_t pmap_bti_ranges_zone;
511 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
512 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
513 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
514 static void *bti_dup_range(void *ctx, void *data);
515 static void bti_free_range(void *ctx, void *node);
516 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
517 static void pmap_bti_deassign_all(pmap_t pmap);
518 
519 /*
520  * These load the old table data and store the new value.
521  * They need to be atomic as the System MMU may write to the table at
522  * the same time as the CPU.
523  */
524 #define	pmap_clear(table)		atomic_store_64(table, 0)
525 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
526 #define	pmap_load(table)		(*table)
527 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
528 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
529 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
530 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
531 
532 /********************/
533 /* Inline functions */
534 /********************/
535 
536 static __inline void
pagecopy(void * s,void * d)537 pagecopy(void *s, void *d)
538 {
539 
540 	memcpy(d, s, PAGE_SIZE);
541 }
542 
543 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)544 pmap_l0(pmap_t pmap, vm_offset_t va)
545 {
546 
547 	return (&pmap->pm_l0[pmap_l0_index(va)]);
548 }
549 
550 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)551 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
552 {
553 	pd_entry_t *l1;
554 
555 	l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
556 	return (&l1[pmap_l1_index(va)]);
557 }
558 
559 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)560 pmap_l1(pmap_t pmap, vm_offset_t va)
561 {
562 	pd_entry_t *l0;
563 
564 	l0 = pmap_l0(pmap, va);
565 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
566 		return (NULL);
567 
568 	return (pmap_l0_to_l1(l0, va));
569 }
570 
571 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)572 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
573 {
574 	pd_entry_t l1, *l2p;
575 
576 	l1 = pmap_load(l1p);
577 
578 	KASSERT(ADDR_IS_CANONICAL(va),
579 	    ("%s: Address not in canonical form: %lx", __func__, va));
580 	/*
581 	 * The valid bit may be clear if pmap_update_entry() is concurrently
582 	 * modifying the entry, so for KVA only the entry type may be checked.
583 	 */
584 	KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
585 	    ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
586 	KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
587 	    ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
588 	l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
589 	return (&l2p[pmap_l2_index(va)]);
590 }
591 
592 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)593 pmap_l2(pmap_t pmap, vm_offset_t va)
594 {
595 	pd_entry_t *l1;
596 
597 	l1 = pmap_l1(pmap, va);
598 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
599 		return (NULL);
600 
601 	return (pmap_l1_to_l2(l1, va));
602 }
603 
604 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)605 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
606 {
607 	pd_entry_t l2;
608 	pt_entry_t *l3p;
609 
610 	l2 = pmap_load(l2p);
611 
612 	KASSERT(ADDR_IS_CANONICAL(va),
613 	    ("%s: Address not in canonical form: %lx", __func__, va));
614 	/*
615 	 * The valid bit may be clear if pmap_update_entry() is concurrently
616 	 * modifying the entry, so for KVA only the entry type may be checked.
617 	 */
618 	KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
619 	    ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
620 	KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
621 	    ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
622 	l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
623 	return (&l3p[pmap_l3_index(va)]);
624 }
625 
626 /*
627  * Returns the lowest valid pde for a given virtual address.
628  * The next level may or may not point to a valid page or block.
629  */
630 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)631 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
632 {
633 	pd_entry_t *l0, *l1, *l2, desc;
634 
635 	l0 = pmap_l0(pmap, va);
636 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
637 	if (desc != L0_TABLE) {
638 		*level = -1;
639 		return (NULL);
640 	}
641 
642 	l1 = pmap_l0_to_l1(l0, va);
643 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
644 	if (desc != L1_TABLE) {
645 		*level = 0;
646 		return (l0);
647 	}
648 
649 	l2 = pmap_l1_to_l2(l1, va);
650 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
651 	if (desc != L2_TABLE) {
652 		*level = 1;
653 		return (l1);
654 	}
655 
656 	*level = 2;
657 	return (l2);
658 }
659 
660 /*
661  * Returns the lowest valid pte block or table entry for a given virtual
662  * address. If there are no valid entries return NULL and set the level to
663  * the first invalid level.
664  */
665 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)666 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
667 {
668 	pd_entry_t *l1, *l2, desc;
669 	pt_entry_t *l3;
670 
671 	l1 = pmap_l1(pmap, va);
672 	if (l1 == NULL) {
673 		*level = 0;
674 		return (NULL);
675 	}
676 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
677 	if (desc == L1_BLOCK) {
678 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
679 		*level = 1;
680 		return (l1);
681 	}
682 
683 	if (desc != L1_TABLE) {
684 		*level = 1;
685 		return (NULL);
686 	}
687 
688 	l2 = pmap_l1_to_l2(l1, va);
689 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
690 	if (desc == L2_BLOCK) {
691 		*level = 2;
692 		return (l2);
693 	}
694 
695 	if (desc != L2_TABLE) {
696 		*level = 2;
697 		return (NULL);
698 	}
699 
700 	*level = 3;
701 	l3 = pmap_l2_to_l3(l2, va);
702 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
703 		return (NULL);
704 
705 	return (l3);
706 }
707 
708 /*
709  * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
710  * level that maps the specified virtual address, then a pointer to that entry
711  * is returned.  Otherwise, NULL is returned, unless INVARIANTS are enabled
712  * and a diagnostic message is provided, in which case this function panics.
713  */
714 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)715 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
716 {
717 	pd_entry_t *l0p, *l1p, *l2p;
718 	pt_entry_t desc, *l3p;
719 	int walk_level __diagused;
720 
721 	KASSERT(level >= 0 && level < 4,
722 	    ("%s: %s passed an out-of-range level (%d)", __func__, diag,
723 	    level));
724 	l0p = pmap_l0(pmap, va);
725 	desc = pmap_load(l0p) & ATTR_DESCR_MASK;
726 	if (desc == L0_TABLE && level > 0) {
727 		l1p = pmap_l0_to_l1(l0p, va);
728 		desc = pmap_load(l1p) & ATTR_DESCR_MASK;
729 		if (desc == L1_BLOCK && level == 1) {
730 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
731 			return (l1p);
732 		}
733 		if (desc == L1_TABLE && level > 1) {
734 			l2p = pmap_l1_to_l2(l1p, va);
735 			desc = pmap_load(l2p) & ATTR_DESCR_MASK;
736 			if (desc == L2_BLOCK && level == 2)
737 				return (l2p);
738 			else if (desc == L2_TABLE && level > 2) {
739 				l3p = pmap_l2_to_l3(l2p, va);
740 				desc = pmap_load(l3p) & ATTR_DESCR_MASK;
741 				if (desc == L3_PAGE && level == 3)
742 					return (l3p);
743 				else
744 					walk_level = 3;
745 			} else
746 				walk_level = 2;
747 		} else
748 			walk_level = 1;
749 	} else
750 		walk_level = 0;
751 	KASSERT(diag == NULL,
752 	    ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
753 	    diag, va, level, desc, walk_level));
754 	return (NULL);
755 }
756 
757 bool
pmap_ps_enabled(pmap_t pmap)758 pmap_ps_enabled(pmap_t pmap)
759 {
760 	/*
761 	 * Promotion requires a hypervisor call when the kernel is running
762 	 * in EL1. To stop this disable superpage support on non-stage 1
763 	 * pmaps for now.
764 	 */
765 	if (pmap->pm_stage != PM_STAGE1)
766 		return (false);
767 
768 #ifdef KMSAN
769 	/*
770 	 * The break-before-make in pmap_update_entry() results in a situation
771 	 * where a CPU may call into the KMSAN runtime while the entry is
772 	 * invalid.  If the entry is used to map the current thread structure,
773 	 * then the runtime will attempt to access unmapped memory.  Avoid this
774 	 * by simply disabling superpage promotion for the kernel map.
775 	 */
776 	if (pmap == kernel_pmap)
777 		return (false);
778 #endif
779 
780 	return (superpages_enabled != 0);
781 }
782 
783 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)784 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
785     pd_entry_t **l2, pt_entry_t **l3)
786 {
787 	pd_entry_t *l0p, *l1p, *l2p;
788 
789 	if (pmap->pm_l0 == NULL)
790 		return (false);
791 
792 	l0p = pmap_l0(pmap, va);
793 	*l0 = l0p;
794 
795 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
796 		return (false);
797 
798 	l1p = pmap_l0_to_l1(l0p, va);
799 	*l1 = l1p;
800 
801 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
802 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
803 		*l2 = NULL;
804 		*l3 = NULL;
805 		return (true);
806 	}
807 
808 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
809 		return (false);
810 
811 	l2p = pmap_l1_to_l2(l1p, va);
812 	*l2 = l2p;
813 
814 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
815 		*l3 = NULL;
816 		return (true);
817 	}
818 
819 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
820 		return (false);
821 
822 	*l3 = pmap_l2_to_l3(l2p, va);
823 
824 	return (true);
825 }
826 
827 static __inline int
pmap_l3_valid(pt_entry_t l3)828 pmap_l3_valid(pt_entry_t l3)
829 {
830 
831 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
832 }
833 
834 CTASSERT(L1_BLOCK == L2_BLOCK);
835 
836 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)837 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
838 {
839 	pt_entry_t val;
840 
841 	if (pmap->pm_stage == PM_STAGE1) {
842 		val = ATTR_S1_IDX(memattr);
843 		if (memattr == VM_MEMATTR_DEVICE)
844 			val |= ATTR_S1_XN;
845 		return (val);
846 	}
847 
848 	val = 0;
849 
850 	switch (memattr) {
851 	case VM_MEMATTR_DEVICE:
852 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
853 		    ATTR_S2_XN(ATTR_S2_XN_ALL));
854 	case VM_MEMATTR_UNCACHEABLE:
855 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
856 	case VM_MEMATTR_WRITE_BACK:
857 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
858 	case VM_MEMATTR_WRITE_THROUGH:
859 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
860 	default:
861 		panic("%s: invalid memory attribute %x", __func__, memattr);
862 	}
863 }
864 
865 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)866 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
867 {
868 	pt_entry_t val;
869 
870 	val = 0;
871 	if (pmap->pm_stage == PM_STAGE1) {
872 		if ((prot & VM_PROT_EXECUTE) == 0)
873 			val |= ATTR_S1_XN;
874 		if ((prot & VM_PROT_WRITE) == 0)
875 			val |= ATTR_S1_AP(ATTR_S1_AP_RO);
876 	} else {
877 		if ((prot & VM_PROT_WRITE) != 0)
878 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
879 		if ((prot & VM_PROT_READ) != 0)
880 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
881 		if ((prot & VM_PROT_EXECUTE) == 0)
882 			val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
883 	}
884 
885 	return (val);
886 }
887 
888 /*
889  * Checks if the PTE is dirty.
890  */
891 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)892 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
893 {
894 
895 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
896 
897 	if (pmap->pm_stage == PM_STAGE1) {
898 		KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
899 		    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
900 
901 		return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
902 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
903 	}
904 
905 	return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
906 	    ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
907 }
908 
909 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)910 pmap_resident_count_inc(pmap_t pmap, int count)
911 {
912 
913 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
914 	pmap->pm_stats.resident_count += count;
915 }
916 
917 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)918 pmap_resident_count_dec(pmap_t pmap, int count)
919 {
920 
921 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
922 	KASSERT(pmap->pm_stats.resident_count >= count,
923 	    ("pmap %p resident count underflow %ld %d", pmap,
924 	    pmap->pm_stats.resident_count, count));
925 	pmap->pm_stats.resident_count -= count;
926 }
927 
928 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)929 pmap_early_vtophys(vm_offset_t va)
930 {
931 	vm_paddr_t pa_page;
932 
933 	pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
934 	return (pa_page | (va & PAR_LOW_MASK));
935 }
936 
937 /* State of the bootstrapped DMAP page tables */
938 struct pmap_bootstrap_state {
939 	pt_entry_t	*l1;
940 	pt_entry_t	*l2;
941 	pt_entry_t	*l3;
942 	vm_offset_t	freemempos;
943 	vm_offset_t	va;
944 	vm_paddr_t	pa;
945 	pt_entry_t	table_attrs;
946 	u_int		l0_slot;
947 	u_int		l1_slot;
948 	u_int		l2_slot;
949 	bool		dmap_valid;
950 };
951 
952 /* The bootstrap state */
953 static struct pmap_bootstrap_state bs_state = {
954 	.l1 = NULL,
955 	.l2 = NULL,
956 	.l3 = NULL,
957 	.table_attrs = TATTR_PXN_TABLE,
958 	.l0_slot = L0_ENTRIES,
959 	.l1_slot = Ln_ENTRIES,
960 	.l2_slot = Ln_ENTRIES,
961 	.dmap_valid = false,
962 };
963 
964 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)965 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
966 {
967 	vm_paddr_t l1_pa;
968 	pd_entry_t l0e;
969 	u_int l0_slot;
970 
971 	/* Link the level 0 table to a level 1 table */
972 	l0_slot = pmap_l0_index(state->va);
973 	if (l0_slot != state->l0_slot) {
974 		/*
975 		 * Make sure we move from a low address to high address
976 		 * before the DMAP region is ready. This ensures we never
977 		 * modify an existing mapping until we can map from a
978 		 * physical address to a virtual address.
979 		 */
980 		MPASS(state->l0_slot < l0_slot ||
981 		    state->l0_slot == L0_ENTRIES ||
982 		    state->dmap_valid);
983 
984 		/* Reset lower levels */
985 		state->l2 = NULL;
986 		state->l3 = NULL;
987 		state->l1_slot = Ln_ENTRIES;
988 		state->l2_slot = Ln_ENTRIES;
989 
990 		/* Check the existing L0 entry */
991 		state->l0_slot = l0_slot;
992 		if (state->dmap_valid) {
993 			l0e = pagetable_l0_ttbr1[l0_slot];
994 			if ((l0e & ATTR_DESCR_VALID) != 0) {
995 				MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
996 				l1_pa = PTE_TO_PHYS(l0e);
997 				state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
998 				return;
999 			}
1000 		}
1001 
1002 		/* Create a new L0 table entry */
1003 		state->l1 = (pt_entry_t *)state->freemempos;
1004 		memset(state->l1, 0, PAGE_SIZE);
1005 		state->freemempos += PAGE_SIZE;
1006 
1007 		l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1008 		MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1009 		MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1010 		pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1011 		    TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1012 	}
1013 	KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1014 }
1015 
1016 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1017 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1018 {
1019 	vm_paddr_t l2_pa;
1020 	pd_entry_t l1e;
1021 	u_int l1_slot;
1022 
1023 	/* Make sure there is a valid L0 -> L1 table */
1024 	pmap_bootstrap_l0_table(state);
1025 
1026 	/* Link the level 1 table to a level 2 table */
1027 	l1_slot = pmap_l1_index(state->va);
1028 	if (l1_slot != state->l1_slot) {
1029 		/* See pmap_bootstrap_l0_table for a description */
1030 		MPASS(state->l1_slot < l1_slot ||
1031 		    state->l1_slot == Ln_ENTRIES ||
1032 		    state->dmap_valid);
1033 
1034 		/* Reset lower levels */
1035 		state->l3 = NULL;
1036 		state->l2_slot = Ln_ENTRIES;
1037 
1038 		/* Check the existing L1 entry */
1039 		state->l1_slot = l1_slot;
1040 		if (state->dmap_valid) {
1041 			l1e = state->l1[l1_slot];
1042 			if ((l1e & ATTR_DESCR_VALID) != 0) {
1043 				MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1044 				l2_pa = PTE_TO_PHYS(l1e);
1045 				state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1046 				return;
1047 			}
1048 		}
1049 
1050 		/* Create a new L1 table entry */
1051 		state->l2 = (pt_entry_t *)state->freemempos;
1052 		memset(state->l2, 0, PAGE_SIZE);
1053 		state->freemempos += PAGE_SIZE;
1054 
1055 		l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1056 		MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1057 		MPASS(state->l1[l1_slot] == 0);
1058 		pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1059 		    state->table_attrs | L1_TABLE);
1060 	}
1061 	KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1062 }
1063 
1064 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1065 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1066 {
1067 	vm_paddr_t l3_pa;
1068 	pd_entry_t l2e;
1069 	u_int l2_slot;
1070 
1071 	/* Make sure there is a valid L1 -> L2 table */
1072 	pmap_bootstrap_l1_table(state);
1073 
1074 	/* Link the level 2 table to a level 3 table */
1075 	l2_slot = pmap_l2_index(state->va);
1076 	if (l2_slot != state->l2_slot) {
1077 		/* See pmap_bootstrap_l0_table for a description */
1078 		MPASS(state->l2_slot < l2_slot ||
1079 		    state->l2_slot == Ln_ENTRIES ||
1080 		    state->dmap_valid);
1081 
1082 		/* Check the existing L2 entry */
1083 		state->l2_slot = l2_slot;
1084 		if (state->dmap_valid) {
1085 			l2e = state->l2[l2_slot];
1086 			if ((l2e & ATTR_DESCR_VALID) != 0) {
1087 				MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1088 				l3_pa = PTE_TO_PHYS(l2e);
1089 				state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1090 				return;
1091 			}
1092 		}
1093 
1094 		/* Create a new L2 table entry */
1095 		state->l3 = (pt_entry_t *)state->freemempos;
1096 		memset(state->l3, 0, PAGE_SIZE);
1097 		state->freemempos += PAGE_SIZE;
1098 
1099 		l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1100 		MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1101 		MPASS(state->l2[l2_slot] == 0);
1102 		pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1103 		    state->table_attrs | L2_TABLE);
1104 	}
1105 	KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1106 }
1107 
1108 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1109 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1110 {
1111 	u_int l2_slot;
1112 	bool first;
1113 
1114 	if ((physmap[i + 1] - state->pa) < L2_SIZE)
1115 		return;
1116 
1117 	/* Make sure there is a valid L1 table */
1118 	pmap_bootstrap_l1_table(state);
1119 
1120 	MPASS((state->va & L2_OFFSET) == 0);
1121 	for (first = true;
1122 	    state->va < DMAP_MAX_ADDRESS &&
1123 	    (physmap[i + 1] - state->pa) >= L2_SIZE;
1124 	    state->va += L2_SIZE, state->pa += L2_SIZE) {
1125 		/*
1126 		 * Stop if we are about to walk off the end of what the
1127 		 * current L1 slot can address.
1128 		 */
1129 		if (!first && (state->pa & L1_OFFSET) == 0)
1130 			break;
1131 
1132 		first = false;
1133 		l2_slot = pmap_l2_index(state->va);
1134 		MPASS((state->pa & L2_OFFSET) == 0);
1135 		MPASS(state->l2[l2_slot] == 0);
1136 		pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1137 		    ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1138 		    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
1139 	}
1140 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1141 }
1142 
1143 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1144 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1145 {
1146 	pt_entry_t contig;
1147 	u_int l3_slot;
1148 	bool first;
1149 
1150 	if (physmap[i + 1] - state->pa < L3_SIZE)
1151 		return;
1152 
1153 	/* Make sure there is a valid L2 table */
1154 	pmap_bootstrap_l2_table(state);
1155 
1156 	MPASS((state->va & L3_OFFSET) == 0);
1157 	for (first = true, contig = 0;
1158 	    state->va < DMAP_MAX_ADDRESS &&
1159 	    physmap[i + 1] - state->pa >= L3_SIZE;
1160 	    state->va += L3_SIZE, state->pa += L3_SIZE) {
1161 		/*
1162 		 * Stop if we are about to walk off the end of what the
1163 		 * current L2 slot can address.
1164 		 */
1165 		if (!first && (state->pa & L2_OFFSET) == 0)
1166 			break;
1167 
1168 		/*
1169 		 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1170 		 * L3 pages, set the contiguous bit within each PTE so that
1171 		 * the chunk can be cached using only one TLB entry.
1172 		 */
1173 		if ((state->pa & L3C_OFFSET) == 0) {
1174 			if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1175 			    physmap[i + 1] - state->pa >= L3C_SIZE) {
1176 				contig = ATTR_CONTIGUOUS;
1177 			} else {
1178 				contig = 0;
1179 			}
1180 		}
1181 
1182 		first = false;
1183 		l3_slot = pmap_l3_index(state->va);
1184 		MPASS((state->pa & L3_OFFSET) == 0);
1185 		MPASS(state->l3[l3_slot] == 0);
1186 		pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1187 		    ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1188 		    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1189 	}
1190 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1191 }
1192 
1193 static void
pmap_bootstrap_dmap(vm_paddr_t min_pa)1194 pmap_bootstrap_dmap(vm_paddr_t min_pa)
1195 {
1196 	int i;
1197 
1198 	dmap_phys_base = min_pa & ~L1_OFFSET;
1199 	dmap_phys_max = 0;
1200 	dmap_max_addr = 0;
1201 
1202 	for (i = 0; i < (physmap_idx * 2); i += 2) {
1203 		bs_state.pa = physmap[i] & ~L3_OFFSET;
1204 		bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1205 
1206 		/* Create L3 mappings at the start of the region */
1207 		if ((bs_state.pa & L2_OFFSET) != 0)
1208 			pmap_bootstrap_l3_page(&bs_state, i);
1209 		MPASS(bs_state.pa <= physmap[i + 1]);
1210 
1211 		if (L1_BLOCKS_SUPPORTED) {
1212 			/* Create L2 mappings at the start of the region */
1213 			if ((bs_state.pa & L1_OFFSET) != 0)
1214 				pmap_bootstrap_l2_block(&bs_state, i);
1215 			MPASS(bs_state.pa <= physmap[i + 1]);
1216 
1217 			/* Create the main L1 block mappings */
1218 			for (; bs_state.va < DMAP_MAX_ADDRESS &&
1219 			    (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1220 			    bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1221 				/* Make sure there is a valid L1 table */
1222 				pmap_bootstrap_l0_table(&bs_state);
1223 				MPASS((bs_state.pa & L1_OFFSET) == 0);
1224 				pmap_store(
1225 				    &bs_state.l1[pmap_l1_index(bs_state.va)],
1226 				    PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT |
1227 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1228 				    ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1229 			}
1230 			MPASS(bs_state.pa <= physmap[i + 1]);
1231 
1232 			/* Create L2 mappings at the end of the region */
1233 			pmap_bootstrap_l2_block(&bs_state, i);
1234 		} else {
1235 			while (bs_state.va < DMAP_MAX_ADDRESS &&
1236 			    (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1237 				pmap_bootstrap_l2_block(&bs_state, i);
1238 			}
1239 		}
1240 		MPASS(bs_state.pa <= physmap[i + 1]);
1241 
1242 		/* Create L3 mappings at the end of the region */
1243 		pmap_bootstrap_l3_page(&bs_state, i);
1244 		MPASS(bs_state.pa == physmap[i + 1]);
1245 
1246 		if (bs_state.pa > dmap_phys_max) {
1247 			dmap_phys_max = bs_state.pa;
1248 			dmap_max_addr = bs_state.va;
1249 		}
1250 	}
1251 
1252 	cpu_tlb_flushID();
1253 }
1254 
1255 static void
pmap_bootstrap_l2(vm_offset_t va)1256 pmap_bootstrap_l2(vm_offset_t va)
1257 {
1258 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1259 
1260 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1261 	bs_state.va = va;
1262 
1263 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1264 		pmap_bootstrap_l1_table(&bs_state);
1265 }
1266 
1267 static void
pmap_bootstrap_l3(vm_offset_t va)1268 pmap_bootstrap_l3(vm_offset_t va)
1269 {
1270 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1271 
1272 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1273 	bs_state.va = va;
1274 
1275 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1276 		pmap_bootstrap_l2_table(&bs_state);
1277 }
1278 
1279 /*
1280  *	Bootstrap the system enough to run with virtual memory.
1281  */
1282 void
pmap_bootstrap(vm_size_t kernlen)1283 pmap_bootstrap(vm_size_t kernlen)
1284 {
1285 	vm_offset_t dpcpu, msgbufpv;
1286 	vm_paddr_t start_pa, pa, min_pa;
1287 	int i;
1288 
1289 	/* Verify that the ASID is set through TTBR0. */
1290 	KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1291 	    ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1292 
1293 	/* Set this early so we can use the pagetable walking functions */
1294 	kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1295 	PMAP_LOCK_INIT(kernel_pmap);
1296 	kernel_pmap->pm_l0_paddr =
1297 	    pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1298 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1299 	vm_radix_init(&kernel_pmap->pm_root);
1300 	kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1301 	kernel_pmap->pm_stage = PM_STAGE1;
1302 	kernel_pmap->pm_levels = 4;
1303 	kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1304 	kernel_pmap->pm_asid_set = &asids;
1305 
1306 	/* Assume the address we were loaded to is a valid physical address */
1307 	min_pa = pmap_early_vtophys(KERNBASE);
1308 
1309 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1310 	physmap_idx /= 2;
1311 
1312 	/*
1313 	 * Find the minimum physical address. physmap is sorted,
1314 	 * but may contain empty ranges.
1315 	 */
1316 	for (i = 0; i < physmap_idx * 2; i += 2) {
1317 		if (physmap[i] == physmap[i + 1])
1318 			continue;
1319 		if (physmap[i] <= min_pa)
1320 			min_pa = physmap[i];
1321 	}
1322 
1323 	bs_state.freemempos = KERNBASE + kernlen;
1324 	bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1325 
1326 	/* Create a direct map region early so we can use it for pa -> va */
1327 	pmap_bootstrap_dmap(min_pa);
1328 	bs_state.dmap_valid = true;
1329 	/*
1330 	 * We only use PXN when we know nothing will be executed from it, e.g.
1331 	 * the DMAP region.
1332 	 */
1333 	bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1334 
1335 	start_pa = pa = pmap_early_vtophys(KERNBASE);
1336 
1337 	/*
1338 	 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS.  We assume that the
1339 	 * loader allocated the first and only l2 page table page used to map
1340 	 * the kernel, preloaded files and module metadata.
1341 	 */
1342 	pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1343 	/* And the l3 tables for the early devmap */
1344 	pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1345 
1346 	cpu_tlb_flushID();
1347 
1348 #define alloc_pages(var, np)						\
1349 	(var) = bs_state.freemempos;					\
1350 	bs_state.freemempos += (np * PAGE_SIZE);			\
1351 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
1352 
1353 	/* Allocate dynamic per-cpu area. */
1354 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1355 	dpcpu_init((void *)dpcpu, 0);
1356 
1357 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1358 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1359 	msgbufp = (void *)msgbufpv;
1360 
1361 	/* Reserve some VA space for early BIOS/ACPI mapping */
1362 	preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1363 
1364 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1365 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
1366 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1367 	kernel_vm_end = virtual_avail;
1368 
1369 	pa = pmap_early_vtophys(bs_state.freemempos);
1370 
1371 	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1372 
1373 	cpu_tlb_flushID();
1374 }
1375 
1376 #if defined(KASAN) || defined(KMSAN)
1377 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1378 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1379     vm_offset_t *vap, vm_offset_t eva)
1380 {
1381 	vm_paddr_t pa;
1382 	vm_offset_t va;
1383 	pd_entry_t *l2;
1384 
1385 	va = *vap;
1386 	pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1387 	for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1388 		l2 = pmap_l2(kernel_pmap, va);
1389 
1390 		/*
1391 		 * KASAN stack checking results in us having already allocated
1392 		 * part of our shadow map, so we can just skip those segments.
1393 		 */
1394 		if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1395 			pa += L2_SIZE;
1396 			continue;
1397 		}
1398 
1399 		bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1400 		physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1401 		pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1402 	}
1403 	*vap = va;
1404 }
1405 
1406 /*
1407  * Finish constructing the initial shadow map:
1408  * - Count how many pages from KERNBASE to virtual_avail (scaled for
1409  *   shadow map)
1410  * - Map that entire range using L2 superpages.
1411  */
1412 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1413 pmap_bootstrap_san1(vm_offset_t va, int scale)
1414 {
1415 	vm_offset_t eva;
1416 	vm_paddr_t kernstart;
1417 	int i;
1418 
1419 	kernstart = pmap_early_vtophys(KERNBASE);
1420 
1421 	/*
1422 	 * Rebuild physmap one more time, we may have excluded more regions from
1423 	 * allocation since pmap_bootstrap().
1424 	 */
1425 	bzero(physmap, sizeof(physmap));
1426 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1427 	physmap_idx /= 2;
1428 
1429 	eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1430 
1431 	/*
1432 	 * Find a slot in the physmap large enough for what we needed.  We try to put
1433 	 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1434 	 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1435 	 */
1436 	for (i = (physmap_idx * 2) - 2; i >= 0; i -= 2) {
1437 		vm_paddr_t plow, phigh;
1438 
1439 		/* L2 mappings must be backed by memory that is L2-aligned */
1440 		plow = roundup2(physmap[i], L2_SIZE);
1441 		phigh = physmap[i + 1];
1442 		if (plow >= phigh)
1443 			continue;
1444 		if (kernstart >= plow && kernstart < phigh)
1445 			phigh = kernstart;
1446 		if (phigh - plow >= L2_SIZE) {
1447 			pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1448 			if (va >= eva)
1449 				break;
1450 		}
1451 	}
1452 	if (i < 0)
1453 		panic("Could not find phys region for shadow map");
1454 
1455 	/*
1456 	 * Done. We should now have a valid shadow address mapped for all KVA
1457 	 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1458 	 * shadow accesses by the sanitizer runtime will succeed for this range.
1459 	 * When the kernel virtual address range is later expanded, as will
1460 	 * happen in vm_mem_init(), the shadow map will be grown as well. This
1461 	 * is handled by pmap_san_enter().
1462 	 */
1463 }
1464 
1465 void
pmap_bootstrap_san(void)1466 pmap_bootstrap_san(void)
1467 {
1468 #ifdef KASAN
1469 	pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1470 #else
1471 	static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1472 	static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1473 	pd_entry_t *l0, *l1;
1474 
1475 	if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1476 		panic("initial kernel map is too large");
1477 
1478 	l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1479 	pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1480 	    pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1481 	l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1482 	pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1483 	    pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1484 	pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1485 
1486 	l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1487 	pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1488 	    pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1489 	l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1490 	pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1491 	    pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1492 	pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1493 #endif
1494 }
1495 #endif
1496 
1497 /*
1498  *	Initialize a vm_page's machine-dependent fields.
1499  */
1500 void
pmap_page_init(vm_page_t m)1501 pmap_page_init(vm_page_t m)
1502 {
1503 
1504 	TAILQ_INIT(&m->md.pv_list);
1505 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1506 }
1507 
1508 static void
pmap_init_asids(struct asid_set * set,int bits)1509 pmap_init_asids(struct asid_set *set, int bits)
1510 {
1511 	int i;
1512 
1513 	set->asid_bits = bits;
1514 
1515 	/*
1516 	 * We may be too early in the overall initialization process to use
1517 	 * bit_alloc().
1518 	 */
1519 	set->asid_set_size = 1 << set->asid_bits;
1520 	set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1521 	    M_WAITOK | M_ZERO);
1522 	for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1523 		bit_set(set->asid_set, i);
1524 	set->asid_next = ASID_FIRST_AVAILABLE;
1525 	mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1526 }
1527 
1528 static void
pmap_init_pv_table(void)1529 pmap_init_pv_table(void)
1530 {
1531 	struct vm_phys_seg *seg, *next_seg;
1532 	struct pmap_large_md_page *pvd;
1533 	vm_size_t s;
1534 	int domain, i, j, pages;
1535 
1536 	/*
1537 	 * We strongly depend on the size being a power of two, so the assert
1538 	 * is overzealous. However, should the struct be resized to a
1539 	 * different power of two, the code below needs to be revisited.
1540 	 */
1541 	CTASSERT((sizeof(*pvd) == 64));
1542 
1543 	/*
1544 	 * Calculate the size of the array.
1545 	 */
1546 	s = 0;
1547 	for (i = 0; i < vm_phys_nsegs; i++) {
1548 		seg = &vm_phys_segs[i];
1549 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1550 		    pmap_l2_pindex(seg->start);
1551 		s += round_page(pages * sizeof(*pvd));
1552 	}
1553 	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1554 	if (pv_table == NULL)
1555 		panic("%s: kva_alloc failed\n", __func__);
1556 
1557 	/*
1558 	 * Iterate physical segments to allocate domain-local memory for PV
1559 	 * list headers.
1560 	 */
1561 	pvd = pv_table;
1562 	for (i = 0; i < vm_phys_nsegs; i++) {
1563 		seg = &vm_phys_segs[i];
1564 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1565 		    pmap_l2_pindex(seg->start);
1566 		domain = seg->domain;
1567 
1568 		s = round_page(pages * sizeof(*pvd));
1569 
1570 		for (j = 0; j < s; j += PAGE_SIZE) {
1571 			vm_page_t m = vm_page_alloc_noobj_domain(domain,
1572 			    VM_ALLOC_ZERO);
1573 			if (m == NULL)
1574 				panic("failed to allocate PV table page");
1575 			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1576 		}
1577 
1578 		for (j = 0; j < s / sizeof(*pvd); j++) {
1579 			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1580 			TAILQ_INIT(&pvd->pv_page.pv_list);
1581 			pvd++;
1582 		}
1583 	}
1584 	pvd = &pv_dummy_large;
1585 	memset(pvd, 0, sizeof(*pvd));
1586 	rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1587 	TAILQ_INIT(&pvd->pv_page.pv_list);
1588 
1589 	/*
1590 	 * Set pointers from vm_phys_segs to pv_table.
1591 	 */
1592 	for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1593 		seg = &vm_phys_segs[i];
1594 		seg->md_first = pvd;
1595 		pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1596 		    pmap_l2_pindex(seg->start);
1597 
1598 		/*
1599 		 * If there is a following segment, and the final
1600 		 * superpage of this segment and the initial superpage
1601 		 * of the next segment are the same then adjust the
1602 		 * pv_table entry for that next segment down by one so
1603 		 * that the pv_table entries will be shared.
1604 		 */
1605 		if (i + 1 < vm_phys_nsegs) {
1606 			next_seg = &vm_phys_segs[i + 1];
1607 			if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1608 			    pmap_l2_pindex(next_seg->start)) {
1609 				pvd--;
1610 			}
1611 		}
1612 	}
1613 }
1614 
1615 /*
1616  *	Initialize the pmap module.
1617  *	Called by vm_init, to initialize any structures that the pmap
1618  *	system needs to map virtual memory.
1619  */
1620 void
pmap_init(void)1621 pmap_init(void)
1622 {
1623 	uint64_t mmfr1;
1624 	int i, vmid_bits;
1625 
1626 	/*
1627 	 * Are large page mappings enabled?
1628 	 */
1629 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1630 	if (superpages_enabled) {
1631 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1632 		    ("pmap_init: can't assign to pagesizes[1]"));
1633 		pagesizes[1] = L2_SIZE;
1634 		if (L1_BLOCKS_SUPPORTED) {
1635 			KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1636 			    ("pmap_init: can't assign to pagesizes[2]"));
1637 			pagesizes[2] = L1_SIZE;
1638 		}
1639 	}
1640 
1641 	/*
1642 	 * Initialize the ASID allocator.
1643 	 */
1644 	pmap_init_asids(&asids,
1645 	    (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1646 
1647 	if (has_hyp()) {
1648 		mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1649 		vmid_bits = 8;
1650 
1651 		if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1652 		    ID_AA64MMFR1_VMIDBits_16)
1653 			vmid_bits = 16;
1654 		pmap_init_asids(&vmids, vmid_bits);
1655 	}
1656 
1657 	/*
1658 	 * Initialize pv chunk lists.
1659 	 */
1660 	for (i = 0; i < PMAP_MEMDOM; i++) {
1661 		mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1662 		    MTX_DEF);
1663 		TAILQ_INIT(&pv_chunks[i].pvc_list);
1664 	}
1665 	pmap_init_pv_table();
1666 
1667 	vm_initialized = 1;
1668 }
1669 
1670 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1671     "2MB page mapping counters");
1672 
1673 static u_long pmap_l2_demotions;
1674 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1675     &pmap_l2_demotions, 0, "2MB page demotions");
1676 
1677 static u_long pmap_l2_mappings;
1678 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1679     &pmap_l2_mappings, 0, "2MB page mappings");
1680 
1681 static u_long pmap_l2_p_failures;
1682 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1683     &pmap_l2_p_failures, 0, "2MB page promotion failures");
1684 
1685 static u_long pmap_l2_promotions;
1686 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1687     &pmap_l2_promotions, 0, "2MB page promotions");
1688 
1689 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1690     "L3C (64KB/2MB) page mapping counters");
1691 
1692 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1693 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1694     &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1695 
1696 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1697 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1698     &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1699 
1700 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1701 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1702     &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1703 
1704 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1705 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1706     &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1707 
1708 /*
1709  * If the given value for "final_only" is false, then any cached intermediate-
1710  * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1711  * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1712  * Otherwise, just the cached final-level entry is invalidated.
1713  */
1714 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1715 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1716 {
1717 	if (final_only)
1718 		__asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1719 	else
1720 		__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1721 }
1722 
1723 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1724 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1725 {
1726 	if (final_only)
1727 		__asm __volatile("tlbi vale1is, %0" : : "r" (r));
1728 	else
1729 		__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1730 }
1731 
1732 /*
1733  * Invalidates any cached final- and optionally intermediate-level TLB entries
1734  * for the specified virtual address in the given virtual address space.
1735  */
1736 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1737 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1738 {
1739 	uint64_t r;
1740 
1741 	PMAP_ASSERT_STAGE1(pmap);
1742 
1743 	dsb(ishst);
1744 	r = TLBI_VA(va);
1745 	if (pmap == kernel_pmap) {
1746 		pmap_s1_invalidate_kernel(r, final_only);
1747 	} else {
1748 		r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1749 		pmap_s1_invalidate_user(r, final_only);
1750 	}
1751 	dsb(ish);
1752 	isb();
1753 }
1754 
1755 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1756 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1757 {
1758 	PMAP_ASSERT_STAGE2(pmap);
1759 	MPASS(pmap_stage2_invalidate_range != NULL);
1760 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1761 	    final_only);
1762 }
1763 
1764 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1765 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1766 {
1767 	if (pmap->pm_stage == PM_STAGE1)
1768 		pmap_s1_invalidate_page(pmap, va, final_only);
1769 	else
1770 		pmap_s2_invalidate_page(pmap, va, final_only);
1771 }
1772 
1773 /*
1774  * Invalidates any cached final- and optionally intermediate-level TLB entries
1775  * for the specified virtual address range in the given virtual address space.
1776  */
1777 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1778 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1779     bool final_only)
1780 {
1781 	uint64_t end, r, start;
1782 
1783 	PMAP_ASSERT_STAGE1(pmap);
1784 
1785 	dsb(ishst);
1786 	if (pmap == kernel_pmap) {
1787 		start = TLBI_VA(sva);
1788 		end = TLBI_VA(eva);
1789 		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1790 			pmap_s1_invalidate_kernel(r, final_only);
1791 	} else {
1792 		start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1793 		start |= TLBI_VA(sva);
1794 		end |= TLBI_VA(eva);
1795 		for (r = start; r < end; r += TLBI_VA_L3_INCR)
1796 			pmap_s1_invalidate_user(r, final_only);
1797 	}
1798 	dsb(ish);
1799 	isb();
1800 }
1801 
1802 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1803 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1804     bool final_only)
1805 {
1806 	PMAP_ASSERT_STAGE2(pmap);
1807 	MPASS(pmap_stage2_invalidate_range != NULL);
1808 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1809 }
1810 
1811 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1812 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1813     bool final_only)
1814 {
1815 	if (pmap->pm_stage == PM_STAGE1)
1816 		pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1817 	else
1818 		pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1819 }
1820 
1821 /*
1822  * Invalidates all cached intermediate- and final-level TLB entries for the
1823  * given virtual address space.
1824  */
1825 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)1826 pmap_s1_invalidate_all(pmap_t pmap)
1827 {
1828 	uint64_t r;
1829 
1830 	PMAP_ASSERT_STAGE1(pmap);
1831 
1832 	dsb(ishst);
1833 	if (pmap == kernel_pmap) {
1834 		__asm __volatile("tlbi vmalle1is");
1835 	} else {
1836 		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1837 		__asm __volatile("tlbi aside1is, %0" : : "r" (r));
1838 	}
1839 	dsb(ish);
1840 	isb();
1841 }
1842 
1843 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)1844 pmap_s2_invalidate_all(pmap_t pmap)
1845 {
1846 	PMAP_ASSERT_STAGE2(pmap);
1847 	MPASS(pmap_stage2_invalidate_all != NULL);
1848 	pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1849 }
1850 
1851 static __inline void
pmap_invalidate_all(pmap_t pmap)1852 pmap_invalidate_all(pmap_t pmap)
1853 {
1854 	if (pmap->pm_stage == PM_STAGE1)
1855 		pmap_s1_invalidate_all(pmap);
1856 	else
1857 		pmap_s2_invalidate_all(pmap);
1858 }
1859 
1860 /*
1861  *	Routine:	pmap_extract
1862  *	Function:
1863  *		Extract the physical page address associated
1864  *		with the given map/virtual_address pair.
1865  */
1866 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1867 pmap_extract(pmap_t pmap, vm_offset_t va)
1868 {
1869 	pt_entry_t *pte, tpte;
1870 	vm_paddr_t pa;
1871 	int lvl;
1872 
1873 	pa = 0;
1874 	PMAP_LOCK(pmap);
1875 	/*
1876 	 * Find the block or page map for this virtual address. pmap_pte
1877 	 * will return either a valid block/page entry, or NULL.
1878 	 */
1879 	pte = pmap_pte(pmap, va, &lvl);
1880 	if (pte != NULL) {
1881 		tpte = pmap_load(pte);
1882 		pa = PTE_TO_PHYS(tpte);
1883 		switch(lvl) {
1884 		case 1:
1885 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1886 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1887 			    ("pmap_extract: Invalid L1 pte found: %lx",
1888 			    tpte & ATTR_DESCR_MASK));
1889 			pa |= (va & L1_OFFSET);
1890 			break;
1891 		case 2:
1892 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1893 			    ("pmap_extract: Invalid L2 pte found: %lx",
1894 			    tpte & ATTR_DESCR_MASK));
1895 			pa |= (va & L2_OFFSET);
1896 			break;
1897 		case 3:
1898 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1899 			    ("pmap_extract: Invalid L3 pte found: %lx",
1900 			    tpte & ATTR_DESCR_MASK));
1901 			pa |= (va & L3_OFFSET);
1902 			break;
1903 		}
1904 	}
1905 	PMAP_UNLOCK(pmap);
1906 	return (pa);
1907 }
1908 
1909 /*
1910  *	Routine:	pmap_extract_and_hold
1911  *	Function:
1912  *		Atomically extract and hold the physical page
1913  *		with the given pmap and virtual address pair
1914  *		if that mapping permits the given protection.
1915  */
1916 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1917 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1918 {
1919 	pt_entry_t *pte, tpte;
1920 	vm_offset_t off;
1921 	vm_page_t m;
1922 	int lvl;
1923 	bool use;
1924 
1925 	m = NULL;
1926 	PMAP_LOCK(pmap);
1927 	pte = pmap_pte(pmap, va, &lvl);
1928 	if (pte != NULL) {
1929 		tpte = pmap_load(pte);
1930 
1931 		KASSERT(lvl > 0 && lvl <= 3,
1932 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1933 		/*
1934 		 * Check that the pte is either a L3 page, or a L1 or L2 block
1935 		 * entry. We can assume L1_BLOCK == L2_BLOCK.
1936 		 */
1937 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1938 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1939 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1940 		     tpte & ATTR_DESCR_MASK));
1941 
1942 		use = false;
1943 		if ((prot & VM_PROT_WRITE) == 0)
1944 			use = true;
1945 		else if (pmap->pm_stage == PM_STAGE1 &&
1946 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1947 			use = true;
1948 		else if (pmap->pm_stage == PM_STAGE2 &&
1949 		    ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1950 		     ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1951 			use = true;
1952 
1953 		if (use) {
1954 			switch (lvl) {
1955 			case 1:
1956 				off = va & L1_OFFSET;
1957 				break;
1958 			case 2:
1959 				off = va & L2_OFFSET;
1960 				break;
1961 			case 3:
1962 			default:
1963 				off = 0;
1964 			}
1965 			m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
1966 			if (m != NULL && !vm_page_wire_mapped(m))
1967 				m = NULL;
1968 		}
1969 	}
1970 	PMAP_UNLOCK(pmap);
1971 	return (m);
1972 }
1973 
1974 /*
1975  * Walks the page tables to translate a kernel virtual address to a
1976  * physical address. Returns true if the kva is valid and stores the
1977  * physical address in pa if it is not NULL.
1978  *
1979  * See the comment above data_abort() for the rationale for specifying
1980  * NO_PERTHREAD_SSP here.
1981  */
1982 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)1983 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1984 {
1985 	pt_entry_t *pte, tpte;
1986 	register_t intr;
1987 	uint64_t par;
1988 
1989 	/*
1990 	 * Disable interrupts so we don't get interrupted between asking
1991 	 * for address translation, and getting the result back.
1992 	 */
1993 	intr = intr_disable();
1994 	par = arm64_address_translate_s1e1r(va);
1995 	intr_restore(intr);
1996 
1997 	if (PAR_SUCCESS(par)) {
1998 		if (pa != NULL)
1999 			*pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2000 		return (true);
2001 	}
2002 
2003 	/*
2004 	 * Fall back to walking the page table. The address translation
2005 	 * instruction may fail when the page is in a break-before-make
2006 	 * sequence. As we only clear the valid bit in said sequence we
2007 	 * can walk the page table to find the physical address.
2008 	 */
2009 
2010 	pte = pmap_l1(kernel_pmap, va);
2011 	if (pte == NULL)
2012 		return (false);
2013 
2014 	/*
2015 	 * A concurrent pmap_update_entry() will clear the entry's valid bit
2016 	 * but leave the rest of the entry unchanged.  Therefore, we treat a
2017 	 * non-zero entry as being valid, and we ignore the valid bit when
2018 	 * determining whether the entry maps a block, page, or table.
2019 	 */
2020 	tpte = pmap_load(pte);
2021 	if (tpte == 0)
2022 		return (false);
2023 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2024 		if (pa != NULL)
2025 			*pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2026 		return (true);
2027 	}
2028 	pte = pmap_l1_to_l2(&tpte, va);
2029 	tpte = pmap_load(pte);
2030 	if (tpte == 0)
2031 		return (false);
2032 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2033 		if (pa != NULL)
2034 			*pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2035 		return (true);
2036 	}
2037 	pte = pmap_l2_to_l3(&tpte, va);
2038 	tpte = pmap_load(pte);
2039 	if (tpte == 0)
2040 		return (false);
2041 	if (pa != NULL)
2042 		*pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2043 	return (true);
2044 }
2045 
2046 /*
2047  *	Routine:	pmap_kextract
2048  *	Function:
2049  *		Extract the physical page address associated with the given kernel
2050  *		virtual address.
2051  */
2052 vm_paddr_t
pmap_kextract(vm_offset_t va)2053 pmap_kextract(vm_offset_t va)
2054 {
2055 	vm_paddr_t pa;
2056 
2057 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2058 		return (DMAP_TO_PHYS(va));
2059 
2060 	if (pmap_klookup(va, &pa) == false)
2061 		return (0);
2062 	return (pa);
2063 }
2064 
2065 /***************************************************
2066  * Low level mapping routines.....
2067  ***************************************************/
2068 
2069 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2070 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2071 {
2072 	pd_entry_t *pde;
2073 	pt_entry_t attr, old_l3e, *pte;
2074 	vm_offset_t va;
2075 	vm_page_t mpte;
2076 	int error, lvl;
2077 
2078 	KASSERT((pa & L3_OFFSET) == 0,
2079 	    ("pmap_kenter: Invalid physical address"));
2080 	KASSERT((sva & L3_OFFSET) == 0,
2081 	    ("pmap_kenter: Invalid virtual address"));
2082 	KASSERT((size & PAGE_MASK) == 0,
2083 	    ("pmap_kenter: Mapping is not page-sized"));
2084 
2085 	attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2086 	    ATTR_KERN_GP | ATTR_S1_IDX(mode);
2087 	old_l3e = 0;
2088 	va = sva;
2089 	while (size != 0) {
2090 		pde = pmap_pde(kernel_pmap, va, &lvl);
2091 		KASSERT(pde != NULL,
2092 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2093 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2094 
2095 		/*
2096 		 * If we have an aligned, contiguous chunk of L2_SIZE, try
2097 		 * to create an L2_BLOCK mapping.
2098 		 */
2099 		if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2100 		    (pa & L2_OFFSET) == 0 && vm_initialized) {
2101 			mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2102 			KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2103 			    ("pmap_kenter: Unexpected mapping"));
2104 			PMAP_LOCK(kernel_pmap);
2105 			error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2106 			    false);
2107 			if (error == 0) {
2108 				attr &= ~ATTR_CONTIGUOUS;
2109 
2110 				/*
2111 				 * Although the page table page "mpte" should
2112 				 * be devoid of mappings, the TLB might hold
2113 				 * intermediate entries that reference it, so
2114 				 * we perform a single-page invalidation.
2115 				 */
2116 				pmap_update_entry(kernel_pmap, pde,
2117 				    PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2118 				    PAGE_SIZE);
2119 			}
2120 			PMAP_UNLOCK(kernel_pmap);
2121 			if (error == 0) {
2122 				va += L2_SIZE;
2123 				pa += L2_SIZE;
2124 				size -= L2_SIZE;
2125 				continue;
2126 			}
2127 		}
2128 
2129 		/*
2130 		 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2131 		 * L3 pages, set the contiguous bit within each PTE so that
2132 		 * the chunk can be cached using only one TLB entry.
2133 		 */
2134 		if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2135 			if (size >= L3C_SIZE)
2136 				attr |= ATTR_CONTIGUOUS;
2137 			else
2138 				attr &= ~ATTR_CONTIGUOUS;
2139 		}
2140 
2141 		pte = pmap_l2_to_l3(pde, va);
2142 		old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2143 		    L3_PAGE);
2144 
2145 		va += PAGE_SIZE;
2146 		pa += PAGE_SIZE;
2147 		size -= PAGE_SIZE;
2148 	}
2149 	if ((old_l3e & ATTR_DESCR_VALID) != 0)
2150 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2151 	else {
2152 		/*
2153 		 * Because the old entries were invalid and the new mappings
2154 		 * are not executable, an isb is not required.
2155 		 */
2156 		dsb(ishst);
2157 	}
2158 }
2159 
2160 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2161 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2162 {
2163 
2164 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2165 }
2166 
2167 /*
2168  * Remove a page from the kernel pagetables.
2169  */
2170 void
pmap_kremove(vm_offset_t va)2171 pmap_kremove(vm_offset_t va)
2172 {
2173 	pt_entry_t *pte;
2174 
2175 	pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2176 	KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2177 	    ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2178 	pmap_clear(pte);
2179 	pmap_s1_invalidate_page(kernel_pmap, va, true);
2180 }
2181 
2182 /*
2183  * Remove the specified range of mappings from the kernel address space.
2184  *
2185  * Should only be applied to mappings that were created by pmap_kenter() or
2186  * pmap_kenter_device().  Nothing about this function is actually specific
2187  * to device mappings.
2188  */
2189 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2190 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2191 {
2192 	pt_entry_t *ptep, *ptep_end;
2193 	vm_offset_t va;
2194 	int lvl;
2195 
2196 	KASSERT((sva & L3_OFFSET) == 0,
2197 	    ("pmap_kremove_device: Invalid virtual address"));
2198 	KASSERT((size & PAGE_MASK) == 0,
2199 	    ("pmap_kremove_device: Mapping is not page-sized"));
2200 
2201 	va = sva;
2202 	while (size != 0) {
2203 		ptep = pmap_pte(kernel_pmap, va, &lvl);
2204 		KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2205 		switch (lvl) {
2206 		case 2:
2207 			KASSERT((va & L2_OFFSET) == 0,
2208 			    ("Unaligned virtual address"));
2209 			KASSERT(size >= L2_SIZE, ("Insufficient size"));
2210 
2211 			if (va != sva) {
2212 				pmap_s1_invalidate_range(kernel_pmap, sva, va,
2213 				    true);
2214 			}
2215 			pmap_clear(ptep);
2216 			pmap_s1_invalidate_page(kernel_pmap, va, true);
2217 			PMAP_LOCK(kernel_pmap);
2218 			pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2219 			PMAP_UNLOCK(kernel_pmap);
2220 
2221 			va += L2_SIZE;
2222 			sva = va;
2223 			size -= L2_SIZE;
2224 			break;
2225 		case 3:
2226 			if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2227 				KASSERT((va & L3C_OFFSET) == 0,
2228 				    ("Unaligned L3C virtual address"));
2229 				KASSERT(size >= L3C_SIZE,
2230 				    ("Insufficient L3C size"));
2231 
2232 				ptep_end = ptep + L3C_ENTRIES;
2233 				for (; ptep < ptep_end; ptep++)
2234 					pmap_clear(ptep);
2235 
2236 				va += L3C_SIZE;
2237 				size -= L3C_SIZE;
2238 				break;
2239 			}
2240 			pmap_clear(ptep);
2241 
2242 			va += PAGE_SIZE;
2243 			size -= PAGE_SIZE;
2244 			break;
2245 		default:
2246 			__assert_unreachable();
2247 			break;
2248 		}
2249 	}
2250 	if (va != sva)
2251 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2252 }
2253 
2254 /*
2255  *	Used to map a range of physical addresses into kernel
2256  *	virtual address space.
2257  *
2258  *	The value passed in '*virt' is a suggested virtual address for
2259  *	the mapping. Architectures which can support a direct-mapped
2260  *	physical to virtual region can return the appropriate address
2261  *	within that region, leaving '*virt' unchanged. Other
2262  *	architectures should map the pages starting at '*virt' and
2263  *	update '*virt' with the first usable address after the mapped
2264  *	region.
2265  */
2266 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2267 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2268 {
2269 	return PHYS_TO_DMAP(start);
2270 }
2271 
2272 /*
2273  * Add a list of wired pages to the kva
2274  * this routine is only used for temporary
2275  * kernel mappings that do not need to have
2276  * page modification or references recorded.
2277  * Note that old mappings are simply written
2278  * over.  The page *must* be wired.
2279  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2280  */
2281 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)2282 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2283 {
2284 	pd_entry_t *pde;
2285 	pt_entry_t attr, old_l3e, *pte;
2286 	vm_offset_t va;
2287 	vm_page_t m;
2288 	int i, lvl;
2289 
2290 	old_l3e = 0;
2291 	va = sva;
2292 	for (i = 0; i < count; i++) {
2293 		pde = pmap_pde(kernel_pmap, va, &lvl);
2294 		KASSERT(pde != NULL,
2295 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2296 		KASSERT(lvl == 2,
2297 		    ("pmap_qenter: Invalid level %d", lvl));
2298 
2299 		m = ma[i];
2300 		attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2301 		    ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2302 		pte = pmap_l2_to_l3(pde, va);
2303 		old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2304 
2305 		va += L3_SIZE;
2306 	}
2307 	if ((old_l3e & ATTR_DESCR_VALID) != 0)
2308 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2309 	else {
2310 		/*
2311 		 * Because the old entries were invalid and the new mappings
2312 		 * are not executable, an isb is not required.
2313 		 */
2314 		dsb(ishst);
2315 	}
2316 }
2317 
2318 /*
2319  * This routine tears out page mappings from the
2320  * kernel -- it is meant only for temporary mappings.
2321  */
2322 void
pmap_qremove(vm_offset_t sva,int count)2323 pmap_qremove(vm_offset_t sva, int count)
2324 {
2325 	pt_entry_t *pte;
2326 	vm_offset_t va;
2327 
2328 	KASSERT(ADDR_IS_CANONICAL(sva),
2329 	    ("%s: Address not in canonical form: %lx", __func__, sva));
2330 	KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2331 
2332 	va = sva;
2333 	while (count-- > 0) {
2334 		pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2335 		if (pte != NULL) {
2336 			pmap_clear(pte);
2337 		}
2338 
2339 		va += PAGE_SIZE;
2340 	}
2341 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2342 }
2343 
2344 /***************************************************
2345  * Page table page management routines.....
2346  ***************************************************/
2347 /*
2348  * Schedule the specified unused page table page to be freed.  Specifically,
2349  * add the page to the specified list of pages that will be released to the
2350  * physical memory manager after the TLB has been updated.
2351  */
2352 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2353 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2354 {
2355 
2356 	if (set_PG_ZERO)
2357 		m->flags |= PG_ZERO;
2358 	else
2359 		m->flags &= ~PG_ZERO;
2360 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2361 }
2362 
2363 /*
2364  * Decrements a page table page's reference count, which is used to record the
2365  * number of valid page table entries within the page.  If the reference count
2366  * drops to zero, then the page table page is unmapped.  Returns true if the
2367  * page table page was unmapped and false otherwise.
2368  */
2369 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2370 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2371 {
2372 
2373 	--m->ref_count;
2374 	if (m->ref_count == 0) {
2375 		_pmap_unwire_l3(pmap, va, m, free);
2376 		return (true);
2377 	} else
2378 		return (false);
2379 }
2380 
2381 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2382 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2383 {
2384 
2385 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2386 	/*
2387 	 * unmap the page table page
2388 	 */
2389 	if (m->pindex >= (NUL2E + NUL1E)) {
2390 		/* l1 page */
2391 		pd_entry_t *l0;
2392 
2393 		l0 = pmap_l0(pmap, va);
2394 		pmap_clear(l0);
2395 	} else if (m->pindex >= NUL2E) {
2396 		/* l2 page */
2397 		pd_entry_t *l1;
2398 
2399 		l1 = pmap_l1(pmap, va);
2400 		pmap_clear(l1);
2401 	} else {
2402 		/* l3 page */
2403 		pd_entry_t *l2;
2404 
2405 		l2 = pmap_l2(pmap, va);
2406 		pmap_clear(l2);
2407 	}
2408 	pmap_resident_count_dec(pmap, 1);
2409 	if (m->pindex < NUL2E) {
2410 		/* We just released an l3, unhold the matching l2 */
2411 		pd_entry_t *l1, tl1;
2412 		vm_page_t l2pg;
2413 
2414 		l1 = pmap_l1(pmap, va);
2415 		tl1 = pmap_load(l1);
2416 		l2pg = PTE_TO_VM_PAGE(tl1);
2417 		pmap_unwire_l3(pmap, va, l2pg, free);
2418 	} else if (m->pindex < (NUL2E + NUL1E)) {
2419 		/* We just released an l2, unhold the matching l1 */
2420 		pd_entry_t *l0, tl0;
2421 		vm_page_t l1pg;
2422 
2423 		l0 = pmap_l0(pmap, va);
2424 		tl0 = pmap_load(l0);
2425 		l1pg = PTE_TO_VM_PAGE(tl0);
2426 		pmap_unwire_l3(pmap, va, l1pg, free);
2427 	}
2428 	pmap_invalidate_page(pmap, va, false);
2429 
2430 	/*
2431 	 * Put page on a list so that it is released after
2432 	 * *ALL* TLB shootdown is done
2433 	 */
2434 	pmap_add_delayed_free_list(m, free, true);
2435 }
2436 
2437 /*
2438  * After removing a page table entry, this routine is used to
2439  * conditionally free the page, and manage the reference count.
2440  */
2441 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2442 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2443     struct spglist *free)
2444 {
2445 	vm_page_t mpte;
2446 
2447 	KASSERT(ADDR_IS_CANONICAL(va),
2448 	    ("%s: Address not in canonical form: %lx", __func__, va));
2449 	if (ADDR_IS_KERNEL(va))
2450 		return (0);
2451 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2452 	mpte = PTE_TO_VM_PAGE(ptepde);
2453 	return (pmap_unwire_l3(pmap, va, mpte, free));
2454 }
2455 
2456 /*
2457  * Release a page table page reference after a failed attempt to create a
2458  * mapping.
2459  */
2460 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2461 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2462 {
2463 	struct spglist free;
2464 
2465 	SLIST_INIT(&free);
2466 	if (pmap_unwire_l3(pmap, va, mpte, &free))
2467 		vm_page_free_pages_toq(&free, true);
2468 }
2469 
2470 void
pmap_pinit0(pmap_t pmap)2471 pmap_pinit0(pmap_t pmap)
2472 {
2473 
2474 	PMAP_LOCK_INIT(pmap);
2475 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2476 	pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2477 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2478 	TAILQ_INIT(&pmap->pm_pvchunk);
2479 	vm_radix_init(&pmap->pm_root);
2480 	pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2481 	pmap->pm_stage = PM_STAGE1;
2482 	pmap->pm_levels = 4;
2483 	pmap->pm_ttbr = pmap->pm_l0_paddr;
2484 	pmap->pm_asid_set = &asids;
2485 	pmap->pm_bti = NULL;
2486 
2487 	PCPU_SET(curpmap, pmap);
2488 }
2489 
2490 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2491 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2492 {
2493 	vm_page_t m;
2494 
2495 	/*
2496 	 * allocate the l0 page
2497 	 */
2498 	m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2499 	    VM_ALLOC_ZERO);
2500 	pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2501 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2502 
2503 	TAILQ_INIT(&pmap->pm_pvchunk);
2504 	vm_radix_init(&pmap->pm_root);
2505 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2506 	pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2507 
2508 	MPASS(levels == 3 || levels == 4);
2509 	pmap->pm_levels = levels;
2510 	pmap->pm_stage = stage;
2511 	pmap->pm_bti = NULL;
2512 	switch (stage) {
2513 	case PM_STAGE1:
2514 		pmap->pm_asid_set = &asids;
2515 		if (pmap_bti_support) {
2516 			pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2517 			    M_ZERO | M_WAITOK);
2518 			rangeset_init(pmap->pm_bti, bti_dup_range,
2519 			    bti_free_range, pmap, M_NOWAIT);
2520 		}
2521 		break;
2522 	case PM_STAGE2:
2523 		pmap->pm_asid_set = &vmids;
2524 		break;
2525 	default:
2526 		panic("%s: Invalid pmap type %d", __func__, stage);
2527 		break;
2528 	}
2529 
2530 	/* XXX Temporarily disable deferred ASID allocation. */
2531 	pmap_alloc_asid(pmap);
2532 
2533 	/*
2534 	 * Allocate the level 1 entry to use as the root. This will increase
2535 	 * the refcount on the level 1 page so it won't be removed until
2536 	 * pmap_release() is called.
2537 	 */
2538 	if (pmap->pm_levels == 3) {
2539 		PMAP_LOCK(pmap);
2540 		m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2541 		PMAP_UNLOCK(pmap);
2542 	}
2543 	pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2544 
2545 	return (1);
2546 }
2547 
2548 int
pmap_pinit(pmap_t pmap)2549 pmap_pinit(pmap_t pmap)
2550 {
2551 
2552 	return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2553 }
2554 
2555 /*
2556  * This routine is called if the desired page table page does not exist.
2557  *
2558  * If page table page allocation fails, this routine may sleep before
2559  * returning NULL.  It sleeps only if a lock pointer was given.
2560  *
2561  * Note: If a page allocation fails at page table level two or three,
2562  * one or two pages may be held during the wait, only to be released
2563  * afterwards.  This conservative approach is easily argued to avoid
2564  * race conditions.
2565  */
2566 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2567 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2568 {
2569 	vm_page_t m, l1pg, l2pg;
2570 
2571 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2572 
2573 	/*
2574 	 * Allocate a page table page.
2575 	 */
2576 	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2577 		if (lockp != NULL) {
2578 			RELEASE_PV_LIST_LOCK(lockp);
2579 			PMAP_UNLOCK(pmap);
2580 			vm_wait(NULL);
2581 			PMAP_LOCK(pmap);
2582 		}
2583 
2584 		/*
2585 		 * Indicate the need to retry.  While waiting, the page table
2586 		 * page may have been allocated.
2587 		 */
2588 		return (NULL);
2589 	}
2590 	m->pindex = ptepindex;
2591 
2592 	/*
2593 	 * Because of AArch64's weak memory consistency model, we must have a
2594 	 * barrier here to ensure that the stores for zeroing "m", whether by
2595 	 * pmap_zero_page() or an earlier function, are visible before adding
2596 	 * "m" to the page table.  Otherwise, a page table walk by another
2597 	 * processor's MMU could see the mapping to "m" and a stale, non-zero
2598 	 * PTE within "m".
2599 	 */
2600 	dmb(ishst);
2601 
2602 	/*
2603 	 * Map the pagetable page into the process address space, if
2604 	 * it isn't already there.
2605 	 */
2606 
2607 	if (ptepindex >= (NUL2E + NUL1E)) {
2608 		pd_entry_t *l0p, l0e;
2609 		vm_pindex_t l0index;
2610 
2611 		l0index = ptepindex - (NUL2E + NUL1E);
2612 		l0p = &pmap->pm_l0[l0index];
2613 		KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2614 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2615 		l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2616 
2617 		/*
2618 		 * Mark all kernel memory as not accessible from userspace
2619 		 * and userspace memory as not executable from the kernel.
2620 		 * This has been done for the bootstrap L0 entries in
2621 		 * locore.S.
2622 		 */
2623 		if (pmap == kernel_pmap)
2624 			l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2625 		else
2626 			l0e |= TATTR_PXN_TABLE;
2627 		pmap_store(l0p, l0e);
2628 	} else if (ptepindex >= NUL2E) {
2629 		vm_pindex_t l0index, l1index;
2630 		pd_entry_t *l0, *l1;
2631 		pd_entry_t tl0;
2632 
2633 		l1index = ptepindex - NUL2E;
2634 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2635 
2636 		l0 = &pmap->pm_l0[l0index];
2637 		tl0 = pmap_load(l0);
2638 		if (tl0 == 0) {
2639 			/* recurse for allocating page dir */
2640 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2641 			    lockp) == NULL) {
2642 				vm_page_unwire_noq(m);
2643 				vm_page_free_zero(m);
2644 				return (NULL);
2645 			}
2646 		} else {
2647 			l1pg = PTE_TO_VM_PAGE(tl0);
2648 			l1pg->ref_count++;
2649 		}
2650 
2651 		l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2652 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
2653 		KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2654 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2655 		pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2656 	} else {
2657 		vm_pindex_t l0index, l1index;
2658 		pd_entry_t *l0, *l1, *l2;
2659 		pd_entry_t tl0, tl1;
2660 
2661 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2662 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2663 
2664 		l0 = &pmap->pm_l0[l0index];
2665 		tl0 = pmap_load(l0);
2666 		if (tl0 == 0) {
2667 			/* recurse for allocating page dir */
2668 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2669 			    lockp) == NULL) {
2670 				vm_page_unwire_noq(m);
2671 				vm_page_free_zero(m);
2672 				return (NULL);
2673 			}
2674 			tl0 = pmap_load(l0);
2675 			l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2676 			l1 = &l1[l1index & Ln_ADDR_MASK];
2677 		} else {
2678 			l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2679 			l1 = &l1[l1index & Ln_ADDR_MASK];
2680 			tl1 = pmap_load(l1);
2681 			if (tl1 == 0) {
2682 				/* recurse for allocating page dir */
2683 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2684 				    lockp) == NULL) {
2685 					vm_page_unwire_noq(m);
2686 					vm_page_free_zero(m);
2687 					return (NULL);
2688 				}
2689 			} else {
2690 				l2pg = PTE_TO_VM_PAGE(tl1);
2691 				l2pg->ref_count++;
2692 			}
2693 		}
2694 
2695 		l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2696 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
2697 		KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2698 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2699 		pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2700 	}
2701 
2702 	pmap_resident_count_inc(pmap, 1);
2703 
2704 	return (m);
2705 }
2706 
2707 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)2708 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2709     struct rwlock **lockp)
2710 {
2711 	pd_entry_t *l1, *l2;
2712 	vm_page_t l2pg;
2713 	vm_pindex_t l2pindex;
2714 
2715 	KASSERT(ADDR_IS_CANONICAL(va),
2716 	    ("%s: Address not in canonical form: %lx", __func__, va));
2717 
2718 retry:
2719 	l1 = pmap_l1(pmap, va);
2720 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2721 		l2 = pmap_l1_to_l2(l1, va);
2722 		if (!ADDR_IS_KERNEL(va)) {
2723 			/* Add a reference to the L2 page. */
2724 			l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2725 			l2pg->ref_count++;
2726 		} else
2727 			l2pg = NULL;
2728 	} else if (!ADDR_IS_KERNEL(va)) {
2729 		/* Allocate a L2 page. */
2730 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2731 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2732 		if (l2pg == NULL) {
2733 			if (lockp != NULL)
2734 				goto retry;
2735 			else
2736 				return (NULL);
2737 		}
2738 		l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2739 		l2 = &l2[pmap_l2_index(va)];
2740 	} else
2741 		panic("pmap_alloc_l2: missing page table page for va %#lx",
2742 		    va);
2743 	*l2pgp = l2pg;
2744 	return (l2);
2745 }
2746 
2747 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2748 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2749 {
2750 	vm_pindex_t ptepindex;
2751 	pd_entry_t *pde, tpde;
2752 #ifdef INVARIANTS
2753 	pt_entry_t *pte;
2754 #endif
2755 	vm_page_t m;
2756 	int lvl;
2757 
2758 	/*
2759 	 * Calculate pagetable page index
2760 	 */
2761 	ptepindex = pmap_l2_pindex(va);
2762 retry:
2763 	/*
2764 	 * Get the page directory entry
2765 	 */
2766 	pde = pmap_pde(pmap, va, &lvl);
2767 
2768 	/*
2769 	 * If the page table page is mapped, we just increment the hold count,
2770 	 * and activate it. If we get a level 2 pde it will point to a level 3
2771 	 * table.
2772 	 */
2773 	switch (lvl) {
2774 	case -1:
2775 		break;
2776 	case 0:
2777 #ifdef INVARIANTS
2778 		pte = pmap_l0_to_l1(pde, va);
2779 		KASSERT(pmap_load(pte) == 0,
2780 		    ("pmap_alloc_l3: TODO: l0 superpages"));
2781 #endif
2782 		break;
2783 	case 1:
2784 #ifdef INVARIANTS
2785 		pte = pmap_l1_to_l2(pde, va);
2786 		KASSERT(pmap_load(pte) == 0,
2787 		    ("pmap_alloc_l3: TODO: l1 superpages"));
2788 #endif
2789 		break;
2790 	case 2:
2791 		tpde = pmap_load(pde);
2792 		if (tpde != 0) {
2793 			m = PTE_TO_VM_PAGE(tpde);
2794 			m->ref_count++;
2795 			return (m);
2796 		}
2797 		break;
2798 	default:
2799 		panic("pmap_alloc_l3: Invalid level %d", lvl);
2800 	}
2801 
2802 	/*
2803 	 * Here if the pte page isn't mapped, or if it has been deallocated.
2804 	 */
2805 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2806 	if (m == NULL && lockp != NULL)
2807 		goto retry;
2808 
2809 	return (m);
2810 }
2811 
2812 /***************************************************
2813  * Pmap allocation/deallocation routines.
2814  ***************************************************/
2815 
2816 /*
2817  * Release any resources held by the given physical map.
2818  * Called when a pmap initialized by pmap_pinit is being released.
2819  * Should only be called if the map contains no valid mappings.
2820  */
2821 void
pmap_release(pmap_t pmap)2822 pmap_release(pmap_t pmap)
2823 {
2824 	bool rv __diagused;
2825 	struct spglist freelist;
2826 	struct asid_set *set;
2827 	vm_page_t m;
2828 	int asid;
2829 
2830 	if (pmap->pm_levels != 4) {
2831 		PMAP_ASSERT_STAGE2(pmap);
2832 		KASSERT(pmap->pm_stats.resident_count == 1,
2833 		    ("pmap_release: pmap resident count %ld != 0",
2834 		    pmap->pm_stats.resident_count));
2835 		KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2836 		    ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2837 
2838 		SLIST_INIT(&freelist);
2839 		m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2840 		PMAP_LOCK(pmap);
2841 		rv = pmap_unwire_l3(pmap, 0, m, &freelist);
2842 		PMAP_UNLOCK(pmap);
2843 		MPASS(rv == true);
2844 		vm_page_free_pages_toq(&freelist, true);
2845 	}
2846 
2847 	KASSERT(pmap->pm_stats.resident_count == 0,
2848 	    ("pmap_release: pmap resident count %ld != 0",
2849 	    pmap->pm_stats.resident_count));
2850 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2851 	    ("pmap_release: pmap has reserved page table page(s)"));
2852 
2853 	set = pmap->pm_asid_set;
2854 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2855 
2856 	/*
2857 	 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2858 	 * the entries when removing them so rely on a later tlb invalidation.
2859 	 * this will happen when updating the VMID generation. Because of this
2860 	 * we don't reuse VMIDs within a generation.
2861 	 */
2862 	if (pmap->pm_stage == PM_STAGE1) {
2863 		mtx_lock_spin(&set->asid_set_mutex);
2864 		if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2865 			asid = COOKIE_TO_ASID(pmap->pm_cookie);
2866 			KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2867 			    asid < set->asid_set_size,
2868 			    ("pmap_release: pmap cookie has out-of-range asid"));
2869 			bit_clear(set->asid_set, asid);
2870 		}
2871 		mtx_unlock_spin(&set->asid_set_mutex);
2872 
2873 		if (pmap->pm_bti != NULL) {
2874 			rangeset_fini(pmap->pm_bti);
2875 			free(pmap->pm_bti, M_DEVBUF);
2876 		}
2877 	}
2878 
2879 	m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2880 	vm_page_unwire_noq(m);
2881 	vm_page_free_zero(m);
2882 }
2883 
2884 static int
kvm_size(SYSCTL_HANDLER_ARGS)2885 kvm_size(SYSCTL_HANDLER_ARGS)
2886 {
2887 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2888 
2889 	return sysctl_handle_long(oidp, &ksize, 0, req);
2890 }
2891 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2892     0, 0, kvm_size, "LU",
2893     "Size of KVM");
2894 
2895 static int
kvm_free(SYSCTL_HANDLER_ARGS)2896 kvm_free(SYSCTL_HANDLER_ARGS)
2897 {
2898 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2899 
2900 	return sysctl_handle_long(oidp, &kfree, 0, req);
2901 }
2902 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2903     0, 0, kvm_free, "LU",
2904     "Amount of KVM free");
2905 
2906 /*
2907  * grow the number of kernel page table entries, if needed
2908  */
2909 void
pmap_growkernel(vm_offset_t addr)2910 pmap_growkernel(vm_offset_t addr)
2911 {
2912 	vm_page_t nkpg;
2913 	pd_entry_t *l0, *l1, *l2;
2914 
2915 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2916 
2917 	addr = roundup2(addr, L2_SIZE);
2918 	if (addr - 1 >= vm_map_max(kernel_map))
2919 		addr = vm_map_max(kernel_map);
2920 	if (kernel_vm_end < addr) {
2921 		kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2922 		kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2923 	}
2924 	while (kernel_vm_end < addr) {
2925 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2926 		KASSERT(pmap_load(l0) != 0,
2927 		    ("pmap_growkernel: No level 0 kernel entry"));
2928 
2929 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2930 		if (pmap_load(l1) == 0) {
2931 			/* We need a new PDP entry */
2932 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2933 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2934 			if (nkpg == NULL)
2935 				panic("pmap_growkernel: no memory to grow kernel");
2936 			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2937 			/* See the dmb() in _pmap_alloc_l3(). */
2938 			dmb(ishst);
2939 			pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
2940 			continue; /* try again */
2941 		}
2942 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2943 		if (pmap_load(l2) != 0) {
2944 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2945 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2946 				kernel_vm_end = vm_map_max(kernel_map);
2947 				break;
2948 			}
2949 			continue;
2950 		}
2951 
2952 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2953 		    VM_ALLOC_ZERO);
2954 		if (nkpg == NULL)
2955 			panic("pmap_growkernel: no memory to grow kernel");
2956 		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2957 		/* See the dmb() in _pmap_alloc_l3(). */
2958 		dmb(ishst);
2959 		pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
2960 
2961 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2962 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2963 			kernel_vm_end = vm_map_max(kernel_map);
2964 			break;
2965 		}
2966 	}
2967 }
2968 
2969 /***************************************************
2970  * page management routines.
2971  ***************************************************/
2972 
2973 static const uint64_t pc_freemask[_NPCM] = {
2974 	[0 ... _NPCM - 2] = PC_FREEN,
2975 	[_NPCM - 1] = PC_FREEL
2976 };
2977 
2978 #ifdef PV_STATS
2979 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2980 
2981 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2982 	"Current number of pv entry chunks");
2983 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2984 	"Current number of pv entry chunks allocated");
2985 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2986 	"Current number of pv entry chunks frees");
2987 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2988 	"Number of times tried to get a chunk page but failed.");
2989 
2990 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2991 static int pv_entry_spare;
2992 
2993 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2994 	"Current number of pv entry frees");
2995 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2996 	"Current number of pv entry allocs");
2997 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2998 	"Current number of pv entries");
2999 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3000 	"Current number of spare pv entries");
3001 #endif
3002 
3003 /*
3004  * We are in a serious low memory condition.  Resort to
3005  * drastic measures to free some pages so we can allocate
3006  * another pv entry chunk.
3007  *
3008  * Returns NULL if PV entries were reclaimed from the specified pmap.
3009  *
3010  * We do not, however, unmap 2mpages because subsequent accesses will
3011  * allocate per-page pv entries until repromotion occurs, thereby
3012  * exacerbating the shortage of free pv entries.
3013  */
3014 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3015 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3016 {
3017 	struct pv_chunks_list *pvc;
3018 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3019 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3020 	struct md_page *pvh;
3021 	pd_entry_t *pde;
3022 	pmap_t next_pmap, pmap;
3023 	pt_entry_t *pte, tpte;
3024 	pv_entry_t pv;
3025 	vm_offset_t va;
3026 	vm_page_t m, m_pc;
3027 	struct spglist free;
3028 	uint64_t inuse;
3029 	int bit, field, freed, lvl;
3030 
3031 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3032 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3033 
3034 	pmap = NULL;
3035 	m_pc = NULL;
3036 	SLIST_INIT(&free);
3037 	bzero(&pc_marker_b, sizeof(pc_marker_b));
3038 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3039 	pc_marker = (struct pv_chunk *)&pc_marker_b;
3040 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3041 
3042 	pvc = &pv_chunks[domain];
3043 	mtx_lock(&pvc->pvc_lock);
3044 	pvc->active_reclaims++;
3045 	TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3046 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3047 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3048 	    SLIST_EMPTY(&free)) {
3049 		next_pmap = pc->pc_pmap;
3050 		if (next_pmap == NULL) {
3051 			/*
3052 			 * The next chunk is a marker.  However, it is
3053 			 * not our marker, so active_reclaims must be
3054 			 * > 1.  Consequently, the next_chunk code
3055 			 * will not rotate the pv_chunks list.
3056 			 */
3057 			goto next_chunk;
3058 		}
3059 		mtx_unlock(&pvc->pvc_lock);
3060 
3061 		/*
3062 		 * A pv_chunk can only be removed from the pc_lru list
3063 		 * when both pvc->pvc_lock is owned and the
3064 		 * corresponding pmap is locked.
3065 		 */
3066 		if (pmap != next_pmap) {
3067 			if (pmap != NULL && pmap != locked_pmap)
3068 				PMAP_UNLOCK(pmap);
3069 			pmap = next_pmap;
3070 			/* Avoid deadlock and lock recursion. */
3071 			if (pmap > locked_pmap) {
3072 				RELEASE_PV_LIST_LOCK(lockp);
3073 				PMAP_LOCK(pmap);
3074 				mtx_lock(&pvc->pvc_lock);
3075 				continue;
3076 			} else if (pmap != locked_pmap) {
3077 				if (PMAP_TRYLOCK(pmap)) {
3078 					mtx_lock(&pvc->pvc_lock);
3079 					continue;
3080 				} else {
3081 					pmap = NULL; /* pmap is not locked */
3082 					mtx_lock(&pvc->pvc_lock);
3083 					pc = TAILQ_NEXT(pc_marker, pc_lru);
3084 					if (pc == NULL ||
3085 					    pc->pc_pmap != next_pmap)
3086 						continue;
3087 					goto next_chunk;
3088 				}
3089 			}
3090 		}
3091 
3092 		/*
3093 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
3094 		 */
3095 		freed = 0;
3096 		for (field = 0; field < _NPCM; field++) {
3097 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3098 			    inuse != 0; inuse &= ~(1UL << bit)) {
3099 				bit = ffsl(inuse) - 1;
3100 				pv = &pc->pc_pventry[field * 64 + bit];
3101 				va = pv->pv_va;
3102 				pde = pmap_pde(pmap, va, &lvl);
3103 				if (lvl != 2)
3104 					continue;
3105 				pte = pmap_l2_to_l3(pde, va);
3106 				tpte = pmap_load(pte);
3107 				if ((tpte & ATTR_SW_WIRED) != 0)
3108 					continue;
3109 				if ((tpte & ATTR_CONTIGUOUS) != 0)
3110 					(void)pmap_demote_l3c(pmap, pte, va);
3111 				tpte = pmap_load_clear(pte);
3112 				m = PTE_TO_VM_PAGE(tpte);
3113 				if (pmap_pte_dirty(pmap, tpte))
3114 					vm_page_dirty(m);
3115 				if ((tpte & ATTR_AF) != 0) {
3116 					pmap_s1_invalidate_page(pmap, va, true);
3117 					vm_page_aflag_set(m, PGA_REFERENCED);
3118 				}
3119 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3120 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3121 				m->md.pv_gen++;
3122 				if (TAILQ_EMPTY(&m->md.pv_list) &&
3123 				    (m->flags & PG_FICTITIOUS) == 0) {
3124 					pvh = page_to_pvh(m);
3125 					if (TAILQ_EMPTY(&pvh->pv_list)) {
3126 						vm_page_aflag_clear(m,
3127 						    PGA_WRITEABLE);
3128 					}
3129 				}
3130 				pc->pc_map[field] |= 1UL << bit;
3131 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3132 				freed++;
3133 			}
3134 		}
3135 		if (freed == 0) {
3136 			mtx_lock(&pvc->pvc_lock);
3137 			goto next_chunk;
3138 		}
3139 		/* Every freed mapping is for a 4 KB page. */
3140 		pmap_resident_count_dec(pmap, freed);
3141 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3142 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3143 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3144 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3145 		if (pc_is_free(pc)) {
3146 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3147 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3148 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3149 			/* Entire chunk is free; return it. */
3150 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3151 			dump_drop_page(m_pc->phys_addr);
3152 			mtx_lock(&pvc->pvc_lock);
3153 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3154 			break;
3155 		}
3156 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3157 		mtx_lock(&pvc->pvc_lock);
3158 		/* One freed pv entry in locked_pmap is sufficient. */
3159 		if (pmap == locked_pmap)
3160 			break;
3161 
3162 next_chunk:
3163 		TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3164 		TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3165 		if (pvc->active_reclaims == 1 && pmap != NULL) {
3166 			/*
3167 			 * Rotate the pv chunks list so that we do not
3168 			 * scan the same pv chunks that could not be
3169 			 * freed (because they contained a wired
3170 			 * and/or superpage mapping) on every
3171 			 * invocation of reclaim_pv_chunk().
3172 			 */
3173 			while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3174 				MPASS(pc->pc_pmap != NULL);
3175 				TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3176 				TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3177 			}
3178 		}
3179 	}
3180 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3181 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3182 	pvc->active_reclaims--;
3183 	mtx_unlock(&pvc->pvc_lock);
3184 	if (pmap != NULL && pmap != locked_pmap)
3185 		PMAP_UNLOCK(pmap);
3186 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3187 		m_pc = SLIST_FIRST(&free);
3188 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3189 		/* Recycle a freed page table page. */
3190 		m_pc->ref_count = 1;
3191 	}
3192 	vm_page_free_pages_toq(&free, true);
3193 	return (m_pc);
3194 }
3195 
3196 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3197 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3198 {
3199 	vm_page_t m;
3200 	int i, domain;
3201 
3202 	domain = PCPU_GET(domain);
3203 	for (i = 0; i < vm_ndomains; i++) {
3204 		m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3205 		if (m != NULL)
3206 			break;
3207 		domain = (domain + 1) % vm_ndomains;
3208 	}
3209 
3210 	return (m);
3211 }
3212 
3213 /*
3214  * free the pv_entry back to the free list
3215  */
3216 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3217 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3218 {
3219 	struct pv_chunk *pc;
3220 	int idx, field, bit;
3221 
3222 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3223 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3224 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3225 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3226 	pc = pv_to_chunk(pv);
3227 	idx = pv - &pc->pc_pventry[0];
3228 	field = idx / 64;
3229 	bit = idx % 64;
3230 	pc->pc_map[field] |= 1ul << bit;
3231 	if (!pc_is_free(pc)) {
3232 		/* 98% of the time, pc is already at the head of the list. */
3233 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3234 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3235 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3236 		}
3237 		return;
3238 	}
3239 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3240 	free_pv_chunk(pc);
3241 }
3242 
3243 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3244 free_pv_chunk_dequeued(struct pv_chunk *pc)
3245 {
3246 	vm_page_t m;
3247 
3248 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3249 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3250 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3251 	/* entire chunk is free, return it */
3252 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3253 	dump_drop_page(m->phys_addr);
3254 	vm_page_unwire_noq(m);
3255 	vm_page_free(m);
3256 }
3257 
3258 static void
free_pv_chunk(struct pv_chunk * pc)3259 free_pv_chunk(struct pv_chunk *pc)
3260 {
3261 	struct pv_chunks_list *pvc;
3262 
3263 	pvc = &pv_chunks[pc_to_domain(pc)];
3264 	mtx_lock(&pvc->pvc_lock);
3265 	TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3266 	mtx_unlock(&pvc->pvc_lock);
3267 	free_pv_chunk_dequeued(pc);
3268 }
3269 
3270 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3271 free_pv_chunk_batch(struct pv_chunklist *batch)
3272 {
3273 	struct pv_chunks_list *pvc;
3274 	struct pv_chunk *pc, *npc;
3275 	int i;
3276 
3277 	for (i = 0; i < vm_ndomains; i++) {
3278 		if (TAILQ_EMPTY(&batch[i]))
3279 			continue;
3280 		pvc = &pv_chunks[i];
3281 		mtx_lock(&pvc->pvc_lock);
3282 		TAILQ_FOREACH(pc, &batch[i], pc_list) {
3283 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3284 		}
3285 		mtx_unlock(&pvc->pvc_lock);
3286 	}
3287 
3288 	for (i = 0; i < vm_ndomains; i++) {
3289 		TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3290 			free_pv_chunk_dequeued(pc);
3291 		}
3292 	}
3293 }
3294 
3295 /*
3296  * Returns a new PV entry, allocating a new PV chunk from the system when
3297  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3298  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3299  * returned.
3300  *
3301  * The given PV list lock may be released.
3302  */
3303 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3304 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3305 {
3306 	struct pv_chunks_list *pvc;
3307 	int bit, field;
3308 	pv_entry_t pv;
3309 	struct pv_chunk *pc;
3310 	vm_page_t m;
3311 
3312 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3313 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3314 retry:
3315 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3316 	if (pc != NULL) {
3317 		for (field = 0; field < _NPCM; field++) {
3318 			if (pc->pc_map[field]) {
3319 				bit = ffsl(pc->pc_map[field]) - 1;
3320 				break;
3321 			}
3322 		}
3323 		if (field < _NPCM) {
3324 			pv = &pc->pc_pventry[field * 64 + bit];
3325 			pc->pc_map[field] &= ~(1ul << bit);
3326 			/* If this was the last item, move it to tail */
3327 			if (pc_is_full(pc)) {
3328 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3329 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3330 				    pc_list);
3331 			}
3332 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3333 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3334 			return (pv);
3335 		}
3336 	}
3337 	/* No free items, allocate another chunk */
3338 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3339 	if (m == NULL) {
3340 		if (lockp == NULL) {
3341 			PV_STAT(pc_chunk_tryfail++);
3342 			return (NULL);
3343 		}
3344 		m = reclaim_pv_chunk(pmap, lockp);
3345 		if (m == NULL)
3346 			goto retry;
3347 	}
3348 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3349 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3350 	dump_add_page(m->phys_addr);
3351 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3352 	pc->pc_pmap = pmap;
3353 	memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3354 	pc->pc_map[0] &= ~1ul;		/* preallocated bit 0 */
3355 	pvc = &pv_chunks[vm_page_domain(m)];
3356 	mtx_lock(&pvc->pvc_lock);
3357 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3358 	mtx_unlock(&pvc->pvc_lock);
3359 	pv = &pc->pc_pventry[0];
3360 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3361 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3362 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3363 	return (pv);
3364 }
3365 
3366 /*
3367  * Ensure that the number of spare PV entries in the specified pmap meets or
3368  * exceeds the given count, "needed".
3369  *
3370  * The given PV list lock may be released.
3371  */
3372 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3373 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3374 {
3375 	struct pv_chunks_list *pvc;
3376 	struct pch new_tail[PMAP_MEMDOM];
3377 	struct pv_chunk *pc;
3378 	vm_page_t m;
3379 	int avail, free, i;
3380 	bool reclaimed;
3381 
3382 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3383 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3384 
3385 	/*
3386 	 * Newly allocated PV chunks must be stored in a private list until
3387 	 * the required number of PV chunks have been allocated.  Otherwise,
3388 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3389 	 * contrast, these chunks must be added to the pmap upon allocation.
3390 	 */
3391 	for (i = 0; i < PMAP_MEMDOM; i++)
3392 		TAILQ_INIT(&new_tail[i]);
3393 retry:
3394 	avail = 0;
3395 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3396 		bit_count((bitstr_t *)pc->pc_map, 0,
3397 		    sizeof(pc->pc_map) * NBBY, &free);
3398 		if (free == 0)
3399 			break;
3400 		avail += free;
3401 		if (avail >= needed)
3402 			break;
3403 	}
3404 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
3405 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3406 		if (m == NULL) {
3407 			m = reclaim_pv_chunk(pmap, lockp);
3408 			if (m == NULL)
3409 				goto retry;
3410 			reclaimed = true;
3411 		}
3412 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3413 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3414 		dump_add_page(m->phys_addr);
3415 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3416 		pc->pc_pmap = pmap;
3417 		memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3418 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3419 		TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3420 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3421 
3422 		/*
3423 		 * The reclaim might have freed a chunk from the current pmap.
3424 		 * If that chunk contained available entries, we need to
3425 		 * re-count the number of available entries.
3426 		 */
3427 		if (reclaimed)
3428 			goto retry;
3429 	}
3430 	for (i = 0; i < vm_ndomains; i++) {
3431 		if (TAILQ_EMPTY(&new_tail[i]))
3432 			continue;
3433 		pvc = &pv_chunks[i];
3434 		mtx_lock(&pvc->pvc_lock);
3435 		TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3436 		mtx_unlock(&pvc->pvc_lock);
3437 	}
3438 }
3439 
3440 /*
3441  * First find and then remove the pv entry for the specified pmap and virtual
3442  * address from the specified pv list.  Returns the pv entry if found and NULL
3443  * otherwise.  This operation can be performed on pv lists for either 4KB or
3444  * 2MB page mappings.
3445  */
3446 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3447 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3448 {
3449 	pv_entry_t pv;
3450 
3451 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3452 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3453 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3454 			pvh->pv_gen++;
3455 			break;
3456 		}
3457 	}
3458 	return (pv);
3459 }
3460 
3461 /*
3462  * After demotion from a 2MB page mapping to 512 4KB page mappings,
3463  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3464  * entries for each of the 4KB page mappings.
3465  */
3466 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3467 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3468     struct rwlock **lockp)
3469 {
3470 	struct md_page *pvh;
3471 	struct pv_chunk *pc;
3472 	pv_entry_t pv;
3473 	vm_offset_t va_last;
3474 	vm_page_t m;
3475 	int bit, field;
3476 
3477 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3478 	KASSERT((va & L2_OFFSET) == 0,
3479 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3480 	KASSERT((pa & L2_OFFSET) == 0,
3481 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3482 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3483 
3484 	/*
3485 	 * Transfer the 2mpage's pv entry for this mapping to the first
3486 	 * page's pv list.  Once this transfer begins, the pv list lock
3487 	 * must not be released until the last pv entry is reinstantiated.
3488 	 */
3489 	pvh = pa_to_pvh(pa);
3490 	pv = pmap_pvh_remove(pvh, pmap, va);
3491 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3492 	m = PHYS_TO_VM_PAGE(pa);
3493 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3494 	m->md.pv_gen++;
3495 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3496 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3497 	va_last = va + L2_SIZE - PAGE_SIZE;
3498 	for (;;) {
3499 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3500 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3501 		for (field = 0; field < _NPCM; field++) {
3502 			while (pc->pc_map[field]) {
3503 				bit = ffsl(pc->pc_map[field]) - 1;
3504 				pc->pc_map[field] &= ~(1ul << bit);
3505 				pv = &pc->pc_pventry[field * 64 + bit];
3506 				va += PAGE_SIZE;
3507 				pv->pv_va = va;
3508 				m++;
3509 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3510 			    ("pmap_pv_demote_l2: page %p is not managed", m));
3511 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3512 				m->md.pv_gen++;
3513 				if (va == va_last)
3514 					goto out;
3515 			}
3516 		}
3517 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3518 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3519 	}
3520 out:
3521 	if (pc_is_full(pc)) {
3522 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3523 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3524 	}
3525 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3526 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3527 }
3528 
3529 /*
3530  * First find and then destroy the pv entry for the specified pmap and virtual
3531  * address.  This operation can be performed on pv lists for either 4KB or 2MB
3532  * page mappings.
3533  */
3534 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3535 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3536 {
3537 	pv_entry_t pv;
3538 
3539 	pv = pmap_pvh_remove(pvh, pmap, va);
3540 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3541 	free_pv_entry(pmap, pv);
3542 }
3543 
3544 /*
3545  * Conditionally create the PV entry for a 4KB page mapping if the required
3546  * memory can be allocated without resorting to reclamation.
3547  */
3548 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3549 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3550     struct rwlock **lockp)
3551 {
3552 	pv_entry_t pv;
3553 
3554 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3555 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3556 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3557 		pv->pv_va = va;
3558 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3559 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3560 		m->md.pv_gen++;
3561 		return (true);
3562 	} else
3563 		return (false);
3564 }
3565 
3566 /*
3567  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3568  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3569  * false if the PV entry cannot be allocated without resorting to reclamation.
3570  */
3571 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3572 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3573     struct rwlock **lockp)
3574 {
3575 	struct md_page *pvh;
3576 	pv_entry_t pv;
3577 	vm_paddr_t pa;
3578 
3579 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3580 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3581 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3582 	    NULL : lockp)) == NULL)
3583 		return (false);
3584 	pv->pv_va = va;
3585 	pa = PTE_TO_PHYS(l2e);
3586 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3587 	pvh = pa_to_pvh(pa);
3588 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3589 	pvh->pv_gen++;
3590 	return (true);
3591 }
3592 
3593 /*
3594  * Conditionally creates the PV entries for a L3C superpage mapping if
3595  * the required memory can be allocated without resorting to reclamation.
3596  */
3597 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3598 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3599     struct rwlock **lockp)
3600 {
3601 	pv_entry_t pv;
3602 	vm_offset_t tva;
3603 	vm_paddr_t pa __diagused;
3604 	vm_page_t mt;
3605 
3606 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3607 	KASSERT((va & L3C_OFFSET) == 0,
3608 	    ("pmap_pv_insert_l3c: va is not aligned"));
3609 	pa = VM_PAGE_TO_PHYS(m);
3610 	KASSERT((pa & L3C_OFFSET) == 0,
3611 	    ("pmap_pv_insert_l3c: pa is not aligned"));
3612 	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3613 	for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3614 		/* Pass NULL instead of lockp to disable reclamation. */
3615 		pv = get_pv_entry(pmap, NULL);
3616 		if (__predict_false(pv == NULL)) {
3617 			while (tva > va) {
3618 				mt--;
3619 				tva -= L3_SIZE;
3620 				pmap_pvh_free(&mt->md, pmap, tva);
3621 			}
3622 			return (false);
3623 		}
3624 		pv->pv_va = tva;
3625 		TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3626 		mt->md.pv_gen++;
3627 	}
3628 	return (true);
3629 }
3630 
3631 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3632 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3633 {
3634 	pt_entry_t newl2, oldl2 __diagused;
3635 	vm_page_t ml3;
3636 	vm_paddr_t ml3pa;
3637 
3638 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3639 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3640 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3641 
3642 	ml3 = pmap_remove_pt_page(pmap, va);
3643 	if (ml3 == NULL)
3644 		panic("pmap_remove_kernel_l2: Missing pt page");
3645 
3646 	ml3pa = VM_PAGE_TO_PHYS(ml3);
3647 	newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3648 
3649 	/*
3650 	 * If this page table page was unmapped by a promotion, then it
3651 	 * contains valid mappings.  Zero it to invalidate those mappings.
3652 	 */
3653 	if (vm_page_any_valid(ml3))
3654 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
3655 
3656 	/*
3657 	 * Demote the mapping.  The caller must have already invalidated the
3658 	 * mapping (i.e., the "break" in break-before-make).
3659 	 */
3660 	oldl2 = pmap_load_store(l2, newl2);
3661 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3662 	    __func__, l2, oldl2));
3663 }
3664 
3665 /*
3666  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3667  */
3668 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)3669 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3670     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3671 {
3672 	struct md_page *pvh;
3673 	pt_entry_t old_l2;
3674 	vm_page_t m, ml3, mt;
3675 
3676 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3677 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3678 	old_l2 = pmap_load_clear(l2);
3679 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3680 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3681 
3682 	/*
3683 	 * Since a promotion must break the 4KB page mappings before making
3684 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3685 	 */
3686 	pmap_s1_invalidate_page(pmap, sva, true);
3687 
3688 	if (old_l2 & ATTR_SW_WIRED)
3689 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3690 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3691 	if (old_l2 & ATTR_SW_MANAGED) {
3692 		m = PTE_TO_VM_PAGE(old_l2);
3693 		pvh = page_to_pvh(m);
3694 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3695 		pmap_pvh_free(pvh, pmap, sva);
3696 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3697 			if (pmap_pte_dirty(pmap, old_l2))
3698 				vm_page_dirty(mt);
3699 			if (old_l2 & ATTR_AF)
3700 				vm_page_aflag_set(mt, PGA_REFERENCED);
3701 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3702 			    TAILQ_EMPTY(&pvh->pv_list))
3703 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3704 		}
3705 	}
3706 	if (pmap == kernel_pmap) {
3707 		pmap_remove_kernel_l2(pmap, l2, sva);
3708 	} else {
3709 		ml3 = pmap_remove_pt_page(pmap, sva);
3710 		if (ml3 != NULL) {
3711 			KASSERT(vm_page_any_valid(ml3),
3712 			    ("pmap_remove_l2: l3 page not promoted"));
3713 			pmap_resident_count_dec(pmap, 1);
3714 			KASSERT(ml3->ref_count == NL3PG,
3715 			    ("pmap_remove_l2: l3 page ref count error"));
3716 			ml3->ref_count = 0;
3717 			pmap_add_delayed_free_list(ml3, free, false);
3718 		}
3719 	}
3720 	return (pmap_unuse_pt(pmap, sva, l1e, free));
3721 }
3722 
3723 /*
3724  * pmap_remove_l3: do the things to unmap a page in a process
3725  */
3726 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)3727 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3728     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3729 {
3730 	struct md_page *pvh;
3731 	pt_entry_t old_l3;
3732 	vm_page_t m;
3733 
3734 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3735 	old_l3 = pmap_load(l3);
3736 	if ((old_l3 & ATTR_CONTIGUOUS) != 0)
3737 		(void)pmap_demote_l3c(pmap, l3, va);
3738 	old_l3 = pmap_load_clear(l3);
3739 	pmap_s1_invalidate_page(pmap, va, true);
3740 	if (old_l3 & ATTR_SW_WIRED)
3741 		pmap->pm_stats.wired_count -= 1;
3742 	pmap_resident_count_dec(pmap, 1);
3743 	if (old_l3 & ATTR_SW_MANAGED) {
3744 		m = PTE_TO_VM_PAGE(old_l3);
3745 		if (pmap_pte_dirty(pmap, old_l3))
3746 			vm_page_dirty(m);
3747 		if (old_l3 & ATTR_AF)
3748 			vm_page_aflag_set(m, PGA_REFERENCED);
3749 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3750 		pmap_pvh_free(&m->md, pmap, va);
3751 		if (TAILQ_EMPTY(&m->md.pv_list) &&
3752 		    (m->flags & PG_FICTITIOUS) == 0) {
3753 			pvh = page_to_pvh(m);
3754 			if (TAILQ_EMPTY(&pvh->pv_list))
3755 				vm_page_aflag_clear(m, PGA_WRITEABLE);
3756 		}
3757 	}
3758 	return (pmap_unuse_pt(pmap, va, l2e, free));
3759 }
3760 
3761 /*
3762  * Removes the specified L3C superpage mapping.  Requests TLB invalidations
3763  * to be performed by the caller through the returned "*vap". Returns true
3764  * if the level 3 table "ml3" was unmapped and added to the spglist "free".
3765  * Otherwise, returns false.
3766  */
3767 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)3768 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
3769     vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
3770     struct rwlock **lockp)
3771 {
3772 	struct md_page *pvh;
3773 	struct rwlock *new_lock;
3774 	pt_entry_t first_l3e, l3e, *tl3p;
3775 	vm_offset_t tva;
3776 	vm_page_t m, mt;
3777 
3778 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3779 	KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
3780 	    0, ("pmap_remove_l3c: l3p is not aligned"));
3781 	KASSERT((va & L3C_OFFSET) == 0,
3782 	    ("pmap_remove_l3c: va is not aligned"));
3783 
3784 	/*
3785 	 * Hardware accessed and dirty bit maintenance might only update a
3786 	 * single L3 entry, so we must combine the accessed and dirty bits
3787 	 * from this entire set of contiguous L3 entries.
3788 	 */
3789 	first_l3e = pmap_load_clear(l3p);
3790 	for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
3791 		l3e = pmap_load_clear(tl3p);
3792 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
3793 		    ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
3794 		if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
3795 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
3796 			first_l3e &= ~ATTR_S1_AP_RW_BIT;
3797 		first_l3e |= l3e & ATTR_AF;
3798 	}
3799 	if ((first_l3e & ATTR_SW_WIRED) != 0)
3800 		pmap->pm_stats.wired_count -= L3C_ENTRIES;
3801 	pmap_resident_count_dec(pmap, L3C_ENTRIES);
3802 	if ((first_l3e & ATTR_SW_MANAGED) != 0) {
3803 		m = PTE_TO_VM_PAGE(first_l3e);
3804 		new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3805 		if (new_lock != *lockp) {
3806 			if (*lockp != NULL) {
3807 				/*
3808 				 * Pending TLB invalidations must be
3809 				 * performed before the PV list lock is
3810 				 * released.  Otherwise, a concurrent
3811 				 * pmap_remove_all() on a physical page
3812 				 * could return while a stale TLB entry
3813 				 * still provides access to that page.
3814 				 */
3815 				if (*vap != va_next) {
3816 					pmap_invalidate_range(pmap, *vap, va,
3817 					    true);
3818 					*vap = va_next;
3819 				}
3820 				rw_wunlock(*lockp);
3821 			}
3822 			*lockp = new_lock;
3823 			rw_wlock(*lockp);
3824 		}
3825 		pvh = page_to_pvh(m);
3826 		for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
3827 		    L3_SIZE) {
3828 			if (pmap_pte_dirty(pmap, first_l3e))
3829 				vm_page_dirty(mt);
3830 			if ((first_l3e & ATTR_AF) != 0)
3831 				vm_page_aflag_set(mt, PGA_REFERENCED);
3832 			pmap_pvh_free(&mt->md, pmap, tva);
3833 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3834 			    TAILQ_EMPTY(&pvh->pv_list))
3835 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3836 		}
3837 	}
3838 	if (*vap == va_next)
3839 		*vap = va;
3840 	if (ml3 != NULL) {
3841 		ml3->ref_count -= L3C_ENTRIES;
3842 		if (ml3->ref_count == 0) {
3843 			_pmap_unwire_l3(pmap, va, ml3, free);
3844 			return (true);
3845 		}
3846 	}
3847 	return (false);
3848 }
3849 
3850 /*
3851  * Remove the specified range of addresses from the L3 page table that is
3852  * identified by the given L2 entry.
3853  */
3854 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)3855 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3856     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3857 {
3858 	struct md_page *pvh;
3859 	struct rwlock *new_lock;
3860 	pt_entry_t *l3, old_l3;
3861 	vm_offset_t va;
3862 	vm_page_t l3pg, m;
3863 
3864 	KASSERT(ADDR_IS_CANONICAL(sva),
3865 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
3866 	KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3867 	    ("%s: End address not in canonical form: %lx", __func__, eva));
3868 
3869 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3870 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3871 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3872 	l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
3873 	va = eva;
3874 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3875 		old_l3 = pmap_load(l3);
3876 		if (!pmap_l3_valid(old_l3)) {
3877 			if (va != eva) {
3878 				pmap_invalidate_range(pmap, va, sva, true);
3879 				va = eva;
3880 			}
3881 			continue;
3882 		}
3883 		if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
3884 			/*
3885 			 * Is this entire set of contiguous L3 entries being
3886 			 * removed?  Handle the possibility that "eva" is zero
3887 			 * because of address wraparound.
3888 			 */
3889 			if ((sva & L3C_OFFSET) == 0 &&
3890 			    sva + L3C_OFFSET <= eva - 1) {
3891 				if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
3892 				    l3pg, free, lockp)) {
3893 					/* The L3 table was unmapped. */
3894 					sva += L3C_SIZE;
3895 					break;
3896 				}
3897 				l3 += L3C_ENTRIES - 1;
3898 				sva += L3C_SIZE - L3_SIZE;
3899 				continue;
3900 			}
3901 
3902 			(void)pmap_demote_l3c(pmap, l3, sva);
3903 		}
3904 		old_l3 = pmap_load_clear(l3);
3905 		if ((old_l3 & ATTR_SW_WIRED) != 0)
3906 			pmap->pm_stats.wired_count--;
3907 		pmap_resident_count_dec(pmap, 1);
3908 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3909 			m = PTE_TO_VM_PAGE(old_l3);
3910 			if (pmap_pte_dirty(pmap, old_l3))
3911 				vm_page_dirty(m);
3912 			if ((old_l3 & ATTR_AF) != 0)
3913 				vm_page_aflag_set(m, PGA_REFERENCED);
3914 			new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3915 			if (new_lock != *lockp) {
3916 				if (*lockp != NULL) {
3917 					/*
3918 					 * Pending TLB invalidations must be
3919 					 * performed before the PV list lock is
3920 					 * released.  Otherwise, a concurrent
3921 					 * pmap_remove_all() on a physical page
3922 					 * could return while a stale TLB entry
3923 					 * still provides access to that page.
3924 					 */
3925 					if (va != eva) {
3926 						pmap_invalidate_range(pmap, va,
3927 						    sva, true);
3928 						va = eva;
3929 					}
3930 					rw_wunlock(*lockp);
3931 				}
3932 				*lockp = new_lock;
3933 				rw_wlock(*lockp);
3934 			}
3935 			pmap_pvh_free(&m->md, pmap, sva);
3936 			if (TAILQ_EMPTY(&m->md.pv_list) &&
3937 			    (m->flags & PG_FICTITIOUS) == 0) {
3938 				pvh = page_to_pvh(m);
3939 				if (TAILQ_EMPTY(&pvh->pv_list))
3940 					vm_page_aflag_clear(m, PGA_WRITEABLE);
3941 			}
3942 		}
3943 		if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3944 			/*
3945 			 * _pmap_unwire_l3() has already invalidated the TLB
3946 			 * entries at all levels for "sva".  So, we need not
3947 			 * perform "sva += L3_SIZE;" here.  Moreover, we need
3948 			 * not perform "va = sva;" if "sva" is at the start
3949 			 * of a new valid range consisting of a single page.
3950 			 */
3951 			break;
3952 		}
3953 		if (va == eva)
3954 			va = sva;
3955 	}
3956 	if (va != eva)
3957 		pmap_invalidate_range(pmap, va, sva, true);
3958 }
3959 
3960 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)3961 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
3962 {
3963 	struct rwlock *lock;
3964 	vm_offset_t va_next;
3965 	pd_entry_t *l0, *l1, *l2;
3966 	pt_entry_t l3_paddr;
3967 	struct spglist free;
3968 
3969 	/*
3970 	 * Perform an unsynchronized read.  This is, however, safe.
3971 	 */
3972 	if (pmap->pm_stats.resident_count == 0)
3973 		return;
3974 
3975 	SLIST_INIT(&free);
3976 
3977 	PMAP_LOCK(pmap);
3978 	if (map_delete)
3979 		pmap_bti_on_remove(pmap, sva, eva);
3980 
3981 	lock = NULL;
3982 	for (; sva < eva; sva = va_next) {
3983 		if (pmap->pm_stats.resident_count == 0)
3984 			break;
3985 
3986 		l0 = pmap_l0(pmap, sva);
3987 		if (pmap_load(l0) == 0) {
3988 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3989 			if (va_next < sva)
3990 				va_next = eva;
3991 			continue;
3992 		}
3993 
3994 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3995 		if (va_next < sva)
3996 			va_next = eva;
3997 		l1 = pmap_l0_to_l1(l0, sva);
3998 		if (pmap_load(l1) == 0)
3999 			continue;
4000 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4001 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4002 			KASSERT(va_next <= eva,
4003 			    ("partial update of non-transparent 1G page "
4004 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4005 			    pmap_load(l1), sva, eva, va_next));
4006 			MPASS(pmap != kernel_pmap);
4007 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4008 			pmap_clear(l1);
4009 			pmap_s1_invalidate_page(pmap, sva, true);
4010 			pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4011 			pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4012 			continue;
4013 		}
4014 
4015 		/*
4016 		 * Calculate index for next page table.
4017 		 */
4018 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4019 		if (va_next < sva)
4020 			va_next = eva;
4021 
4022 		l2 = pmap_l1_to_l2(l1, sva);
4023 		if (l2 == NULL)
4024 			continue;
4025 
4026 		l3_paddr = pmap_load(l2);
4027 
4028 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4029 			if (sva + L2_SIZE == va_next && eva >= va_next) {
4030 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4031 				    &free, &lock);
4032 				continue;
4033 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
4034 			    &lock) == NULL)
4035 				continue;
4036 			l3_paddr = pmap_load(l2);
4037 		}
4038 
4039 		/*
4040 		 * Weed out invalid mappings.
4041 		 */
4042 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4043 			continue;
4044 
4045 		/*
4046 		 * Limit our scan to either the end of the va represented
4047 		 * by the current page table page, or to the end of the
4048 		 * range being removed.
4049 		 */
4050 		if (va_next > eva)
4051 			va_next = eva;
4052 
4053 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4054 		    &lock);
4055 	}
4056 	if (lock != NULL)
4057 		rw_wunlock(lock);
4058 	PMAP_UNLOCK(pmap);
4059 	vm_page_free_pages_toq(&free, true);
4060 }
4061 
4062 /*
4063  *	Remove the given range of addresses from the specified map.
4064  *
4065  *	It is assumed that the start and end are properly
4066  *	rounded to the page size.
4067  */
4068 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4069 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4070 {
4071 	pmap_remove1(pmap, sva, eva, false);
4072 }
4073 
4074 /*
4075  *	Remove the given range of addresses as part of a logical unmap
4076  *	operation. This has the effect of calling pmap_remove(), but
4077  *	also clears any metadata that should persist for the lifetime
4078  *	of a logical mapping.
4079  */
4080 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4081 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4082 {
4083 	pmap_remove1(pmap, sva, eva, true);
4084 }
4085 
4086 /*
4087  *	Routine:	pmap_remove_all
4088  *	Function:
4089  *		Removes this physical page from
4090  *		all physical maps in which it resides.
4091  *		Reflects back modify bits to the pager.
4092  *
4093  *	Notes:
4094  *		Original versions of this routine were very
4095  *		inefficient because they iteratively called
4096  *		pmap_remove (slow...)
4097  */
4098 
4099 void
pmap_remove_all(vm_page_t m)4100 pmap_remove_all(vm_page_t m)
4101 {
4102 	struct md_page *pvh;
4103 	pv_entry_t pv;
4104 	pmap_t pmap;
4105 	struct rwlock *lock;
4106 	pd_entry_t *pde, tpde;
4107 	pt_entry_t *pte, tpte;
4108 	vm_offset_t va;
4109 	struct spglist free;
4110 	int lvl, pvh_gen, md_gen;
4111 
4112 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4113 	    ("pmap_remove_all: page %p is not managed", m));
4114 	SLIST_INIT(&free);
4115 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4116 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4117 	rw_wlock(lock);
4118 retry:
4119 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4120 		pmap = PV_PMAP(pv);
4121 		if (!PMAP_TRYLOCK(pmap)) {
4122 			pvh_gen = pvh->pv_gen;
4123 			rw_wunlock(lock);
4124 			PMAP_LOCK(pmap);
4125 			rw_wlock(lock);
4126 			if (pvh_gen != pvh->pv_gen) {
4127 				PMAP_UNLOCK(pmap);
4128 				goto retry;
4129 			}
4130 		}
4131 		va = pv->pv_va;
4132 		pte = pmap_pte_exists(pmap, va, 2, __func__);
4133 		pmap_demote_l2_locked(pmap, pte, va, &lock);
4134 		PMAP_UNLOCK(pmap);
4135 	}
4136 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4137 		pmap = PV_PMAP(pv);
4138 		if (!PMAP_TRYLOCK(pmap)) {
4139 			pvh_gen = pvh->pv_gen;
4140 			md_gen = m->md.pv_gen;
4141 			rw_wunlock(lock);
4142 			PMAP_LOCK(pmap);
4143 			rw_wlock(lock);
4144 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4145 				PMAP_UNLOCK(pmap);
4146 				goto retry;
4147 			}
4148 		}
4149 		pmap_resident_count_dec(pmap, 1);
4150 
4151 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
4152 		KASSERT(pde != NULL,
4153 		    ("pmap_remove_all: no page directory entry found"));
4154 		KASSERT(lvl == 2,
4155 		    ("pmap_remove_all: invalid pde level %d", lvl));
4156 		tpde = pmap_load(pde);
4157 
4158 		pte = pmap_l2_to_l3(pde, pv->pv_va);
4159 		tpte = pmap_load(pte);
4160 		if ((tpte & ATTR_CONTIGUOUS) != 0)
4161 			(void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4162 		tpte = pmap_load_clear(pte);
4163 		if (tpte & ATTR_SW_WIRED)
4164 			pmap->pm_stats.wired_count--;
4165 		if ((tpte & ATTR_AF) != 0) {
4166 			pmap_invalidate_page(pmap, pv->pv_va, true);
4167 			vm_page_aflag_set(m, PGA_REFERENCED);
4168 		}
4169 
4170 		/*
4171 		 * Update the vm_page_t clean and reference bits.
4172 		 */
4173 		if (pmap_pte_dirty(pmap, tpte))
4174 			vm_page_dirty(m);
4175 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4176 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4177 		m->md.pv_gen++;
4178 		free_pv_entry(pmap, pv);
4179 		PMAP_UNLOCK(pmap);
4180 	}
4181 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4182 	rw_wunlock(lock);
4183 	vm_page_free_pages_toq(&free, true);
4184 }
4185 
4186 /*
4187  * Masks and sets bits in a level 2 page table entries in the specified pmap
4188  */
4189 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4190 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4191     pt_entry_t nbits)
4192 {
4193 	pd_entry_t old_l2;
4194 	vm_page_t m, mt;
4195 
4196 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4197 	PMAP_ASSERT_STAGE1(pmap);
4198 	KASSERT((sva & L2_OFFSET) == 0,
4199 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
4200 	old_l2 = pmap_load(l2);
4201 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4202 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4203 
4204 	/*
4205 	 * Return if the L2 entry already has the desired access restrictions
4206 	 * in place.
4207 	 */
4208 	if ((old_l2 & mask) == nbits)
4209 		return;
4210 
4211 	while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4212 		cpu_spinwait();
4213 
4214 	/*
4215 	 * When a dirty read/write superpage mapping is write protected,
4216 	 * update the dirty field of each of the superpage's constituent 4KB
4217 	 * pages.
4218 	 */
4219 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4220 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4221 	    pmap_pte_dirty(pmap, old_l2)) {
4222 		m = PTE_TO_VM_PAGE(old_l2);
4223 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4224 			vm_page_dirty(mt);
4225 	}
4226 
4227 	/*
4228 	 * Since a promotion must break the 4KB page mappings before making
4229 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4230 	 */
4231 	pmap_s1_invalidate_page(pmap, sva, true);
4232 }
4233 
4234 /*
4235  * Masks and sets bits in the specified L3C superpage mapping.
4236  *
4237  * Requests TLB invalidations to be performed by the caller through the
4238  * returned "*vap".
4239  */
4240 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4241 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4242     vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4243 {
4244 	pt_entry_t l3e, *tl3p;
4245 	vm_page_t m, mt;
4246 	bool dirty;
4247 
4248 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4249 	KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4250 	    0, ("pmap_mask_set_l3c: l3p is not aligned"));
4251 	KASSERT((va & L3C_OFFSET) == 0,
4252 	    ("pmap_mask_set_l3c: va is not aligned"));
4253 	dirty = false;
4254 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4255 		l3e = pmap_load(tl3p);
4256 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4257 		    ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4258 		while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4259 			cpu_spinwait();
4260 		if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4261 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4262 			dirty = true;
4263 	}
4264 
4265 	/*
4266 	 * When a dirty read/write superpage mapping is write protected,
4267 	 * update the dirty field of each of the superpage's constituent 4KB
4268 	 * pages.
4269 	 */
4270 	if ((l3e & ATTR_SW_MANAGED) != 0 &&
4271 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4272 	    dirty) {
4273 		m = PTE_TO_VM_PAGE(pmap_load(l3p));
4274 		for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4275 			vm_page_dirty(mt);
4276 	}
4277 
4278 	if (*vap == va_next)
4279 		*vap = va;
4280 }
4281 
4282 /*
4283  * Masks and sets bits in last level page table entries in the specified
4284  * pmap and range
4285  */
4286 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4287 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4288     pt_entry_t nbits, bool invalidate)
4289 {
4290 	vm_offset_t va, va_next;
4291 	pd_entry_t *l0, *l1, *l2;
4292 	pt_entry_t *l3p, l3;
4293 
4294 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4295 	for (; sva < eva; sva = va_next) {
4296 		l0 = pmap_l0(pmap, sva);
4297 		if (pmap_load(l0) == 0) {
4298 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4299 			if (va_next < sva)
4300 				va_next = eva;
4301 			continue;
4302 		}
4303 
4304 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4305 		if (va_next < sva)
4306 			va_next = eva;
4307 		l1 = pmap_l0_to_l1(l0, sva);
4308 		if (pmap_load(l1) == 0)
4309 			continue;
4310 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4311 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4312 			KASSERT(va_next <= eva,
4313 			    ("partial update of non-transparent 1G page "
4314 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4315 			    pmap_load(l1), sva, eva, va_next));
4316 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4317 			if ((pmap_load(l1) & mask) != nbits) {
4318 				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4319 				if (invalidate)
4320 					pmap_s1_invalidate_page(pmap, sva, true);
4321 			}
4322 			continue;
4323 		}
4324 
4325 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4326 		if (va_next < sva)
4327 			va_next = eva;
4328 
4329 		l2 = pmap_l1_to_l2(l1, sva);
4330 		if (pmap_load(l2) == 0)
4331 			continue;
4332 
4333 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4334 			if (sva + L2_SIZE == va_next && eva >= va_next) {
4335 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
4336 				continue;
4337 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
4338 				continue;
4339 		}
4340 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4341 		    ("pmap_protect: Invalid L2 entry after demotion"));
4342 
4343 		if (va_next > eva)
4344 			va_next = eva;
4345 
4346 		va = va_next;
4347 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4348 		    sva += L3_SIZE) {
4349 			l3 = pmap_load(l3p);
4350 
4351 			/*
4352 			 * Go to the next L3 entry if the current one is
4353 			 * invalid or already has the desired access
4354 			 * restrictions in place.  (The latter case occurs
4355 			 * frequently.  For example, in a "buildworld"
4356 			 * workload, almost 1 out of 4 L3 entries already
4357 			 * have the desired restrictions.)
4358 			 */
4359 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4360 				if (va != va_next) {
4361 					if (invalidate)
4362 						pmap_s1_invalidate_range(pmap,
4363 						    va, sva, true);
4364 					va = va_next;
4365 				}
4366 				if ((l3 & ATTR_CONTIGUOUS) != 0) {
4367 					l3p += L3C_ENTRIES - 1;
4368 					sva += L3C_SIZE - L3_SIZE;
4369 				}
4370 				continue;
4371 			}
4372 
4373 			if ((l3 & ATTR_CONTIGUOUS) != 0) {
4374 				/*
4375 				 * Is this entire set of contiguous L3 entries
4376 				 * being protected?  Handle the possibility
4377 				 * that "va_next" is zero because of address
4378 				 * wraparound.
4379 				 */
4380 				if ((sva & L3C_OFFSET) == 0 &&
4381 				    sva + L3C_OFFSET <= va_next - 1) {
4382 					pmap_mask_set_l3c(pmap, l3p, sva, &va,
4383 					    va_next, mask, nbits);
4384 					l3p += L3C_ENTRIES - 1;
4385 					sva += L3C_SIZE - L3_SIZE;
4386 					continue;
4387 				}
4388 
4389 				(void)pmap_demote_l3c(pmap, l3p, sva);
4390 
4391 				/*
4392 				 * The L3 entry's accessed bit may have changed.
4393 				 */
4394 				l3 = pmap_load(l3p);
4395 			}
4396 			while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4397 			    nbits))
4398 				cpu_spinwait();
4399 
4400 			/*
4401 			 * When a dirty read/write mapping is write protected,
4402 			 * update the page's dirty field.
4403 			 */
4404 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
4405 			    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4406 			    pmap_pte_dirty(pmap, l3))
4407 				vm_page_dirty(PTE_TO_VM_PAGE(l3));
4408 
4409 			if (va == va_next)
4410 				va = sva;
4411 		}
4412 		if (va != va_next && invalidate)
4413 			pmap_s1_invalidate_range(pmap, va, sva, true);
4414 	}
4415 }
4416 
4417 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4418 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4419     pt_entry_t nbits, bool invalidate)
4420 {
4421 	PMAP_LOCK(pmap);
4422 	pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4423 	PMAP_UNLOCK(pmap);
4424 }
4425 
4426 /*
4427  *	Set the physical protection on the
4428  *	specified range of this map as requested.
4429  */
4430 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4431 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4432 {
4433 	pt_entry_t mask, nbits;
4434 
4435 	PMAP_ASSERT_STAGE1(pmap);
4436 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4437 	if (prot == VM_PROT_NONE) {
4438 		pmap_remove(pmap, sva, eva);
4439 		return;
4440 	}
4441 
4442 	mask = nbits = 0;
4443 	if ((prot & VM_PROT_WRITE) == 0) {
4444 		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4445 		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4446 	}
4447 	if ((prot & VM_PROT_EXECUTE) == 0) {
4448 		mask |= ATTR_S1_XN;
4449 		nbits |= ATTR_S1_XN;
4450 	}
4451 	if (pmap == kernel_pmap) {
4452 		mask |= ATTR_KERN_GP;
4453 		nbits |= ATTR_KERN_GP;
4454 	}
4455 	if (mask == 0)
4456 		return;
4457 
4458 	pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4459 }
4460 
4461 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4462 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4463 {
4464 
4465 	MPASS((sva & L3_OFFSET) == 0);
4466 	MPASS(((sva + size) & L3_OFFSET) == 0);
4467 
4468 	pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4469 	    ATTR_SW_NO_PROMOTE, false);
4470 }
4471 
4472 /*
4473  * Inserts the specified page table page into the specified pmap's collection
4474  * of idle page table pages.  Each of a pmap's page table pages is responsible
4475  * for mapping a distinct range of virtual addresses.  The pmap's collection is
4476  * ordered by this virtual address range.
4477  *
4478  * If "promoted" is false, then the page table page "mpte" must be zero filled;
4479  * "mpte"'s valid field will be set to 0.
4480  *
4481  * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4482  * contain valid mappings with identical attributes except for ATTR_AF;
4483  * "mpte"'s valid field will be set to 1.
4484  *
4485  * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4486  * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4487  * field will be set to VM_PAGE_BITS_ALL.
4488  */
4489 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4490 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4491     bool all_l3e_AF_set)
4492 {
4493 
4494 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4495 	KASSERT(promoted || !all_l3e_AF_set,
4496 	    ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4497 	mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4498 	return (vm_radix_insert(&pmap->pm_root, mpte));
4499 }
4500 
4501 /*
4502  * Removes the page table page mapping the specified virtual address from the
4503  * specified pmap's collection of idle page table pages, and returns it.
4504  * Otherwise, returns NULL if there is no page table page corresponding to the
4505  * specified virtual address.
4506  */
4507 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4508 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4509 {
4510 
4511 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4512 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4513 }
4514 
4515 /*
4516  * Performs a break-before-make update of a pmap entry. This is needed when
4517  * either promoting or demoting pages to ensure the TLB doesn't get into an
4518  * inconsistent state.
4519  */
4520 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4521 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4522     vm_offset_t va, vm_size_t size)
4523 {
4524 	pd_entry_t *lip, *ptep_end;
4525 	register_t intr;
4526 
4527 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4528 
4529 	if ((newpte & ATTR_SW_NO_PROMOTE) != 0)
4530 		panic("%s: Updating non-promote pte", __func__);
4531 
4532 	if (size == L3C_SIZE)
4533 		ptep_end = ptep + L3C_ENTRIES;
4534 	else
4535 		ptep_end = ptep + 1;
4536 
4537 	/*
4538 	 * Ensure we don't get switched out with the page table in an
4539 	 * inconsistent state. We also need to ensure no interrupts fire
4540 	 * as they may make use of an address we are about to invalidate.
4541 	 */
4542 	intr = intr_disable();
4543 
4544 	/*
4545 	 * Clear the old mapping's valid bit, but leave the rest of the entry
4546 	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4547 	 * lookup the physical address.
4548 	 */
4549 	for (lip = ptep; lip < ptep_end; lip++)
4550 		pmap_clear_bits(lip, ATTR_DESCR_VALID);
4551 
4552 	/*
4553 	 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4554 	 * be cached, so we invalidate intermediate entries as well as final
4555 	 * entries.
4556 	 */
4557 	pmap_s1_invalidate_range(pmap, va, va + size, size == L3C_SIZE);
4558 
4559 	/* Create the new mapping */
4560 	for (lip = ptep; lip < ptep_end; lip++) {
4561 		pmap_store(lip, newpte);
4562 		newpte += PAGE_SIZE;
4563 	}
4564 	dsb(ishst);
4565 
4566 	intr_restore(intr);
4567 }
4568 
4569 #if VM_NRESERVLEVEL > 0
4570 /*
4571  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4572  * replace the many pv entries for the 4KB page mappings by a single pv entry
4573  * for the 2MB page mapping.
4574  */
4575 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4576 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4577     struct rwlock **lockp)
4578 {
4579 	struct md_page *pvh;
4580 	pv_entry_t pv;
4581 	vm_offset_t va_last;
4582 	vm_page_t m;
4583 
4584 	KASSERT((pa & L2_OFFSET) == 0,
4585 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4586 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4587 
4588 	/*
4589 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
4590 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
4591 	 * a transfer avoids the possibility that get_pv_entry() calls
4592 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4593 	 * mappings that is being promoted.
4594 	 */
4595 	m = PHYS_TO_VM_PAGE(pa);
4596 	va = va & ~L2_OFFSET;
4597 	pv = pmap_pvh_remove(&m->md, pmap, va);
4598 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4599 	pvh = page_to_pvh(m);
4600 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4601 	pvh->pv_gen++;
4602 	/* Free the remaining NPTEPG - 1 pv entries. */
4603 	va_last = va + L2_SIZE - PAGE_SIZE;
4604 	do {
4605 		m++;
4606 		va += PAGE_SIZE;
4607 		pmap_pvh_free(&m->md, pmap, va);
4608 	} while (va < va_last);
4609 }
4610 
4611 /*
4612  * Tries to promote the 512, contiguous 4KB page mappings that are within a
4613  * single level 2 table entry to a single 2MB page mapping.  For promotion
4614  * to occur, two conditions must be met: (1) the 4KB page mappings must map
4615  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4616  * identical characteristics.
4617  */
4618 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4619 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4620     struct rwlock **lockp)
4621 {
4622 	pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4623 
4624 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4625 
4626 	/*
4627 	 * Currently, this function only supports promotion on stage 1 pmaps
4628 	 * because it tests stage 1 specific fields and performs a break-
4629 	 * before-make sequence that is incorrect for stage 2 pmaps.
4630 	 */
4631 	if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4632 		return (false);
4633 
4634 	/*
4635 	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
4636 	 * ineligible for promotion...
4637 	 */
4638 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4639 	newl2 = pmap_load(firstl3);
4640 	if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4641 		return (false);
4642 	/* ... is not the first physical page within an L2 block */
4643 	if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4644 	    ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4645 		atomic_add_long(&pmap_l2_p_failures, 1);
4646 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4647 		    " in pmap %p", va, pmap);
4648 		return (false);
4649 	}
4650 
4651 	/*
4652 	 * Both here and in the below "for" loop, to allow for repromotion
4653 	 * after MADV_FREE, conditionally write protect a clean L3E before
4654 	 * possibly aborting the promotion due to other L3E attributes.  Why?
4655 	 * Suppose that MADV_FREE is applied to a part of a superpage, the
4656 	 * address range [S, E).  pmap_advise() will demote the superpage
4657 	 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4658 	 * set AP_RO and clear AF in the L3Es for the rest of [S, E).  Later,
4659 	 * imagine that the memory in [S, E) is recycled, but the last 4KB
4660 	 * page in [S, E) is not the last to be rewritten, or simply accessed.
4661 	 * In other words, there is still a 4KB page in [S, E), call it P,
4662 	 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4663 	 * Unless we write protect P before aborting the promotion, if and
4664 	 * when P is finally rewritten, there won't be a page fault to trigger
4665 	 * repromotion.
4666 	 */
4667 setl2:
4668 	if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4669 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4670 		/*
4671 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4672 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4673 		 */
4674 		if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4675 			goto setl2;
4676 		newl2 &= ~ATTR_SW_DBM;
4677 		CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4678 		    " in pmap %p", va & ~L2_OFFSET, pmap);
4679 	}
4680 
4681 	/*
4682 	 * Examine each of the other L3Es in the specified PTP.  Abort if this
4683 	 * L3E maps an unexpected 4KB physical page or does not have identical
4684 	 * characteristics to the first L3E.  If ATTR_AF is not set in every
4685 	 * PTE, then request that the PTP be refilled on demotion.
4686 	 */
4687 	all_l3e_AF = newl2 & ATTR_AF;
4688 	pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4689 	    + L2_SIZE - PAGE_SIZE;
4690 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4691 		oldl3 = pmap_load(l3);
4692 		if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4693 			atomic_add_long(&pmap_l2_p_failures, 1);
4694 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4695 			    " in pmap %p", va, pmap);
4696 			return (false);
4697 		}
4698 setl3:
4699 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4700 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4701 			/*
4702 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4703 			 * set, ATTR_SW_DBM can be cleared without a TLB
4704 			 * invalidation.
4705 			 */
4706 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4707 			    ~ATTR_SW_DBM))
4708 				goto setl3;
4709 			oldl3 &= ~ATTR_SW_DBM;
4710 		}
4711 		if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
4712 			atomic_add_long(&pmap_l2_p_failures, 1);
4713 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4714 			    " in pmap %p", va, pmap);
4715 			return (false);
4716 		}
4717 		all_l3e_AF &= oldl3;
4718 		pa -= PAGE_SIZE;
4719 	}
4720 
4721 	/*
4722 	 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4723 	 * mapping, so that promotions triggered by speculative mappings,
4724 	 * such as pmap_enter_quick(), don't automatically mark the
4725 	 * underlying pages as referenced.
4726 	 */
4727 	newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
4728 
4729 	/*
4730 	 * Save the page table page in its current state until the L2
4731 	 * mapping the superpage is demoted by pmap_demote_l2() or
4732 	 * destroyed by pmap_remove_l3().
4733 	 */
4734 	if (mpte == NULL)
4735 		mpte = PTE_TO_VM_PAGE(pmap_load(l2));
4736 	KASSERT(mpte >= vm_page_array &&
4737 	    mpte < &vm_page_array[vm_page_array_size],
4738 	    ("pmap_promote_l2: page table page is out of range"));
4739 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
4740 	    ("pmap_promote_l2: page table page's pindex is wrong"));
4741 	if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4742 		atomic_add_long(&pmap_l2_p_failures, 1);
4743 		CTR2(KTR_PMAP,
4744 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4745 		    pmap);
4746 		return (false);
4747 	}
4748 
4749 	if ((newl2 & ATTR_SW_MANAGED) != 0)
4750 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
4751 
4752 	pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
4753 
4754 	atomic_add_long(&pmap_l2_promotions, 1);
4755 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4756 	    pmap);
4757 	return (true);
4758 }
4759 
4760 /*
4761  * Tries to promote an aligned, contiguous set of base page mappings to a
4762  * single L3C page mapping.  For promotion to occur, two conditions must be
4763  * met: (1) the base page mappings must map aligned, contiguous physical
4764  * memory and (2) the base page mappings must have identical characteristics
4765  * except for the accessed flag.
4766  */
4767 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)4768 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
4769 {
4770 	pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
4771 
4772 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4773 
4774 	/*
4775 	 * Currently, this function only supports promotion on stage 1 pmaps
4776 	 * because it tests stage 1 specific fields and performs a break-
4777 	 * before-make sequence that is incorrect for stage 2 pmaps.
4778 	 */
4779 	if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4780 		return (false);
4781 
4782 	/*
4783 	 * Compute the address of the first L3 entry in the superpage
4784 	 * candidate.
4785 	 */
4786 	l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
4787 	    sizeof(pt_entry_t)) - 1));
4788 
4789 	firstl3c = pmap_load(l3p);
4790 
4791 	/*
4792 	 * Examine the first L3 entry. Abort if this L3E is ineligible for
4793 	 * promotion...
4794 	 */
4795 	if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
4796 		return (false);
4797 	/* ...is not properly aligned... */
4798 	if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
4799 	    (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
4800 		counter_u64_add(pmap_l3c_p_failures, 1);
4801 		CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4802 		    " in pmap %p", va, pmap);
4803 		return (false);
4804 	}
4805 
4806 	/*
4807 	 * If the first L3 entry is a clean read-write mapping, convert it
4808 	 * to a read-only mapping.  See pmap_promote_l2() for the rationale.
4809 	 */
4810 set_first:
4811 	if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4812 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4813 		/*
4814 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4815 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4816 		 */
4817 		if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
4818 			goto set_first;
4819 		firstl3c &= ~ATTR_SW_DBM;
4820 		CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4821 		    " in pmap %p", va & ~L3C_OFFSET, pmap);
4822 	}
4823 
4824 	/*
4825 	 * Check that the rest of the L3 entries are compatible with the first,
4826 	 * and convert clean read-write mappings to read-only mappings.
4827 	 */
4828 	all_l3e_AF = firstl3c & ATTR_AF;
4829 	pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
4830 	    L3C_SIZE - PAGE_SIZE;
4831 	for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
4832 		oldl3 = pmap_load(l3);
4833 		if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4834 			counter_u64_add(pmap_l3c_p_failures, 1);
4835 			CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4836 			    " in pmap %p", va, pmap);
4837 			return (false);
4838 		}
4839 set_l3:
4840 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4841 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4842 			/*
4843 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4844 			 * set, ATTR_SW_DBM can be cleared without a TLB
4845 			 * invalidation.
4846 			 */
4847 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4848 			    ~ATTR_SW_DBM))
4849 				goto set_l3;
4850 			oldl3 &= ~ATTR_SW_DBM;
4851 			CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4852 			    " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
4853 			    (va & ~L3C_OFFSET), pmap);
4854 		}
4855 		if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
4856 			counter_u64_add(pmap_l3c_p_failures, 1);
4857 			CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4858 			    " in pmap %p", va, pmap);
4859 			return (false);
4860 		}
4861 		all_l3e_AF &= oldl3;
4862 		pa -= PAGE_SIZE;
4863 	}
4864 
4865 	/*
4866 	 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4867 	 * mapping, so that promotions triggered by speculative mappings,
4868 	 * such as pmap_enter_quick(), don't automatically mark the
4869 	 * underlying pages as referenced.
4870 	 */
4871 	firstl3c &= ~ATTR_AF | all_l3e_AF;
4872 
4873 	/*
4874 	 * Remake the mappings with the contiguous bit set.
4875 	 */
4876 	pmap_update_entry(pmap, l3p, firstl3c | ATTR_CONTIGUOUS, va &
4877 	    ~L3C_OFFSET, L3C_SIZE);
4878 
4879 	counter_u64_add(pmap_l3c_promotions, 1);
4880 	CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
4881 	    pmap);
4882 	return (true);
4883 }
4884 #endif /* VM_NRESERVLEVEL > 0 */
4885 
4886 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t newpte,int flags,int psind)4887 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
4888     int psind)
4889 {
4890 	pd_entry_t *l0p, *l1p, *l2p, origpte;
4891 	vm_page_t mp;
4892 
4893 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4894 	KASSERT(psind > 0 && psind < MAXPAGESIZES,
4895 	    ("psind %d unexpected", psind));
4896 	KASSERT((PTE_TO_PHYS(newpte) & (pagesizes[psind] - 1)) == 0,
4897 	    ("unaligned phys address %#lx newpte %#lx psind %d",
4898 	    PTE_TO_PHYS(newpte), newpte, psind));
4899 
4900 restart:
4901 	if (!pmap_bti_same(pmap, va, va + pagesizes[psind]))
4902 		return (KERN_PROTECTION_FAILURE);
4903 	if (psind == 2) {
4904 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4905 
4906 		l0p = pmap_l0(pmap, va);
4907 		if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4908 			mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4909 			if (mp == NULL) {
4910 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4911 					return (KERN_RESOURCE_SHORTAGE);
4912 				PMAP_UNLOCK(pmap);
4913 				vm_wait(NULL);
4914 				PMAP_LOCK(pmap);
4915 				goto restart;
4916 			}
4917 			l1p = pmap_l0_to_l1(l0p, va);
4918 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4919 			origpte = pmap_load(l1p);
4920 		} else {
4921 			l1p = pmap_l0_to_l1(l0p, va);
4922 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4923 			origpte = pmap_load(l1p);
4924 			if ((origpte & ATTR_DESCR_VALID) == 0) {
4925 				mp = PTE_TO_VM_PAGE(pmap_load(l0p));
4926 				mp->ref_count++;
4927 			}
4928 		}
4929 		KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
4930 		    (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
4931 		    (origpte & ATTR_DESCR_VALID) == 0,
4932 		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
4933 		    va, origpte, newpte));
4934 		pmap_store(l1p, newpte);
4935 	} else /* (psind == 1) */ {
4936 		l2p = pmap_l2(pmap, va);
4937 		if (l2p == NULL) {
4938 			mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
4939 			if (mp == NULL) {
4940 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4941 					return (KERN_RESOURCE_SHORTAGE);
4942 				PMAP_UNLOCK(pmap);
4943 				vm_wait(NULL);
4944 				PMAP_LOCK(pmap);
4945 				goto restart;
4946 			}
4947 			l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
4948 			l2p = &l2p[pmap_l2_index(va)];
4949 			origpte = pmap_load(l2p);
4950 		} else {
4951 			l1p = pmap_l1(pmap, va);
4952 			origpte = pmap_load(l2p);
4953 			if ((origpte & ATTR_DESCR_VALID) == 0) {
4954 				mp = PTE_TO_VM_PAGE(pmap_load(l1p));
4955 				mp->ref_count++;
4956 			}
4957 		}
4958 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
4959 		    ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
4960 		    PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
4961 		    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
4962 		    va, origpte, newpte));
4963 		pmap_store(l2p, newpte);
4964 	}
4965 	dsb(ishst);
4966 
4967 	if ((origpte & ATTR_DESCR_VALID) == 0)
4968 		pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
4969 	if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
4970 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
4971 	else if ((newpte & ATTR_SW_WIRED) == 0 &&
4972 	    (origpte & ATTR_SW_WIRED) != 0)
4973 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
4974 
4975 	return (KERN_SUCCESS);
4976 }
4977 
4978 /*
4979  *	Insert the given physical page (p) at
4980  *	the specified virtual address (v) in the
4981  *	target physical map with the protection requested.
4982  *
4983  *	If specified, the page will be wired down, meaning
4984  *	that the related pte can not be reclaimed.
4985  *
4986  *	NB:  This is the only routine which MAY NOT lazy-evaluate
4987  *	or lose information.  That is, this routine must actually
4988  *	insert this page into the given map NOW.
4989  */
4990 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)4991 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4992     u_int flags, int8_t psind)
4993 {
4994 	struct rwlock *lock;
4995 	pd_entry_t *pde;
4996 	pt_entry_t new_l3, orig_l3;
4997 	pt_entry_t *l2, *l3;
4998 	pv_entry_t pv;
4999 	vm_paddr_t opa, pa;
5000 	vm_page_t mpte, om;
5001 	bool nosleep;
5002 	int lvl, rv;
5003 
5004 	KASSERT(ADDR_IS_CANONICAL(va),
5005 	    ("%s: Address not in canonical form: %lx", __func__, va));
5006 
5007 	va = trunc_page(va);
5008 	if ((m->oflags & VPO_UNMANAGED) == 0)
5009 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
5010 	pa = VM_PAGE_TO_PHYS(m);
5011 	new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE);
5012 	new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5013 	new_l3 |= pmap_pte_prot(pmap, prot);
5014 	if ((flags & PMAP_ENTER_WIRED) != 0)
5015 		new_l3 |= ATTR_SW_WIRED;
5016 	if (pmap->pm_stage == PM_STAGE1) {
5017 		if (!ADDR_IS_KERNEL(va))
5018 			new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5019 		else
5020 			new_l3 |= ATTR_S1_UXN;
5021 		if (pmap != kernel_pmap)
5022 			new_l3 |= ATTR_S1_nG;
5023 	} else {
5024 		/*
5025 		 * Clear the access flag on executable mappings, this will be
5026 		 * set later when the page is accessed. The fault handler is
5027 		 * required to invalidate the I-cache.
5028 		 *
5029 		 * TODO: Switch to the valid flag to allow hardware management
5030 		 * of the access flag. Much of the pmap code assumes the
5031 		 * valid flag is set and fails to destroy the old page tables
5032 		 * correctly if it is clear.
5033 		 */
5034 		if (prot & VM_PROT_EXECUTE)
5035 			new_l3 &= ~ATTR_AF;
5036 	}
5037 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5038 		new_l3 |= ATTR_SW_MANAGED;
5039 		if ((prot & VM_PROT_WRITE) != 0) {
5040 			new_l3 |= ATTR_SW_DBM;
5041 			if ((flags & VM_PROT_WRITE) == 0) {
5042 				if (pmap->pm_stage == PM_STAGE1)
5043 					new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5044 				else
5045 					new_l3 &=
5046 					    ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5047 			}
5048 		}
5049 	}
5050 
5051 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5052 
5053 	lock = NULL;
5054 	PMAP_LOCK(pmap);
5055 	/* Wait until we lock the pmap to protect the bti rangeset */
5056 	new_l3 |= pmap_pte_bti(pmap, va);
5057 
5058 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5059 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5060 		    ("managed largepage va %#lx flags %#x", va, flags));
5061 		new_l3 &= ~L3_PAGE;
5062 		if (psind == 2) {
5063 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5064 			new_l3 |= L1_BLOCK;
5065 		} else /* (psind == 1) */
5066 			new_l3 |= L2_BLOCK;
5067 		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5068 		goto out;
5069 	}
5070 	if (psind == 1) {
5071 		/* Assert the required virtual and physical alignment. */
5072 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5073 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5074 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5075 		    flags, m, &lock);
5076 		goto out;
5077 	}
5078 	mpte = NULL;
5079 
5080 	/*
5081 	 * In the case that a page table page is not
5082 	 * resident, we are creating it here.
5083 	 */
5084 retry:
5085 	pde = pmap_pde(pmap, va, &lvl);
5086 	if (pde != NULL && lvl == 2) {
5087 		l3 = pmap_l2_to_l3(pde, va);
5088 		if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
5089 			mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5090 			mpte->ref_count++;
5091 		}
5092 		goto havel3;
5093 	} else if (pde != NULL && lvl == 1) {
5094 		l2 = pmap_l1_to_l2(pde, va);
5095 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5096 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5097 			l3 = &l3[pmap_l3_index(va)];
5098 			if (!ADDR_IS_KERNEL(va)) {
5099 				mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5100 				mpte->ref_count++;
5101 			}
5102 			goto havel3;
5103 		}
5104 		/* We need to allocate an L3 table. */
5105 	}
5106 	if (!ADDR_IS_KERNEL(va)) {
5107 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5108 
5109 		/*
5110 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5111 		 * to handle the possibility that a superpage mapping for "va"
5112 		 * was created while we slept.
5113 		 */
5114 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5115 		    nosleep ? NULL : &lock);
5116 		if (mpte == NULL && nosleep) {
5117 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5118 			rv = KERN_RESOURCE_SHORTAGE;
5119 			goto out;
5120 		}
5121 		goto retry;
5122 	} else
5123 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5124 
5125 havel3:
5126 	orig_l3 = pmap_load(l3);
5127 	opa = PTE_TO_PHYS(orig_l3);
5128 	pv = NULL;
5129 
5130 	/*
5131 	 * Is the specified virtual address already mapped?
5132 	 */
5133 	if (pmap_l3_valid(orig_l3)) {
5134 		/*
5135 		 * Wiring change, just update stats. We don't worry about
5136 		 * wiring PT pages as they remain resident as long as there
5137 		 * are valid mappings in them. Hence, if a user page is wired,
5138 		 * the PT page will be also.
5139 		 */
5140 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
5141 		    (orig_l3 & ATTR_SW_WIRED) == 0)
5142 			pmap->pm_stats.wired_count++;
5143 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5144 		    (orig_l3 & ATTR_SW_WIRED) != 0)
5145 			pmap->pm_stats.wired_count--;
5146 
5147 		/*
5148 		 * Remove the extra PT page reference.
5149 		 */
5150 		if (mpte != NULL) {
5151 			mpte->ref_count--;
5152 			KASSERT(mpte->ref_count > 0,
5153 			    ("pmap_enter: missing reference to page table page,"
5154 			     " va: 0x%lx", va));
5155 		}
5156 
5157 		/*
5158 		 * Has the physical page changed?
5159 		 */
5160 		if (opa == pa) {
5161 			/*
5162 			 * No, might be a protection or wiring change.
5163 			 */
5164 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5165 			    (new_l3 & ATTR_SW_DBM) != 0)
5166 				vm_page_aflag_set(m, PGA_WRITEABLE);
5167 			goto validate;
5168 		}
5169 
5170 		/*
5171 		 * The physical page has changed.  Temporarily invalidate
5172 		 * the mapping.
5173 		 */
5174 		if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5175 			(void)pmap_demote_l3c(pmap, l3, va);
5176 		orig_l3 = pmap_load_clear(l3);
5177 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5178 		    ("pmap_enter: unexpected pa update for %#lx", va));
5179 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5180 			om = PHYS_TO_VM_PAGE(opa);
5181 
5182 			/*
5183 			 * The pmap lock is sufficient to synchronize with
5184 			 * concurrent calls to pmap_page_test_mappings() and
5185 			 * pmap_ts_referenced().
5186 			 */
5187 			if (pmap_pte_dirty(pmap, orig_l3))
5188 				vm_page_dirty(om);
5189 			if ((orig_l3 & ATTR_AF) != 0) {
5190 				pmap_invalidate_page(pmap, va, true);
5191 				vm_page_aflag_set(om, PGA_REFERENCED);
5192 			}
5193 			CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5194 			pv = pmap_pvh_remove(&om->md, pmap, va);
5195 			if ((m->oflags & VPO_UNMANAGED) != 0)
5196 				free_pv_entry(pmap, pv);
5197 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5198 			    TAILQ_EMPTY(&om->md.pv_list) &&
5199 			    ((om->flags & PG_FICTITIOUS) != 0 ||
5200 			    TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5201 				vm_page_aflag_clear(om, PGA_WRITEABLE);
5202 		} else {
5203 			KASSERT((orig_l3 & ATTR_AF) != 0,
5204 			    ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5205 			pmap_invalidate_page(pmap, va, true);
5206 		}
5207 		orig_l3 = 0;
5208 	} else {
5209 		/*
5210 		 * Increment the counters.
5211 		 */
5212 		if ((new_l3 & ATTR_SW_WIRED) != 0)
5213 			pmap->pm_stats.wired_count++;
5214 		pmap_resident_count_inc(pmap, 1);
5215 	}
5216 	/*
5217 	 * Enter on the PV list if part of our managed memory.
5218 	 */
5219 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5220 		if (pv == NULL) {
5221 			pv = get_pv_entry(pmap, &lock);
5222 			pv->pv_va = va;
5223 		}
5224 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5225 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5226 		m->md.pv_gen++;
5227 		if ((new_l3 & ATTR_SW_DBM) != 0)
5228 			vm_page_aflag_set(m, PGA_WRITEABLE);
5229 	}
5230 
5231 validate:
5232 	if (pmap->pm_stage == PM_STAGE1) {
5233 		/*
5234 		 * Sync icache if exec permission and attribute
5235 		 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5236 		 * is stored and made valid for hardware table walk. If done
5237 		 * later, then other can access this page before caches are
5238 		 * properly synced. Don't do it for kernel memory which is
5239 		 * mapped with exec permission even if the memory isn't going
5240 		 * to hold executable code. The only time when icache sync is
5241 		 * needed is after kernel module is loaded and the relocation
5242 		 * info is processed. And it's done in elf_cpu_load_file().
5243 		*/
5244 		if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
5245 		    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5246 		    (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5247 			PMAP_ASSERT_STAGE1(pmap);
5248 			cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5249 			    PAGE_SIZE);
5250 		}
5251 	} else {
5252 		cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5253 	}
5254 
5255 	/*
5256 	 * Update the L3 entry
5257 	 */
5258 	if (pmap_l3_valid(orig_l3)) {
5259 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
5260 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5261 			/* same PA, different attributes */
5262 			if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5263 				(void)pmap_demote_l3c(pmap, l3, va);
5264 			orig_l3 = pmap_load_store(l3, new_l3);
5265 			pmap_invalidate_page(pmap, va, true);
5266 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5267 			    pmap_pte_dirty(pmap, orig_l3))
5268 				vm_page_dirty(m);
5269 		} else {
5270 			/*
5271 			 * orig_l3 == new_l3
5272 			 * This can happens if multiple threads simultaneously
5273 			 * access not yet mapped page. This bad for performance
5274 			 * since this can cause full demotion-NOP-promotion
5275 			 * cycle.
5276 			 * Another possible reasons are:
5277 			 * - VM and pmap memory layout are diverged
5278 			 * - tlb flush is missing somewhere and CPU doesn't see
5279 			 *   actual mapping.
5280 			 */
5281 			CTR4(KTR_PMAP, "%s: already mapped page - "
5282 			    "pmap %p va 0x%#lx pte 0x%lx",
5283 			    __func__, pmap, va, new_l3);
5284 		}
5285 	} else {
5286 		/* New mapping */
5287 		pmap_store(l3, new_l3);
5288 		dsb(ishst);
5289 	}
5290 
5291 #if VM_NRESERVLEVEL > 0
5292 	/*
5293 	 * First, attempt L3C promotion, if the virtual and physical addresses
5294 	 * are aligned with each other and an underlying reservation has the
5295 	 * neighboring L3 pages allocated.  The first condition is simply an
5296 	 * optimization that recognizes some eventual promotion failures early
5297 	 * at a lower run-time cost.  Then, if both the page table page and
5298 	 * the reservation are fully populated, attempt L2 promotion.
5299 	 */
5300 	if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5301 	    (m->flags & PG_FICTITIOUS) == 0 &&
5302 	    vm_reserv_is_populated(m, L3C_ENTRIES) &&
5303 	    pmap_promote_l3c(pmap, l3, va) &&
5304 	    (mpte == NULL || mpte->ref_count == NL3PG) &&
5305 	    vm_reserv_level_iffullpop(m) == 0)
5306 		(void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5307 #endif
5308 
5309 	rv = KERN_SUCCESS;
5310 out:
5311 	if (lock != NULL)
5312 		rw_wunlock(lock);
5313 	PMAP_UNLOCK(pmap);
5314 	return (rv);
5315 }
5316 
5317 /*
5318  * Tries to create a read- and/or execute-only L2 page mapping.  Returns
5319  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
5320  * value.  See pmap_enter_l2() for the possible error values when "no sleep",
5321  * "no replace", and "no reclaim" are specified.
5322  */
5323 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5324 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5325     struct rwlock **lockp)
5326 {
5327 	pd_entry_t new_l2;
5328 
5329 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5330 	PMAP_ASSERT_STAGE1(pmap);
5331 	KASSERT(ADDR_IS_CANONICAL(va),
5332 	    ("%s: Address not in canonical form: %lx", __func__, va));
5333 
5334 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5335 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5336 	    L2_BLOCK);
5337 	new_l2 |= pmap_pte_bti(pmap, va);
5338 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5339 		new_l2 |= ATTR_SW_MANAGED;
5340 		new_l2 &= ~ATTR_AF;
5341 	}
5342 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5343 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5344 		new_l2 |= ATTR_S1_XN;
5345 	if (!ADDR_IS_KERNEL(va))
5346 		new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5347 	else
5348 		new_l2 |= ATTR_S1_UXN;
5349 	if (pmap != kernel_pmap)
5350 		new_l2 |= ATTR_S1_nG;
5351 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5352 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5353 }
5354 
5355 /*
5356  * Returns true if every page table entry in the specified page table is
5357  * zero.
5358  */
5359 static bool
pmap_every_pte_zero(vm_paddr_t pa)5360 pmap_every_pte_zero(vm_paddr_t pa)
5361 {
5362 	pt_entry_t *pt_end, *pte;
5363 
5364 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5365 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5366 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5367 		if (*pte != 0)
5368 			return (false);
5369 	}
5370 	return (true);
5371 }
5372 
5373 /*
5374  * Tries to create the specified L2 page mapping.  Returns KERN_SUCCESS if
5375  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5376  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
5377  * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5378  * within the L2 virtual address range starting at the specified virtual
5379  * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5380  * L2 page mapping already exists at the specified virtual address.  Returns
5381  * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5382  * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5383  * and a PV entry allocation failed.
5384  */
5385 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5386 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5387     vm_page_t m, struct rwlock **lockp)
5388 {
5389 	struct spglist free;
5390 	pd_entry_t *l2, old_l2;
5391 	vm_page_t l2pg, mt;
5392 	vm_page_t uwptpg;
5393 
5394 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5395 	KASSERT(ADDR_IS_CANONICAL(va),
5396 	    ("%s: Address not in canonical form: %lx", __func__, va));
5397 
5398 	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5399 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5400 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5401 		    va, pmap);
5402 		return (KERN_RESOURCE_SHORTAGE);
5403 	}
5404 
5405 	/*
5406 	 * If bti is not the same for the whole l2 range, return failure
5407 	 * and let vm_fault() cope.  Check after l2 allocation, since
5408 	 * it could sleep.
5409 	 */
5410 	if (!pmap_bti_same(pmap, va, va + L2_SIZE)) {
5411 		KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5412 		pmap_abort_ptp(pmap, va, l2pg);
5413 		return (KERN_PROTECTION_FAILURE);
5414 	}
5415 
5416 	/*
5417 	 * If there are existing mappings, either abort or remove them.
5418 	 */
5419 	if ((old_l2 = pmap_load(l2)) != 0) {
5420 		KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5421 		    ("pmap_enter_l2: l2pg's ref count is too low"));
5422 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5423 			if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5424 				if (l2pg != NULL)
5425 					l2pg->ref_count--;
5426 				CTR2(KTR_PMAP,
5427 				    "pmap_enter_l2: no space for va %#lx"
5428 				    " in pmap %p", va, pmap);
5429 				return (KERN_NO_SPACE);
5430 			} else if (!ADDR_IS_KERNEL(va) ||
5431 			    !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5432 				if (l2pg != NULL)
5433 					l2pg->ref_count--;
5434 				CTR2(KTR_PMAP,
5435 				    "pmap_enter_l2: failure for va %#lx"
5436 				    " in pmap %p", va, pmap);
5437 				return (KERN_FAILURE);
5438 			}
5439 		}
5440 		SLIST_INIT(&free);
5441 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
5442 			(void)pmap_remove_l2(pmap, l2, va,
5443 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
5444 		else
5445 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5446 			    &free, lockp);
5447 		if (!ADDR_IS_KERNEL(va)) {
5448 			vm_page_free_pages_toq(&free, true);
5449 			KASSERT(pmap_load(l2) == 0,
5450 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
5451 		} else {
5452 			KASSERT(SLIST_EMPTY(&free),
5453 			    ("pmap_enter_l2: freed kernel page table page"));
5454 
5455 			/*
5456 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
5457 			 * will leave the kernel page table page zero filled.
5458 			 * Nonetheless, the TLB could have an intermediate
5459 			 * entry for the kernel page table page, so request
5460 			 * an invalidation at all levels after clearing
5461 			 * the L2_TABLE entry.
5462 			 */
5463 			mt = PTE_TO_VM_PAGE(pmap_load(l2));
5464 			if (pmap_insert_pt_page(pmap, mt, false, false))
5465 				panic("pmap_enter_l2: trie insert failed");
5466 			pmap_clear(l2);
5467 			pmap_s1_invalidate_page(pmap, va, false);
5468 		}
5469 	}
5470 
5471 	/*
5472 	 * Allocate leaf ptpage for wired userspace pages.
5473 	 */
5474 	uwptpg = NULL;
5475 	if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5476 		uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5477 		if (uwptpg == NULL) {
5478 			return (KERN_RESOURCE_SHORTAGE);
5479 		}
5480 		uwptpg->pindex = pmap_l2_pindex(va);
5481 		if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5482 			vm_page_unwire_noq(uwptpg);
5483 			vm_page_free(uwptpg);
5484 			return (KERN_RESOURCE_SHORTAGE);
5485 		}
5486 		pmap_resident_count_inc(pmap, 1);
5487 		uwptpg->ref_count = NL3PG;
5488 	}
5489 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5490 		/*
5491 		 * Abort this mapping if its PV entry could not be created.
5492 		 */
5493 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5494 			if (l2pg != NULL)
5495 				pmap_abort_ptp(pmap, va, l2pg);
5496 			if (uwptpg != NULL) {
5497 				mt = pmap_remove_pt_page(pmap, va);
5498 				KASSERT(mt == uwptpg,
5499 				    ("removed pt page %p, expected %p", mt,
5500 				    uwptpg));
5501 				pmap_resident_count_dec(pmap, 1);
5502 				uwptpg->ref_count = 1;
5503 				vm_page_unwire_noq(uwptpg);
5504 				vm_page_free(uwptpg);
5505 			}
5506 			CTR2(KTR_PMAP,
5507 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
5508 			    va, pmap);
5509 			return (KERN_RESOURCE_SHORTAGE);
5510 		}
5511 		if ((new_l2 & ATTR_SW_DBM) != 0)
5512 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5513 				vm_page_aflag_set(mt, PGA_WRITEABLE);
5514 	}
5515 
5516 	/*
5517 	 * Increment counters.
5518 	 */
5519 	if ((new_l2 & ATTR_SW_WIRED) != 0)
5520 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5521 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5522 
5523 	/*
5524 	 * Conditionally sync the icache.  See pmap_enter() for details.
5525 	 */
5526 	if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5527 	    PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5528 	    pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5529 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5530 		    L2_SIZE);
5531 	}
5532 
5533 	/*
5534 	 * Map the superpage.
5535 	 */
5536 	pmap_store(l2, new_l2);
5537 	dsb(ishst);
5538 
5539 	atomic_add_long(&pmap_l2_mappings, 1);
5540 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5541 	    va, pmap);
5542 
5543 	return (KERN_SUCCESS);
5544 }
5545 
5546 /*
5547  * Tries to create a read- and/or execute-only L3C page mapping.  Returns
5548  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
5549  * value.
5550  */
5551 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)5552 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5553     vm_prot_t prot, struct rwlock **lockp)
5554 {
5555 	pt_entry_t l3e;
5556 
5557 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5558 	PMAP_ASSERT_STAGE1(pmap);
5559 	KASSERT(ADDR_IS_CANONICAL(va),
5560 	    ("%s: Address not in canonical form: %lx", __func__, va));
5561 
5562 	l3e = VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5563 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5564 	    ATTR_CONTIGUOUS | L3_PAGE;
5565 	l3e |= pmap_pte_bti(pmap, va);
5566 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5567 		l3e |= ATTR_SW_MANAGED;
5568 		l3e &= ~ATTR_AF;
5569 	}
5570 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5571 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5572 		l3e |= ATTR_S1_XN;
5573 	if (!ADDR_IS_KERNEL(va))
5574 		l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5575 	else
5576 		l3e |= ATTR_S1_UXN;
5577 	if (pmap != kernel_pmap)
5578 		l3e |= ATTR_S1_nG;
5579 	return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5580 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
5581 }
5582 
5583 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)5584 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
5585     vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
5586 {
5587 	pd_entry_t *l2p, *pde;
5588 	pt_entry_t *l3p, *tl3p;
5589 	vm_page_t mt;
5590 	vm_paddr_t pa;
5591 	vm_pindex_t l2pindex;
5592 	int lvl;
5593 
5594 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5595 	KASSERT((va & L3C_OFFSET) == 0,
5596 	    ("pmap_enter_l3c: va is not aligned"));
5597 	KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
5598 	    ("pmap_enter_l3c: managed mapping within the clean submap"));
5599 
5600 	/*
5601 	 * If the L3 PTP is not resident, we attempt to create it here.
5602 	 */
5603 	if (!ADDR_IS_KERNEL(va)) {
5604 		/*
5605 		 * Were we given the correct L3 PTP?  If so, we can simply
5606 		 * increment its ref count.
5607 		 */
5608 		l2pindex = pmap_l2_pindex(va);
5609 		if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
5610 			(*ml3p)->ref_count += L3C_ENTRIES;
5611 		} else {
5612 retry:
5613 			/*
5614 			 * Get the L2 entry.
5615 			 */
5616 			pde = pmap_pde(pmap, va, &lvl);
5617 
5618 			/*
5619 			 * If the L2 entry is a superpage, we either abort or
5620 			 * demote depending on the given flags.
5621 			 */
5622 			if (lvl == 1) {
5623 				l2p = pmap_l1_to_l2(pde, va);
5624 				if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
5625 				    L2_BLOCK) {
5626 					if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5627 						return (KERN_FAILURE);
5628 					l3p = pmap_demote_l2_locked(pmap, l2p,
5629 					    va, lockp);
5630 					if (l3p != NULL) {
5631 						*ml3p = PTE_TO_VM_PAGE(
5632 						    pmap_load(l2p));
5633 						(*ml3p)->ref_count +=
5634 						    L3C_ENTRIES;
5635 						goto have_l3p;
5636 					}
5637 				}
5638 				/* We need to allocate an L3 PTP. */
5639 			}
5640 
5641 			/*
5642 			 * If the L3 PTP is mapped, we just increment its ref
5643 			 * count.  Otherwise, we attempt to allocate it.
5644 			 */
5645 			if (lvl == 2 && pmap_load(pde) != 0) {
5646 				*ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
5647 				(*ml3p)->ref_count += L3C_ENTRIES;
5648 			} else {
5649 				*ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
5650 				    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
5651 				if (*ml3p == NULL) {
5652 					if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5653 						return (KERN_FAILURE);
5654 
5655 					/*
5656 					 * The page table may have changed
5657 					 * while we slept.
5658 					 */
5659 					goto retry;
5660 				}
5661 				(*ml3p)->ref_count += L3C_ENTRIES - 1;
5662 			}
5663 		}
5664 		l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
5665 
5666 have_l3p:
5667 		/*
5668 		 * If bti is not the same for the whole L3C range, return
5669 		 * failure and let vm_fault() cope.  Check after L3 allocation,
5670 		 * since it could sleep.
5671 		 */
5672 		if (!pmap_bti_same(pmap, va, va + L3C_SIZE)) {
5673 			(*ml3p)->ref_count -= L3C_ENTRIES - 1;
5674 			pmap_abort_ptp(pmap, va, *ml3p);
5675 			*ml3p = NULL;
5676 			return (KERN_PROTECTION_FAILURE);
5677 		}
5678 	} else {
5679 		*ml3p = NULL;
5680 
5681 		/*
5682 		 * If the L2 entry is a superpage, we either abort or demote
5683 		 * depending on the given flags.
5684 		 */
5685 		pde = pmap_pde(kernel_pmap, va, &lvl);
5686 		if (lvl == 1) {
5687 			l2p = pmap_l1_to_l2(pde, va);
5688 			KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
5689 			    ("pmap_enter_l3c: missing L2 block"));
5690 			if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5691 				return (KERN_FAILURE);
5692 			l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
5693 		} else {
5694 			KASSERT(lvl == 2,
5695 			    ("pmap_enter_l3c: Invalid level %d", lvl));
5696 			l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
5697 			    pmap_load(pde)));
5698 		}
5699 	}
5700 	l3p = &l3p[pmap_l3_index(va)];
5701 
5702 	/*
5703 	 * If there are existing mappings, either abort or remove them.
5704 	 */
5705 	if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5706 		for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5707 			if (pmap_load(tl3p) != 0) {
5708 				if (*ml3p != NULL)
5709 					(*ml3p)->ref_count -= L3C_ENTRIES;
5710 				return (KERN_FAILURE);
5711 			}
5712 		}
5713 	} else {
5714 		/*
5715 		 * Because we increment the L3 page's reference count above,
5716 		 * it is guaranteed not to be freed here and we can pass NULL
5717 		 * instead of a valid free list.
5718 		 */
5719 		pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
5720 		    va + L3C_SIZE, NULL, lockp);
5721 	}
5722 
5723 	/*
5724 	 * Enter on the PV list if part of our managed memory.
5725 	 */
5726 	if ((l3e & ATTR_SW_MANAGED) != 0) {
5727 		if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
5728 			if (*ml3p != NULL) {
5729 				(*ml3p)->ref_count -= L3C_ENTRIES - 1;
5730 				pmap_abort_ptp(pmap, va, *ml3p);
5731 				*ml3p = NULL;
5732 			}
5733 			return (KERN_RESOURCE_SHORTAGE);
5734 		}
5735 		if ((l3e & ATTR_SW_DBM) != 0)
5736 			for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
5737 				vm_page_aflag_set(mt, PGA_WRITEABLE);
5738 	}
5739 
5740 	/*
5741 	 * Increment counters.
5742 	 */
5743 	if ((l3e & ATTR_SW_WIRED) != 0)
5744 		pmap->pm_stats.wired_count += L3C_ENTRIES;
5745 	pmap_resident_count_inc(pmap, L3C_ENTRIES);
5746 
5747 	pa = VM_PAGE_TO_PHYS(m);
5748 	KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
5749 
5750 	/*
5751 	 * Sync the icache before the mapping is stored.
5752 	 */
5753 	if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
5754 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5755 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
5756 
5757 	/*
5758 	 * Map the superpage.
5759 	 */
5760 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5761 		pmap_store(tl3p, l3e);
5762 		l3e += L3_SIZE;
5763 	}
5764 	dsb(ishst);
5765 
5766 	counter_u64_add(pmap_l3c_mappings, 1);
5767 	CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
5768 	    va, pmap);
5769 	return (KERN_SUCCESS);
5770 }
5771 
5772 /*
5773  * Maps a sequence of resident pages belonging to the same object.
5774  * The sequence begins with the given page m_start.  This page is
5775  * mapped at the given virtual address start.  Each subsequent page is
5776  * mapped at a virtual address that is offset from start by the same
5777  * amount as the page is offset from m_start within the object.  The
5778  * last page in the sequence is the page with the largest offset from
5779  * m_start that can be mapped at a virtual address less than the given
5780  * virtual address end.  Not every virtual page between start and end
5781  * is mapped; only those for which a resident page exists with the
5782  * corresponding offset from m_start are mapped.
5783  */
5784 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)5785 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5786     vm_page_t m_start, vm_prot_t prot)
5787 {
5788 	struct rwlock *lock;
5789 	vm_offset_t va;
5790 	vm_page_t m, mpte;
5791 	vm_pindex_t diff, psize;
5792 	int rv;
5793 
5794 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
5795 
5796 	psize = atop(end - start);
5797 	mpte = NULL;
5798 	m = m_start;
5799 	lock = NULL;
5800 	PMAP_LOCK(pmap);
5801 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5802 		va = start + ptoa(diff);
5803 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
5804 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
5805 		    ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
5806 		    KERN_SUCCESS || rv == KERN_NO_SPACE))
5807 			m = &m[L2_SIZE / PAGE_SIZE - 1];
5808 		else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
5809 		    (VM_PAGE_TO_PHYS(m) & L3C_OFFSET) == 0 &&
5810 		    vm_reserv_is_populated(m, L3C_ENTRIES) &&
5811 		    pmap_ps_enabled(pmap) &&
5812 		    ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
5813 		    &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE))
5814 			m = &m[L3C_ENTRIES - 1];
5815 		else
5816 			mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
5817 			    &lock);
5818 		m = TAILQ_NEXT(m, listq);
5819 	}
5820 	if (lock != NULL)
5821 		rw_wunlock(lock);
5822 	PMAP_UNLOCK(pmap);
5823 }
5824 
5825 /*
5826  * this code makes some *MAJOR* assumptions:
5827  * 1. Current pmap & pmap exists.
5828  * 2. Not wired.
5829  * 3. Read access.
5830  * 4. No page table pages.
5831  * but is *MUCH* faster than pmap_enter...
5832  */
5833 
5834 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)5835 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5836 {
5837 	struct rwlock *lock;
5838 
5839 	lock = NULL;
5840 	PMAP_LOCK(pmap);
5841 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5842 	if (lock != NULL)
5843 		rw_wunlock(lock);
5844 	PMAP_UNLOCK(pmap);
5845 }
5846 
5847 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)5848 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5849     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5850 {
5851 	pd_entry_t *pde;
5852 	pt_entry_t *l1, *l2, *l3, l3_val;
5853 	vm_paddr_t pa;
5854 	int lvl;
5855 
5856 	KASSERT(!VA_IS_CLEANMAP(va) ||
5857 	    (m->oflags & VPO_UNMANAGED) != 0,
5858 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5859 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5860 	PMAP_ASSERT_STAGE1(pmap);
5861 	KASSERT(ADDR_IS_CANONICAL(va),
5862 	    ("%s: Address not in canonical form: %lx", __func__, va));
5863 	l2 = NULL;
5864 
5865 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
5866 	/*
5867 	 * In the case that a page table page is not
5868 	 * resident, we are creating it here.
5869 	 */
5870 	if (!ADDR_IS_KERNEL(va)) {
5871 		vm_pindex_t l2pindex;
5872 
5873 		/*
5874 		 * Calculate pagetable page index
5875 		 */
5876 		l2pindex = pmap_l2_pindex(va);
5877 		if (mpte && (mpte->pindex == l2pindex)) {
5878 			mpte->ref_count++;
5879 		} else {
5880 			/*
5881 			 * If the page table page is mapped, we just increment
5882 			 * the hold count, and activate it.  Otherwise, we
5883 			 * attempt to allocate a page table page, passing NULL
5884 			 * instead of the PV list lock pointer because we don't
5885 			 * intend to sleep.  If this attempt fails, we don't
5886 			 * retry.  Instead, we give up.
5887 			 */
5888 			l1 = pmap_l1(pmap, va);
5889 			if (l1 != NULL && pmap_load(l1) != 0) {
5890 				if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
5891 				    L1_BLOCK)
5892 					return (NULL);
5893 				l2 = pmap_l1_to_l2(l1, va);
5894 				if (pmap_load(l2) != 0) {
5895 					if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
5896 					    L2_BLOCK)
5897 						return (NULL);
5898 					mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5899 					mpte->ref_count++;
5900 				} else {
5901 					mpte = _pmap_alloc_l3(pmap, l2pindex,
5902 					    NULL);
5903 					if (mpte == NULL)
5904 						return (mpte);
5905 				}
5906 			} else {
5907 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
5908 				if (mpte == NULL)
5909 					return (mpte);
5910 			}
5911 		}
5912 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5913 		l3 = &l3[pmap_l3_index(va)];
5914 	} else {
5915 		mpte = NULL;
5916 		pde = pmap_pde(kernel_pmap, va, &lvl);
5917 		KASSERT(pde != NULL,
5918 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
5919 		     va));
5920 		KASSERT(lvl == 2,
5921 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
5922 		l3 = pmap_l2_to_l3(pde, va);
5923 	}
5924 
5925 	/*
5926 	 * Abort if a mapping already exists.
5927 	 */
5928 	if (pmap_load(l3) != 0) {
5929 		if (mpte != NULL)
5930 			mpte->ref_count--;
5931 		return (NULL);
5932 	}
5933 
5934 	/*
5935 	 * Enter on the PV list if part of our managed memory.
5936 	 */
5937 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
5938 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5939 		if (mpte != NULL)
5940 			pmap_abort_ptp(pmap, va, mpte);
5941 		return (NULL);
5942 	}
5943 
5944 	/*
5945 	 * Increment counters
5946 	 */
5947 	pmap_resident_count_inc(pmap, 1);
5948 
5949 	pa = VM_PAGE_TO_PHYS(m);
5950 	l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
5951 	    ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
5952 	l3_val |= pmap_pte_bti(pmap, va);
5953 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5954 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5955 		l3_val |= ATTR_S1_XN;
5956 	if (!ADDR_IS_KERNEL(va))
5957 		l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5958 	else
5959 		l3_val |= ATTR_S1_UXN;
5960 	if (pmap != kernel_pmap)
5961 		l3_val |= ATTR_S1_nG;
5962 
5963 	/*
5964 	 * Now validate mapping with RO protection
5965 	 */
5966 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5967 		l3_val |= ATTR_SW_MANAGED;
5968 		l3_val &= ~ATTR_AF;
5969 	}
5970 
5971 	/* Sync icache before the mapping is stored to PTE */
5972 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5973 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5974 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5975 
5976 	pmap_store(l3, l3_val);
5977 	dsb(ishst);
5978 
5979 #if VM_NRESERVLEVEL > 0
5980 	/*
5981 	 * If both the PTP and the reservation are fully populated, then
5982 	 * attempt promotion.
5983 	 */
5984 	if ((mpte == NULL || mpte->ref_count == NL3PG) &&
5985 	    (m->flags & PG_FICTITIOUS) == 0 &&
5986 	    vm_reserv_level_iffullpop(m) == 0) {
5987 		if (l2 == NULL)
5988 			l2 = pmap_pde(pmap, va, &lvl);
5989 
5990 		/*
5991 		 * If promotion succeeds, then the next call to this function
5992 		 * should not be given the unmapped PTP as a hint.
5993 		 */
5994 		if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
5995 			mpte = NULL;
5996 	}
5997 #endif
5998 
5999 	return (mpte);
6000 }
6001 
6002 /*
6003  * This code maps large physical mmap regions into the
6004  * processor address space.  Note that some shortcuts
6005  * are taken, but the code works.
6006  */
6007 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6008 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6009     vm_pindex_t pindex, vm_size_t size)
6010 {
6011 
6012 	VM_OBJECT_ASSERT_WLOCKED(object);
6013 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6014 	    ("pmap_object_init_pt: non-device object"));
6015 }
6016 
6017 /*
6018  *	Clear the wired attribute from the mappings for the specified range of
6019  *	addresses in the given pmap.  Every valid mapping within that range
6020  *	must have the wired attribute set.  In contrast, invalid mappings
6021  *	cannot have the wired attribute set, so they are ignored.
6022  *
6023  *	The wired attribute of the page table entry is not a hardware feature,
6024  *	so there is no need to invalidate any TLB entries.
6025  */
6026 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6027 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6028 {
6029 	vm_offset_t va_next;
6030 	pd_entry_t *l0, *l1, *l2;
6031 	pt_entry_t *l3;
6032 	bool partial_l3c;
6033 
6034 	PMAP_LOCK(pmap);
6035 	for (; sva < eva; sva = va_next) {
6036 		l0 = pmap_l0(pmap, sva);
6037 		if (pmap_load(l0) == 0) {
6038 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6039 			if (va_next < sva)
6040 				va_next = eva;
6041 			continue;
6042 		}
6043 
6044 		l1 = pmap_l0_to_l1(l0, sva);
6045 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6046 		if (va_next < sva)
6047 			va_next = eva;
6048 		if (pmap_load(l1) == 0)
6049 			continue;
6050 
6051 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6052 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6053 			KASSERT(va_next <= eva,
6054 			    ("partial update of non-transparent 1G page "
6055 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6056 			    pmap_load(l1), sva, eva, va_next));
6057 			MPASS(pmap != kernel_pmap);
6058 			MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6059 			    ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6060 			pmap_clear_bits(l1, ATTR_SW_WIRED);
6061 			pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6062 			continue;
6063 		}
6064 
6065 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6066 		if (va_next < sva)
6067 			va_next = eva;
6068 
6069 		l2 = pmap_l1_to_l2(l1, sva);
6070 		if (pmap_load(l2) == 0)
6071 			continue;
6072 
6073 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6074 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6075 				panic("pmap_unwire: l2 %#jx is missing "
6076 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6077 
6078 			/*
6079 			 * Are we unwiring the entire large page?  If not,
6080 			 * demote the mapping and fall through.
6081 			 */
6082 			if (sva + L2_SIZE == va_next && eva >= va_next) {
6083 				pmap_clear_bits(l2, ATTR_SW_WIRED);
6084 				pmap->pm_stats.wired_count -= L2_SIZE /
6085 				    PAGE_SIZE;
6086 				continue;
6087 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6088 				panic("pmap_unwire: demotion failed");
6089 		}
6090 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6091 		    ("pmap_unwire: Invalid l2 entry after demotion"));
6092 
6093 		if (va_next > eva)
6094 			va_next = eva;
6095 		for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6096 		    sva != va_next; l3++, sva += L3_SIZE) {
6097 			if (pmap_load(l3) == 0)
6098 				continue;
6099 			if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6100 				/*
6101 				 * Avoid demotion for whole-page unwiring.
6102 				 */
6103 				if ((sva & L3C_OFFSET) == 0) {
6104 					/*
6105 					 * Handle the possibility that
6106 					 * "va_next" is zero because of
6107 					 * address wraparound.
6108 					 */
6109 					partial_l3c = sva + L3C_OFFSET >
6110 					    va_next - 1;
6111 				}
6112 				if (partial_l3c)
6113 					(void)pmap_demote_l3c(pmap, l3, sva);
6114 			}
6115 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6116 				panic("pmap_unwire: l3 %#jx is missing "
6117 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6118 
6119 			/*
6120 			 * ATTR_SW_WIRED must be cleared atomically.  Although
6121 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6122 			 * the System MMU may write to the entry concurrently.
6123 			 */
6124 			pmap_clear_bits(l3, ATTR_SW_WIRED);
6125 			pmap->pm_stats.wired_count--;
6126 		}
6127 	}
6128 	PMAP_UNLOCK(pmap);
6129 }
6130 
6131 /*
6132  * This function requires that the caller has already added one to ml3's
6133  * ref_count in anticipation of creating a 4KB page mapping.
6134  */
6135 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6136 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6137     vm_page_t ml3, struct rwlock **lockp)
6138 {
6139 	pt_entry_t *tl3p;
6140 
6141 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6142 	KASSERT((va & L3C_OFFSET) == 0,
6143 	    ("pmap_copy_l3c: va is not aligned"));
6144 	KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6145 	    ("pmap_copy_l3c: l3e is not managed"));
6146 
6147 	/*
6148 	 * Abort if a mapping already exists.
6149 	 */
6150 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6151 		if (pmap_load(tl3p) != 0) {
6152 			if (ml3 != NULL)
6153 				ml3->ref_count--;
6154 			return (false);
6155 		}
6156 
6157 	if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6158 		if (ml3 != NULL)
6159 			pmap_abort_ptp(pmap, va, ml3);
6160 		return (false);
6161 	}
6162 	ml3->ref_count += L3C_ENTRIES - 1;
6163 
6164 	/*
6165 	 * Clear the wired and accessed bits.  However, leave the dirty bit
6166 	 * unchanged because read/write superpage mappings are required to be
6167 	 * dirty.
6168 	 */
6169 	l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6170 
6171 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6172 		pmap_store(tl3p, l3e);
6173 		l3e += L3_SIZE;
6174 	}
6175 	pmap_resident_count_inc(pmap, L3C_ENTRIES);
6176 	counter_u64_add(pmap_l3c_mappings, 1);
6177 	CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6178 	    va, pmap);
6179 	return (true);
6180 }
6181 
6182 /*
6183  *	Copy the range specified by src_addr/len
6184  *	from the source map to the range dst_addr/len
6185  *	in the destination map.
6186  *
6187  *	This routine is only advisory and need not do anything.
6188  *
6189  *	Because the executable mappings created by this routine are copied,
6190  *	it should not have to flush the instruction cache.
6191  */
6192 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6193 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6194     vm_offset_t src_addr)
6195 {
6196 	struct rwlock *lock;
6197 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
6198 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6199 	vm_offset_t addr, end_addr, va_next;
6200 	vm_page_t dst_m, dstmpte, srcmpte;
6201 
6202 	PMAP_ASSERT_STAGE1(dst_pmap);
6203 	PMAP_ASSERT_STAGE1(src_pmap);
6204 
6205 	if (dst_addr != src_addr)
6206 		return;
6207 	end_addr = src_addr + len;
6208 	lock = NULL;
6209 	if (dst_pmap < src_pmap) {
6210 		PMAP_LOCK(dst_pmap);
6211 		PMAP_LOCK(src_pmap);
6212 	} else {
6213 		PMAP_LOCK(src_pmap);
6214 		PMAP_LOCK(dst_pmap);
6215 	}
6216 	for (addr = src_addr; addr < end_addr; addr = va_next) {
6217 		l0 = pmap_l0(src_pmap, addr);
6218 		if (pmap_load(l0) == 0) {
6219 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6220 			if (va_next < addr)
6221 				va_next = end_addr;
6222 			continue;
6223 		}
6224 
6225 		va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6226 		if (va_next < addr)
6227 			va_next = end_addr;
6228 		l1 = pmap_l0_to_l1(l0, addr);
6229 		if (pmap_load(l1) == 0)
6230 			continue;
6231 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6232 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6233 			KASSERT(va_next <= end_addr,
6234 			    ("partial update of non-transparent 1G page "
6235 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6236 			    pmap_load(l1), addr, end_addr, va_next));
6237 			srcptepaddr = pmap_load(l1);
6238 			l1 = pmap_l1(dst_pmap, addr);
6239 			if (l1 == NULL) {
6240 				if (_pmap_alloc_l3(dst_pmap,
6241 				    pmap_l0_pindex(addr), NULL) == NULL)
6242 					break;
6243 				l1 = pmap_l1(dst_pmap, addr);
6244 			} else {
6245 				l0 = pmap_l0(dst_pmap, addr);
6246 				dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6247 				dst_m->ref_count++;
6248 			}
6249 			KASSERT(pmap_load(l1) == 0,
6250 			    ("1G mapping present in dst pmap "
6251 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6252 			    pmap_load(l1), addr, end_addr, va_next));
6253 			pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6254 			pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6255 			continue;
6256 		}
6257 
6258 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6259 		if (va_next < addr)
6260 			va_next = end_addr;
6261 		l2 = pmap_l1_to_l2(l1, addr);
6262 		srcptepaddr = pmap_load(l2);
6263 		if (srcptepaddr == 0)
6264 			continue;
6265 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6266 			/*
6267 			 * We can only virtual copy whole superpages.
6268 			 */
6269 			if ((addr & L2_OFFSET) != 0 ||
6270 			    addr + L2_SIZE > end_addr)
6271 				continue;
6272 			l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6273 			if (l2 == NULL)
6274 				break;
6275 			if (pmap_load(l2) == 0 &&
6276 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6277 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6278 			    PMAP_ENTER_NORECLAIM, &lock))) {
6279 				/*
6280 				 * We leave the dirty bit unchanged because
6281 				 * managed read/write superpage mappings are
6282 				 * required to be dirty.  However, managed
6283 				 * superpage mappings are not required to
6284 				 * have their accessed bit set, so we clear
6285 				 * it because we don't know if this mapping
6286 				 * will be used.
6287 				 */
6288 				srcptepaddr &= ~ATTR_SW_WIRED;
6289 				if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6290 					srcptepaddr &= ~ATTR_AF;
6291 				pmap_store(l2, srcptepaddr);
6292 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
6293 				    PAGE_SIZE);
6294 				atomic_add_long(&pmap_l2_mappings, 1);
6295 			} else
6296 				pmap_abort_ptp(dst_pmap, addr, dst_m);
6297 			continue;
6298 		}
6299 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6300 		    ("pmap_copy: invalid L2 entry"));
6301 		srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6302 		KASSERT(srcmpte->ref_count > 0,
6303 		    ("pmap_copy: source page table page is unused"));
6304 		if (va_next > end_addr)
6305 			va_next = end_addr;
6306 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6307 		src_pte = &src_pte[pmap_l3_index(addr)];
6308 		dstmpte = NULL;
6309 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6310 			ptetemp = pmap_load(src_pte);
6311 
6312 			/*
6313 			 * We only virtual copy managed pages.
6314 			 */
6315 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
6316 				continue;
6317 
6318 			if (dstmpte != NULL) {
6319 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6320 				    ("dstmpte pindex/addr mismatch"));
6321 				dstmpte->ref_count++;
6322 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6323 			    NULL)) == NULL)
6324 				goto out;
6325 			dst_pte = (pt_entry_t *)
6326 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6327 			dst_pte = &dst_pte[pmap_l3_index(addr)];
6328 			if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6329 			    L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6330 			    va_next - 1) {
6331 				if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6332 				    ptetemp, dstmpte, &lock))
6333 					goto out;
6334 				addr += L3C_SIZE - PAGE_SIZE;
6335 				src_pte += L3C_ENTRIES - 1;
6336 			} else if (pmap_load(dst_pte) == 0 &&
6337 			    pmap_try_insert_pv_entry(dst_pmap, addr,
6338 			    PTE_TO_VM_PAGE(ptetemp), &lock)) {
6339 				/*
6340 				 * Clear the wired, contiguous, modified, and
6341 				 * accessed bits from the destination PTE.
6342 				 * The contiguous bit is cleared because we
6343 				 * are not copying the entire L3C superpage.
6344 				 */
6345 				mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6346 				    ATTR_AF;
6347 				nbits = 0;
6348 				if ((ptetemp & ATTR_SW_DBM) != 0)
6349 					nbits |= ATTR_S1_AP_RW_BIT;
6350 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6351 				pmap_resident_count_inc(dst_pmap, 1);
6352 			} else {
6353 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
6354 				goto out;
6355 			}
6356 			/* Have we copied all of the valid mappings? */
6357 			if (dstmpte->ref_count >= srcmpte->ref_count)
6358 				break;
6359 		}
6360 	}
6361 out:
6362 	/*
6363 	 * XXX This barrier may not be needed because the destination pmap is
6364 	 * not active.
6365 	 */
6366 	dsb(ishst);
6367 
6368 	if (lock != NULL)
6369 		rw_wunlock(lock);
6370 	PMAP_UNLOCK(src_pmap);
6371 	PMAP_UNLOCK(dst_pmap);
6372 }
6373 
6374 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6375 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6376 {
6377 	int error;
6378 
6379 	if (dst_pmap->pm_stage != src_pmap->pm_stage)
6380 		return (EINVAL);
6381 
6382 	if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6383 		return (0);
6384 
6385 	for (;;) {
6386 		if (dst_pmap < src_pmap) {
6387 			PMAP_LOCK(dst_pmap);
6388 			PMAP_LOCK(src_pmap);
6389 		} else {
6390 			PMAP_LOCK(src_pmap);
6391 			PMAP_LOCK(dst_pmap);
6392 		}
6393 		error = pmap_bti_copy(dst_pmap, src_pmap);
6394 		/* Clean up partial copy on failure due to no memory. */
6395 		if (error == ENOMEM)
6396 			pmap_bti_deassign_all(dst_pmap);
6397 		PMAP_UNLOCK(src_pmap);
6398 		PMAP_UNLOCK(dst_pmap);
6399 		if (error != ENOMEM)
6400 			break;
6401 		vm_wait(NULL);
6402 	}
6403 	return (error);
6404 }
6405 
6406 /*
6407  *	pmap_zero_page zeros the specified hardware page by mapping
6408  *	the page into KVM and using bzero to clear its contents.
6409  */
6410 void
pmap_zero_page(vm_page_t m)6411 pmap_zero_page(vm_page_t m)
6412 {
6413 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6414 
6415 	pagezero((void *)va);
6416 }
6417 
6418 /*
6419  *	pmap_zero_page_area zeros the specified hardware page by mapping
6420  *	the page into KVM and using bzero to clear its contents.
6421  *
6422  *	off and size may not cover an area beyond a single hardware page.
6423  */
6424 void
pmap_zero_page_area(vm_page_t m,int off,int size)6425 pmap_zero_page_area(vm_page_t m, int off, int size)
6426 {
6427 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6428 
6429 	if (off == 0 && size == PAGE_SIZE)
6430 		pagezero((void *)va);
6431 	else
6432 		bzero((char *)va + off, size);
6433 }
6434 
6435 /*
6436  *	pmap_copy_page copies the specified (machine independent)
6437  *	page by mapping the page into virtual memory and using
6438  *	bcopy to copy the page, one machine dependent page at a
6439  *	time.
6440  */
6441 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6442 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6443 {
6444 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6445 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6446 
6447 	pagecopy((void *)src, (void *)dst);
6448 }
6449 
6450 int unmapped_buf_allowed = 1;
6451 
6452 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6453 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6454     vm_offset_t b_offset, int xfersize)
6455 {
6456 	void *a_cp, *b_cp;
6457 	vm_page_t m_a, m_b;
6458 	vm_paddr_t p_a, p_b;
6459 	vm_offset_t a_pg_offset, b_pg_offset;
6460 	int cnt;
6461 
6462 	while (xfersize > 0) {
6463 		a_pg_offset = a_offset & PAGE_MASK;
6464 		m_a = ma[a_offset >> PAGE_SHIFT];
6465 		p_a = m_a->phys_addr;
6466 		b_pg_offset = b_offset & PAGE_MASK;
6467 		m_b = mb[b_offset >> PAGE_SHIFT];
6468 		p_b = m_b->phys_addr;
6469 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6470 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6471 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6472 			panic("!DMAP a %lx", p_a);
6473 		} else {
6474 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6475 		}
6476 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6477 			panic("!DMAP b %lx", p_b);
6478 		} else {
6479 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6480 		}
6481 		bcopy(a_cp, b_cp, cnt);
6482 		a_offset += cnt;
6483 		b_offset += cnt;
6484 		xfersize -= cnt;
6485 	}
6486 }
6487 
6488 vm_offset_t
pmap_quick_enter_page(vm_page_t m)6489 pmap_quick_enter_page(vm_page_t m)
6490 {
6491 
6492 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6493 }
6494 
6495 void
pmap_quick_remove_page(vm_offset_t addr)6496 pmap_quick_remove_page(vm_offset_t addr)
6497 {
6498 }
6499 
6500 /*
6501  * Returns true if the pmap's pv is one of the first
6502  * 16 pvs linked to from this page.  This count may
6503  * be changed upwards or downwards in the future; it
6504  * is only necessary that true be returned for a small
6505  * subset of pmaps for proper page aging.
6506  */
6507 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)6508 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6509 {
6510 	struct md_page *pvh;
6511 	struct rwlock *lock;
6512 	pv_entry_t pv;
6513 	int loops = 0;
6514 	bool rv;
6515 
6516 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6517 	    ("pmap_page_exists_quick: page %p is not managed", m));
6518 	rv = false;
6519 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6520 	rw_rlock(lock);
6521 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6522 		if (PV_PMAP(pv) == pmap) {
6523 			rv = true;
6524 			break;
6525 		}
6526 		loops++;
6527 		if (loops >= 16)
6528 			break;
6529 	}
6530 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6531 		pvh = page_to_pvh(m);
6532 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6533 			if (PV_PMAP(pv) == pmap) {
6534 				rv = true;
6535 				break;
6536 			}
6537 			loops++;
6538 			if (loops >= 16)
6539 				break;
6540 		}
6541 	}
6542 	rw_runlock(lock);
6543 	return (rv);
6544 }
6545 
6546 /*
6547  *	pmap_page_wired_mappings:
6548  *
6549  *	Return the number of managed mappings to the given physical page
6550  *	that are wired.
6551  */
6552 int
pmap_page_wired_mappings(vm_page_t m)6553 pmap_page_wired_mappings(vm_page_t m)
6554 {
6555 	struct rwlock *lock;
6556 	struct md_page *pvh;
6557 	pmap_t pmap;
6558 	pt_entry_t *pte;
6559 	pv_entry_t pv;
6560 	int count, md_gen, pvh_gen;
6561 
6562 	if ((m->oflags & VPO_UNMANAGED) != 0)
6563 		return (0);
6564 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6565 	rw_rlock(lock);
6566 restart:
6567 	count = 0;
6568 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6569 		pmap = PV_PMAP(pv);
6570 		if (!PMAP_TRYLOCK(pmap)) {
6571 			md_gen = m->md.pv_gen;
6572 			rw_runlock(lock);
6573 			PMAP_LOCK(pmap);
6574 			rw_rlock(lock);
6575 			if (md_gen != m->md.pv_gen) {
6576 				PMAP_UNLOCK(pmap);
6577 				goto restart;
6578 			}
6579 		}
6580 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6581 		if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6582 			count++;
6583 		PMAP_UNLOCK(pmap);
6584 	}
6585 	if ((m->flags & PG_FICTITIOUS) == 0) {
6586 		pvh = page_to_pvh(m);
6587 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6588 			pmap = PV_PMAP(pv);
6589 			if (!PMAP_TRYLOCK(pmap)) {
6590 				md_gen = m->md.pv_gen;
6591 				pvh_gen = pvh->pv_gen;
6592 				rw_runlock(lock);
6593 				PMAP_LOCK(pmap);
6594 				rw_rlock(lock);
6595 				if (md_gen != m->md.pv_gen ||
6596 				    pvh_gen != pvh->pv_gen) {
6597 					PMAP_UNLOCK(pmap);
6598 					goto restart;
6599 				}
6600 			}
6601 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6602 			if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6603 				count++;
6604 			PMAP_UNLOCK(pmap);
6605 		}
6606 	}
6607 	rw_runlock(lock);
6608 	return (count);
6609 }
6610 
6611 /*
6612  * Returns true if the given page is mapped individually or as part of
6613  * a 2mpage.  Otherwise, returns false.
6614  */
6615 bool
pmap_page_is_mapped(vm_page_t m)6616 pmap_page_is_mapped(vm_page_t m)
6617 {
6618 	struct rwlock *lock;
6619 	bool rv;
6620 
6621 	if ((m->oflags & VPO_UNMANAGED) != 0)
6622 		return (false);
6623 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6624 	rw_rlock(lock);
6625 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6626 	    ((m->flags & PG_FICTITIOUS) == 0 &&
6627 	    !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
6628 	rw_runlock(lock);
6629 	return (rv);
6630 }
6631 
6632 /*
6633  * Destroy all managed, non-wired mappings in the given user-space
6634  * pmap.  This pmap cannot be active on any processor besides the
6635  * caller.
6636  *
6637  * This function cannot be applied to the kernel pmap.  Moreover, it
6638  * is not intended for general use.  It is only to be used during
6639  * process termination.  Consequently, it can be implemented in ways
6640  * that make it faster than pmap_remove().  First, it can more quickly
6641  * destroy mappings by iterating over the pmap's collection of PV
6642  * entries, rather than searching the page table.  Second, it doesn't
6643  * have to test and clear the page table entries atomically, because
6644  * no processor is currently accessing the user address space.  In
6645  * particular, a page table entry's dirty bit won't change state once
6646  * this function starts.
6647  */
6648 void
pmap_remove_pages(pmap_t pmap)6649 pmap_remove_pages(pmap_t pmap)
6650 {
6651 	pd_entry_t *pde;
6652 	pt_entry_t *pte, tpte;
6653 	struct spglist free;
6654 	struct pv_chunklist free_chunks[PMAP_MEMDOM];
6655 	vm_page_t m, ml3, mt;
6656 	pv_entry_t pv;
6657 	struct md_page *pvh;
6658 	struct pv_chunk *pc, *npc;
6659 	struct rwlock *lock;
6660 	int64_t bit;
6661 	uint64_t inuse, bitmask;
6662 	int allfree, field, i, idx, lvl;
6663 	int freed __pvused;
6664 	vm_paddr_t pa;
6665 
6666 	lock = NULL;
6667 
6668 	for (i = 0; i < PMAP_MEMDOM; i++)
6669 		TAILQ_INIT(&free_chunks[i]);
6670 	SLIST_INIT(&free);
6671 	PMAP_LOCK(pmap);
6672 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6673 		allfree = 1;
6674 		freed = 0;
6675 		for (field = 0; field < _NPCM; field++) {
6676 			inuse = ~pc->pc_map[field] & pc_freemask[field];
6677 			while (inuse != 0) {
6678 				bit = ffsl(inuse) - 1;
6679 				bitmask = 1UL << bit;
6680 				idx = field * 64 + bit;
6681 				pv = &pc->pc_pventry[idx];
6682 				inuse &= ~bitmask;
6683 
6684 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
6685 				KASSERT(pde != NULL,
6686 				    ("Attempting to remove an unmapped page"));
6687 
6688 				switch(lvl) {
6689 				case 1:
6690 					pte = pmap_l1_to_l2(pde, pv->pv_va);
6691 					tpte = pmap_load(pte);
6692 					KASSERT((tpte & ATTR_DESCR_MASK) ==
6693 					    L2_BLOCK,
6694 					    ("Attempting to remove an invalid "
6695 					    "block: %lx", tpte));
6696 					break;
6697 				case 2:
6698 					pte = pmap_l2_to_l3(pde, pv->pv_va);
6699 					tpte = pmap_load(pte);
6700 					KASSERT((tpte & ATTR_DESCR_MASK) ==
6701 					    L3_PAGE,
6702 					    ("Attempting to remove an invalid "
6703 					     "page: %lx", tpte));
6704 					break;
6705 				default:
6706 					panic(
6707 					    "Invalid page directory level: %d",
6708 					    lvl);
6709 				}
6710 
6711 				/*
6712 				 * We cannot remove wired mappings at this time.
6713 				 *
6714 				 * For L3C superpages, all of the constituent PTEs
6715 				 * should have the wired bit set, so we don't
6716 				 * check for ATTR_CONTIGUOUS here.
6717 				 */
6718 				if (tpte & ATTR_SW_WIRED) {
6719 					allfree = 0;
6720 					continue;
6721 				}
6722 
6723 				/* Mark free */
6724 				pc->pc_map[field] |= bitmask;
6725 
6726 				/*
6727 				 * Because this pmap is not active on other
6728 				 * processors, the dirty bit cannot have
6729 				 * changed state since we last loaded pte.
6730 				 */
6731 				pmap_clear(pte);
6732 
6733 				pa = PTE_TO_PHYS(tpte);
6734 
6735 				m = PHYS_TO_VM_PAGE(pa);
6736 				KASSERT(m->phys_addr == pa,
6737 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6738 				    m, (uintmax_t)m->phys_addr,
6739 				    (uintmax_t)tpte));
6740 
6741 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6742 				    m < &vm_page_array[vm_page_array_size],
6743 				    ("pmap_remove_pages: bad pte %#jx",
6744 				    (uintmax_t)tpte));
6745 
6746 				/*
6747 				 * Update the vm_page_t clean/reference bits.
6748 				 *
6749 				 * We don't check for ATTR_CONTIGUOUS here
6750 				 * because writeable L3C superpages are expected
6751 				 * to be dirty, i.e., every constituent PTE
6752 				 * should be dirty.
6753 				 */
6754 				if (pmap_pte_dirty(pmap, tpte)) {
6755 					switch (lvl) {
6756 					case 1:
6757 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6758 							vm_page_dirty(mt);
6759 						break;
6760 					case 2:
6761 						vm_page_dirty(m);
6762 						break;
6763 					}
6764 				}
6765 
6766 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6767 
6768 				switch (lvl) {
6769 				case 1:
6770 					pmap_resident_count_dec(pmap,
6771 					    L2_SIZE / PAGE_SIZE);
6772 					pvh = page_to_pvh(m);
6773 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
6774 					pvh->pv_gen++;
6775 					if (TAILQ_EMPTY(&pvh->pv_list)) {
6776 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6777 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
6778 							    TAILQ_EMPTY(&mt->md.pv_list))
6779 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
6780 					}
6781 					ml3 = pmap_remove_pt_page(pmap,
6782 					    pv->pv_va);
6783 					if (ml3 != NULL) {
6784 						KASSERT(vm_page_any_valid(ml3),
6785 						    ("pmap_remove_pages: l3 page not promoted"));
6786 						pmap_resident_count_dec(pmap,1);
6787 						KASSERT(ml3->ref_count == NL3PG,
6788 						    ("pmap_remove_pages: l3 page ref count error"));
6789 						ml3->ref_count = 0;
6790 						pmap_add_delayed_free_list(ml3,
6791 						    &free, false);
6792 					}
6793 					break;
6794 				case 2:
6795 					pmap_resident_count_dec(pmap, 1);
6796 					TAILQ_REMOVE(&m->md.pv_list, pv,
6797 					    pv_next);
6798 					m->md.pv_gen++;
6799 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
6800 					    TAILQ_EMPTY(&m->md.pv_list) &&
6801 					    (m->flags & PG_FICTITIOUS) == 0) {
6802 						pvh = page_to_pvh(m);
6803 						if (TAILQ_EMPTY(&pvh->pv_list))
6804 							vm_page_aflag_clear(m,
6805 							    PGA_WRITEABLE);
6806 					}
6807 					break;
6808 				}
6809 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
6810 				    &free);
6811 				freed++;
6812 			}
6813 		}
6814 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6815 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6816 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6817 		if (allfree) {
6818 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6819 			TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
6820 			    pc_list);
6821 		}
6822 	}
6823 	if (lock != NULL)
6824 		rw_wunlock(lock);
6825 	pmap_invalidate_all(pmap);
6826 	pmap_bti_deassign_all(pmap);
6827 	free_pv_chunk_batch(free_chunks);
6828 	PMAP_UNLOCK(pmap);
6829 	vm_page_free_pages_toq(&free, true);
6830 }
6831 
6832 /*
6833  * This is used to check if a page has been accessed or modified.
6834  */
6835 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)6836 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
6837 {
6838 	struct rwlock *lock;
6839 	pv_entry_t pv;
6840 	struct md_page *pvh;
6841 	pt_entry_t l3e, mask, *pte, value;
6842 	pmap_t pmap;
6843 	int md_gen, pvh_gen;
6844 	bool rv;
6845 
6846 	rv = false;
6847 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6848 	rw_rlock(lock);
6849 restart:
6850 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6851 		pmap = PV_PMAP(pv);
6852 		PMAP_ASSERT_STAGE1(pmap);
6853 		if (!PMAP_TRYLOCK(pmap)) {
6854 			md_gen = m->md.pv_gen;
6855 			rw_runlock(lock);
6856 			PMAP_LOCK(pmap);
6857 			rw_rlock(lock);
6858 			if (md_gen != m->md.pv_gen) {
6859 				PMAP_UNLOCK(pmap);
6860 				goto restart;
6861 			}
6862 		}
6863 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6864 		mask = 0;
6865 		value = 0;
6866 		if (modified) {
6867 			mask |= ATTR_S1_AP_RW_BIT;
6868 			value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6869 		}
6870 		if (accessed) {
6871 			mask |= ATTR_AF | ATTR_DESCR_MASK;
6872 			value |= ATTR_AF | L3_PAGE;
6873 		}
6874 		l3e = pmap_load(pte);
6875 		if ((l3e & ATTR_CONTIGUOUS) != 0)
6876 			l3e = pmap_load_l3c(pte);
6877 		PMAP_UNLOCK(pmap);
6878 		rv = (l3e & mask) == value;
6879 		if (rv)
6880 			goto out;
6881 	}
6882 	if ((m->flags & PG_FICTITIOUS) == 0) {
6883 		pvh = page_to_pvh(m);
6884 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6885 			pmap = PV_PMAP(pv);
6886 			PMAP_ASSERT_STAGE1(pmap);
6887 			if (!PMAP_TRYLOCK(pmap)) {
6888 				md_gen = m->md.pv_gen;
6889 				pvh_gen = pvh->pv_gen;
6890 				rw_runlock(lock);
6891 				PMAP_LOCK(pmap);
6892 				rw_rlock(lock);
6893 				if (md_gen != m->md.pv_gen ||
6894 				    pvh_gen != pvh->pv_gen) {
6895 					PMAP_UNLOCK(pmap);
6896 					goto restart;
6897 				}
6898 			}
6899 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6900 			mask = 0;
6901 			value = 0;
6902 			if (modified) {
6903 				mask |= ATTR_S1_AP_RW_BIT;
6904 				value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6905 			}
6906 			if (accessed) {
6907 				mask |= ATTR_AF | ATTR_DESCR_MASK;
6908 				value |= ATTR_AF | L2_BLOCK;
6909 			}
6910 			rv = (pmap_load(pte) & mask) == value;
6911 			PMAP_UNLOCK(pmap);
6912 			if (rv)
6913 				goto out;
6914 		}
6915 	}
6916 out:
6917 	rw_runlock(lock);
6918 	return (rv);
6919 }
6920 
6921 /*
6922  *	pmap_is_modified:
6923  *
6924  *	Return whether or not the specified physical page was modified
6925  *	in any physical maps.
6926  */
6927 bool
pmap_is_modified(vm_page_t m)6928 pmap_is_modified(vm_page_t m)
6929 {
6930 
6931 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6932 	    ("pmap_is_modified: page %p is not managed", m));
6933 
6934 	/*
6935 	 * If the page is not busied then this check is racy.
6936 	 */
6937 	if (!pmap_page_is_write_mapped(m))
6938 		return (false);
6939 	return (pmap_page_test_mappings(m, false, true));
6940 }
6941 
6942 /*
6943  *	pmap_is_prefaultable:
6944  *
6945  *	Return whether or not the specified virtual address is eligible
6946  *	for prefault.
6947  */
6948 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)6949 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
6950 {
6951 	pd_entry_t *pde;
6952 	pt_entry_t *pte;
6953 	bool rv;
6954 	int lvl;
6955 
6956 	/*
6957 	 * Return true if and only if the L3 entry for the specified virtual
6958 	 * address is allocated but invalid.
6959 	 */
6960 	rv = false;
6961 	PMAP_LOCK(pmap);
6962 	pde = pmap_pde(pmap, addr, &lvl);
6963 	if (pde != NULL && lvl == 2) {
6964 		pte = pmap_l2_to_l3(pde, addr);
6965 		rv = pmap_load(pte) == 0;
6966 	}
6967 	PMAP_UNLOCK(pmap);
6968 	return (rv);
6969 }
6970 
6971 /*
6972  *	pmap_is_referenced:
6973  *
6974  *	Return whether or not the specified physical page was referenced
6975  *	in any physical maps.
6976  */
6977 bool
pmap_is_referenced(vm_page_t m)6978 pmap_is_referenced(vm_page_t m)
6979 {
6980 
6981 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6982 	    ("pmap_is_referenced: page %p is not managed", m));
6983 	return (pmap_page_test_mappings(m, true, false));
6984 }
6985 
6986 /*
6987  * Clear the write and modified bits in each of the given page's mappings.
6988  */
6989 void
pmap_remove_write(vm_page_t m)6990 pmap_remove_write(vm_page_t m)
6991 {
6992 	struct md_page *pvh;
6993 	pmap_t pmap;
6994 	struct rwlock *lock;
6995 	pv_entry_t next_pv, pv;
6996 	pt_entry_t oldpte, *pte, set, clear, mask, val;
6997 	vm_offset_t va;
6998 	int md_gen, pvh_gen;
6999 
7000 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7001 	    ("pmap_remove_write: page %p is not managed", m));
7002 	vm_page_assert_busied(m);
7003 
7004 	if (!pmap_page_is_write_mapped(m))
7005 		return;
7006 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7007 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7008 	rw_wlock(lock);
7009 retry:
7010 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7011 		pmap = PV_PMAP(pv);
7012 		PMAP_ASSERT_STAGE1(pmap);
7013 		if (!PMAP_TRYLOCK(pmap)) {
7014 			pvh_gen = pvh->pv_gen;
7015 			rw_wunlock(lock);
7016 			PMAP_LOCK(pmap);
7017 			rw_wlock(lock);
7018 			if (pvh_gen != pvh->pv_gen) {
7019 				PMAP_UNLOCK(pmap);
7020 				goto retry;
7021 			}
7022 		}
7023 		va = pv->pv_va;
7024 		pte = pmap_pte_exists(pmap, va, 2, __func__);
7025 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7026 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7027 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7028 		    ("inconsistent pv lock %p %p for page %p",
7029 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7030 		PMAP_UNLOCK(pmap);
7031 	}
7032 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7033 		pmap = PV_PMAP(pv);
7034 		if (!PMAP_TRYLOCK(pmap)) {
7035 			pvh_gen = pvh->pv_gen;
7036 			md_gen = m->md.pv_gen;
7037 			rw_wunlock(lock);
7038 			PMAP_LOCK(pmap);
7039 			rw_wlock(lock);
7040 			if (pvh_gen != pvh->pv_gen ||
7041 			    md_gen != m->md.pv_gen) {
7042 				PMAP_UNLOCK(pmap);
7043 				goto retry;
7044 			}
7045 		}
7046 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7047 		oldpte = pmap_load(pte);
7048 		if ((oldpte & ATTR_SW_DBM) != 0) {
7049 			if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7050 				(void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7051 
7052 				/*
7053 				 * The L3 entry's accessed bit may have
7054 				 * changed.
7055 				 */
7056 				oldpte = pmap_load(pte);
7057 			}
7058 			if (pmap->pm_stage == PM_STAGE1) {
7059 				set = ATTR_S1_AP_RW_BIT;
7060 				clear = 0;
7061 				mask = ATTR_S1_AP_RW_BIT;
7062 				val = ATTR_S1_AP(ATTR_S1_AP_RW);
7063 			} else {
7064 				set = 0;
7065 				clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7066 				mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7067 				val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7068 			}
7069 			clear |= ATTR_SW_DBM;
7070 			while (!atomic_fcmpset_64(pte, &oldpte,
7071 			    (oldpte | set) & ~clear))
7072 				cpu_spinwait();
7073 
7074 			if ((oldpte & mask) == val)
7075 				vm_page_dirty(m);
7076 			pmap_invalidate_page(pmap, pv->pv_va, true);
7077 		}
7078 		PMAP_UNLOCK(pmap);
7079 	}
7080 	rw_wunlock(lock);
7081 	vm_page_aflag_clear(m, PGA_WRITEABLE);
7082 }
7083 
7084 /*
7085  *	pmap_ts_referenced:
7086  *
7087  *	Return a count of reference bits for a page, clearing those bits.
7088  *	It is not necessary for every reference bit to be cleared, but it
7089  *	is necessary that 0 only be returned when there are truly no
7090  *	reference bits set.
7091  *
7092  *	As an optimization, update the page's dirty field if a modified bit is
7093  *	found while counting reference bits.  This opportunistic update can be
7094  *	performed at low cost and can eliminate the need for some future calls
7095  *	to pmap_is_modified().  However, since this function stops after
7096  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7097  *	dirty pages.  Those dirty pages will only be detected by a future call
7098  *	to pmap_is_modified().
7099  */
7100 int
pmap_ts_referenced(vm_page_t m)7101 pmap_ts_referenced(vm_page_t m)
7102 {
7103 	struct md_page *pvh;
7104 	pv_entry_t pv, pvf;
7105 	pmap_t pmap;
7106 	struct rwlock *lock;
7107 	pt_entry_t *pte, tpte;
7108 	vm_offset_t va;
7109 	vm_paddr_t pa;
7110 	int cleared, md_gen, not_cleared, pvh_gen;
7111 	struct spglist free;
7112 
7113 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7114 	    ("pmap_ts_referenced: page %p is not managed", m));
7115 	SLIST_INIT(&free);
7116 	cleared = 0;
7117 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7118 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7119 	rw_wlock(lock);
7120 retry:
7121 	not_cleared = 0;
7122 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7123 		goto small_mappings;
7124 	pv = pvf;
7125 	do {
7126 		if (pvf == NULL)
7127 			pvf = pv;
7128 		pmap = PV_PMAP(pv);
7129 		if (!PMAP_TRYLOCK(pmap)) {
7130 			pvh_gen = pvh->pv_gen;
7131 			rw_wunlock(lock);
7132 			PMAP_LOCK(pmap);
7133 			rw_wlock(lock);
7134 			if (pvh_gen != pvh->pv_gen) {
7135 				PMAP_UNLOCK(pmap);
7136 				goto retry;
7137 			}
7138 		}
7139 		va = pv->pv_va;
7140 		pte = pmap_pte_exists(pmap, va, 2, __func__);
7141 		tpte = pmap_load(pte);
7142 		if (pmap_pte_dirty(pmap, tpte)) {
7143 			/*
7144 			 * Although "tpte" is mapping a 2MB page, because
7145 			 * this function is called at a 4KB page granularity,
7146 			 * we only update the 4KB page under test.
7147 			 */
7148 			vm_page_dirty(m);
7149 		}
7150 		if ((tpte & ATTR_AF) != 0) {
7151 			pa = VM_PAGE_TO_PHYS(m);
7152 
7153 			/*
7154 			 * Since this reference bit is shared by 512 4KB pages,
7155 			 * it should not be cleared every time it is tested.
7156 			 * Apply a simple "hash" function on the physical page
7157 			 * number, the virtual superpage number, and the pmap
7158 			 * address to select one 4KB page out of the 512 on
7159 			 * which testing the reference bit will result in
7160 			 * clearing that reference bit.  This function is
7161 			 * designed to avoid the selection of the same 4KB page
7162 			 * for every 2MB page mapping.
7163 			 *
7164 			 * On demotion, a mapping that hasn't been referenced
7165 			 * is simply destroyed.  To avoid the possibility of a
7166 			 * subsequent page fault on a demoted wired mapping,
7167 			 * always leave its reference bit set.  Moreover,
7168 			 * since the superpage is wired, the current state of
7169 			 * its reference bit won't affect page replacement.
7170 			 */
7171 			if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7172 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7173 			    (tpte & ATTR_SW_WIRED) == 0) {
7174 				pmap_clear_bits(pte, ATTR_AF);
7175 				pmap_invalidate_page(pmap, va, true);
7176 				cleared++;
7177 			} else
7178 				not_cleared++;
7179 		}
7180 		PMAP_UNLOCK(pmap);
7181 		/* Rotate the PV list if it has more than one entry. */
7182 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
7183 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7184 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7185 			pvh->pv_gen++;
7186 		}
7187 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7188 			goto out;
7189 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7190 small_mappings:
7191 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7192 		goto out;
7193 	pv = pvf;
7194 	do {
7195 		if (pvf == NULL)
7196 			pvf = pv;
7197 		pmap = PV_PMAP(pv);
7198 		if (!PMAP_TRYLOCK(pmap)) {
7199 			pvh_gen = pvh->pv_gen;
7200 			md_gen = m->md.pv_gen;
7201 			rw_wunlock(lock);
7202 			PMAP_LOCK(pmap);
7203 			rw_wlock(lock);
7204 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7205 				PMAP_UNLOCK(pmap);
7206 				goto retry;
7207 			}
7208 		}
7209 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7210 		tpte = pmap_load(pte);
7211 		if (pmap_pte_dirty(pmap, tpte))
7212 			vm_page_dirty(m);
7213 		if ((tpte & ATTR_AF) != 0) {
7214 			if ((tpte & ATTR_SW_WIRED) == 0) {
7215 				/*
7216 				 * Clear the accessed bit in this L3 entry
7217 				 * regardless of the contiguous bit.
7218 				 */
7219 				pmap_clear_bits(pte, ATTR_AF);
7220 				pmap_invalidate_page(pmap, pv->pv_va, true);
7221 				cleared++;
7222 			} else
7223 				not_cleared++;
7224 		} else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7225 		    (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7226 			/*
7227 			 * An L3C superpage mapping is regarded as accessed
7228 			 * until the accessed bit has been cleared in all
7229 			 * of its constituent entries.
7230 			 */
7231 			not_cleared++;
7232 		}
7233 		PMAP_UNLOCK(pmap);
7234 		/* Rotate the PV list if it has more than one entry. */
7235 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
7236 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7237 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7238 			m->md.pv_gen++;
7239 		}
7240 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7241 	    not_cleared < PMAP_TS_REFERENCED_MAX);
7242 out:
7243 	rw_wunlock(lock);
7244 	vm_page_free_pages_toq(&free, true);
7245 	return (cleared + not_cleared);
7246 }
7247 
7248 /*
7249  *	Apply the given advice to the specified range of addresses within the
7250  *	given pmap.  Depending on the advice, clear the referenced and/or
7251  *	modified flags in each mapping and set the mapped page's dirty field.
7252  */
7253 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7254 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7255 {
7256 	struct rwlock *lock;
7257 	vm_offset_t va, va_next, dva;
7258 	vm_page_t m;
7259 	pd_entry_t *l0, *l1, *l2, oldl2;
7260 	pt_entry_t *l3, *dl3, oldl3;
7261 
7262 	PMAP_ASSERT_STAGE1(pmap);
7263 
7264 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
7265 		return;
7266 
7267 	PMAP_LOCK(pmap);
7268 	for (; sva < eva; sva = va_next) {
7269 		l0 = pmap_l0(pmap, sva);
7270 		if (pmap_load(l0) == 0) {
7271 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7272 			if (va_next < sva)
7273 				va_next = eva;
7274 			continue;
7275 		}
7276 
7277 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7278 		if (va_next < sva)
7279 			va_next = eva;
7280 		l1 = pmap_l0_to_l1(l0, sva);
7281 		if (pmap_load(l1) == 0)
7282 			continue;
7283 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7284 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7285 			continue;
7286 		}
7287 
7288 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7289 		if (va_next < sva)
7290 			va_next = eva;
7291 		l2 = pmap_l1_to_l2(l1, sva);
7292 		oldl2 = pmap_load(l2);
7293 		if (oldl2 == 0)
7294 			continue;
7295 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7296 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
7297 				continue;
7298 			lock = NULL;
7299 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7300 				if (lock != NULL)
7301 					rw_wunlock(lock);
7302 
7303 				/*
7304 				 * The 2MB page mapping was destroyed.
7305 				 */
7306 				continue;
7307 			}
7308 
7309 			/*
7310 			 * Unless the page mappings are wired, remove the
7311 			 * mapping to a single page so that a subsequent
7312 			 * access may repromote.  Choosing the last page
7313 			 * within the address range [sva, min(va_next, eva))
7314 			 * generally results in more repromotions.  Since the
7315 			 * underlying page table page is fully populated, this
7316 			 * removal never frees a page table page.
7317 			 */
7318 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
7319 				va = eva;
7320 				if (va > va_next)
7321 					va = va_next;
7322 				va -= PAGE_SIZE;
7323 				KASSERT(va >= sva,
7324 				    ("pmap_advise: no address gap"));
7325 				l3 = pmap_l2_to_l3(l2, va);
7326 				KASSERT(pmap_load(l3) != 0,
7327 				    ("pmap_advise: invalid PTE"));
7328 				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7329 				    NULL, &lock);
7330 			}
7331 			if (lock != NULL)
7332 				rw_wunlock(lock);
7333 		}
7334 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7335 		    ("pmap_advise: invalid L2 entry after demotion"));
7336 		if (va_next > eva)
7337 			va_next = eva;
7338 		va = va_next;
7339 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7340 		    sva += L3_SIZE) {
7341 			oldl3 = pmap_load(l3);
7342 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7343 			    (ATTR_SW_MANAGED | L3_PAGE))
7344 				goto maybe_invlrng;
7345 			else if (pmap_pte_dirty(pmap, oldl3)) {
7346 				if (advice == MADV_DONTNEED) {
7347 					/*
7348 					 * Future calls to pmap_is_modified()
7349 					 * can be avoided by making the page
7350 					 * dirty now.
7351 					 */
7352 					m = PTE_TO_VM_PAGE(oldl3);
7353 					vm_page_dirty(m);
7354 				}
7355 				if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7356 					/*
7357 					 * Unconditionally demote the L3C
7358 					 * superpage because we do not allow
7359 					 * writeable, clean superpages.
7360 					 */
7361 					(void)pmap_demote_l3c(pmap, l3, sva);
7362 
7363 					/*
7364                                          * Destroy the final mapping before the
7365                                          * next L3C boundary or va_next,
7366 					 * whichever comes first, so that a
7367 					 * subsequent access may act as a
7368 					 * repromotion trigger.
7369 					 */
7370                                         if ((oldl3 & ATTR_SW_WIRED) == 0) {
7371 						dva = MIN((sva & ~L3C_OFFSET) +
7372 						    L3C_SIZE - PAGE_SIZE,
7373 						    va_next - PAGE_SIZE);
7374 						dl3 = pmap_l2_to_l3(l2, dva);
7375 						KASSERT(pmap_load(dl3) != 0,
7376 						    ("pmap_advise: invalid PTE"));
7377 						lock = NULL;
7378 						pmap_remove_l3(pmap, dl3, dva,
7379 						    pmap_load(l2), NULL, &lock);
7380 						if (lock != NULL)
7381 							rw_wunlock(lock);
7382 					}
7383 
7384 					/*
7385 					 * The L3 entry's accessed bit may have
7386 					 * changed.
7387 					 */
7388 					oldl3 = pmap_load(l3);
7389 				}
7390 
7391 				/*
7392 				 * Check that we did not just destroy this entry so
7393 				 * we avoid corrupting the page able.
7394 				 */
7395 				if (oldl3 != 0) {
7396 					while (!atomic_fcmpset_long(l3, &oldl3,
7397 					    (oldl3 & ~ATTR_AF) |
7398 					    ATTR_S1_AP(ATTR_S1_AP_RO)))
7399 						cpu_spinwait();
7400 				}
7401 			} else if ((oldl3 & ATTR_AF) != 0) {
7402 				/*
7403 				 * Clear the accessed bit in this L3 entry
7404 				 * regardless of the contiguous bit.
7405 				 */
7406 				pmap_clear_bits(l3, ATTR_AF);
7407 			} else
7408 				goto maybe_invlrng;
7409 			if (va == va_next)
7410 				va = sva;
7411 			continue;
7412 maybe_invlrng:
7413 			if (va != va_next) {
7414 				pmap_s1_invalidate_range(pmap, va, sva, true);
7415 				va = va_next;
7416 			}
7417 		}
7418 		if (va != va_next)
7419 			pmap_s1_invalidate_range(pmap, va, sva, true);
7420 	}
7421 	PMAP_UNLOCK(pmap);
7422 }
7423 
7424 /*
7425  *	Clear the modify bits on the specified physical page.
7426  */
7427 void
pmap_clear_modify(vm_page_t m)7428 pmap_clear_modify(vm_page_t m)
7429 {
7430 	struct md_page *pvh;
7431 	struct rwlock *lock;
7432 	pmap_t pmap;
7433 	pv_entry_t next_pv, pv;
7434 	pd_entry_t *l2, oldl2;
7435 	pt_entry_t *l3, oldl3;
7436 	vm_offset_t va;
7437 	int md_gen, pvh_gen;
7438 
7439 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7440 	    ("pmap_clear_modify: page %p is not managed", m));
7441 	vm_page_assert_busied(m);
7442 
7443 	if (!pmap_page_is_write_mapped(m))
7444 		return;
7445 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7446 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7447 	rw_wlock(lock);
7448 restart:
7449 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7450 		pmap = PV_PMAP(pv);
7451 		PMAP_ASSERT_STAGE1(pmap);
7452 		if (!PMAP_TRYLOCK(pmap)) {
7453 			pvh_gen = pvh->pv_gen;
7454 			rw_wunlock(lock);
7455 			PMAP_LOCK(pmap);
7456 			rw_wlock(lock);
7457 			if (pvh_gen != pvh->pv_gen) {
7458 				PMAP_UNLOCK(pmap);
7459 				goto restart;
7460 			}
7461 		}
7462 		va = pv->pv_va;
7463 		l2 = pmap_l2(pmap, va);
7464 		oldl2 = pmap_load(l2);
7465 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7466 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
7467 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7468 		    (oldl2 & ATTR_SW_WIRED) == 0) {
7469 			/*
7470 			 * Write protect the mapping to a single page so that
7471 			 * a subsequent write access may repromote.
7472 			 */
7473 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7474 			l3 = pmap_l2_to_l3(l2, va);
7475 			oldl3 = pmap_load(l3);
7476 			while (!atomic_fcmpset_long(l3, &oldl3,
7477 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7478 				cpu_spinwait();
7479 			vm_page_dirty(m);
7480 			pmap_s1_invalidate_page(pmap, va, true);
7481 		}
7482 		PMAP_UNLOCK(pmap);
7483 	}
7484 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7485 		pmap = PV_PMAP(pv);
7486 		PMAP_ASSERT_STAGE1(pmap);
7487 		if (!PMAP_TRYLOCK(pmap)) {
7488 			md_gen = m->md.pv_gen;
7489 			pvh_gen = pvh->pv_gen;
7490 			rw_wunlock(lock);
7491 			PMAP_LOCK(pmap);
7492 			rw_wlock(lock);
7493 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7494 				PMAP_UNLOCK(pmap);
7495 				goto restart;
7496 			}
7497 		}
7498 		l2 = pmap_l2(pmap, pv->pv_va);
7499 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
7500 		oldl3 = pmap_load(l3);
7501 		KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7502 		    (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7503 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7504 		    ("writeable L3C superpage not dirty"));
7505 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7506 			if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7507 				(void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7508 			pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7509 			pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7510 		}
7511 		PMAP_UNLOCK(pmap);
7512 	}
7513 	rw_wunlock(lock);
7514 }
7515 
7516 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)7517 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7518 {
7519 	struct pmap_preinit_mapping *ppim;
7520 	vm_offset_t va, offset;
7521 	pd_entry_t old_l2e, *pde;
7522 	pt_entry_t *l2;
7523 	int i, lvl, l2_blocks, free_l2_count, start_idx;
7524 
7525 	if (!vm_initialized) {
7526 		/*
7527 		 * No L3 ptables so map entire L2 blocks where start VA is:
7528 		 * 	preinit_map_va + start_idx * L2_SIZE
7529 		 * There may be duplicate mappings (multiple VA -> same PA) but
7530 		 * ARM64 dcache is always PIPT so that's acceptable.
7531 		 */
7532 		 if (size == 0)
7533 			 return (NULL);
7534 
7535 		 /* Calculate how many L2 blocks are needed for the mapping */
7536 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
7537 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7538 
7539 		offset = pa & L2_OFFSET;
7540 
7541 		if (preinit_map_va == 0)
7542 			return (NULL);
7543 
7544 		/* Map 2MiB L2 blocks from reserved VA space */
7545 
7546 		free_l2_count = 0;
7547 		start_idx = -1;
7548 		/* Find enough free contiguous VA space */
7549 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7550 			ppim = pmap_preinit_mapping + i;
7551 			if (free_l2_count > 0 && ppim->pa != 0) {
7552 				/* Not enough space here */
7553 				free_l2_count = 0;
7554 				start_idx = -1;
7555 				continue;
7556 			}
7557 
7558 			if (ppim->pa == 0) {
7559 				/* Free L2 block */
7560 				if (start_idx == -1)
7561 					start_idx = i;
7562 				free_l2_count++;
7563 				if (free_l2_count == l2_blocks)
7564 					break;
7565 			}
7566 		}
7567 		if (free_l2_count != l2_blocks)
7568 			panic("%s: too many preinit mappings", __func__);
7569 
7570 		va = preinit_map_va + (start_idx * L2_SIZE);
7571 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
7572 			/* Mark entries as allocated */
7573 			ppim = pmap_preinit_mapping + i;
7574 			ppim->pa = pa;
7575 			ppim->va = va + offset;
7576 			ppim->size = size;
7577 		}
7578 
7579 		/* Map L2 blocks */
7580 		pa = rounddown2(pa, L2_SIZE);
7581 		old_l2e = 0;
7582 		for (i = 0; i < l2_blocks; i++) {
7583 			pde = pmap_pde(kernel_pmap, va, &lvl);
7584 			KASSERT(pde != NULL,
7585 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
7586 			    va));
7587 			KASSERT(lvl == 1,
7588 			    ("pmap_mapbios: Invalid level %d", lvl));
7589 
7590 			/* Insert L2_BLOCK */
7591 			l2 = pmap_l1_to_l2(pde, va);
7592 			old_l2e |= pmap_load_store(l2,
7593 			    PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN |
7594 			    ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
7595 			    L2_BLOCK);
7596 
7597 			va += L2_SIZE;
7598 			pa += L2_SIZE;
7599 		}
7600 		if ((old_l2e & ATTR_DESCR_VALID) != 0)
7601 			pmap_s1_invalidate_all(kernel_pmap);
7602 		else {
7603 			/*
7604 			 * Because the old entries were invalid and the new
7605 			 * mappings are not executable, an isb is not required.
7606 			 */
7607 			dsb(ishst);
7608 		}
7609 
7610 		va = preinit_map_va + (start_idx * L2_SIZE);
7611 
7612 	} else {
7613 		/* kva_alloc may be used to map the pages */
7614 		offset = pa & PAGE_MASK;
7615 		size = round_page(offset + size);
7616 
7617 		va = kva_alloc(size);
7618 		if (va == 0)
7619 			panic("%s: Couldn't allocate KVA", __func__);
7620 
7621 		pde = pmap_pde(kernel_pmap, va, &lvl);
7622 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
7623 
7624 		/* L3 table is linked */
7625 		va = trunc_page(va);
7626 		pa = trunc_page(pa);
7627 		pmap_kenter(va, size, pa, memory_mapping_mode(pa));
7628 	}
7629 
7630 	return ((void *)(va + offset));
7631 }
7632 
7633 void
pmap_unmapbios(void * p,vm_size_t size)7634 pmap_unmapbios(void *p, vm_size_t size)
7635 {
7636 	struct pmap_preinit_mapping *ppim;
7637 	vm_offset_t offset, va, va_trunc;
7638 	pd_entry_t *pde;
7639 	pt_entry_t *l2;
7640 	int i, lvl, l2_blocks, block;
7641 	bool preinit_map;
7642 
7643 	va = (vm_offset_t)p;
7644 	l2_blocks =
7645 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
7646 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
7647 
7648 	/* Remove preinit mapping */
7649 	preinit_map = false;
7650 	block = 0;
7651 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7652 		ppim = pmap_preinit_mapping + i;
7653 		if (ppim->va == va) {
7654 			KASSERT(ppim->size == size,
7655 			    ("pmap_unmapbios: size mismatch"));
7656 			ppim->va = 0;
7657 			ppim->pa = 0;
7658 			ppim->size = 0;
7659 			preinit_map = true;
7660 			offset = block * L2_SIZE;
7661 			va_trunc = rounddown2(va, L2_SIZE) + offset;
7662 
7663 			/* Remove L2_BLOCK */
7664 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
7665 			KASSERT(pde != NULL,
7666 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
7667 			    va_trunc));
7668 			l2 = pmap_l1_to_l2(pde, va_trunc);
7669 			pmap_clear(l2);
7670 
7671 			if (block == (l2_blocks - 1))
7672 				break;
7673 			block++;
7674 		}
7675 	}
7676 	if (preinit_map) {
7677 		pmap_s1_invalidate_all(kernel_pmap);
7678 		return;
7679 	}
7680 
7681 	/* Unmap the pages reserved with kva_alloc. */
7682 	if (vm_initialized) {
7683 		offset = va & PAGE_MASK;
7684 		size = round_page(offset + size);
7685 		va = trunc_page(va);
7686 
7687 		/* Unmap and invalidate the pages */
7688 		pmap_kremove_device(va, size);
7689 
7690 		kva_free(va, size);
7691 	}
7692 }
7693 
7694 /*
7695  * Sets the memory attribute for the specified page.
7696  */
7697 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)7698 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7699 {
7700 
7701 	m->md.pv_memattr = ma;
7702 
7703 	/*
7704 	 * If "m" is a normal page, update its direct mapping.  This update
7705 	 * can be relied upon to perform any cache operations that are
7706 	 * required for data coherence.
7707 	 */
7708 	if ((m->flags & PG_FICTITIOUS) == 0 &&
7709 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7710 	    m->md.pv_memattr) != 0)
7711 		panic("memory attribute change on the direct map failed");
7712 }
7713 
7714 /*
7715  * Changes the specified virtual address range's memory type to that given by
7716  * the parameter "mode".  The specified virtual address range must be
7717  * completely contained within either the direct map or the kernel map.  If
7718  * the virtual address range is contained within the kernel map, then the
7719  * memory type for each of the corresponding ranges of the direct map is also
7720  * changed.  (The corresponding ranges of the direct map are those ranges that
7721  * map the same physical pages as the specified virtual address range.)  These
7722  * changes to the direct map are necessary because Intel describes the
7723  * behavior of their processors as "undefined" if two or more mappings to the
7724  * same physical page have different memory types.
7725  *
7726  * Returns zero if the change completed successfully, and either EINVAL or
7727  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
7728  * of the virtual address range was not mapped, and ENOMEM is returned if
7729  * there was insufficient memory available to complete the change.  In the
7730  * latter case, the memory type may have been changed on some part of the
7731  * virtual address range or the direct map.
7732  */
7733 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)7734 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7735 {
7736 	int error;
7737 
7738 	PMAP_LOCK(kernel_pmap);
7739 	error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
7740 	PMAP_UNLOCK(kernel_pmap);
7741 	return (error);
7742 }
7743 
7744 /*
7745  * Changes the specified virtual address range's protections to those
7746  * specified by "prot".  Like pmap_change_attr(), protections for aliases
7747  * in the direct map are updated as well.  Protections on aliasing mappings may
7748  * be a subset of the requested protections; for example, mappings in the direct
7749  * map are never executable.
7750  */
7751 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)7752 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
7753 {
7754 	int error;
7755 
7756 	/* Only supported within the kernel map. */
7757 	if (va < VM_MIN_KERNEL_ADDRESS)
7758 		return (EINVAL);
7759 
7760 	PMAP_LOCK(kernel_pmap);
7761 	error = pmap_change_props_locked(va, size, prot, -1, false);
7762 	PMAP_UNLOCK(kernel_pmap);
7763 	return (error);
7764 }
7765 
7766 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)7767 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
7768     int mode, bool skip_unmapped)
7769 {
7770 	vm_offset_t base, offset, tmpva;
7771 	vm_size_t pte_size;
7772 	vm_paddr_t pa;
7773 	pt_entry_t pte, *ptep, *newpte;
7774 	pt_entry_t bits, mask;
7775 	int lvl, rv;
7776 
7777 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7778 	base = trunc_page(va);
7779 	offset = va & PAGE_MASK;
7780 	size = round_page(offset + size);
7781 
7782 	if (!VIRT_IN_DMAP(base) &&
7783 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
7784 		return (EINVAL);
7785 
7786 	bits = 0;
7787 	mask = 0;
7788 	if (mode != -1) {
7789 		bits = ATTR_S1_IDX(mode);
7790 		mask = ATTR_S1_IDX_MASK;
7791 		if (mode == VM_MEMATTR_DEVICE) {
7792 			mask |= ATTR_S1_XN;
7793 			bits |= ATTR_S1_XN;
7794 		}
7795 	}
7796 	if (prot != VM_PROT_NONE) {
7797 		/* Don't mark the DMAP as executable. It never is on arm64. */
7798 		if (VIRT_IN_DMAP(base)) {
7799 			prot &= ~VM_PROT_EXECUTE;
7800 			/*
7801 			 * XXX Mark the DMAP as writable for now. We rely
7802 			 * on this in ddb & dtrace to insert breakpoint
7803 			 * instructions.
7804 			 */
7805 			prot |= VM_PROT_WRITE;
7806 		}
7807 
7808 		if ((prot & VM_PROT_WRITE) == 0) {
7809 			bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
7810 		}
7811 		if ((prot & VM_PROT_EXECUTE) == 0) {
7812 			bits |= ATTR_S1_PXN;
7813 		}
7814 		bits |= ATTR_S1_UXN;
7815 		mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
7816 	}
7817 
7818 	for (tmpva = base; tmpva < base + size; ) {
7819 		ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
7820 		if (ptep == NULL && !skip_unmapped) {
7821 			return (EINVAL);
7822 		} else if ((ptep == NULL && skip_unmapped) ||
7823 		    (pmap_load(ptep) & mask) == bits) {
7824 			/*
7825 			 * We already have the correct attribute or there
7826 			 * is no memory mapped at this address and we are
7827 			 * skipping unmapped memory.
7828 			 */
7829 			switch (lvl) {
7830 			default:
7831 				panic("Invalid DMAP table level: %d\n", lvl);
7832 			case 1:
7833 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
7834 				break;
7835 			case 2:
7836 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
7837 				break;
7838 			case 3:
7839 				tmpva += PAGE_SIZE;
7840 				break;
7841 			}
7842 		} else {
7843 			/* We can't demote/promote this entry */
7844 			MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
7845 
7846 			/*
7847 			 * Split the entry to an level 3 table, then
7848 			 * set the new attribute.
7849 			 */
7850 			switch (lvl) {
7851 			default:
7852 				panic("Invalid DMAP table level: %d\n", lvl);
7853 			case 1:
7854 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7855 				if ((tmpva & L1_OFFSET) == 0 &&
7856 				    (base + size - tmpva) >= L1_SIZE) {
7857 					pte_size = L1_SIZE;
7858 					break;
7859 				}
7860 				newpte = pmap_demote_l1(kernel_pmap, ptep,
7861 				    tmpva & ~L1_OFFSET);
7862 				if (newpte == NULL)
7863 					return (EINVAL);
7864 				ptep = pmap_l1_to_l2(ptep, tmpva);
7865 				/* FALLTHROUGH */
7866 			case 2:
7867 				if ((tmpva & L2_OFFSET) == 0 &&
7868 				    (base + size - tmpva) >= L2_SIZE) {
7869 					pte_size = L2_SIZE;
7870 					break;
7871 				}
7872 				newpte = pmap_demote_l2(kernel_pmap, ptep,
7873 				    tmpva);
7874 				if (newpte == NULL)
7875 					return (EINVAL);
7876 				ptep = pmap_l2_to_l3(ptep, tmpva);
7877 				/* FALLTHROUGH */
7878 			case 3:
7879 				if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
7880 					if ((tmpva & L3C_OFFSET) == 0 &&
7881 					    (base + size - tmpva) >= L3C_SIZE) {
7882 						pte_size = L3C_SIZE;
7883 						break;
7884 					}
7885 					if (!pmap_demote_l3c(kernel_pmap, ptep,
7886 					    tmpva))
7887 						return (EINVAL);
7888 				}
7889 				pte_size = PAGE_SIZE;
7890 				break;
7891 			}
7892 
7893 			/* Update the entry */
7894 			pte = pmap_load(ptep);
7895 			pte &= ~mask;
7896 			pte |= bits;
7897 
7898 			pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
7899 			    pte_size);
7900 
7901 			pa = PTE_TO_PHYS(pte);
7902 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
7903 				/*
7904 				 * Keep the DMAP memory in sync.
7905 				 */
7906 				rv = pmap_change_props_locked(
7907 				    PHYS_TO_DMAP(pa), pte_size,
7908 				    prot, mode, true);
7909 				if (rv != 0)
7910 					return (rv);
7911 			}
7912 
7913 			/*
7914 			 * If moving to a non-cacheable entry flush
7915 			 * the cache.
7916 			 */
7917 			if (mode == VM_MEMATTR_UNCACHEABLE)
7918 				cpu_dcache_wbinv_range((void *)tmpva, pte_size);
7919 			tmpva += pte_size;
7920 		}
7921 	}
7922 
7923 	return (0);
7924 }
7925 
7926 /*
7927  * Create an L2 table to map all addresses within an L1 mapping.
7928  */
7929 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)7930 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
7931 {
7932 	pt_entry_t *l2, newl2, oldl1;
7933 	vm_offset_t tmpl1;
7934 	vm_paddr_t l2phys, phys;
7935 	vm_page_t ml2;
7936 	int i;
7937 
7938 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7939 	oldl1 = pmap_load(l1);
7940 	PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7941 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
7942 	    ("pmap_demote_l1: Demoting a non-block entry"));
7943 	KASSERT((va & L1_OFFSET) == 0,
7944 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
7945 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
7946 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
7947 	KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
7948 	    ("pmap_demote_l1: Demoting entry with no-demote flag set"));
7949 
7950 	tmpl1 = 0;
7951 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
7952 		tmpl1 = kva_alloc(PAGE_SIZE);
7953 		if (tmpl1 == 0)
7954 			return (NULL);
7955 	}
7956 
7957 	if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
7958 	    NULL) {
7959 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
7960 		    " in pmap %p", va, pmap);
7961 		l2 = NULL;
7962 		goto fail;
7963 	}
7964 
7965 	l2phys = VM_PAGE_TO_PHYS(ml2);
7966 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
7967 
7968 	/* Address the range points at */
7969 	phys = PTE_TO_PHYS(oldl1);
7970 	/* The attributed from the old l1 table to be copied */
7971 	newl2 = oldl1 & ATTR_MASK;
7972 
7973 	/* Create the new entries */
7974 	for (i = 0; i < Ln_ENTRIES; i++) {
7975 		l2[i] = newl2 | phys;
7976 		phys += L2_SIZE;
7977 	}
7978 	KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
7979 	    ("Invalid l2 page (%lx != %lx)", l2[0],
7980 	    (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
7981 
7982 	if (tmpl1 != 0) {
7983 		pmap_kenter(tmpl1, PAGE_SIZE,
7984 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
7985 		    VM_MEMATTR_WRITE_BACK);
7986 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
7987 	}
7988 
7989 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
7990 
7991 fail:
7992 	if (tmpl1 != 0) {
7993 		pmap_kremove(tmpl1);
7994 		kva_free(tmpl1, PAGE_SIZE);
7995 	}
7996 
7997 	return (l2);
7998 }
7999 
8000 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8001 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8002 {
8003 	pt_entry_t *l3;
8004 
8005 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8006 		*l3 = newl3;
8007 		newl3 += L3_SIZE;
8008 	}
8009 }
8010 
8011 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8012 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8013 {
8014 #ifdef INVARIANTS
8015 #ifdef DIAGNOSTIC
8016 	pt_entry_t *xl3p, *yl3p;
8017 
8018 	for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8019 	    xl3p++, newl3e += PAGE_SIZE) {
8020 		if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8021 			printf("pmap_demote_l2: xl3e %zd and newl3e map "
8022 			    "different pages: found %#lx, expected %#lx\n",
8023 			    xl3p - firstl3p, pmap_load(xl3p), newl3e);
8024 			printf("page table dump\n");
8025 			for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8026 			    yl3p++) {
8027 				printf("%zd %#lx\n", yl3p - firstl3p,
8028 				    pmap_load(yl3p));
8029 			}
8030 			panic("firstpte");
8031 		}
8032 	}
8033 #else
8034 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8035 	    ("pmap_demote_l2: firstl3 and newl3e map different physical"
8036 	    " addresses"));
8037 #endif
8038 #endif
8039 }
8040 
8041 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8042 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8043     struct rwlock **lockp)
8044 {
8045 	struct spglist free;
8046 
8047 	SLIST_INIT(&free);
8048 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
8049 	    lockp);
8050 	vm_page_free_pages_toq(&free, true);
8051 }
8052 
8053 /*
8054  * Create an L3 table to map all addresses within an L2 mapping.
8055  */
8056 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8057 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8058     struct rwlock **lockp)
8059 {
8060 	pt_entry_t *l3, newl3, oldl2;
8061 	vm_offset_t tmpl2;
8062 	vm_paddr_t l3phys;
8063 	vm_page_t ml3;
8064 
8065 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8066 	PMAP_ASSERT_STAGE1(pmap);
8067 	KASSERT(ADDR_IS_CANONICAL(va),
8068 	    ("%s: Address not in canonical form: %lx", __func__, va));
8069 
8070 	l3 = NULL;
8071 	oldl2 = pmap_load(l2);
8072 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8073 	    ("pmap_demote_l2: Demoting a non-block entry"));
8074 	KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8075 	    ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8076 	va &= ~L2_OFFSET;
8077 
8078 	tmpl2 = 0;
8079 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8080 		tmpl2 = kva_alloc(PAGE_SIZE);
8081 		if (tmpl2 == 0)
8082 			return (NULL);
8083 	}
8084 
8085 	/*
8086 	 * Invalidate the 2MB page mapping and return "failure" if the
8087 	 * mapping was never accessed.
8088 	 */
8089 	if ((oldl2 & ATTR_AF) == 0) {
8090 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8091 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
8092 		pmap_demote_l2_abort(pmap, va, l2, lockp);
8093 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
8094 		    va, pmap);
8095 		goto fail;
8096 	}
8097 
8098 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8099 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8100 		    ("pmap_demote_l2: page table page for a wired mapping"
8101 		    " is missing"));
8102 
8103 		/*
8104 		 * If the page table page is missing and the mapping
8105 		 * is for a kernel address, the mapping must belong to
8106 		 * either the direct map or the early kernel memory.
8107 		 * Page table pages are preallocated for every other
8108 		 * part of the kernel address space, so the direct map
8109 		 * region and early kernel memory are the only parts of the
8110 		 * kernel address space that must be handled here.
8111 		 */
8112 		KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
8113 		    (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8114 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
8115 
8116 		/*
8117 		 * If the 2MB page mapping belongs to the direct map
8118 		 * region of the kernel's address space, then the page
8119 		 * allocation request specifies the highest possible
8120 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
8121 		 * priority is normal.
8122 		 */
8123 		ml3 = vm_page_alloc_noobj(
8124 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8125 		    VM_ALLOC_WIRED);
8126 
8127 		/*
8128 		 * If the allocation of the new page table page fails,
8129 		 * invalidate the 2MB page mapping and return "failure".
8130 		 */
8131 		if (ml3 == NULL) {
8132 			pmap_demote_l2_abort(pmap, va, l2, lockp);
8133 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8134 			    " in pmap %p", va, pmap);
8135 			goto fail;
8136 		}
8137 		ml3->pindex = pmap_l2_pindex(va);
8138 
8139 		if (!ADDR_IS_KERNEL(va)) {
8140 			ml3->ref_count = NL3PG;
8141 			pmap_resident_count_inc(pmap, 1);
8142 		}
8143 	}
8144 	l3phys = VM_PAGE_TO_PHYS(ml3);
8145 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8146 	newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8147 	KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8148 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8149 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8150 
8151 	/*
8152 	 * If the PTP is not leftover from an earlier promotion or it does not
8153 	 * have ATTR_AF set in every L3E, then fill it.  The new L3Es will all
8154 	 * have ATTR_AF set.
8155 	 *
8156 	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8157 	 * performs a dsb().  That dsb() ensures that the stores for filling
8158 	 * "l3" are visible before "l3" is added to the page table.
8159 	 */
8160 	if (!vm_page_all_valid(ml3))
8161 		pmap_fill_l3(l3, newl3);
8162 
8163 	pmap_demote_l2_check(l3, newl3);
8164 
8165 	/*
8166 	 * If the mapping has changed attributes, update the L3Es.
8167 	 */
8168 	if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8169 		pmap_fill_l3(l3, newl3);
8170 
8171 	/*
8172 	 * Map the temporary page so we don't lose access to the l2 table.
8173 	 */
8174 	if (tmpl2 != 0) {
8175 		pmap_kenter(tmpl2, PAGE_SIZE,
8176 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8177 		    VM_MEMATTR_WRITE_BACK);
8178 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8179 	}
8180 
8181 	/*
8182 	 * The spare PV entries must be reserved prior to demoting the
8183 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
8184 	 * of the L2 and the PV lists will be inconsistent, which can result
8185 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8186 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8187 	 * PV entry for the 2MB page mapping that is being demoted.
8188 	 */
8189 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
8190 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8191 
8192 	/*
8193 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8194 	 * the 2MB page mapping.
8195 	 */
8196 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8197 
8198 	/*
8199 	 * Demote the PV entry.
8200 	 */
8201 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
8202 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8203 
8204 	atomic_add_long(&pmap_l2_demotions, 1);
8205 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8206 	    " in pmap %p %lx", va, pmap, l3[0]);
8207 
8208 fail:
8209 	if (tmpl2 != 0) {
8210 		pmap_kremove(tmpl2);
8211 		kva_free(tmpl2, PAGE_SIZE);
8212 	}
8213 
8214 	return (l3);
8215 
8216 }
8217 
8218 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8219 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8220 {
8221 	struct rwlock *lock;
8222 	pt_entry_t *l3;
8223 
8224 	lock = NULL;
8225 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8226 	if (lock != NULL)
8227 		rw_wunlock(lock);
8228 	return (l3);
8229 }
8230 
8231 /*
8232  * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8233  */
8234 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8235 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8236 {
8237 	pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8238 	vm_offset_t tmpl3;
8239 	register_t intr;
8240 
8241 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8242 	l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8243 	    sizeof(pt_entry_t)) - 1));
8244 	l3c_end = l3c_start + L3C_ENTRIES;
8245 	tmpl3 = 0;
8246 	if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8247 	    (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8248 		tmpl3 = kva_alloc(PAGE_SIZE);
8249 		if (tmpl3 == 0)
8250 			return (false);
8251 		pmap_kenter(tmpl3, PAGE_SIZE,
8252 		    DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8253 		    VM_MEMATTR_WRITE_BACK);
8254 		l3c_start = (pt_entry_t *)(tmpl3 +
8255 		    ((vm_offset_t)l3c_start & PAGE_MASK));
8256 		l3c_end = (pt_entry_t *)(tmpl3 +
8257 		    ((vm_offset_t)l3c_end & PAGE_MASK));
8258 	}
8259 	mask = 0;
8260 	nbits = ATTR_DESCR_VALID;
8261 	intr = intr_disable();
8262 
8263 	/*
8264 	 * Break the mappings.
8265 	 */
8266 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8267 		/*
8268 		 * Clear the mapping's contiguous and valid bits, but leave
8269 		 * the rest of the entry unchanged, so that a lockless,
8270 		 * concurrent pmap_kextract() can still lookup the physical
8271 		 * address.
8272 		 */
8273 		l3e = pmap_load(tl3p);
8274 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8275 		    ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8276 		KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8277 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8278 		    ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8279 		while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8280 		    ATTR_DESCR_VALID)))
8281 			cpu_spinwait();
8282 
8283 		/*
8284 		 * Hardware accessed and dirty bit maintenance might only
8285 		 * update a single L3 entry, so we must combine the accessed
8286 		 * and dirty bits from this entire set of contiguous L3
8287 		 * entries.
8288 		 */
8289 		if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8290 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8291 			mask = ATTR_S1_AP_RW_BIT;
8292 		nbits |= l3e & ATTR_AF;
8293 	}
8294 	if ((nbits & ATTR_AF) != 0) {
8295 		pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8296 		    ~L3C_OFFSET, true);
8297 	}
8298 
8299 	/*
8300 	 * Remake the mappings, updating the accessed and dirty bits.
8301 	 */
8302 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8303 		l3e = pmap_load(tl3p);
8304 		while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
8305 			cpu_spinwait();
8306 	}
8307 	dsb(ishst);
8308 
8309 	intr_restore(intr);
8310 	if (tmpl3 != 0) {
8311 		pmap_kremove(tmpl3);
8312 		kva_free(tmpl3, PAGE_SIZE);
8313 	}
8314 	counter_u64_add(pmap_l3c_demotions, 1);
8315 	CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8316 	    va, pmap);
8317 	return (true);
8318 }
8319 
8320 /*
8321  * Accumulate the accessed and dirty bits within a L3C superpage and
8322  * return the specified PTE with them applied correctly.
8323  */
8324 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)8325 pmap_load_l3c(pt_entry_t *l3p)
8326 {
8327 	pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8328 
8329 	l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8330 	    sizeof(pt_entry_t)) - 1));
8331 	l3c_end = l3c_start + L3C_ENTRIES;
8332 	mask = 0;
8333 	nbits = 0;
8334 	/* Iterate over each mapping in the superpage. */
8335 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8336 		l3e = pmap_load(tl3p);
8337 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8338 		    ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8339 		/* Update mask if the current page has its dirty bit set. */
8340 		if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8341 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8342 			mask = ATTR_S1_AP_RW_BIT;
8343 		/* Update nbits if the accessed bit is set. */
8344 		nbits |= l3e & ATTR_AF;
8345 	}
8346 	return ((pmap_load(l3p) & ~mask) | nbits);
8347 }
8348 
8349 /*
8350  * Perform the pmap work for mincore(2).  If the page is not both referenced and
8351  * modified by this pmap, returns its physical address so that the caller can
8352  * find other mappings.
8353  */
8354 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)8355 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8356 {
8357 	pt_entry_t *pte, tpte;
8358 	vm_paddr_t mask, pa;
8359 	int lvl, val;
8360 	bool managed;
8361 
8362 	PMAP_ASSERT_STAGE1(pmap);
8363 	PMAP_LOCK(pmap);
8364 	pte = pmap_pte(pmap, addr, &lvl);
8365 	if (pte != NULL) {
8366 		tpte = pmap_load(pte);
8367 
8368 		switch (lvl) {
8369 		case 3:
8370 			mask = L3_OFFSET;
8371 			break;
8372 		case 2:
8373 			mask = L2_OFFSET;
8374 			break;
8375 		case 1:
8376 			mask = L1_OFFSET;
8377 			break;
8378 		default:
8379 			panic("pmap_mincore: invalid level %d", lvl);
8380 		}
8381 
8382 		managed = (tpte & ATTR_SW_MANAGED) != 0;
8383 		val = MINCORE_INCORE;
8384 		if (lvl != 3)
8385 			val |= MINCORE_PSIND(3 - lvl);
8386 		if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8387 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8388 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8389 		if ((tpte & ATTR_AF) == ATTR_AF)
8390 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8391 
8392 		pa = PTE_TO_PHYS(tpte) | (addr & mask);
8393 	} else {
8394 		managed = false;
8395 		val = 0;
8396 	}
8397 
8398 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8399 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8400 		*pap = pa;
8401 	}
8402 	PMAP_UNLOCK(pmap);
8403 	return (val);
8404 }
8405 
8406 /*
8407  * Garbage collect every ASID that is neither active on a processor nor
8408  * reserved.
8409  */
8410 static void
pmap_reset_asid_set(pmap_t pmap)8411 pmap_reset_asid_set(pmap_t pmap)
8412 {
8413 	pmap_t curpmap;
8414 	int asid, cpuid, epoch;
8415 	struct asid_set *set;
8416 	enum pmap_stage stage;
8417 
8418 	set = pmap->pm_asid_set;
8419 	stage = pmap->pm_stage;
8420 
8421 	set = pmap->pm_asid_set;
8422 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8423 	mtx_assert(&set->asid_set_mutex, MA_OWNED);
8424 
8425 	/*
8426 	 * Ensure that the store to asid_epoch is globally visible before the
8427 	 * loads from pc_curpmap are performed.
8428 	 */
8429 	epoch = set->asid_epoch + 1;
8430 	if (epoch == INT_MAX)
8431 		epoch = 0;
8432 	set->asid_epoch = epoch;
8433 	dsb(ishst);
8434 	if (stage == PM_STAGE1) {
8435 		__asm __volatile("tlbi vmalle1is");
8436 	} else {
8437 		KASSERT(pmap_clean_stage2_tlbi != NULL,
8438 		    ("%s: Unset stage 2 tlb invalidation callback\n",
8439 		    __func__));
8440 		pmap_clean_stage2_tlbi();
8441 	}
8442 	dsb(ish);
8443 	bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
8444 	    set->asid_set_size - 1);
8445 	CPU_FOREACH(cpuid) {
8446 		if (cpuid == curcpu)
8447 			continue;
8448 		if (stage == PM_STAGE1) {
8449 			curpmap = pcpu_find(cpuid)->pc_curpmap;
8450 			PMAP_ASSERT_STAGE1(pmap);
8451 		} else {
8452 			curpmap = pcpu_find(cpuid)->pc_curvmpmap;
8453 			if (curpmap == NULL)
8454 				continue;
8455 			PMAP_ASSERT_STAGE2(pmap);
8456 		}
8457 		KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
8458 		asid = COOKIE_TO_ASID(curpmap->pm_cookie);
8459 		if (asid == -1)
8460 			continue;
8461 		bit_set(set->asid_set, asid);
8462 		curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
8463 	}
8464 }
8465 
8466 /*
8467  * Allocate a new ASID for the specified pmap.
8468  */
8469 static void
pmap_alloc_asid(pmap_t pmap)8470 pmap_alloc_asid(pmap_t pmap)
8471 {
8472 	struct asid_set *set;
8473 	int new_asid;
8474 
8475 	set = pmap->pm_asid_set;
8476 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8477 
8478 	mtx_lock_spin(&set->asid_set_mutex);
8479 
8480 	/*
8481 	 * While this processor was waiting to acquire the asid set mutex,
8482 	 * pmap_reset_asid_set() running on another processor might have
8483 	 * updated this pmap's cookie to the current epoch.  In which case, we
8484 	 * don't need to allocate a new ASID.
8485 	 */
8486 	if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
8487 		goto out;
8488 
8489 	bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
8490 	    &new_asid);
8491 	if (new_asid == -1) {
8492 		bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8493 		    set->asid_next, &new_asid);
8494 		if (new_asid == -1) {
8495 			pmap_reset_asid_set(pmap);
8496 			bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8497 			    set->asid_set_size, &new_asid);
8498 			KASSERT(new_asid != -1, ("ASID allocation failure"));
8499 		}
8500 	}
8501 	bit_set(set->asid_set, new_asid);
8502 	set->asid_next = new_asid + 1;
8503 	pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
8504 out:
8505 	mtx_unlock_spin(&set->asid_set_mutex);
8506 }
8507 
8508 static uint64_t __read_mostly ttbr_flags;
8509 
8510 /*
8511  * Compute the value that should be stored in ttbr0 to activate the specified
8512  * pmap.  This value may change from time to time.
8513  */
8514 uint64_t
pmap_to_ttbr0(pmap_t pmap)8515 pmap_to_ttbr0(pmap_t pmap)
8516 {
8517 	uint64_t ttbr;
8518 
8519 	ttbr = pmap->pm_ttbr;
8520 	ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
8521 	ttbr |= ttbr_flags;
8522 
8523 	return (ttbr);
8524 }
8525 
8526 static void
pmap_set_cnp(void * arg)8527 pmap_set_cnp(void *arg)
8528 {
8529 	uint64_t ttbr0, ttbr1;
8530 	u_int cpuid;
8531 
8532 	cpuid = *(u_int *)arg;
8533 	if (cpuid == curcpu) {
8534 		/*
8535 		 * Set the flags while all CPUs are handling the
8536 		 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
8537 		 * to pmap_to_ttbr0 after this will have the CnP flag set.
8538 		 * The dsb after invalidating the TLB will act as a barrier
8539 		 * to ensure all CPUs can observe this change.
8540 		 */
8541 		ttbr_flags |= TTBR_CnP;
8542 	}
8543 
8544 	ttbr0 = READ_SPECIALREG(ttbr0_el1);
8545 	ttbr0 |= TTBR_CnP;
8546 
8547 	ttbr1 = READ_SPECIALREG(ttbr1_el1);
8548 	ttbr1 |= TTBR_CnP;
8549 
8550 	/* Update ttbr{0,1}_el1 with the CnP flag */
8551 	WRITE_SPECIALREG(ttbr0_el1, ttbr0);
8552 	WRITE_SPECIALREG(ttbr1_el1, ttbr1);
8553 	isb();
8554 	__asm __volatile("tlbi vmalle1is");
8555 	dsb(ish);
8556 	isb();
8557 }
8558 
8559 /*
8560  * Defer enabling some features until we have read the ID registers to know
8561  * if they are supported on all CPUs.
8562  */
8563 static void
pmap_init_mp(void * dummy __unused)8564 pmap_init_mp(void *dummy __unused)
8565 {
8566 	uint64_t reg;
8567 
8568 	if (get_kernel_reg(ID_AA64PFR1_EL1, &reg)) {
8569 		if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
8570 			if (bootverbose)
8571 				printf("Enabling BTI\n");
8572 			pmap_bti_support = true;
8573 
8574 			pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
8575 			    sizeof(struct rs_el), NULL, NULL, NULL, NULL,
8576 			    UMA_ALIGN_PTR, 0);
8577 		}
8578 	}
8579 }
8580 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
8581 
8582 /*
8583  * Defer enabling CnP until we have read the ID registers to know if it's
8584  * supported on all CPUs.
8585  */
8586 static void
pmap_init_cnp(void * dummy __unused)8587 pmap_init_cnp(void *dummy __unused)
8588 {
8589 	uint64_t reg;
8590 	u_int cpuid;
8591 
8592 	if (!get_kernel_reg(ID_AA64MMFR2_EL1, &reg))
8593 		return;
8594 
8595 	if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
8596 		if (bootverbose)
8597 			printf("Enabling CnP\n");
8598 		cpuid = curcpu;
8599 		smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
8600 	}
8601 
8602 }
8603 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
8604 
8605 static bool
pmap_activate_int(pmap_t pmap)8606 pmap_activate_int(pmap_t pmap)
8607 {
8608 	struct asid_set *set;
8609 	int epoch;
8610 
8611 	KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
8612 	KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
8613 
8614 	if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
8615 	    (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
8616 		/*
8617 		 * Handle the possibility that the old thread was preempted
8618 		 * after an "ic" or "tlbi" instruction but before it performed
8619 		 * a "dsb" instruction.  If the old thread migrates to a new
8620 		 * processor, its completion of a "dsb" instruction on that
8621 		 * new processor does not guarantee that the "ic" or "tlbi"
8622 		 * instructions performed on the old processor have completed.
8623 		 */
8624 		dsb(ish);
8625 		return (false);
8626 	}
8627 
8628 	set = pmap->pm_asid_set;
8629 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8630 
8631 	/*
8632 	 * Ensure that the store to curpmap is globally visible before the
8633 	 * load from asid_epoch is performed.
8634 	 */
8635 	if (pmap->pm_stage == PM_STAGE1)
8636 		PCPU_SET(curpmap, pmap);
8637 	else
8638 		PCPU_SET(curvmpmap, pmap);
8639 	dsb(ish);
8640 	epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
8641 	if (epoch >= 0 && epoch != set->asid_epoch)
8642 		pmap_alloc_asid(pmap);
8643 
8644 	if (pmap->pm_stage == PM_STAGE1) {
8645 		set_ttbr0(pmap_to_ttbr0(pmap));
8646 		if (PCPU_GET(bcast_tlbi_workaround) != 0)
8647 			invalidate_local_icache();
8648 	}
8649 	return (true);
8650 }
8651 
8652 void
pmap_activate_vm(pmap_t pmap)8653 pmap_activate_vm(pmap_t pmap)
8654 {
8655 
8656 	PMAP_ASSERT_STAGE2(pmap);
8657 
8658 	(void)pmap_activate_int(pmap);
8659 }
8660 
8661 void
pmap_activate(struct thread * td)8662 pmap_activate(struct thread *td)
8663 {
8664 	pmap_t	pmap;
8665 
8666 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
8667 	PMAP_ASSERT_STAGE1(pmap);
8668 	critical_enter();
8669 	(void)pmap_activate_int(pmap);
8670 	critical_exit();
8671 }
8672 
8673 /*
8674  * Activate the thread we are switching to.
8675  * To simplify the assembly in cpu_throw return the new threads pcb.
8676  */
8677 struct pcb *
pmap_switch(struct thread * new)8678 pmap_switch(struct thread *new)
8679 {
8680 	pcpu_bp_harden bp_harden;
8681 	struct pcb *pcb;
8682 
8683 	/* Store the new curthread */
8684 	PCPU_SET(curthread, new);
8685 
8686 	/* And the new pcb */
8687 	pcb = new->td_pcb;
8688 	PCPU_SET(curpcb, pcb);
8689 
8690 	/*
8691 	 * TODO: We may need to flush the cache here if switching
8692 	 * to a user process.
8693 	 */
8694 
8695 	if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
8696 		/*
8697 		 * Stop userspace from training the branch predictor against
8698 		 * other processes. This will call into a CPU specific
8699 		 * function that clears the branch predictor state.
8700 		 */
8701 		bp_harden = PCPU_GET(bp_harden);
8702 		if (bp_harden != NULL)
8703 			bp_harden();
8704 	}
8705 
8706 	return (pcb);
8707 }
8708 
8709 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)8710 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
8711 {
8712 
8713 	PMAP_ASSERT_STAGE1(pmap);
8714 	KASSERT(ADDR_IS_CANONICAL(va),
8715 	    ("%s: Address not in canonical form: %lx", __func__, va));
8716 
8717 	if (ADDR_IS_KERNEL(va)) {
8718 		cpu_icache_sync_range((void *)va, sz);
8719 	} else {
8720 		u_int len, offset;
8721 		vm_paddr_t pa;
8722 
8723 		/* Find the length of data in this page to flush */
8724 		offset = va & PAGE_MASK;
8725 		len = imin(PAGE_SIZE - offset, sz);
8726 
8727 		while (sz != 0) {
8728 			/* Extract the physical address & find it in the DMAP */
8729 			pa = pmap_extract(pmap, va);
8730 			if (pa != 0)
8731 				cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
8732 				    len);
8733 
8734 			/* Move to the next page */
8735 			sz -= len;
8736 			va += len;
8737 			/* Set the length for the next iteration */
8738 			len = imin(PAGE_SIZE, sz);
8739 		}
8740 	}
8741 }
8742 
8743 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)8744 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
8745 {
8746 	pd_entry_t *pdep;
8747 	pt_entry_t *ptep, pte;
8748 	int rv, lvl, dfsc;
8749 
8750 	PMAP_ASSERT_STAGE2(pmap);
8751 	rv = KERN_FAILURE;
8752 
8753 	/* Data and insn aborts use same encoding for FSC field. */
8754 	dfsc = esr & ISS_DATA_DFSC_MASK;
8755 	switch (dfsc) {
8756 	case ISS_DATA_DFSC_TF_L0:
8757 	case ISS_DATA_DFSC_TF_L1:
8758 	case ISS_DATA_DFSC_TF_L2:
8759 	case ISS_DATA_DFSC_TF_L3:
8760 		PMAP_LOCK(pmap);
8761 		pdep = pmap_pde(pmap, far, &lvl);
8762 		if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
8763 			PMAP_UNLOCK(pmap);
8764 			break;
8765 		}
8766 
8767 		switch (lvl) {
8768 		case 0:
8769 			ptep = pmap_l0_to_l1(pdep, far);
8770 			break;
8771 		case 1:
8772 			ptep = pmap_l1_to_l2(pdep, far);
8773 			break;
8774 		case 2:
8775 			ptep = pmap_l2_to_l3(pdep, far);
8776 			break;
8777 		default:
8778 			panic("%s: Invalid pde level %d", __func__,lvl);
8779 		}
8780 		goto fault_exec;
8781 
8782 	case ISS_DATA_DFSC_AFF_L1:
8783 	case ISS_DATA_DFSC_AFF_L2:
8784 	case ISS_DATA_DFSC_AFF_L3:
8785 		PMAP_LOCK(pmap);
8786 		ptep = pmap_pte(pmap, far, &lvl);
8787 fault_exec:
8788 		if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
8789 			if (icache_vmid) {
8790 				pmap_invalidate_vpipt_icache();
8791 			} else {
8792 				/*
8793 				 * If accessing an executable page invalidate
8794 				 * the I-cache so it will be valid when we
8795 				 * continue execution in the guest. The D-cache
8796 				 * is assumed to already be clean to the Point
8797 				 * of Coherency.
8798 				 */
8799 				if ((pte & ATTR_S2_XN_MASK) !=
8800 				    ATTR_S2_XN(ATTR_S2_XN_NONE)) {
8801 					invalidate_icache();
8802 				}
8803 			}
8804 			pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
8805 			rv = KERN_SUCCESS;
8806 		}
8807 		PMAP_UNLOCK(pmap);
8808 		break;
8809 	}
8810 
8811 	return (rv);
8812 }
8813 
8814 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)8815 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
8816 {
8817 	pt_entry_t pte, *ptep;
8818 	register_t intr;
8819 	uint64_t ec, par;
8820 	int lvl, rv;
8821 
8822 	rv = KERN_FAILURE;
8823 
8824 	ec = ESR_ELx_EXCEPTION(esr);
8825 	switch (ec) {
8826 	case EXCP_INSN_ABORT_L:
8827 	case EXCP_INSN_ABORT:
8828 	case EXCP_DATA_ABORT_L:
8829 	case EXCP_DATA_ABORT:
8830 		break;
8831 	default:
8832 		return (rv);
8833 	}
8834 
8835 	if (pmap->pm_stage == PM_STAGE2)
8836 		return (pmap_stage2_fault(pmap, esr, far));
8837 
8838 	/* Data and insn aborts use same encoding for FSC field. */
8839 	switch (esr & ISS_DATA_DFSC_MASK) {
8840 	case ISS_DATA_DFSC_AFF_L1:
8841 	case ISS_DATA_DFSC_AFF_L2:
8842 	case ISS_DATA_DFSC_AFF_L3:
8843 		PMAP_LOCK(pmap);
8844 		ptep = pmap_pte(pmap, far, &lvl);
8845 		if (ptep != NULL) {
8846 			pmap_set_bits(ptep, ATTR_AF);
8847 			rv = KERN_SUCCESS;
8848 			/*
8849 			 * XXXMJ as an optimization we could mark the entry
8850 			 * dirty if this is a write fault.
8851 			 */
8852 		}
8853 		PMAP_UNLOCK(pmap);
8854 		break;
8855 	case ISS_DATA_DFSC_PF_L1:
8856 	case ISS_DATA_DFSC_PF_L2:
8857 	case ISS_DATA_DFSC_PF_L3:
8858 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
8859 		    (esr & ISS_DATA_WnR) == 0)
8860 			return (rv);
8861 		PMAP_LOCK(pmap);
8862 		ptep = pmap_pte(pmap, far, &lvl);
8863 		if (ptep != NULL &&
8864 		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
8865 			if ((pte & ATTR_S1_AP_RW_BIT) ==
8866 			    ATTR_S1_AP(ATTR_S1_AP_RO)) {
8867 				pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
8868 				pmap_s1_invalidate_page(pmap, far, true);
8869 			}
8870 			rv = KERN_SUCCESS;
8871 		}
8872 		PMAP_UNLOCK(pmap);
8873 		break;
8874 	case ISS_DATA_DFSC_TF_L0:
8875 	case ISS_DATA_DFSC_TF_L1:
8876 	case ISS_DATA_DFSC_TF_L2:
8877 	case ISS_DATA_DFSC_TF_L3:
8878 		/*
8879 		 * Retry the translation.  A break-before-make sequence can
8880 		 * produce a transient fault.
8881 		 */
8882 		if (pmap == kernel_pmap) {
8883 			/*
8884 			 * The translation fault may have occurred within a
8885 			 * critical section.  Therefore, we must check the
8886 			 * address without acquiring the kernel pmap's lock.
8887 			 */
8888 			if (pmap_klookup(far, NULL))
8889 				rv = KERN_SUCCESS;
8890 		} else {
8891 			PMAP_LOCK(pmap);
8892 			/* Ask the MMU to check the address. */
8893 			intr = intr_disable();
8894 			par = arm64_address_translate_s1e0r(far);
8895 			intr_restore(intr);
8896 			PMAP_UNLOCK(pmap);
8897 
8898 			/*
8899 			 * If the translation was successful, then we can
8900 			 * return success to the trap handler.
8901 			 */
8902 			if (PAR_SUCCESS(par))
8903 				rv = KERN_SUCCESS;
8904 		}
8905 		break;
8906 	}
8907 
8908 	return (rv);
8909 }
8910 
8911 /*
8912  *	Increase the starting virtual address of the given mapping if a
8913  *	different alignment might result in more superpage mappings.
8914  */
8915 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)8916 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
8917     vm_offset_t *addr, vm_size_t size)
8918 {
8919 	vm_offset_t superpage_offset;
8920 
8921 	if (size < L2_SIZE)
8922 		return;
8923 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
8924 		offset += ptoa(object->pg_color);
8925 	superpage_offset = offset & L2_OFFSET;
8926 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
8927 	    (*addr & L2_OFFSET) == superpage_offset)
8928 		return;
8929 	if ((*addr & L2_OFFSET) < superpage_offset)
8930 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
8931 	else
8932 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
8933 }
8934 
8935 /**
8936  * Get the kernel virtual address of a set of physical pages. If there are
8937  * physical addresses not covered by the DMAP perform a transient mapping
8938  * that will be removed when calling pmap_unmap_io_transient.
8939  *
8940  * \param page        The pages the caller wishes to obtain the virtual
8941  *                    address on the kernel memory map.
8942  * \param vaddr       On return contains the kernel virtual memory address
8943  *                    of the pages passed in the page parameter.
8944  * \param count       Number of pages passed in.
8945  * \param can_fault   true if the thread using the mapped pages can take
8946  *                    page faults, false otherwise.
8947  *
8948  * \returns true if the caller must call pmap_unmap_io_transient when
8949  *          finished or false otherwise.
8950  *
8951  */
8952 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)8953 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8954     bool can_fault)
8955 {
8956 	vm_paddr_t paddr;
8957 	bool needs_mapping;
8958 	int error __diagused, i;
8959 
8960 	/*
8961 	 * Allocate any KVA space that we need, this is done in a separate
8962 	 * loop to prevent calling vmem_alloc while pinned.
8963 	 */
8964 	needs_mapping = false;
8965 	for (i = 0; i < count; i++) {
8966 		paddr = VM_PAGE_TO_PHYS(page[i]);
8967 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
8968 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
8969 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
8970 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
8971 			needs_mapping = true;
8972 		} else {
8973 			vaddr[i] = PHYS_TO_DMAP(paddr);
8974 		}
8975 	}
8976 
8977 	/* Exit early if everything is covered by the DMAP */
8978 	if (!needs_mapping)
8979 		return (false);
8980 
8981 	if (!can_fault)
8982 		sched_pin();
8983 	for (i = 0; i < count; i++) {
8984 		paddr = VM_PAGE_TO_PHYS(page[i]);
8985 		if (!PHYS_IN_DMAP(paddr)) {
8986 			panic(
8987 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
8988 		}
8989 	}
8990 
8991 	return (needs_mapping);
8992 }
8993 
8994 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)8995 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8996     bool can_fault)
8997 {
8998 	vm_paddr_t paddr;
8999 	int i;
9000 
9001 	if (!can_fault)
9002 		sched_unpin();
9003 	for (i = 0; i < count; i++) {
9004 		paddr = VM_PAGE_TO_PHYS(page[i]);
9005 		if (!PHYS_IN_DMAP(paddr)) {
9006 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9007 		}
9008 	}
9009 }
9010 
9011 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9012 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9013 {
9014 
9015 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
9016 }
9017 
9018 static void *
bti_dup_range(void * ctx __unused,void * data)9019 bti_dup_range(void *ctx __unused, void *data)
9020 {
9021 	struct rs_el *node, *new_node;
9022 
9023 	new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9024 	if (new_node == NULL)
9025 		return (NULL);
9026 	node = data;
9027 	memcpy(new_node, node, sizeof(*node));
9028 	return (new_node);
9029 }
9030 
9031 static void
bti_free_range(void * ctx __unused,void * node)9032 bti_free_range(void *ctx __unused, void *node)
9033 {
9034 
9035 	uma_zfree(pmap_bti_ranges_zone, node);
9036 }
9037 
9038 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9039 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9040 {
9041 	struct rs_el *rs;
9042 	int error;
9043 
9044 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9045 	PMAP_ASSERT_STAGE1(pmap);
9046 	MPASS(pmap->pm_bti != NULL);
9047 	rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9048 	if (rs == NULL)
9049 		return (ENOMEM);
9050 	error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9051 	if (error != 0)
9052 		uma_zfree(pmap_bti_ranges_zone, rs);
9053 	return (error);
9054 }
9055 
9056 static void
pmap_bti_deassign_all(pmap_t pmap)9057 pmap_bti_deassign_all(pmap_t pmap)
9058 {
9059 
9060 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9061 	if (pmap->pm_bti != NULL)
9062 		rangeset_remove_all(pmap->pm_bti);
9063 }
9064 
9065 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9066 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9067 {
9068 	struct rs_el *prev_rs, *rs;
9069 	vm_offset_t va;
9070 
9071 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9072 	KASSERT(ADDR_IS_CANONICAL(sva),
9073 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
9074 	KASSERT(ADDR_IS_CANONICAL(eva),
9075 	    ("%s: End address not in canonical form: %lx", __func__, eva));
9076 
9077 	if (pmap->pm_bti == NULL || ADDR_IS_KERNEL(sva))
9078 		return (true);
9079 	MPASS(!ADDR_IS_KERNEL(eva));
9080 	for (va = sva; va < eva; prev_rs = rs) {
9081 		rs = rangeset_lookup(pmap->pm_bti, va);
9082 		if (va == sva)
9083 			prev_rs = rs;
9084 		else if ((rs == NULL) ^ (prev_rs == NULL))
9085 			return (false);
9086 		if (rs == NULL) {
9087 			va += PAGE_SIZE;
9088 			continue;
9089 		}
9090 		va = rs->re_end;
9091 	}
9092 	return (true);
9093 }
9094 
9095 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9096 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9097 {
9098 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9099 	MPASS(ADDR_IS_CANONICAL(va));
9100 
9101 	if (pmap->pm_stage != PM_STAGE1)
9102 		return (0);
9103 	if (pmap == kernel_pmap)
9104 		return (ATTR_KERN_GP);
9105 	if (pmap->pm_bti != NULL && rangeset_lookup(pmap->pm_bti, va) != NULL)
9106 		return (ATTR_S1_GP);
9107 	return (0);
9108 }
9109 
9110 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9111 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9112 {
9113 
9114 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9115 	if (pmap->pm_bti != NULL)
9116 		rangeset_remove(pmap->pm_bti, sva, eva);
9117 }
9118 
9119 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9120 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9121 {
9122 
9123 	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9124 	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9125 	MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9126 	MPASS(src_pmap->pm_bti != NULL);
9127 	MPASS(dst_pmap->pm_bti != NULL);
9128 	if (src_pmap->pm_bti->rs_data_ctx == NULL)
9129 		return (0);
9130 	return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9131 }
9132 
9133 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9134 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9135 {
9136 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9137 	PMAP_ASSERT_STAGE1(pmap);
9138 
9139 	pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9140 	    true);
9141 }
9142 
9143 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9144 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9145 {
9146 	int error;
9147 
9148 	if (pmap->pm_bti == NULL)
9149 		return (0);
9150 	if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9151 		return (EINVAL);
9152 	if (pmap->pm_stage != PM_STAGE1)
9153 		return (EINVAL);
9154 	if (eva <= sva || ADDR_IS_KERNEL(eva))
9155 		return (EFAULT);
9156 
9157 	sva = trunc_page(sva);
9158 	eva = round_page(eva);
9159 	for (;;) {
9160 		PMAP_LOCK(pmap);
9161 		error = pmap_bti_assign(pmap, sva, eva);
9162 		if (error == 0)
9163 			pmap_bti_update_range(pmap, sva, eva, true);
9164 		PMAP_UNLOCK(pmap);
9165 		if (error != ENOMEM)
9166 			break;
9167 		vm_wait(NULL);
9168 	}
9169 	return (error);
9170 }
9171 
9172 #if defined(KASAN) || defined(KMSAN)
9173 static pd_entry_t	*pmap_san_early_l2;
9174 
9175 #define	SAN_BOOTSTRAP_L2_SIZE	(1 * L2_SIZE)
9176 #define	SAN_BOOTSTRAP_SIZE	(2 * PAGE_SIZE)
9177 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9178 pmap_san_enter_bootstrap_alloc_l2(void)
9179 {
9180 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9181 	static size_t offset = 0;
9182 	vm_offset_t addr;
9183 
9184 	if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9185 		panic("%s: out of memory for the bootstrap shadow map L2 entries",
9186 		    __func__);
9187 	}
9188 
9189 	addr = (uintptr_t)&bootstrap_data[offset];
9190 	offset += L2_SIZE;
9191 	return (addr);
9192 }
9193 
9194 /*
9195  * SAN L1 + L2 pages, maybe L3 entries later?
9196  */
9197 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9198 pmap_san_enter_bootstrap_alloc_pages(int npages)
9199 {
9200 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9201 	static size_t offset = 0;
9202 	vm_offset_t addr;
9203 
9204 	if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9205 		panic("%s: out of memory for the bootstrap shadow map",
9206 		    __func__);
9207 	}
9208 
9209 	addr = (uintptr_t)&bootstrap_data[offset];
9210 	offset += (npages * PAGE_SIZE);
9211 	return (addr);
9212 }
9213 
9214 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)9215 pmap_san_enter_bootstrap(void)
9216 {
9217 	vm_offset_t freemempos;
9218 
9219 	/* L1, L2 */
9220 	freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9221 	bs_state.freemempos = freemempos;
9222 	bs_state.va = KASAN_MIN_ADDRESS;
9223 	pmap_bootstrap_l1_table(&bs_state);
9224 	pmap_san_early_l2 = bs_state.l2;
9225 }
9226 
9227 static vm_page_t
pmap_san_enter_alloc_l3(void)9228 pmap_san_enter_alloc_l3(void)
9229 {
9230 	vm_page_t m;
9231 
9232 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9233 	    VM_ALLOC_ZERO);
9234 	if (m == NULL)
9235 		panic("%s: no memory to grow shadow map", __func__);
9236 	return (m);
9237 }
9238 
9239 static vm_page_t
pmap_san_enter_alloc_l2(void)9240 pmap_san_enter_alloc_l2(void)
9241 {
9242 	return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9243 	    Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9244 }
9245 
9246 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)9247 pmap_san_enter(vm_offset_t va)
9248 {
9249 	pd_entry_t *l1, *l2;
9250 	pt_entry_t *l3;
9251 	vm_page_t m;
9252 
9253 	if (virtual_avail == 0) {
9254 		vm_offset_t block;
9255 		int slot;
9256 		bool first;
9257 
9258 		/* Temporary shadow map prior to pmap_bootstrap(). */
9259 		first = pmap_san_early_l2 == NULL;
9260 		if (first)
9261 			pmap_san_enter_bootstrap();
9262 
9263 		l2 = pmap_san_early_l2;
9264 		slot = pmap_l2_index(va);
9265 
9266 		if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9267 			MPASS(first);
9268 			block = pmap_san_enter_bootstrap_alloc_l2();
9269 			pmap_store(&l2[slot],
9270 			    PHYS_TO_PTE(pmap_early_vtophys(block)) |
9271 			    PMAP_SAN_PTE_BITS | L2_BLOCK);
9272 			dmb(ishst);
9273 		}
9274 
9275 		return;
9276 	}
9277 
9278 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9279 	l1 = pmap_l1(kernel_pmap, va);
9280 	MPASS(l1 != NULL);
9281 	if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9282 		m = pmap_san_enter_alloc_l3();
9283 		pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9284 	}
9285 	l2 = pmap_l1_to_l2(l1, va);
9286 	if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9287 		m = pmap_san_enter_alloc_l2();
9288 		if (m != NULL) {
9289 			pmap_store(l2, VM_PAGE_TO_PTE(m) |
9290 			    PMAP_SAN_PTE_BITS | L2_BLOCK);
9291 		} else {
9292 			m = pmap_san_enter_alloc_l3();
9293 			pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9294 		}
9295 		dmb(ishst);
9296 	}
9297 	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9298 		return;
9299 	l3 = pmap_l2_to_l3(l2, va);
9300 	if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9301 		return;
9302 	m = pmap_san_enter_alloc_l3();
9303 	pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9304 	dmb(ishst);
9305 }
9306 #endif /* KASAN || KMSAN */
9307 
9308 /*
9309  * Track a range of the kernel's virtual address space that is contiguous
9310  * in various mapping attributes.
9311  */
9312 struct pmap_kernel_map_range {
9313 	vm_offset_t sva;
9314 	pt_entry_t attrs;
9315 	int l3pages;
9316 	int l3contig;
9317 	int l2blocks;
9318 	int l1blocks;
9319 };
9320 
9321 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)9322 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9323     vm_offset_t eva)
9324 {
9325 	const char *mode;
9326 	int index;
9327 
9328 	if (eva <= range->sva)
9329 		return;
9330 
9331 	index = range->attrs & ATTR_S1_IDX_MASK;
9332 	switch (index) {
9333 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9334 		mode = "DEV-NP";
9335 		break;
9336 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9337 		mode = "DEV";
9338 		break;
9339 	case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9340 		mode = "UC";
9341 		break;
9342 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9343 		mode = "WB";
9344 		break;
9345 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9346 		mode = "WT";
9347 		break;
9348 	default:
9349 		printf(
9350 		    "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9351 		    __func__, index, range->sva, eva);
9352 		mode = "??";
9353 		break;
9354 	}
9355 
9356 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d\n",
9357 	    range->sva, eva,
9358 	    (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9359 	    (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9360 	    (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9361 	    (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9362 	    (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9363 	    mode, range->l1blocks, range->l2blocks, range->l3contig,
9364 	    range->l3pages);
9365 
9366 	/* Reset to sentinel value. */
9367 	range->sva = 0xfffffffffffffffful;
9368 }
9369 
9370 /*
9371  * Determine whether the attributes specified by a page table entry match those
9372  * being tracked by the current range.
9373  */
9374 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)9375 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
9376 {
9377 
9378 	return (range->attrs == attrs);
9379 }
9380 
9381 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)9382 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
9383     pt_entry_t attrs)
9384 {
9385 
9386 	memset(range, 0, sizeof(*range));
9387 	range->sva = va;
9388 	range->attrs = attrs;
9389 }
9390 
9391 /* Get the block/page attributes that correspond to the table attributes */
9392 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)9393 sysctl_kmaps_table_attrs(pd_entry_t table)
9394 {
9395 	pt_entry_t attrs;
9396 
9397 	attrs = 0;
9398 	if ((table & TATTR_UXN_TABLE) != 0)
9399 		attrs |= ATTR_S1_UXN;
9400 	if ((table & TATTR_PXN_TABLE) != 0)
9401 		attrs |= ATTR_S1_PXN;
9402 	if ((table & TATTR_AP_TABLE_RO) != 0)
9403 		attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
9404 
9405 	return (attrs);
9406 }
9407 
9408 /* Read the block/page attributes we care about */
9409 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)9410 sysctl_kmaps_block_attrs(pt_entry_t block)
9411 {
9412 	return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
9413 	    ATTR_S1_GP));
9414 }
9415 
9416 /*
9417  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
9418  * those of the current run, dump the address range and its attributes, and
9419  * begin a new run.
9420  */
9421 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)9422 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
9423     vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
9424     pt_entry_t l3e)
9425 {
9426 	pt_entry_t attrs;
9427 
9428 	attrs = sysctl_kmaps_table_attrs(l0e);
9429 
9430 	if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9431 		attrs |= sysctl_kmaps_block_attrs(l1e);
9432 		goto done;
9433 	}
9434 	attrs |= sysctl_kmaps_table_attrs(l1e);
9435 
9436 	if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9437 		attrs |= sysctl_kmaps_block_attrs(l2e);
9438 		goto done;
9439 	}
9440 	attrs |= sysctl_kmaps_table_attrs(l2e);
9441 	attrs |= sysctl_kmaps_block_attrs(l3e);
9442 
9443 done:
9444 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
9445 		sysctl_kmaps_dump(sb, range, va);
9446 		sysctl_kmaps_reinit(range, va, attrs);
9447 	}
9448 }
9449 
9450 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)9451 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
9452 {
9453 	struct pmap_kernel_map_range range;
9454 	struct sbuf sbuf, *sb;
9455 	pd_entry_t l0e, *l1, l1e, *l2, l2e;
9456 	pt_entry_t *l3, l3e;
9457 	vm_offset_t sva;
9458 	vm_paddr_t pa;
9459 	int error, i, j, k, l;
9460 
9461 	error = sysctl_wire_old_buffer(req, 0);
9462 	if (error != 0)
9463 		return (error);
9464 	sb = &sbuf;
9465 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
9466 
9467 	/* Sentinel value. */
9468 	range.sva = 0xfffffffffffffffful;
9469 
9470 	/*
9471 	 * Iterate over the kernel page tables without holding the kernel pmap
9472 	 * lock.  Kernel page table pages are never freed, so at worst we will
9473 	 * observe inconsistencies in the output.
9474 	 */
9475 	for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
9476 	    i++) {
9477 		if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
9478 			sbuf_printf(sb, "\nDirect map:\n");
9479 		else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
9480 			sbuf_printf(sb, "\nKernel map:\n");
9481 #ifdef KASAN
9482 		else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
9483 			sbuf_printf(sb, "\nKASAN shadow map:\n");
9484 #endif
9485 #ifdef KMSAN
9486 		else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
9487 			sbuf_printf(sb, "\nKMSAN shadow map:\n");
9488 		else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
9489 			sbuf_printf(sb, "\nKMSAN origin map:\n");
9490 #endif
9491 
9492 		l0e = kernel_pmap->pm_l0[i];
9493 		if ((l0e & ATTR_DESCR_VALID) == 0) {
9494 			sysctl_kmaps_dump(sb, &range, sva);
9495 			sva += L0_SIZE;
9496 			continue;
9497 		}
9498 		pa = PTE_TO_PHYS(l0e);
9499 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9500 
9501 		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
9502 			l1e = l1[j];
9503 			if ((l1e & ATTR_DESCR_VALID) == 0) {
9504 				sysctl_kmaps_dump(sb, &range, sva);
9505 				sva += L1_SIZE;
9506 				continue;
9507 			}
9508 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
9509 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
9510 				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
9511 				    0, 0);
9512 				range.l1blocks++;
9513 				sva += L1_SIZE;
9514 				continue;
9515 			}
9516 			pa = PTE_TO_PHYS(l1e);
9517 			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9518 
9519 			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
9520 				l2e = l2[k];
9521 				if ((l2e & ATTR_DESCR_VALID) == 0) {
9522 					sysctl_kmaps_dump(sb, &range, sva);
9523 					sva += L2_SIZE;
9524 					continue;
9525 				}
9526 				if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
9527 					sysctl_kmaps_check(sb, &range, sva,
9528 					    l0e, l1e, l2e, 0);
9529 					range.l2blocks++;
9530 					sva += L2_SIZE;
9531 					continue;
9532 				}
9533 				pa = PTE_TO_PHYS(l2e);
9534 				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
9535 
9536 				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
9537 				    l++, sva += L3_SIZE) {
9538 					l3e = l3[l];
9539 					if ((l3e & ATTR_DESCR_VALID) == 0) {
9540 						sysctl_kmaps_dump(sb, &range,
9541 						    sva);
9542 						continue;
9543 					}
9544 					sysctl_kmaps_check(sb, &range, sva,
9545 					    l0e, l1e, l2e, l3e);
9546 					if ((l3e & ATTR_CONTIGUOUS) != 0)
9547 						range.l3contig +=
9548 						    l % L3C_ENTRIES == 0 ?
9549 						    1 : 0;
9550 					else
9551 						range.l3pages++;
9552 				}
9553 			}
9554 		}
9555 	}
9556 
9557 	error = sbuf_finish(sb);
9558 	sbuf_delete(sb);
9559 	return (error);
9560 }
9561 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
9562     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
9563     NULL, 0, sysctl_kmaps, "A",
9564     "Dump kernel address layout");
9565