xref: /freebsd/sys/arm64/arm64/pmap.c (revision 5d4545a2)
1 /*-
2  * Copyright (c) 1991 Regents of the University of California.
3  * All rights reserved.
4  * Copyright (c) 1994 John S. Dyson
5  * All rights reserved.
6  * Copyright (c) 1994 David Greenman
7  * All rights reserved.
8  * Copyright (c) 2003 Peter Wemm
9  * All rights reserved.
10  * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11  * All rights reserved.
12  * Copyright (c) 2014 Andrew Turner
13  * All rights reserved.
14  * Copyright (c) 2014-2016 The FreeBSD Foundation
15  * All rights reserved.
16  *
17  * This code is derived from software contributed to Berkeley by
18  * the Systems Programming Group of the University of Utah Computer
19  * Science Department and William Jolitz of UUNET Technologies Inc.
20  *
21  * This software was developed by Andrew Turner under sponsorship from
22  * the FreeBSD Foundation.
23  *
24  * Redistribution and use in source and binary forms, with or without
25  * modification, are permitted provided that the following conditions
26  * are met:
27  * 1. Redistributions of source code must retain the above copyright
28  *    notice, this list of conditions and the following disclaimer.
29  * 2. Redistributions in binary form must reproduce the above copyright
30  *    notice, this list of conditions and the following disclaimer in the
31  *    documentation and/or other materials provided with the distribution.
32  * 3. All advertising materials mentioning features or use of this software
33  *    must display the following acknowledgement:
34  *	This product includes software developed by the University of
35  *	California, Berkeley and its contributors.
36  * 4. Neither the name of the University nor the names of its contributors
37  *    may be used to endorse or promote products derived from this software
38  *    without specific prior written permission.
39  *
40  * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43  * ARE DISCLAIMED.  IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50  * SUCH DAMAGE.
51  */
52 /*-
53  * Copyright (c) 2003 Networks Associates Technology, Inc.
54  * All rights reserved.
55  *
56  * This software was developed for the FreeBSD Project by Jake Burkholder,
57  * Safeport Network Services, and Network Associates Laboratories, the
58  * Security Research Division of Network Associates, Inc. under
59  * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60  * CHATS research program.
61  *
62  * Redistribution and use in source and binary forms, with or without
63  * modification, are permitted provided that the following conditions
64  * are met:
65  * 1. Redistributions of source code must retain the above copyright
66  *    notice, this list of conditions and the following disclaimer.
67  * 2. Redistributions in binary form must reproduce the above copyright
68  *    notice, this list of conditions and the following disclaimer in the
69  *    documentation and/or other materials provided with the distribution.
70  *
71  * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72  * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73  * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74  * ARE DISCLAIMED.  IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75  * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76  * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77  * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78  * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79  * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80  * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81  * SUCH DAMAGE.
82  */
83 
84 #include <sys/cdefs.h>
85 /*
86  *	Manages physical address maps.
87  *
88  *	Since the information managed by this module is
89  *	also stored by the logical address mapping module,
90  *	this module may throw away valid virtual-to-physical
91  *	mappings at almost any time.  However, invalidations
92  *	of virtual-to-physical mappings must be done as
93  *	requested.
94  *
95  *	In order to cope with hardware architectures which
96  *	make virtual-to-physical map invalidates expensive,
97  *	this module may delay invalidate or reduced protection
98  *	operations until such time as they are actually
99  *	necessary.  This module is given full information as
100  *	to which processors are currently using which maps,
101  *	and to when physical maps must be made correct.
102  */
103 
104 #include "opt_vm.h"
105 
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132 
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147 
148 #include <machine/asan.h>
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152 
153 #ifdef NUMA
154 #define	PMAP_MEMDOM	MAXMEMDOM
155 #else
156 #define	PMAP_MEMDOM	1
157 #endif
158 
159 #define	PMAP_ASSERT_STAGE1(pmap)	MPASS((pmap)->pm_stage == PM_STAGE1)
160 #define	PMAP_ASSERT_STAGE2(pmap)	MPASS((pmap)->pm_stage == PM_STAGE2)
161 
162 #define	NL0PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
163 #define	NL1PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
164 #define	NL2PG		(PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define	NL3PG		(PAGE_SIZE/(sizeof (pt_entry_t)))
166 
167 #define	NUL0E		L0_ENTRIES
168 #define	NUL1E		(NUL0E * NL1PG)
169 #define	NUL2E		(NUL1E * NL2PG)
170 
171 #ifdef PV_STATS
172 #define PV_STAT(x)	do { x ; } while (0)
173 #define __pvused
174 #else
175 #define PV_STAT(x)	do { } while (0)
176 #define __pvused	__unused
177 #endif
178 
179 #define	pmap_l0_pindex(v)	(NUL2E + NUL1E + ((v) >> L0_SHIFT))
180 #define	pmap_l1_pindex(v)	(NUL2E + ((v) >> L1_SHIFT))
181 #define	pmap_l2_pindex(v)	((v) >> L2_SHIFT)
182 
183 #ifdef __ARM_FEATURE_BTI_DEFAULT
184 #define	ATTR_KERN_GP		ATTR_S1_GP
185 #else
186 #define	ATTR_KERN_GP		0
187 #endif
188 #define	PMAP_SAN_PTE_BITS	(ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | \
189 	ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
190 
191 struct pmap_large_md_page {
192 	struct rwlock   pv_lock;
193 	struct md_page  pv_page;
194 	/* Pad to a power of 2, see pmap_init_pv_table(). */
195 	int		pv_pad[2];
196 };
197 
198 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
199 #define pv_dummy pv_dummy_large.pv_page
200 __read_mostly static struct pmap_large_md_page *pv_table;
201 
202 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)203 _pa_to_pmdp(vm_paddr_t pa)
204 {
205 	struct vm_phys_seg *seg;
206 
207 	if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
208 		return ((struct pmap_large_md_page *)seg->md_first +
209 		    pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
210 	return (NULL);
211 }
212 
213 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)214 pa_to_pmdp(vm_paddr_t pa)
215 {
216 	struct pmap_large_md_page *pvd;
217 
218 	pvd = _pa_to_pmdp(pa);
219 	if (pvd == NULL)
220 		panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
221 	return (pvd);
222 }
223 
224 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)225 page_to_pmdp(vm_page_t m)
226 {
227 	struct vm_phys_seg *seg;
228 
229 	seg = &vm_phys_segs[m->segind];
230 	return ((struct pmap_large_md_page *)seg->md_first +
231 	    pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
232 }
233 
234 #define	pa_to_pvh(pa)	(&(pa_to_pmdp(pa)->pv_page))
235 #define	page_to_pvh(m)	(&(page_to_pmdp(m)->pv_page))
236 
237 #define	PHYS_TO_PV_LIST_LOCK(pa)	({			\
238 	struct pmap_large_md_page *_pvd;			\
239 	struct rwlock *_lock;					\
240 	_pvd = _pa_to_pmdp(pa);					\
241 	if (__predict_false(_pvd == NULL))			\
242 		_lock = &pv_dummy_large.pv_lock;		\
243 	else							\
244 		_lock = &(_pvd->pv_lock);			\
245 	_lock;							\
246 })
247 
248 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)249 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
250 {
251 	if ((m->flags & PG_FICTITIOUS) == 0)
252 		return (&page_to_pmdp(m)->pv_lock);
253 	else
254 		return (&pv_dummy_large.pv_lock);
255 }
256 
257 #define	CHANGE_PV_LIST_LOCK(lockp, new_lock)	do {	\
258 	struct rwlock **_lockp = (lockp);		\
259 	struct rwlock *_new_lock = (new_lock);		\
260 							\
261 	if (_new_lock != *_lockp) {			\
262 		if (*_lockp != NULL)			\
263 			rw_wunlock(*_lockp);		\
264 		*_lockp = _new_lock;			\
265 		rw_wlock(*_lockp);			\
266 	}						\
267 } while (0)
268 
269 #define	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa)		\
270 			CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
271 
272 #define	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m)	\
273 			CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
274 
275 #define	RELEASE_PV_LIST_LOCK(lockp)		do {	\
276 	struct rwlock **_lockp = (lockp);		\
277 							\
278 	if (*_lockp != NULL) {				\
279 		rw_wunlock(*_lockp);			\
280 		*_lockp = NULL;				\
281 	}						\
282 } while (0)
283 
284 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
285 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
286 
287 /*
288  * The presence of this flag indicates that the mapping is writeable.
289  * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
290  * it is dirty.  This flag may only be set on managed mappings.
291  *
292  * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
293  * as a software managed bit.
294  */
295 #define	ATTR_SW_DBM	ATTR_DBM
296 
297 struct pmap kernel_pmap_store;
298 
299 /* Used for mapping ACPI memory before VM is initialized */
300 #define	PMAP_PREINIT_MAPPING_COUNT	32
301 #define	PMAP_PREINIT_MAPPING_SIZE	(PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
302 static vm_offset_t preinit_map_va;	/* Start VA of pre-init mapping space */
303 static int vm_initialized = 0;		/* No need to use pre-init maps when set */
304 
305 /*
306  * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
307  * Always map entire L2 block for simplicity.
308  * VA of L2 block = preinit_map_va + i * L2_SIZE
309  */
310 static struct pmap_preinit_mapping {
311 	vm_paddr_t	pa;
312 	vm_offset_t	va;
313 	vm_size_t	size;
314 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
315 
316 vm_offset_t virtual_avail;	/* VA of first avail page (after kernel bss) */
317 vm_offset_t virtual_end;	/* VA of last avail page (end of kernel AS) */
318 vm_offset_t kernel_vm_end = 0;
319 
320 /*
321  * Data for the pv entry allocation mechanism.
322  */
323 #ifdef NUMA
324 static __inline int
pc_to_domain(struct pv_chunk * pc)325 pc_to_domain(struct pv_chunk *pc)
326 {
327 	return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
328 }
329 #else
330 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)331 pc_to_domain(struct pv_chunk *pc __unused)
332 {
333 	return (0);
334 }
335 #endif
336 
337 struct pv_chunks_list {
338 	struct mtx pvc_lock;
339 	TAILQ_HEAD(pch, pv_chunk) pvc_list;
340 	int active_reclaims;
341 } __aligned(CACHE_LINE_SIZE);
342 
343 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
344 
345 vm_paddr_t dmap_phys_base;	/* The start of the dmap region */
346 vm_paddr_t dmap_phys_max;	/* The limit of the dmap region */
347 vm_offset_t dmap_max_addr;	/* The virtual address limit of the dmap */
348 
349 extern pt_entry_t pagetable_l0_ttbr1[];
350 
351 #define	PHYSMAP_SIZE	(2 * (VM_PHYSSEG_MAX - 1))
352 static vm_paddr_t physmap[PHYSMAP_SIZE];
353 static u_int physmap_idx;
354 
355 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
356     "VM/pmap parameters");
357 
358 #if PAGE_SIZE == PAGE_SIZE_4K
359 #define	L1_BLOCKS_SUPPORTED	1
360 #else
361 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
362 #define	L1_BLOCKS_SUPPORTED	0
363 #endif
364 
365 #define	PMAP_ASSERT_L1_BLOCKS_SUPPORTED	MPASS(L1_BLOCKS_SUPPORTED)
366 
367 /*
368  * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
369  * that it has currently allocated to a pmap, a cursor ("asid_next") to
370  * optimize its search for a free ASID in the bit vector, and an epoch number
371  * ("asid_epoch") to indicate when it has reclaimed all previously allocated
372  * ASIDs that are not currently active on a processor.
373  *
374  * The current epoch number is always in the range [0, INT_MAX).  Negative
375  * numbers and INT_MAX are reserved for special cases that are described
376  * below.
377  */
378 struct asid_set {
379 	int asid_bits;
380 	bitstr_t *asid_set;
381 	int asid_set_size;
382 	int asid_next;
383 	int asid_epoch;
384 	struct mtx asid_set_mutex;
385 };
386 
387 static struct asid_set asids;
388 static struct asid_set vmids;
389 
390 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
391     "ASID allocator");
392 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
393     "The number of bits in an ASID");
394 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
395     "The last allocated ASID plus one");
396 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
397     "The current epoch number");
398 
399 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
400 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
401     "The number of bits in an VMID");
402 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
403     "The last allocated VMID plus one");
404 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
405     "The current epoch number");
406 
407 void (*pmap_clean_stage2_tlbi)(void);
408 void (*pmap_invalidate_vpipt_icache)(void);
409 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
410 void (*pmap_stage2_invalidate_all)(uint64_t);
411 
412 /*
413  * A pmap's cookie encodes an ASID and epoch number.  Cookies for reserved
414  * ASIDs have a negative epoch number, specifically, INT_MIN.  Cookies for
415  * dynamically allocated ASIDs have a non-negative epoch number.
416  *
417  * An invalid ASID is represented by -1.
418  *
419  * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
420  * which indicates that an ASID should never be allocated to the pmap, and
421  * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
422  * allocated when the pmap is next activated.
423  */
424 #define	COOKIE_FROM(asid, epoch)	((long)((u_int)(asid) |	\
425 					    ((u_long)(epoch) << 32)))
426 #define	COOKIE_TO_ASID(cookie)		((int)(cookie))
427 #define	COOKIE_TO_EPOCH(cookie)		((int)((u_long)(cookie) >> 32))
428 
429 #define	TLBI_VA_SHIFT			12
430 #define	TLBI_VA_MASK			((1ul << 44) - 1)
431 #define	TLBI_VA(addr)			(((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
432 
433 static int __read_frequently superpages_enabled = 1;
434 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
435     CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
436     "Are large page mappings enabled?");
437 
438 /*
439  * True when Branch Target Identification should be used by userspace. This
440  * allows pmap to mark pages as guarded with ATTR_S1_GP.
441  */
442 __read_mostly static bool pmap_bti_support = false;
443 
444 /*
445  * Internal flags for pmap_enter()'s helper functions.
446  */
447 #define	PMAP_ENTER_NORECLAIM	0x1000000	/* Don't reclaim PV entries. */
448 #define	PMAP_ENTER_NOREPLACE	0x2000000	/* Don't replace mappings. */
449 
450 TAILQ_HEAD(pv_chunklist, pv_chunk);
451 
452 static void	free_pv_chunk(struct pv_chunk *pc);
453 static void	free_pv_chunk_batch(struct pv_chunklist *batch);
454 static void	free_pv_entry(pmap_t pmap, pv_entry_t pv);
455 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
456 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
457 static void	pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
458 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
459 		    vm_offset_t va);
460 
461 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
462 static bool pmap_activate_int(pmap_t pmap);
463 static void pmap_alloc_asid(pmap_t pmap);
464 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
465     vm_prot_t prot, int mode, bool skip_unmapped);
466 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
467     pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
468 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
469 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
470     vm_offset_t va, struct rwlock **lockp);
471 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
472 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va);
473 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
474 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
475     vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
476 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
477     u_int flags, vm_page_t m, struct rwlock **lockp);
478 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
479     vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
480 static bool pmap_every_pte_zero(vm_paddr_t pa);
481 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
482     bool all_l3e_AF_set);
483 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
484 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
485     vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
486 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
487     struct rwlock **lockp);
488 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
489 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
490     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
491 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
492     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
493 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
494     vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
495     struct rwlock **lockp);
496 static void pmap_reset_asid_set(pmap_t pmap);
497 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
498     vm_page_t m, struct rwlock **lockp);
499 
500 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
501 		struct rwlock **lockp);
502 
503 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
504     struct spglist *free);
505 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
506 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
507     vm_offset_t va, vm_size_t size);
508 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
509 
510 static uma_zone_t pmap_bti_ranges_zone;
511 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
512     pt_entry_t *pte);
513 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
514 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
515 static void *bti_dup_range(void *ctx, void *data);
516 static void bti_free_range(void *ctx, void *node);
517 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
518 static void pmap_bti_deassign_all(pmap_t pmap);
519 
520 /*
521  * These load the old table data and store the new value.
522  * They need to be atomic as the System MMU may write to the table at
523  * the same time as the CPU.
524  */
525 #define	pmap_clear(table)		atomic_store_64(table, 0)
526 #define	pmap_clear_bits(table, bits)	atomic_clear_64(table, bits)
527 #define	pmap_load(table)		(*table)
528 #define	pmap_load_clear(table)		atomic_swap_64(table, 0)
529 #define	pmap_load_store(table, entry)	atomic_swap_64(table, entry)
530 #define	pmap_set_bits(table, bits)	atomic_set_64(table, bits)
531 #define	pmap_store(table, entry)	atomic_store_64(table, entry)
532 
533 /********************/
534 /* Inline functions */
535 /********************/
536 
537 static __inline void
pagecopy(void * s,void * d)538 pagecopy(void *s, void *d)
539 {
540 
541 	memcpy(d, s, PAGE_SIZE);
542 }
543 
544 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)545 pmap_l0(pmap_t pmap, vm_offset_t va)
546 {
547 
548 	return (&pmap->pm_l0[pmap_l0_index(va)]);
549 }
550 
551 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)552 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
553 {
554 	pd_entry_t *l1;
555 
556 	l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
557 	return (&l1[pmap_l1_index(va)]);
558 }
559 
560 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)561 pmap_l1(pmap_t pmap, vm_offset_t va)
562 {
563 	pd_entry_t *l0;
564 
565 	l0 = pmap_l0(pmap, va);
566 	if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
567 		return (NULL);
568 
569 	return (pmap_l0_to_l1(l0, va));
570 }
571 
572 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)573 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
574 {
575 	pd_entry_t l1, *l2p;
576 
577 	l1 = pmap_load(l1p);
578 
579 	KASSERT(ADDR_IS_CANONICAL(va),
580 	    ("%s: Address not in canonical form: %lx", __func__, va));
581 	/*
582 	 * The valid bit may be clear if pmap_update_entry() is concurrently
583 	 * modifying the entry, so for KVA only the entry type may be checked.
584 	 */
585 	KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
586 	    ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
587 	KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
588 	    ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
589 	l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
590 	return (&l2p[pmap_l2_index(va)]);
591 }
592 
593 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)594 pmap_l2(pmap_t pmap, vm_offset_t va)
595 {
596 	pd_entry_t *l1;
597 
598 	l1 = pmap_l1(pmap, va);
599 	if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
600 		return (NULL);
601 
602 	return (pmap_l1_to_l2(l1, va));
603 }
604 
605 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)606 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
607 {
608 	pd_entry_t l2;
609 	pt_entry_t *l3p;
610 
611 	l2 = pmap_load(l2p);
612 
613 	KASSERT(ADDR_IS_CANONICAL(va),
614 	    ("%s: Address not in canonical form: %lx", __func__, va));
615 	/*
616 	 * The valid bit may be clear if pmap_update_entry() is concurrently
617 	 * modifying the entry, so for KVA only the entry type may be checked.
618 	 */
619 	KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
620 	    ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
621 	KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
622 	    ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
623 	l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
624 	return (&l3p[pmap_l3_index(va)]);
625 }
626 
627 /*
628  * Returns the lowest valid pde for a given virtual address.
629  * The next level may or may not point to a valid page or block.
630  */
631 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)632 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
633 {
634 	pd_entry_t *l0, *l1, *l2, desc;
635 
636 	l0 = pmap_l0(pmap, va);
637 	desc = pmap_load(l0) & ATTR_DESCR_MASK;
638 	if (desc != L0_TABLE) {
639 		*level = -1;
640 		return (NULL);
641 	}
642 
643 	l1 = pmap_l0_to_l1(l0, va);
644 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
645 	if (desc != L1_TABLE) {
646 		*level = 0;
647 		return (l0);
648 	}
649 
650 	l2 = pmap_l1_to_l2(l1, va);
651 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
652 	if (desc != L2_TABLE) {
653 		*level = 1;
654 		return (l1);
655 	}
656 
657 	*level = 2;
658 	return (l2);
659 }
660 
661 /*
662  * Returns the lowest valid pte block or table entry for a given virtual
663  * address. If there are no valid entries return NULL and set the level to
664  * the first invalid level.
665  */
666 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)667 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
668 {
669 	pd_entry_t *l1, *l2, desc;
670 	pt_entry_t *l3;
671 
672 	l1 = pmap_l1(pmap, va);
673 	if (l1 == NULL) {
674 		*level = 0;
675 		return (NULL);
676 	}
677 	desc = pmap_load(l1) & ATTR_DESCR_MASK;
678 	if (desc == L1_BLOCK) {
679 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
680 		*level = 1;
681 		return (l1);
682 	}
683 
684 	if (desc != L1_TABLE) {
685 		*level = 1;
686 		return (NULL);
687 	}
688 
689 	l2 = pmap_l1_to_l2(l1, va);
690 	desc = pmap_load(l2) & ATTR_DESCR_MASK;
691 	if (desc == L2_BLOCK) {
692 		*level = 2;
693 		return (l2);
694 	}
695 
696 	if (desc != L2_TABLE) {
697 		*level = 2;
698 		return (NULL);
699 	}
700 
701 	*level = 3;
702 	l3 = pmap_l2_to_l3(l2, va);
703 	if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
704 		return (NULL);
705 
706 	return (l3);
707 }
708 
709 /*
710  * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
711  * level that maps the specified virtual address, then a pointer to that entry
712  * is returned.  Otherwise, NULL is returned, unless INVARIANTS are enabled
713  * and a diagnostic message is provided, in which case this function panics.
714  */
715 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)716 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
717 {
718 	pd_entry_t *l0p, *l1p, *l2p;
719 	pt_entry_t desc, *l3p;
720 	int walk_level __diagused;
721 
722 	KASSERT(level >= 0 && level < 4,
723 	    ("%s: %s passed an out-of-range level (%d)", __func__, diag,
724 	    level));
725 	l0p = pmap_l0(pmap, va);
726 	desc = pmap_load(l0p) & ATTR_DESCR_MASK;
727 	if (desc == L0_TABLE && level > 0) {
728 		l1p = pmap_l0_to_l1(l0p, va);
729 		desc = pmap_load(l1p) & ATTR_DESCR_MASK;
730 		if (desc == L1_BLOCK && level == 1) {
731 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
732 			return (l1p);
733 		}
734 		if (desc == L1_TABLE && level > 1) {
735 			l2p = pmap_l1_to_l2(l1p, va);
736 			desc = pmap_load(l2p) & ATTR_DESCR_MASK;
737 			if (desc == L2_BLOCK && level == 2)
738 				return (l2p);
739 			else if (desc == L2_TABLE && level > 2) {
740 				l3p = pmap_l2_to_l3(l2p, va);
741 				desc = pmap_load(l3p) & ATTR_DESCR_MASK;
742 				if (desc == L3_PAGE && level == 3)
743 					return (l3p);
744 				else
745 					walk_level = 3;
746 			} else
747 				walk_level = 2;
748 		} else
749 			walk_level = 1;
750 	} else
751 		walk_level = 0;
752 	KASSERT(diag == NULL,
753 	    ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
754 	    diag, va, level, desc, walk_level));
755 	return (NULL);
756 }
757 
758 bool
pmap_ps_enabled(pmap_t pmap)759 pmap_ps_enabled(pmap_t pmap)
760 {
761 	/*
762 	 * Promotion requires a hypervisor call when the kernel is running
763 	 * in EL1. To stop this disable superpage support on non-stage 1
764 	 * pmaps for now.
765 	 */
766 	if (pmap->pm_stage != PM_STAGE1)
767 		return (false);
768 
769 #ifdef KMSAN
770 	/*
771 	 * The break-before-make in pmap_update_entry() results in a situation
772 	 * where a CPU may call into the KMSAN runtime while the entry is
773 	 * invalid.  If the entry is used to map the current thread structure,
774 	 * then the runtime will attempt to access unmapped memory.  Avoid this
775 	 * by simply disabling superpage promotion for the kernel map.
776 	 */
777 	if (pmap == kernel_pmap)
778 		return (false);
779 #endif
780 
781 	return (superpages_enabled != 0);
782 }
783 
784 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)785 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
786     pd_entry_t **l2, pt_entry_t **l3)
787 {
788 	pd_entry_t *l0p, *l1p, *l2p;
789 
790 	if (pmap->pm_l0 == NULL)
791 		return (false);
792 
793 	l0p = pmap_l0(pmap, va);
794 	*l0 = l0p;
795 
796 	if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
797 		return (false);
798 
799 	l1p = pmap_l0_to_l1(l0p, va);
800 	*l1 = l1p;
801 
802 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
803 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
804 		*l2 = NULL;
805 		*l3 = NULL;
806 		return (true);
807 	}
808 
809 	if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
810 		return (false);
811 
812 	l2p = pmap_l1_to_l2(l1p, va);
813 	*l2 = l2p;
814 
815 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
816 		*l3 = NULL;
817 		return (true);
818 	}
819 
820 	if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
821 		return (false);
822 
823 	*l3 = pmap_l2_to_l3(l2p, va);
824 
825 	return (true);
826 }
827 
828 static __inline int
pmap_l3_valid(pt_entry_t l3)829 pmap_l3_valid(pt_entry_t l3)
830 {
831 
832 	return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
833 }
834 
835 CTASSERT(L1_BLOCK == L2_BLOCK);
836 
837 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)838 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
839 {
840 	pt_entry_t val;
841 
842 	if (pmap->pm_stage == PM_STAGE1) {
843 		val = ATTR_S1_IDX(memattr);
844 		if (memattr == VM_MEMATTR_DEVICE)
845 			val |= ATTR_S1_XN;
846 		return (val);
847 	}
848 
849 	val = 0;
850 
851 	switch (memattr) {
852 	case VM_MEMATTR_DEVICE:
853 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
854 		    ATTR_S2_XN(ATTR_S2_XN_ALL));
855 	case VM_MEMATTR_UNCACHEABLE:
856 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
857 	case VM_MEMATTR_WRITE_BACK:
858 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
859 	case VM_MEMATTR_WRITE_THROUGH:
860 		return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
861 	default:
862 		panic("%s: invalid memory attribute %x", __func__, memattr);
863 	}
864 }
865 
866 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)867 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
868 {
869 	pt_entry_t val;
870 
871 	val = 0;
872 	if (pmap->pm_stage == PM_STAGE1) {
873 		if ((prot & VM_PROT_EXECUTE) == 0)
874 			val |= ATTR_S1_XN;
875 		if ((prot & VM_PROT_WRITE) == 0)
876 			val |= ATTR_S1_AP(ATTR_S1_AP_RO);
877 	} else {
878 		if ((prot & VM_PROT_WRITE) != 0)
879 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
880 		if ((prot & VM_PROT_READ) != 0)
881 			val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
882 		if ((prot & VM_PROT_EXECUTE) == 0)
883 			val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
884 	}
885 
886 	return (val);
887 }
888 
889 /*
890  * Checks if the PTE is dirty.
891  */
892 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)893 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
894 {
895 
896 	KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
897 
898 	if (pmap->pm_stage == PM_STAGE1) {
899 		KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
900 		    ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
901 
902 		return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
903 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
904 	}
905 
906 	return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
907 	    ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
908 }
909 
910 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)911 pmap_resident_count_inc(pmap_t pmap, int count)
912 {
913 
914 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
915 	pmap->pm_stats.resident_count += count;
916 }
917 
918 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)919 pmap_resident_count_dec(pmap_t pmap, int count)
920 {
921 
922 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
923 	KASSERT(pmap->pm_stats.resident_count >= count,
924 	    ("pmap %p resident count underflow %ld %d", pmap,
925 	    pmap->pm_stats.resident_count, count));
926 	pmap->pm_stats.resident_count -= count;
927 }
928 
929 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)930 pmap_early_vtophys(vm_offset_t va)
931 {
932 	vm_paddr_t pa_page;
933 
934 	pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
935 	return (pa_page | (va & PAR_LOW_MASK));
936 }
937 
938 /* State of the bootstrapped DMAP page tables */
939 struct pmap_bootstrap_state {
940 	pt_entry_t	*l1;
941 	pt_entry_t	*l2;
942 	pt_entry_t	*l3;
943 	vm_offset_t	freemempos;
944 	vm_offset_t	va;
945 	vm_paddr_t	pa;
946 	pt_entry_t	table_attrs;
947 	u_int		l0_slot;
948 	u_int		l1_slot;
949 	u_int		l2_slot;
950 	bool		dmap_valid;
951 };
952 
953 /* The bootstrap state */
954 static struct pmap_bootstrap_state bs_state = {
955 	.l1 = NULL,
956 	.l2 = NULL,
957 	.l3 = NULL,
958 	.table_attrs = TATTR_PXN_TABLE,
959 	.l0_slot = L0_ENTRIES,
960 	.l1_slot = Ln_ENTRIES,
961 	.l2_slot = Ln_ENTRIES,
962 	.dmap_valid = false,
963 };
964 
965 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)966 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
967 {
968 	vm_paddr_t l1_pa;
969 	pd_entry_t l0e;
970 	u_int l0_slot;
971 
972 	/* Link the level 0 table to a level 1 table */
973 	l0_slot = pmap_l0_index(state->va);
974 	if (l0_slot != state->l0_slot) {
975 		/*
976 		 * Make sure we move from a low address to high address
977 		 * before the DMAP region is ready. This ensures we never
978 		 * modify an existing mapping until we can map from a
979 		 * physical address to a virtual address.
980 		 */
981 		MPASS(state->l0_slot < l0_slot ||
982 		    state->l0_slot == L0_ENTRIES ||
983 		    state->dmap_valid);
984 
985 		/* Reset lower levels */
986 		state->l2 = NULL;
987 		state->l3 = NULL;
988 		state->l1_slot = Ln_ENTRIES;
989 		state->l2_slot = Ln_ENTRIES;
990 
991 		/* Check the existing L0 entry */
992 		state->l0_slot = l0_slot;
993 		if (state->dmap_valid) {
994 			l0e = pagetable_l0_ttbr1[l0_slot];
995 			if ((l0e & ATTR_DESCR_VALID) != 0) {
996 				MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
997 				l1_pa = PTE_TO_PHYS(l0e);
998 				state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
999 				return;
1000 			}
1001 		}
1002 
1003 		/* Create a new L0 table entry */
1004 		state->l1 = (pt_entry_t *)state->freemempos;
1005 		memset(state->l1, 0, PAGE_SIZE);
1006 		state->freemempos += PAGE_SIZE;
1007 
1008 		l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1009 		MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1010 		MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1011 		pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1012 		    TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1013 	}
1014 	KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1015 }
1016 
1017 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1018 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1019 {
1020 	vm_paddr_t l2_pa;
1021 	pd_entry_t l1e;
1022 	u_int l1_slot;
1023 
1024 	/* Make sure there is a valid L0 -> L1 table */
1025 	pmap_bootstrap_l0_table(state);
1026 
1027 	/* Link the level 1 table to a level 2 table */
1028 	l1_slot = pmap_l1_index(state->va);
1029 	if (l1_slot != state->l1_slot) {
1030 		/* See pmap_bootstrap_l0_table for a description */
1031 		MPASS(state->l1_slot < l1_slot ||
1032 		    state->l1_slot == Ln_ENTRIES ||
1033 		    state->dmap_valid);
1034 
1035 		/* Reset lower levels */
1036 		state->l3 = NULL;
1037 		state->l2_slot = Ln_ENTRIES;
1038 
1039 		/* Check the existing L1 entry */
1040 		state->l1_slot = l1_slot;
1041 		if (state->dmap_valid) {
1042 			l1e = state->l1[l1_slot];
1043 			if ((l1e & ATTR_DESCR_VALID) != 0) {
1044 				MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1045 				l2_pa = PTE_TO_PHYS(l1e);
1046 				state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1047 				return;
1048 			}
1049 		}
1050 
1051 		/* Create a new L1 table entry */
1052 		state->l2 = (pt_entry_t *)state->freemempos;
1053 		memset(state->l2, 0, PAGE_SIZE);
1054 		state->freemempos += PAGE_SIZE;
1055 
1056 		l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1057 		MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1058 		MPASS(state->l1[l1_slot] == 0);
1059 		pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1060 		    state->table_attrs | L1_TABLE);
1061 	}
1062 	KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1063 }
1064 
1065 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1066 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1067 {
1068 	vm_paddr_t l3_pa;
1069 	pd_entry_t l2e;
1070 	u_int l2_slot;
1071 
1072 	/* Make sure there is a valid L1 -> L2 table */
1073 	pmap_bootstrap_l1_table(state);
1074 
1075 	/* Link the level 2 table to a level 3 table */
1076 	l2_slot = pmap_l2_index(state->va);
1077 	if (l2_slot != state->l2_slot) {
1078 		/* See pmap_bootstrap_l0_table for a description */
1079 		MPASS(state->l2_slot < l2_slot ||
1080 		    state->l2_slot == Ln_ENTRIES ||
1081 		    state->dmap_valid);
1082 
1083 		/* Check the existing L2 entry */
1084 		state->l2_slot = l2_slot;
1085 		if (state->dmap_valid) {
1086 			l2e = state->l2[l2_slot];
1087 			if ((l2e & ATTR_DESCR_VALID) != 0) {
1088 				MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1089 				l3_pa = PTE_TO_PHYS(l2e);
1090 				state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1091 				return;
1092 			}
1093 		}
1094 
1095 		/* Create a new L2 table entry */
1096 		state->l3 = (pt_entry_t *)state->freemempos;
1097 		memset(state->l3, 0, PAGE_SIZE);
1098 		state->freemempos += PAGE_SIZE;
1099 
1100 		l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1101 		MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1102 		MPASS(state->l2[l2_slot] == 0);
1103 		pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1104 		    state->table_attrs | L2_TABLE);
1105 	}
1106 	KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1107 }
1108 
1109 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1110 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1111 {
1112 	pt_entry_t contig;
1113 	u_int l2_slot;
1114 	bool first;
1115 
1116 	if ((physmap[i + 1] - state->pa) < L2_SIZE)
1117 		return;
1118 
1119 	/* Make sure there is a valid L1 table */
1120 	pmap_bootstrap_l1_table(state);
1121 
1122 	MPASS((state->va & L2_OFFSET) == 0);
1123 	for (first = true, contig = 0;
1124 	    state->va < DMAP_MAX_ADDRESS &&
1125 	    (physmap[i + 1] - state->pa) >= L2_SIZE;
1126 	    state->va += L2_SIZE, state->pa += L2_SIZE) {
1127 		/*
1128 		 * Stop if we are about to walk off the end of what the
1129 		 * current L1 slot can address.
1130 		 */
1131 		if (!first && (state->pa & L1_OFFSET) == 0)
1132 			break;
1133 
1134 		/*
1135 		 * If we have an aligned, contiguous chunk of L2C_ENTRIES
1136 		 * L2 blocks, set the contiguous bit within each PTE so that
1137 		 * the chunk can be cached using only one TLB entry.
1138 		 */
1139 		if ((state->pa & L2C_OFFSET) == 0) {
1140 			if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS &&
1141 			    physmap[i + 1] - state->pa >= L2C_SIZE) {
1142 				contig = ATTR_CONTIGUOUS;
1143 			} else {
1144 				contig = 0;
1145 			}
1146 		}
1147 
1148 		first = false;
1149 		l2_slot = pmap_l2_index(state->va);
1150 		MPASS((state->pa & L2_OFFSET) == 0);
1151 		MPASS(state->l2[l2_slot] == 0);
1152 		pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1153 		    ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1154 		    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK);
1155 	}
1156 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1157 }
1158 
1159 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1160 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1161 {
1162 	pt_entry_t contig;
1163 	u_int l3_slot;
1164 	bool first;
1165 
1166 	if (physmap[i + 1] - state->pa < L3_SIZE)
1167 		return;
1168 
1169 	/* Make sure there is a valid L2 table */
1170 	pmap_bootstrap_l2_table(state);
1171 
1172 	MPASS((state->va & L3_OFFSET) == 0);
1173 	for (first = true, contig = 0;
1174 	    state->va < DMAP_MAX_ADDRESS &&
1175 	    physmap[i + 1] - state->pa >= L3_SIZE;
1176 	    state->va += L3_SIZE, state->pa += L3_SIZE) {
1177 		/*
1178 		 * Stop if we are about to walk off the end of what the
1179 		 * current L2 slot can address.
1180 		 */
1181 		if (!first && (state->pa & L2_OFFSET) == 0)
1182 			break;
1183 
1184 		/*
1185 		 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1186 		 * L3 pages, set the contiguous bit within each PTE so that
1187 		 * the chunk can be cached using only one TLB entry.
1188 		 */
1189 		if ((state->pa & L3C_OFFSET) == 0) {
1190 			if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1191 			    physmap[i + 1] - state->pa >= L3C_SIZE) {
1192 				contig = ATTR_CONTIGUOUS;
1193 			} else {
1194 				contig = 0;
1195 			}
1196 		}
1197 
1198 		first = false;
1199 		l3_slot = pmap_l3_index(state->va);
1200 		MPASS((state->pa & L3_OFFSET) == 0);
1201 		MPASS(state->l3[l3_slot] == 0);
1202 		pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1203 		    ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1204 		    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1205 	}
1206 	MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1207 }
1208 
1209 static void
pmap_bootstrap_dmap(vm_paddr_t min_pa)1210 pmap_bootstrap_dmap(vm_paddr_t min_pa)
1211 {
1212 	int i;
1213 
1214 	dmap_phys_base = min_pa & ~L1_OFFSET;
1215 	dmap_phys_max = 0;
1216 	dmap_max_addr = 0;
1217 
1218 	for (i = 0; i < (physmap_idx * 2); i += 2) {
1219 		bs_state.pa = physmap[i] & ~L3_OFFSET;
1220 		bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1221 
1222 		/* Create L3 mappings at the start of the region */
1223 		if ((bs_state.pa & L2_OFFSET) != 0)
1224 			pmap_bootstrap_l3_page(&bs_state, i);
1225 		MPASS(bs_state.pa <= physmap[i + 1]);
1226 
1227 		if (L1_BLOCKS_SUPPORTED) {
1228 			/* Create L2 mappings at the start of the region */
1229 			if ((bs_state.pa & L1_OFFSET) != 0)
1230 				pmap_bootstrap_l2_block(&bs_state, i);
1231 			MPASS(bs_state.pa <= physmap[i + 1]);
1232 
1233 			/* Create the main L1 block mappings */
1234 			for (; bs_state.va < DMAP_MAX_ADDRESS &&
1235 			    (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1236 			    bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1237 				/* Make sure there is a valid L1 table */
1238 				pmap_bootstrap_l0_table(&bs_state);
1239 				MPASS((bs_state.pa & L1_OFFSET) == 0);
1240 				pmap_store(
1241 				    &bs_state.l1[pmap_l1_index(bs_state.va)],
1242 				    PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT |
1243 				    ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1244 				    ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1245 			}
1246 			MPASS(bs_state.pa <= physmap[i + 1]);
1247 
1248 			/* Create L2 mappings at the end of the region */
1249 			pmap_bootstrap_l2_block(&bs_state, i);
1250 		} else {
1251 			while (bs_state.va < DMAP_MAX_ADDRESS &&
1252 			    (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1253 				pmap_bootstrap_l2_block(&bs_state, i);
1254 			}
1255 		}
1256 		MPASS(bs_state.pa <= physmap[i + 1]);
1257 
1258 		/* Create L3 mappings at the end of the region */
1259 		pmap_bootstrap_l3_page(&bs_state, i);
1260 		MPASS(bs_state.pa == physmap[i + 1]);
1261 
1262 		if (bs_state.pa > dmap_phys_max) {
1263 			dmap_phys_max = bs_state.pa;
1264 			dmap_max_addr = bs_state.va;
1265 		}
1266 	}
1267 
1268 	cpu_tlb_flushID();
1269 }
1270 
1271 static void
pmap_bootstrap_l2(vm_offset_t va)1272 pmap_bootstrap_l2(vm_offset_t va)
1273 {
1274 	KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1275 
1276 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1277 	bs_state.va = va;
1278 
1279 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1280 		pmap_bootstrap_l1_table(&bs_state);
1281 }
1282 
1283 static void
pmap_bootstrap_l3(vm_offset_t va)1284 pmap_bootstrap_l3(vm_offset_t va)
1285 {
1286 	KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1287 
1288 	/* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1289 	bs_state.va = va;
1290 
1291 	for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1292 		pmap_bootstrap_l2_table(&bs_state);
1293 }
1294 
1295 /*
1296  *	Bootstrap the system enough to run with virtual memory.
1297  */
1298 void
pmap_bootstrap(vm_size_t kernlen)1299 pmap_bootstrap(vm_size_t kernlen)
1300 {
1301 	vm_offset_t dpcpu, msgbufpv;
1302 	vm_paddr_t start_pa, pa, min_pa;
1303 	int i;
1304 
1305 	/* Verify that the ASID is set through TTBR0. */
1306 	KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1307 	    ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1308 
1309 	/* Set this early so we can use the pagetable walking functions */
1310 	kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1311 	PMAP_LOCK_INIT(kernel_pmap);
1312 	kernel_pmap->pm_l0_paddr =
1313 	    pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1314 	TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1315 	vm_radix_init(&kernel_pmap->pm_root);
1316 	kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1317 	kernel_pmap->pm_stage = PM_STAGE1;
1318 	kernel_pmap->pm_levels = 4;
1319 	kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1320 	kernel_pmap->pm_asid_set = &asids;
1321 
1322 	/* Assume the address we were loaded to is a valid physical address */
1323 	min_pa = pmap_early_vtophys(KERNBASE);
1324 
1325 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1326 	physmap_idx /= 2;
1327 
1328 	/*
1329 	 * Find the minimum physical address. physmap is sorted,
1330 	 * but may contain empty ranges.
1331 	 */
1332 	for (i = 0; i < physmap_idx * 2; i += 2) {
1333 		if (physmap[i] == physmap[i + 1])
1334 			continue;
1335 		if (physmap[i] <= min_pa)
1336 			min_pa = physmap[i];
1337 	}
1338 
1339 	bs_state.freemempos = KERNBASE + kernlen;
1340 	bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1341 
1342 	/* Create a direct map region early so we can use it for pa -> va */
1343 	pmap_bootstrap_dmap(min_pa);
1344 	bs_state.dmap_valid = true;
1345 	/*
1346 	 * We only use PXN when we know nothing will be executed from it, e.g.
1347 	 * the DMAP region.
1348 	 */
1349 	bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1350 
1351 	start_pa = pa = pmap_early_vtophys(KERNBASE);
1352 
1353 	/*
1354 	 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS.  We assume that the
1355 	 * loader allocated the first and only l2 page table page used to map
1356 	 * the kernel, preloaded files and module metadata.
1357 	 */
1358 	pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1359 	/* And the l3 tables for the early devmap */
1360 	pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1361 
1362 	cpu_tlb_flushID();
1363 
1364 #define alloc_pages(var, np)						\
1365 	(var) = bs_state.freemempos;					\
1366 	bs_state.freemempos += (np * PAGE_SIZE);			\
1367 	memset((char *)(var), 0, ((np) * PAGE_SIZE));
1368 
1369 	/* Allocate dynamic per-cpu area. */
1370 	alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1371 	dpcpu_init((void *)dpcpu, 0);
1372 
1373 	/* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1374 	alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1375 	msgbufp = (void *)msgbufpv;
1376 
1377 	/* Reserve some VA space for early BIOS/ACPI mapping */
1378 	preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1379 
1380 	virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1381 	virtual_avail = roundup2(virtual_avail, L1_SIZE);
1382 	virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1383 	kernel_vm_end = virtual_avail;
1384 
1385 	pa = pmap_early_vtophys(bs_state.freemempos);
1386 
1387 	physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1388 
1389 	cpu_tlb_flushID();
1390 }
1391 
1392 #if defined(KASAN) || defined(KMSAN)
1393 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1394 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1395     vm_offset_t *vap, vm_offset_t eva)
1396 {
1397 	vm_paddr_t pa;
1398 	vm_offset_t va;
1399 	pd_entry_t *l2;
1400 
1401 	va = *vap;
1402 	pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1403 	for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1404 		l2 = pmap_l2(kernel_pmap, va);
1405 
1406 		/*
1407 		 * KASAN stack checking results in us having already allocated
1408 		 * part of our shadow map, so we can just skip those segments.
1409 		 */
1410 		if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1411 			pa += L2_SIZE;
1412 			continue;
1413 		}
1414 
1415 		bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1416 		physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1417 		pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1418 	}
1419 	*vap = va;
1420 }
1421 
1422 /*
1423  * Finish constructing the initial shadow map:
1424  * - Count how many pages from KERNBASE to virtual_avail (scaled for
1425  *   shadow map)
1426  * - Map that entire range using L2 superpages.
1427  */
1428 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1429 pmap_bootstrap_san1(vm_offset_t va, int scale)
1430 {
1431 	vm_offset_t eva;
1432 	vm_paddr_t kernstart;
1433 	int i;
1434 
1435 	kernstart = pmap_early_vtophys(KERNBASE);
1436 
1437 	/*
1438 	 * Rebuild physmap one more time, we may have excluded more regions from
1439 	 * allocation since pmap_bootstrap().
1440 	 */
1441 	physmap_idx = physmem_avail(physmap, nitems(physmap));
1442 	physmap_idx /= 2;
1443 
1444 	eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1445 
1446 	/*
1447 	 * Find a slot in the physmap large enough for what we needed.  We try to put
1448 	 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1449 	 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1450 	 */
1451 	for (i = (physmap_idx * 2) - 2; i >= 0; i -= 2) {
1452 		vm_paddr_t plow, phigh;
1453 
1454 		/* L2 mappings must be backed by memory that is L2-aligned */
1455 		plow = roundup2(physmap[i], L2_SIZE);
1456 		phigh = physmap[i + 1];
1457 		if (plow >= phigh)
1458 			continue;
1459 		if (kernstart >= plow && kernstart < phigh)
1460 			phigh = kernstart;
1461 		if (phigh - plow >= L2_SIZE) {
1462 			pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1463 			if (va >= eva)
1464 				break;
1465 		}
1466 	}
1467 	if (i < 0)
1468 		panic("Could not find phys region for shadow map");
1469 
1470 	/*
1471 	 * Done. We should now have a valid shadow address mapped for all KVA
1472 	 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1473 	 * shadow accesses by the sanitizer runtime will succeed for this range.
1474 	 * When the kernel virtual address range is later expanded, as will
1475 	 * happen in vm_mem_init(), the shadow map will be grown as well. This
1476 	 * is handled by pmap_san_enter().
1477 	 */
1478 }
1479 
1480 void
pmap_bootstrap_san(void)1481 pmap_bootstrap_san(void)
1482 {
1483 #ifdef KASAN
1484 	pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1485 #else
1486 	static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1487 	static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1488 	pd_entry_t *l0, *l1;
1489 
1490 	if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1491 		panic("initial kernel map is too large");
1492 
1493 	l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1494 	pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1495 	    pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1496 	l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1497 	pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1498 	    pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1499 	pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1500 
1501 	l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1502 	pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1503 	    pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1504 	l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1505 	pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1506 	    pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1507 	pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1508 #endif
1509 }
1510 #endif
1511 
1512 /*
1513  *	Initialize a vm_page's machine-dependent fields.
1514  */
1515 void
pmap_page_init(vm_page_t m)1516 pmap_page_init(vm_page_t m)
1517 {
1518 
1519 	TAILQ_INIT(&m->md.pv_list);
1520 	m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1521 }
1522 
1523 static void
pmap_init_asids(struct asid_set * set,int bits)1524 pmap_init_asids(struct asid_set *set, int bits)
1525 {
1526 	int i;
1527 
1528 	set->asid_bits = bits;
1529 
1530 	/*
1531 	 * We may be too early in the overall initialization process to use
1532 	 * bit_alloc().
1533 	 */
1534 	set->asid_set_size = 1 << set->asid_bits;
1535 	set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1536 	    M_WAITOK | M_ZERO);
1537 	for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1538 		bit_set(set->asid_set, i);
1539 	set->asid_next = ASID_FIRST_AVAILABLE;
1540 	mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1541 }
1542 
1543 static void
pmap_init_pv_table(void)1544 pmap_init_pv_table(void)
1545 {
1546 	struct vm_phys_seg *seg, *next_seg;
1547 	struct pmap_large_md_page *pvd;
1548 	vm_size_t s;
1549 	int domain, i, j, pages;
1550 
1551 	/*
1552 	 * We strongly depend on the size being a power of two, so the assert
1553 	 * is overzealous. However, should the struct be resized to a
1554 	 * different power of two, the code below needs to be revisited.
1555 	 */
1556 	CTASSERT((sizeof(*pvd) == 64));
1557 
1558 	/*
1559 	 * Calculate the size of the array.
1560 	 */
1561 	s = 0;
1562 	for (i = 0; i < vm_phys_nsegs; i++) {
1563 		seg = &vm_phys_segs[i];
1564 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1565 		    pmap_l2_pindex(seg->start);
1566 		s += round_page(pages * sizeof(*pvd));
1567 	}
1568 	pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1569 	if (pv_table == NULL)
1570 		panic("%s: kva_alloc failed\n", __func__);
1571 
1572 	/*
1573 	 * Iterate physical segments to allocate domain-local memory for PV
1574 	 * list headers.
1575 	 */
1576 	pvd = pv_table;
1577 	for (i = 0; i < vm_phys_nsegs; i++) {
1578 		seg = &vm_phys_segs[i];
1579 		pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1580 		    pmap_l2_pindex(seg->start);
1581 		domain = seg->domain;
1582 
1583 		s = round_page(pages * sizeof(*pvd));
1584 
1585 		for (j = 0; j < s; j += PAGE_SIZE) {
1586 			vm_page_t m = vm_page_alloc_noobj_domain(domain,
1587 			    VM_ALLOC_ZERO);
1588 			if (m == NULL)
1589 				panic("failed to allocate PV table page");
1590 			pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1591 		}
1592 
1593 		for (j = 0; j < s / sizeof(*pvd); j++) {
1594 			rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1595 			TAILQ_INIT(&pvd->pv_page.pv_list);
1596 			pvd++;
1597 		}
1598 	}
1599 	pvd = &pv_dummy_large;
1600 	memset(pvd, 0, sizeof(*pvd));
1601 	rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1602 	TAILQ_INIT(&pvd->pv_page.pv_list);
1603 
1604 	/*
1605 	 * Set pointers from vm_phys_segs to pv_table.
1606 	 */
1607 	for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1608 		seg = &vm_phys_segs[i];
1609 		seg->md_first = pvd;
1610 		pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1611 		    pmap_l2_pindex(seg->start);
1612 
1613 		/*
1614 		 * If there is a following segment, and the final
1615 		 * superpage of this segment and the initial superpage
1616 		 * of the next segment are the same then adjust the
1617 		 * pv_table entry for that next segment down by one so
1618 		 * that the pv_table entries will be shared.
1619 		 */
1620 		if (i + 1 < vm_phys_nsegs) {
1621 			next_seg = &vm_phys_segs[i + 1];
1622 			if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1623 			    pmap_l2_pindex(next_seg->start)) {
1624 				pvd--;
1625 			}
1626 		}
1627 	}
1628 }
1629 
1630 /*
1631  *	Initialize the pmap module.
1632  *
1633  *	Called by vm_mem_init(), to initialize any structures that the pmap
1634  *	system needs to map virtual memory.
1635  */
1636 void
pmap_init(void)1637 pmap_init(void)
1638 {
1639 	uint64_t mmfr1;
1640 	int i, vmid_bits;
1641 
1642 	/*
1643 	 * Are large page mappings enabled?
1644 	 */
1645 	TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1646 	if (superpages_enabled) {
1647 		KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1648 		    ("pmap_init: can't assign to pagesizes[1]"));
1649 		pagesizes[1] = L2_SIZE;
1650 		if (L1_BLOCKS_SUPPORTED) {
1651 			KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1652 			    ("pmap_init: can't assign to pagesizes[2]"));
1653 			pagesizes[2] = L1_SIZE;
1654 		}
1655 	}
1656 
1657 	/*
1658 	 * Initialize the ASID allocator.
1659 	 */
1660 	pmap_init_asids(&asids,
1661 	    (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1662 
1663 	if (has_hyp()) {
1664 		mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1665 		vmid_bits = 8;
1666 
1667 		if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1668 		    ID_AA64MMFR1_VMIDBits_16)
1669 			vmid_bits = 16;
1670 		pmap_init_asids(&vmids, vmid_bits);
1671 	}
1672 
1673 	/*
1674 	 * Initialize pv chunk lists.
1675 	 */
1676 	for (i = 0; i < PMAP_MEMDOM; i++) {
1677 		mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1678 		    MTX_DEF);
1679 		TAILQ_INIT(&pv_chunks[i].pvc_list);
1680 	}
1681 	pmap_init_pv_table();
1682 
1683 	vm_initialized = 1;
1684 }
1685 
1686 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1687     "L1 (1GB/64GB) page mapping counters");
1688 
1689 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
1690 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
1691     &pmap_l1_demotions, "L1 (1GB/64GB) page demotions");
1692 
1693 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1694     "L2C (32MB/1GB) page mapping counters");
1695 
1696 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions);
1697 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD,
1698     &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions");
1699 
1700 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1701     "2MB page mapping counters");
1702 
1703 static u_long pmap_l2_demotions;
1704 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1705     &pmap_l2_demotions, 0, "2MB page demotions");
1706 
1707 static u_long pmap_l2_mappings;
1708 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1709     &pmap_l2_mappings, 0, "2MB page mappings");
1710 
1711 static u_long pmap_l2_p_failures;
1712 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1713     &pmap_l2_p_failures, 0, "2MB page promotion failures");
1714 
1715 static u_long pmap_l2_promotions;
1716 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1717     &pmap_l2_promotions, 0, "2MB page promotions");
1718 
1719 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1720     "L3C (64KB/2MB) page mapping counters");
1721 
1722 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1723 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1724     &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1725 
1726 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1727 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1728     &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1729 
1730 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1731 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1732     &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1733 
1734 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1735 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1736     &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1737 
1738 /*
1739  * If the given value for "final_only" is false, then any cached intermediate-
1740  * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1741  * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1742  * Otherwise, just the cached final-level entry is invalidated.
1743  */
1744 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1745 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1746 {
1747 	if (final_only)
1748 		__asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1749 	else
1750 		__asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1751 }
1752 
1753 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1754 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1755 {
1756 	if (final_only)
1757 		__asm __volatile("tlbi vale1is, %0" : : "r" (r));
1758 	else
1759 		__asm __volatile("tlbi vae1is, %0" : : "r" (r));
1760 }
1761 
1762 /*
1763  * Invalidates any cached final- and optionally intermediate-level TLB entries
1764  * for the specified virtual address in the given virtual address space.
1765  */
1766 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1767 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1768 {
1769 	uint64_t r;
1770 
1771 	PMAP_ASSERT_STAGE1(pmap);
1772 
1773 	dsb(ishst);
1774 	r = TLBI_VA(va);
1775 	if (pmap == kernel_pmap) {
1776 		pmap_s1_invalidate_kernel(r, final_only);
1777 	} else {
1778 		r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1779 		pmap_s1_invalidate_user(r, final_only);
1780 	}
1781 	dsb(ish);
1782 	isb();
1783 }
1784 
1785 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1786 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1787 {
1788 	PMAP_ASSERT_STAGE2(pmap);
1789 	MPASS(pmap_stage2_invalidate_range != NULL);
1790 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1791 	    final_only);
1792 }
1793 
1794 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1795 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1796 {
1797 	if (pmap->pm_stage == PM_STAGE1)
1798 		pmap_s1_invalidate_page(pmap, va, final_only);
1799 	else
1800 		pmap_s2_invalidate_page(pmap, va, final_only);
1801 }
1802 
1803 /*
1804  * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK
1805  * mappings.  Otherwise, use stride L3_SIZE.
1806  */
1807 static __inline void
pmap_s1_invalidate_strided(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_offset_t stride,bool final_only)1808 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1809     vm_offset_t stride, bool final_only)
1810 {
1811 	uint64_t end, r, start;
1812 
1813 	PMAP_ASSERT_STAGE1(pmap);
1814 
1815 	dsb(ishst);
1816 	if (pmap == kernel_pmap) {
1817 		start = TLBI_VA(sva);
1818 		end = TLBI_VA(eva);
1819 		for (r = start; r < end; r += TLBI_VA(stride))
1820 			pmap_s1_invalidate_kernel(r, final_only);
1821 	} else {
1822 		start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1823 		start |= TLBI_VA(sva);
1824 		end |= TLBI_VA(eva);
1825 		for (r = start; r < end; r += TLBI_VA(stride))
1826 			pmap_s1_invalidate_user(r, final_only);
1827 	}
1828 	dsb(ish);
1829 	isb();
1830 }
1831 
1832 /*
1833  * Invalidates any cached final- and optionally intermediate-level TLB entries
1834  * for the specified virtual address range in the given virtual address space.
1835  */
1836 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1837 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1838     bool final_only)
1839 {
1840 	pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only);
1841 }
1842 
1843 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1844 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1845     bool final_only)
1846 {
1847 	PMAP_ASSERT_STAGE2(pmap);
1848 	MPASS(pmap_stage2_invalidate_range != NULL);
1849 	pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1850 }
1851 
1852 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1853 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1854     bool final_only)
1855 {
1856 	if (pmap->pm_stage == PM_STAGE1)
1857 		pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1858 	else
1859 		pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1860 }
1861 
1862 /*
1863  * Invalidates all cached intermediate- and final-level TLB entries for the
1864  * given virtual address space.
1865  */
1866 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)1867 pmap_s1_invalidate_all(pmap_t pmap)
1868 {
1869 	uint64_t r;
1870 
1871 	PMAP_ASSERT_STAGE1(pmap);
1872 
1873 	dsb(ishst);
1874 	if (pmap == kernel_pmap) {
1875 		__asm __volatile("tlbi vmalle1is");
1876 	} else {
1877 		r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1878 		__asm __volatile("tlbi aside1is, %0" : : "r" (r));
1879 	}
1880 	dsb(ish);
1881 	isb();
1882 }
1883 
1884 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)1885 pmap_s2_invalidate_all(pmap_t pmap)
1886 {
1887 	PMAP_ASSERT_STAGE2(pmap);
1888 	MPASS(pmap_stage2_invalidate_all != NULL);
1889 	pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1890 }
1891 
1892 static __inline void
pmap_invalidate_all(pmap_t pmap)1893 pmap_invalidate_all(pmap_t pmap)
1894 {
1895 	if (pmap->pm_stage == PM_STAGE1)
1896 		pmap_s1_invalidate_all(pmap);
1897 	else
1898 		pmap_s2_invalidate_all(pmap);
1899 }
1900 
1901 /*
1902  *	Routine:	pmap_extract
1903  *	Function:
1904  *		Extract the physical page address associated
1905  *		with the given map/virtual_address pair.
1906  */
1907 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1908 pmap_extract(pmap_t pmap, vm_offset_t va)
1909 {
1910 	pt_entry_t *pte, tpte;
1911 	vm_paddr_t pa;
1912 	int lvl;
1913 
1914 	pa = 0;
1915 	PMAP_LOCK(pmap);
1916 	/*
1917 	 * Find the block or page map for this virtual address. pmap_pte
1918 	 * will return either a valid block/page entry, or NULL.
1919 	 */
1920 	pte = pmap_pte(pmap, va, &lvl);
1921 	if (pte != NULL) {
1922 		tpte = pmap_load(pte);
1923 		pa = PTE_TO_PHYS(tpte);
1924 		switch(lvl) {
1925 		case 1:
1926 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1927 			KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1928 			    ("pmap_extract: Invalid L1 pte found: %lx",
1929 			    tpte & ATTR_DESCR_MASK));
1930 			pa |= (va & L1_OFFSET);
1931 			break;
1932 		case 2:
1933 			KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1934 			    ("pmap_extract: Invalid L2 pte found: %lx",
1935 			    tpte & ATTR_DESCR_MASK));
1936 			pa |= (va & L2_OFFSET);
1937 			break;
1938 		case 3:
1939 			KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1940 			    ("pmap_extract: Invalid L3 pte found: %lx",
1941 			    tpte & ATTR_DESCR_MASK));
1942 			pa |= (va & L3_OFFSET);
1943 			break;
1944 		}
1945 	}
1946 	PMAP_UNLOCK(pmap);
1947 	return (pa);
1948 }
1949 
1950 /*
1951  *	Routine:	pmap_extract_and_hold
1952  *	Function:
1953  *		Atomically extract and hold the physical page
1954  *		with the given pmap and virtual address pair
1955  *		if that mapping permits the given protection.
1956  */
1957 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1958 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1959 {
1960 	pt_entry_t *pte, tpte;
1961 	vm_offset_t off;
1962 	vm_page_t m;
1963 	int lvl;
1964 	bool use;
1965 
1966 	m = NULL;
1967 	PMAP_LOCK(pmap);
1968 	pte = pmap_pte(pmap, va, &lvl);
1969 	if (pte != NULL) {
1970 		tpte = pmap_load(pte);
1971 
1972 		KASSERT(lvl > 0 && lvl <= 3,
1973 		    ("pmap_extract_and_hold: Invalid level %d", lvl));
1974 		/*
1975 		 * Check that the pte is either a L3 page, or a L1 or L2 block
1976 		 * entry. We can assume L1_BLOCK == L2_BLOCK.
1977 		 */
1978 		KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1979 		    (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1980 		    ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1981 		     tpte & ATTR_DESCR_MASK));
1982 
1983 		use = false;
1984 		if ((prot & VM_PROT_WRITE) == 0)
1985 			use = true;
1986 		else if (pmap->pm_stage == PM_STAGE1 &&
1987 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1988 			use = true;
1989 		else if (pmap->pm_stage == PM_STAGE2 &&
1990 		    ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1991 		     ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1992 			use = true;
1993 
1994 		if (use) {
1995 			switch (lvl) {
1996 			case 1:
1997 				off = va & L1_OFFSET;
1998 				break;
1999 			case 2:
2000 				off = va & L2_OFFSET;
2001 				break;
2002 			case 3:
2003 			default:
2004 				off = 0;
2005 			}
2006 			m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
2007 			if (m != NULL && !vm_page_wire_mapped(m))
2008 				m = NULL;
2009 		}
2010 	}
2011 	PMAP_UNLOCK(pmap);
2012 	return (m);
2013 }
2014 
2015 /*
2016  * Walks the page tables to translate a kernel virtual address to a
2017  * physical address. Returns true if the kva is valid and stores the
2018  * physical address in pa if it is not NULL.
2019  *
2020  * See the comment above data_abort() for the rationale for specifying
2021  * NO_PERTHREAD_SSP here.
2022  */
2023 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)2024 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
2025 {
2026 	pt_entry_t *pte, tpte;
2027 	register_t intr;
2028 	uint64_t par;
2029 
2030 	/*
2031 	 * Disable interrupts so we don't get interrupted between asking
2032 	 * for address translation, and getting the result back.
2033 	 */
2034 	intr = intr_disable();
2035 	par = arm64_address_translate_s1e1r(va);
2036 	intr_restore(intr);
2037 
2038 	if (PAR_SUCCESS(par)) {
2039 		if (pa != NULL)
2040 			*pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2041 		return (true);
2042 	}
2043 
2044 	/*
2045 	 * Fall back to walking the page table. The address translation
2046 	 * instruction may fail when the page is in a break-before-make
2047 	 * sequence. As we only clear the valid bit in said sequence we
2048 	 * can walk the page table to find the physical address.
2049 	 */
2050 
2051 	pte = pmap_l1(kernel_pmap, va);
2052 	if (pte == NULL)
2053 		return (false);
2054 
2055 	/*
2056 	 * A concurrent pmap_update_entry() will clear the entry's valid bit
2057 	 * but leave the rest of the entry unchanged.  Therefore, we treat a
2058 	 * non-zero entry as being valid, and we ignore the valid bit when
2059 	 * determining whether the entry maps a block, page, or table.
2060 	 */
2061 	tpte = pmap_load(pte);
2062 	if (tpte == 0)
2063 		return (false);
2064 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2065 		if (pa != NULL)
2066 			*pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2067 		return (true);
2068 	}
2069 	pte = pmap_l1_to_l2(&tpte, va);
2070 	tpte = pmap_load(pte);
2071 	if (tpte == 0)
2072 		return (false);
2073 	if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2074 		if (pa != NULL)
2075 			*pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2076 		return (true);
2077 	}
2078 	pte = pmap_l2_to_l3(&tpte, va);
2079 	tpte = pmap_load(pte);
2080 	if (tpte == 0)
2081 		return (false);
2082 	if (pa != NULL)
2083 		*pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2084 	return (true);
2085 }
2086 
2087 /*
2088  *	Routine:	pmap_kextract
2089  *	Function:
2090  *		Extract the physical page address associated with the given kernel
2091  *		virtual address.
2092  */
2093 vm_paddr_t
pmap_kextract(vm_offset_t va)2094 pmap_kextract(vm_offset_t va)
2095 {
2096 	vm_paddr_t pa;
2097 
2098 	if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2099 		return (DMAP_TO_PHYS(va));
2100 
2101 	if (pmap_klookup(va, &pa) == false)
2102 		return (0);
2103 	return (pa);
2104 }
2105 
2106 /***************************************************
2107  * Low level mapping routines.....
2108  ***************************************************/
2109 
2110 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2111 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2112 {
2113 	pd_entry_t *pde;
2114 	pt_entry_t attr, old_l3e, *pte;
2115 	vm_offset_t va;
2116 	vm_page_t mpte;
2117 	int error, lvl;
2118 
2119 	KASSERT((pa & L3_OFFSET) == 0,
2120 	    ("pmap_kenter: Invalid physical address"));
2121 	KASSERT((sva & L3_OFFSET) == 0,
2122 	    ("pmap_kenter: Invalid virtual address"));
2123 	KASSERT((size & PAGE_MASK) == 0,
2124 	    ("pmap_kenter: Mapping is not page-sized"));
2125 
2126 	attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2127 	    ATTR_KERN_GP | ATTR_S1_IDX(mode);
2128 	old_l3e = 0;
2129 	va = sva;
2130 	while (size != 0) {
2131 		pde = pmap_pde(kernel_pmap, va, &lvl);
2132 		KASSERT(pde != NULL,
2133 		    ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2134 		KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2135 
2136 		/*
2137 		 * If we have an aligned, contiguous chunk of L2_SIZE, try
2138 		 * to create an L2_BLOCK mapping.
2139 		 */
2140 		if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2141 		    (pa & L2_OFFSET) == 0 && vm_initialized) {
2142 			mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2143 			KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2144 			    ("pmap_kenter: Unexpected mapping"));
2145 			PMAP_LOCK(kernel_pmap);
2146 			error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2147 			    false);
2148 			if (error == 0) {
2149 				attr &= ~ATTR_CONTIGUOUS;
2150 
2151 				/*
2152 				 * Although the page table page "mpte" should
2153 				 * be devoid of mappings, the TLB might hold
2154 				 * intermediate entries that reference it, so
2155 				 * we perform a single-page invalidation.
2156 				 */
2157 				pmap_update_entry(kernel_pmap, pde,
2158 				    PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2159 				    PAGE_SIZE);
2160 			}
2161 			PMAP_UNLOCK(kernel_pmap);
2162 			if (error == 0) {
2163 				va += L2_SIZE;
2164 				pa += L2_SIZE;
2165 				size -= L2_SIZE;
2166 				continue;
2167 			}
2168 		}
2169 
2170 		/*
2171 		 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2172 		 * L3 pages, set the contiguous bit within each PTE so that
2173 		 * the chunk can be cached using only one TLB entry.
2174 		 */
2175 		if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2176 			if (size >= L3C_SIZE)
2177 				attr |= ATTR_CONTIGUOUS;
2178 			else
2179 				attr &= ~ATTR_CONTIGUOUS;
2180 		}
2181 
2182 		pte = pmap_l2_to_l3(pde, va);
2183 		old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2184 		    L3_PAGE);
2185 
2186 		va += PAGE_SIZE;
2187 		pa += PAGE_SIZE;
2188 		size -= PAGE_SIZE;
2189 	}
2190 	if ((old_l3e & ATTR_DESCR_VALID) != 0)
2191 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2192 	else {
2193 		/*
2194 		 * Because the old entries were invalid and the new mappings
2195 		 * are not executable, an isb is not required.
2196 		 */
2197 		dsb(ishst);
2198 	}
2199 }
2200 
2201 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2202 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2203 {
2204 
2205 	pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2206 }
2207 
2208 /*
2209  * Remove a page from the kernel pagetables.
2210  */
2211 void
pmap_kremove(vm_offset_t va)2212 pmap_kremove(vm_offset_t va)
2213 {
2214 	pt_entry_t *pte;
2215 
2216 	pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2217 	KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2218 	    ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2219 	pmap_clear(pte);
2220 	pmap_s1_invalidate_page(kernel_pmap, va, true);
2221 }
2222 
2223 /*
2224  * Remove the specified range of mappings from the kernel address space.
2225  *
2226  * Should only be applied to mappings that were created by pmap_kenter() or
2227  * pmap_kenter_device().  Nothing about this function is actually specific
2228  * to device mappings.
2229  */
2230 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2231 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2232 {
2233 	pt_entry_t *ptep, *ptep_end;
2234 	vm_offset_t va;
2235 	int lvl;
2236 
2237 	KASSERT((sva & L3_OFFSET) == 0,
2238 	    ("pmap_kremove_device: Invalid virtual address"));
2239 	KASSERT((size & PAGE_MASK) == 0,
2240 	    ("pmap_kremove_device: Mapping is not page-sized"));
2241 
2242 	va = sva;
2243 	while (size != 0) {
2244 		ptep = pmap_pte(kernel_pmap, va, &lvl);
2245 		KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2246 		switch (lvl) {
2247 		case 2:
2248 			KASSERT((va & L2_OFFSET) == 0,
2249 			    ("Unaligned virtual address"));
2250 			KASSERT(size >= L2_SIZE, ("Insufficient size"));
2251 
2252 			if (va != sva) {
2253 				pmap_s1_invalidate_range(kernel_pmap, sva, va,
2254 				    true);
2255 			}
2256 			pmap_clear(ptep);
2257 			pmap_s1_invalidate_page(kernel_pmap, va, true);
2258 			PMAP_LOCK(kernel_pmap);
2259 			pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2260 			PMAP_UNLOCK(kernel_pmap);
2261 
2262 			va += L2_SIZE;
2263 			sva = va;
2264 			size -= L2_SIZE;
2265 			break;
2266 		case 3:
2267 			if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2268 				KASSERT((va & L3C_OFFSET) == 0,
2269 				    ("Unaligned L3C virtual address"));
2270 				KASSERT(size >= L3C_SIZE,
2271 				    ("Insufficient L3C size"));
2272 
2273 				ptep_end = ptep + L3C_ENTRIES;
2274 				for (; ptep < ptep_end; ptep++)
2275 					pmap_clear(ptep);
2276 
2277 				va += L3C_SIZE;
2278 				size -= L3C_SIZE;
2279 				break;
2280 			}
2281 			pmap_clear(ptep);
2282 
2283 			va += PAGE_SIZE;
2284 			size -= PAGE_SIZE;
2285 			break;
2286 		default:
2287 			__assert_unreachable();
2288 			break;
2289 		}
2290 	}
2291 	if (va != sva)
2292 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2293 }
2294 
2295 /*
2296  *	Used to map a range of physical addresses into kernel
2297  *	virtual address space.
2298  *
2299  *	The value passed in '*virt' is a suggested virtual address for
2300  *	the mapping. Architectures which can support a direct-mapped
2301  *	physical to virtual region can return the appropriate address
2302  *	within that region, leaving '*virt' unchanged. Other
2303  *	architectures should map the pages starting at '*virt' and
2304  *	update '*virt' with the first usable address after the mapped
2305  *	region.
2306  */
2307 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2308 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2309 {
2310 	return PHYS_TO_DMAP(start);
2311 }
2312 
2313 /*
2314  * Add a list of wired pages to the kva
2315  * this routine is only used for temporary
2316  * kernel mappings that do not need to have
2317  * page modification or references recorded.
2318  * Note that old mappings are simply written
2319  * over.  The page *must* be wired.
2320  * Note: SMP coherent.  Uses a ranged shootdown IPI.
2321  */
2322 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)2323 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2324 {
2325 	pd_entry_t *pde;
2326 	pt_entry_t attr, old_l3e, *pte;
2327 	vm_offset_t va;
2328 	vm_page_t m;
2329 	int i, lvl;
2330 
2331 	old_l3e = 0;
2332 	va = sva;
2333 	for (i = 0; i < count; i++) {
2334 		pde = pmap_pde(kernel_pmap, va, &lvl);
2335 		KASSERT(pde != NULL,
2336 		    ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2337 		KASSERT(lvl == 2,
2338 		    ("pmap_qenter: Invalid level %d", lvl));
2339 
2340 		m = ma[i];
2341 		attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2342 		    ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2343 		pte = pmap_l2_to_l3(pde, va);
2344 		old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2345 
2346 		va += L3_SIZE;
2347 	}
2348 	if ((old_l3e & ATTR_DESCR_VALID) != 0)
2349 		pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2350 	else {
2351 		/*
2352 		 * Because the old entries were invalid and the new mappings
2353 		 * are not executable, an isb is not required.
2354 		 */
2355 		dsb(ishst);
2356 	}
2357 }
2358 
2359 /*
2360  * This routine tears out page mappings from the
2361  * kernel -- it is meant only for temporary mappings.
2362  */
2363 void
pmap_qremove(vm_offset_t sva,int count)2364 pmap_qremove(vm_offset_t sva, int count)
2365 {
2366 	pt_entry_t *pte;
2367 	vm_offset_t va;
2368 
2369 	KASSERT(ADDR_IS_CANONICAL(sva),
2370 	    ("%s: Address not in canonical form: %lx", __func__, sva));
2371 	KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2372 
2373 	va = sva;
2374 	while (count-- > 0) {
2375 		pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2376 		if (pte != NULL) {
2377 			pmap_clear(pte);
2378 		}
2379 
2380 		va += PAGE_SIZE;
2381 	}
2382 	pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2383 }
2384 
2385 /***************************************************
2386  * Page table page management routines.....
2387  ***************************************************/
2388 /*
2389  * Schedule the specified unused page table page to be freed.  Specifically,
2390  * add the page to the specified list of pages that will be released to the
2391  * physical memory manager after the TLB has been updated.
2392  */
2393 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2394 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2395 {
2396 
2397 	if (set_PG_ZERO)
2398 		m->flags |= PG_ZERO;
2399 	else
2400 		m->flags &= ~PG_ZERO;
2401 	SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2402 }
2403 
2404 /*
2405  * Decrements a page table page's reference count, which is used to record the
2406  * number of valid page table entries within the page.  If the reference count
2407  * drops to zero, then the page table page is unmapped.  Returns true if the
2408  * page table page was unmapped and false otherwise.
2409  */
2410 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2411 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2412 {
2413 
2414 	--m->ref_count;
2415 	if (m->ref_count == 0) {
2416 		_pmap_unwire_l3(pmap, va, m, free);
2417 		return (true);
2418 	} else
2419 		return (false);
2420 }
2421 
2422 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2423 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2424 {
2425 
2426 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2427 	/*
2428 	 * unmap the page table page
2429 	 */
2430 	if (m->pindex >= (NUL2E + NUL1E)) {
2431 		/* l1 page */
2432 		pd_entry_t *l0;
2433 
2434 		l0 = pmap_l0(pmap, va);
2435 		pmap_clear(l0);
2436 	} else if (m->pindex >= NUL2E) {
2437 		/* l2 page */
2438 		pd_entry_t *l1;
2439 
2440 		l1 = pmap_l1(pmap, va);
2441 		pmap_clear(l1);
2442 	} else {
2443 		/* l3 page */
2444 		pd_entry_t *l2;
2445 
2446 		l2 = pmap_l2(pmap, va);
2447 		pmap_clear(l2);
2448 	}
2449 	pmap_resident_count_dec(pmap, 1);
2450 	if (m->pindex < NUL2E) {
2451 		/* We just released an l3, unhold the matching l2 */
2452 		pd_entry_t *l1, tl1;
2453 		vm_page_t l2pg;
2454 
2455 		l1 = pmap_l1(pmap, va);
2456 		tl1 = pmap_load(l1);
2457 		l2pg = PTE_TO_VM_PAGE(tl1);
2458 		pmap_unwire_l3(pmap, va, l2pg, free);
2459 	} else if (m->pindex < (NUL2E + NUL1E)) {
2460 		/* We just released an l2, unhold the matching l1 */
2461 		pd_entry_t *l0, tl0;
2462 		vm_page_t l1pg;
2463 
2464 		l0 = pmap_l0(pmap, va);
2465 		tl0 = pmap_load(l0);
2466 		l1pg = PTE_TO_VM_PAGE(tl0);
2467 		pmap_unwire_l3(pmap, va, l1pg, free);
2468 	}
2469 	pmap_invalidate_page(pmap, va, false);
2470 
2471 	/*
2472 	 * Put page on a list so that it is released after
2473 	 * *ALL* TLB shootdown is done
2474 	 */
2475 	pmap_add_delayed_free_list(m, free, true);
2476 }
2477 
2478 /*
2479  * After removing a page table entry, this routine is used to
2480  * conditionally free the page, and manage the reference count.
2481  */
2482 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2483 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2484     struct spglist *free)
2485 {
2486 	vm_page_t mpte;
2487 
2488 	KASSERT(ADDR_IS_CANONICAL(va),
2489 	    ("%s: Address not in canonical form: %lx", __func__, va));
2490 	if (ADDR_IS_KERNEL(va))
2491 		return (0);
2492 	KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2493 	mpte = PTE_TO_VM_PAGE(ptepde);
2494 	return (pmap_unwire_l3(pmap, va, mpte, free));
2495 }
2496 
2497 /*
2498  * Release a page table page reference after a failed attempt to create a
2499  * mapping.
2500  */
2501 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2502 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2503 {
2504 	struct spglist free;
2505 
2506 	SLIST_INIT(&free);
2507 	if (pmap_unwire_l3(pmap, va, mpte, &free))
2508 		vm_page_free_pages_toq(&free, true);
2509 }
2510 
2511 void
pmap_pinit0(pmap_t pmap)2512 pmap_pinit0(pmap_t pmap)
2513 {
2514 
2515 	PMAP_LOCK_INIT(pmap);
2516 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2517 	pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2518 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2519 	TAILQ_INIT(&pmap->pm_pvchunk);
2520 	vm_radix_init(&pmap->pm_root);
2521 	pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2522 	pmap->pm_stage = PM_STAGE1;
2523 	pmap->pm_levels = 4;
2524 	pmap->pm_ttbr = pmap->pm_l0_paddr;
2525 	pmap->pm_asid_set = &asids;
2526 	pmap->pm_bti = NULL;
2527 
2528 	PCPU_SET(curpmap, pmap);
2529 }
2530 
2531 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2532 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2533 {
2534 	vm_page_t m;
2535 
2536 	/*
2537 	 * allocate the l0 page
2538 	 */
2539 	m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2540 	    VM_ALLOC_ZERO);
2541 	pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2542 	pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2543 
2544 	TAILQ_INIT(&pmap->pm_pvchunk);
2545 	vm_radix_init(&pmap->pm_root);
2546 	bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2547 	pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2548 
2549 	MPASS(levels == 3 || levels == 4);
2550 	pmap->pm_levels = levels;
2551 	pmap->pm_stage = stage;
2552 	pmap->pm_bti = NULL;
2553 	switch (stage) {
2554 	case PM_STAGE1:
2555 		pmap->pm_asid_set = &asids;
2556 		if (pmap_bti_support) {
2557 			pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2558 			    M_ZERO | M_WAITOK);
2559 			rangeset_init(pmap->pm_bti, bti_dup_range,
2560 			    bti_free_range, pmap, M_NOWAIT);
2561 		}
2562 		break;
2563 	case PM_STAGE2:
2564 		pmap->pm_asid_set = &vmids;
2565 		break;
2566 	default:
2567 		panic("%s: Invalid pmap type %d", __func__, stage);
2568 		break;
2569 	}
2570 
2571 	/* XXX Temporarily disable deferred ASID allocation. */
2572 	pmap_alloc_asid(pmap);
2573 
2574 	/*
2575 	 * Allocate the level 1 entry to use as the root. This will increase
2576 	 * the refcount on the level 1 page so it won't be removed until
2577 	 * pmap_release() is called.
2578 	 */
2579 	if (pmap->pm_levels == 3) {
2580 		PMAP_LOCK(pmap);
2581 		m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2582 		PMAP_UNLOCK(pmap);
2583 	}
2584 	pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2585 
2586 	return (1);
2587 }
2588 
2589 int
pmap_pinit(pmap_t pmap)2590 pmap_pinit(pmap_t pmap)
2591 {
2592 
2593 	return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2594 }
2595 
2596 /*
2597  * This routine is called if the desired page table page does not exist.
2598  *
2599  * If page table page allocation fails, this routine may sleep before
2600  * returning NULL.  It sleeps only if a lock pointer was given.
2601  *
2602  * Note: If a page allocation fails at page table level two or three,
2603  * one or two pages may be held during the wait, only to be released
2604  * afterwards.  This conservative approach is easily argued to avoid
2605  * race conditions.
2606  */
2607 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2608 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2609 {
2610 	vm_page_t m, l1pg, l2pg;
2611 
2612 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2613 
2614 	/*
2615 	 * Allocate a page table page.
2616 	 */
2617 	if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2618 		if (lockp != NULL) {
2619 			RELEASE_PV_LIST_LOCK(lockp);
2620 			PMAP_UNLOCK(pmap);
2621 			vm_wait(NULL);
2622 			PMAP_LOCK(pmap);
2623 		}
2624 
2625 		/*
2626 		 * Indicate the need to retry.  While waiting, the page table
2627 		 * page may have been allocated.
2628 		 */
2629 		return (NULL);
2630 	}
2631 	m->pindex = ptepindex;
2632 
2633 	/*
2634 	 * Because of AArch64's weak memory consistency model, we must have a
2635 	 * barrier here to ensure that the stores for zeroing "m", whether by
2636 	 * pmap_zero_page() or an earlier function, are visible before adding
2637 	 * "m" to the page table.  Otherwise, a page table walk by another
2638 	 * processor's MMU could see the mapping to "m" and a stale, non-zero
2639 	 * PTE within "m".
2640 	 */
2641 	dmb(ishst);
2642 
2643 	/*
2644 	 * Map the pagetable page into the process address space, if
2645 	 * it isn't already there.
2646 	 */
2647 
2648 	if (ptepindex >= (NUL2E + NUL1E)) {
2649 		pd_entry_t *l0p, l0e;
2650 		vm_pindex_t l0index;
2651 
2652 		l0index = ptepindex - (NUL2E + NUL1E);
2653 		l0p = &pmap->pm_l0[l0index];
2654 		KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2655 		    ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2656 		l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2657 
2658 		/*
2659 		 * Mark all kernel memory as not accessible from userspace
2660 		 * and userspace memory as not executable from the kernel.
2661 		 * This has been done for the bootstrap L0 entries in
2662 		 * locore.S.
2663 		 */
2664 		if (pmap == kernel_pmap)
2665 			l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2666 		else
2667 			l0e |= TATTR_PXN_TABLE;
2668 		pmap_store(l0p, l0e);
2669 	} else if (ptepindex >= NUL2E) {
2670 		vm_pindex_t l0index, l1index;
2671 		pd_entry_t *l0, *l1;
2672 		pd_entry_t tl0;
2673 
2674 		l1index = ptepindex - NUL2E;
2675 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2676 
2677 		l0 = &pmap->pm_l0[l0index];
2678 		tl0 = pmap_load(l0);
2679 		if (tl0 == 0) {
2680 			/* recurse for allocating page dir */
2681 			if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2682 			    lockp) == NULL) {
2683 				vm_page_unwire_noq(m);
2684 				vm_page_free_zero(m);
2685 				return (NULL);
2686 			}
2687 		} else {
2688 			l1pg = PTE_TO_VM_PAGE(tl0);
2689 			l1pg->ref_count++;
2690 		}
2691 
2692 		l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2693 		l1 = &l1[ptepindex & Ln_ADDR_MASK];
2694 		KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2695 		    ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2696 		pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2697 	} else {
2698 		vm_pindex_t l0index, l1index;
2699 		pd_entry_t *l0, *l1, *l2;
2700 		pd_entry_t tl0, tl1;
2701 
2702 		l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2703 		l0index = l1index >> Ln_ENTRIES_SHIFT;
2704 
2705 		l0 = &pmap->pm_l0[l0index];
2706 		tl0 = pmap_load(l0);
2707 		if (tl0 == 0) {
2708 			/* recurse for allocating page dir */
2709 			if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2710 			    lockp) == NULL) {
2711 				vm_page_unwire_noq(m);
2712 				vm_page_free_zero(m);
2713 				return (NULL);
2714 			}
2715 			tl0 = pmap_load(l0);
2716 			l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2717 			l1 = &l1[l1index & Ln_ADDR_MASK];
2718 		} else {
2719 			l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2720 			l1 = &l1[l1index & Ln_ADDR_MASK];
2721 			tl1 = pmap_load(l1);
2722 			if (tl1 == 0) {
2723 				/* recurse for allocating page dir */
2724 				if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2725 				    lockp) == NULL) {
2726 					vm_page_unwire_noq(m);
2727 					vm_page_free_zero(m);
2728 					return (NULL);
2729 				}
2730 			} else {
2731 				l2pg = PTE_TO_VM_PAGE(tl1);
2732 				l2pg->ref_count++;
2733 			}
2734 		}
2735 
2736 		l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2737 		l2 = &l2[ptepindex & Ln_ADDR_MASK];
2738 		KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2739 		    ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2740 		pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2741 	}
2742 
2743 	pmap_resident_count_inc(pmap, 1);
2744 
2745 	return (m);
2746 }
2747 
2748 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)2749 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2750     struct rwlock **lockp)
2751 {
2752 	pd_entry_t *l1, *l2;
2753 	vm_page_t l2pg;
2754 	vm_pindex_t l2pindex;
2755 
2756 	KASSERT(ADDR_IS_CANONICAL(va),
2757 	    ("%s: Address not in canonical form: %lx", __func__, va));
2758 
2759 retry:
2760 	l1 = pmap_l1(pmap, va);
2761 	if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2762 		l2 = pmap_l1_to_l2(l1, va);
2763 		if (!ADDR_IS_KERNEL(va)) {
2764 			/* Add a reference to the L2 page. */
2765 			l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2766 			l2pg->ref_count++;
2767 		} else
2768 			l2pg = NULL;
2769 	} else if (!ADDR_IS_KERNEL(va)) {
2770 		/* Allocate a L2 page. */
2771 		l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2772 		l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2773 		if (l2pg == NULL) {
2774 			if (lockp != NULL)
2775 				goto retry;
2776 			else
2777 				return (NULL);
2778 		}
2779 		l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2780 		l2 = &l2[pmap_l2_index(va)];
2781 	} else
2782 		panic("pmap_alloc_l2: missing page table page for va %#lx",
2783 		    va);
2784 	*l2pgp = l2pg;
2785 	return (l2);
2786 }
2787 
2788 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2789 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2790 {
2791 	vm_pindex_t ptepindex;
2792 	pd_entry_t *pde, tpde;
2793 #ifdef INVARIANTS
2794 	pt_entry_t *pte;
2795 #endif
2796 	vm_page_t m;
2797 	int lvl;
2798 
2799 	/*
2800 	 * Calculate pagetable page index
2801 	 */
2802 	ptepindex = pmap_l2_pindex(va);
2803 retry:
2804 	/*
2805 	 * Get the page directory entry
2806 	 */
2807 	pde = pmap_pde(pmap, va, &lvl);
2808 
2809 	/*
2810 	 * If the page table page is mapped, we just increment the hold count,
2811 	 * and activate it. If we get a level 2 pde it will point to a level 3
2812 	 * table.
2813 	 */
2814 	switch (lvl) {
2815 	case -1:
2816 		break;
2817 	case 0:
2818 #ifdef INVARIANTS
2819 		pte = pmap_l0_to_l1(pde, va);
2820 		KASSERT(pmap_load(pte) == 0,
2821 		    ("pmap_alloc_l3: TODO: l0 superpages"));
2822 #endif
2823 		break;
2824 	case 1:
2825 #ifdef INVARIANTS
2826 		pte = pmap_l1_to_l2(pde, va);
2827 		KASSERT(pmap_load(pte) == 0,
2828 		    ("pmap_alloc_l3: TODO: l1 superpages"));
2829 #endif
2830 		break;
2831 	case 2:
2832 		tpde = pmap_load(pde);
2833 		if (tpde != 0) {
2834 			m = PTE_TO_VM_PAGE(tpde);
2835 			m->ref_count++;
2836 			return (m);
2837 		}
2838 		break;
2839 	default:
2840 		panic("pmap_alloc_l3: Invalid level %d", lvl);
2841 	}
2842 
2843 	/*
2844 	 * Here if the pte page isn't mapped, or if it has been deallocated.
2845 	 */
2846 	m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2847 	if (m == NULL && lockp != NULL)
2848 		goto retry;
2849 
2850 	return (m);
2851 }
2852 
2853 /***************************************************
2854  * Pmap allocation/deallocation routines.
2855  ***************************************************/
2856 
2857 /*
2858  * Release any resources held by the given physical map.
2859  * Called when a pmap initialized by pmap_pinit is being released.
2860  * Should only be called if the map contains no valid mappings.
2861  */
2862 void
pmap_release(pmap_t pmap)2863 pmap_release(pmap_t pmap)
2864 {
2865 	bool rv __diagused;
2866 	struct spglist freelist;
2867 	struct asid_set *set;
2868 	vm_page_t m;
2869 	int asid;
2870 
2871 	if (pmap->pm_levels != 4) {
2872 		PMAP_ASSERT_STAGE2(pmap);
2873 		KASSERT(pmap->pm_stats.resident_count == 1,
2874 		    ("pmap_release: pmap resident count %ld != 0",
2875 		    pmap->pm_stats.resident_count));
2876 		KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2877 		    ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2878 
2879 		SLIST_INIT(&freelist);
2880 		m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2881 		PMAP_LOCK(pmap);
2882 		rv = pmap_unwire_l3(pmap, 0, m, &freelist);
2883 		PMAP_UNLOCK(pmap);
2884 		MPASS(rv == true);
2885 		vm_page_free_pages_toq(&freelist, true);
2886 	}
2887 
2888 	KASSERT(pmap->pm_stats.resident_count == 0,
2889 	    ("pmap_release: pmap resident count %ld != 0",
2890 	    pmap->pm_stats.resident_count));
2891 	KASSERT(vm_radix_is_empty(&pmap->pm_root),
2892 	    ("pmap_release: pmap has reserved page table page(s)"));
2893 
2894 	set = pmap->pm_asid_set;
2895 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2896 
2897 	/*
2898 	 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2899 	 * the entries when removing them so rely on a later tlb invalidation.
2900 	 * this will happen when updating the VMID generation. Because of this
2901 	 * we don't reuse VMIDs within a generation.
2902 	 */
2903 	if (pmap->pm_stage == PM_STAGE1) {
2904 		mtx_lock_spin(&set->asid_set_mutex);
2905 		if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2906 			asid = COOKIE_TO_ASID(pmap->pm_cookie);
2907 			KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2908 			    asid < set->asid_set_size,
2909 			    ("pmap_release: pmap cookie has out-of-range asid"));
2910 			bit_clear(set->asid_set, asid);
2911 		}
2912 		mtx_unlock_spin(&set->asid_set_mutex);
2913 
2914 		if (pmap->pm_bti != NULL) {
2915 			rangeset_fini(pmap->pm_bti);
2916 			free(pmap->pm_bti, M_DEVBUF);
2917 		}
2918 	}
2919 
2920 	m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2921 	vm_page_unwire_noq(m);
2922 	vm_page_free_zero(m);
2923 }
2924 
2925 static int
kvm_size(SYSCTL_HANDLER_ARGS)2926 kvm_size(SYSCTL_HANDLER_ARGS)
2927 {
2928 	unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2929 
2930 	return sysctl_handle_long(oidp, &ksize, 0, req);
2931 }
2932 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2933     0, 0, kvm_size, "LU",
2934     "Size of KVM");
2935 
2936 static int
kvm_free(SYSCTL_HANDLER_ARGS)2937 kvm_free(SYSCTL_HANDLER_ARGS)
2938 {
2939 	unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2940 
2941 	return sysctl_handle_long(oidp, &kfree, 0, req);
2942 }
2943 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2944     0, 0, kvm_free, "LU",
2945     "Amount of KVM free");
2946 
2947 /*
2948  * grow the number of kernel page table entries, if needed
2949  */
2950 void
pmap_growkernel(vm_offset_t addr)2951 pmap_growkernel(vm_offset_t addr)
2952 {
2953 	vm_page_t nkpg;
2954 	pd_entry_t *l0, *l1, *l2;
2955 
2956 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2957 
2958 	addr = roundup2(addr, L2_SIZE);
2959 	if (addr - 1 >= vm_map_max(kernel_map))
2960 		addr = vm_map_max(kernel_map);
2961 	if (kernel_vm_end < addr) {
2962 		kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2963 		kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2964 	}
2965 	while (kernel_vm_end < addr) {
2966 		l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2967 		KASSERT(pmap_load(l0) != 0,
2968 		    ("pmap_growkernel: No level 0 kernel entry"));
2969 
2970 		l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2971 		if (pmap_load(l1) == 0) {
2972 			/* We need a new PDP entry */
2973 			nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2974 			    VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2975 			if (nkpg == NULL)
2976 				panic("pmap_growkernel: no memory to grow kernel");
2977 			nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2978 			/* See the dmb() in _pmap_alloc_l3(). */
2979 			dmb(ishst);
2980 			pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
2981 			continue; /* try again */
2982 		}
2983 		l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2984 		if (pmap_load(l2) != 0) {
2985 			kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2986 			if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2987 				kernel_vm_end = vm_map_max(kernel_map);
2988 				break;
2989 			}
2990 			continue;
2991 		}
2992 
2993 		nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2994 		    VM_ALLOC_ZERO);
2995 		if (nkpg == NULL)
2996 			panic("pmap_growkernel: no memory to grow kernel");
2997 		nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2998 		/* See the dmb() in _pmap_alloc_l3(). */
2999 		dmb(ishst);
3000 		pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
3001 
3002 		kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3003 		if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3004 			kernel_vm_end = vm_map_max(kernel_map);
3005 			break;
3006 		}
3007 	}
3008 }
3009 
3010 /***************************************************
3011  * page management routines.
3012  ***************************************************/
3013 
3014 static const uint64_t pc_freemask[_NPCM] = {
3015 	[0 ... _NPCM - 2] = PC_FREEN,
3016 	[_NPCM - 1] = PC_FREEL
3017 };
3018 
3019 #ifdef PV_STATS
3020 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3021 
3022 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3023 	"Current number of pv entry chunks");
3024 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3025 	"Current number of pv entry chunks allocated");
3026 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3027 	"Current number of pv entry chunks frees");
3028 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3029 	"Number of times tried to get a chunk page but failed.");
3030 
3031 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3032 static int pv_entry_spare;
3033 
3034 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3035 	"Current number of pv entry frees");
3036 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3037 	"Current number of pv entry allocs");
3038 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3039 	"Current number of pv entries");
3040 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3041 	"Current number of spare pv entries");
3042 #endif
3043 
3044 /*
3045  * We are in a serious low memory condition.  Resort to
3046  * drastic measures to free some pages so we can allocate
3047  * another pv entry chunk.
3048  *
3049  * Returns NULL if PV entries were reclaimed from the specified pmap.
3050  *
3051  * We do not, however, unmap 2mpages because subsequent accesses will
3052  * allocate per-page pv entries until repromotion occurs, thereby
3053  * exacerbating the shortage of free pv entries.
3054  */
3055 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3056 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3057 {
3058 	struct pv_chunks_list *pvc;
3059 	struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3060 	struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3061 	struct md_page *pvh;
3062 	pd_entry_t *pde;
3063 	pmap_t next_pmap, pmap;
3064 	pt_entry_t *pte, tpte;
3065 	pv_entry_t pv;
3066 	vm_offset_t va;
3067 	vm_page_t m, m_pc;
3068 	struct spglist free;
3069 	uint64_t inuse;
3070 	int bit, field, freed, lvl;
3071 
3072 	PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3073 	KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3074 
3075 	pmap = NULL;
3076 	m_pc = NULL;
3077 	SLIST_INIT(&free);
3078 	bzero(&pc_marker_b, sizeof(pc_marker_b));
3079 	bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3080 	pc_marker = (struct pv_chunk *)&pc_marker_b;
3081 	pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3082 
3083 	pvc = &pv_chunks[domain];
3084 	mtx_lock(&pvc->pvc_lock);
3085 	pvc->active_reclaims++;
3086 	TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3087 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3088 	while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3089 	    SLIST_EMPTY(&free)) {
3090 		next_pmap = pc->pc_pmap;
3091 		if (next_pmap == NULL) {
3092 			/*
3093 			 * The next chunk is a marker.  However, it is
3094 			 * not our marker, so active_reclaims must be
3095 			 * > 1.  Consequently, the next_chunk code
3096 			 * will not rotate the pv_chunks list.
3097 			 */
3098 			goto next_chunk;
3099 		}
3100 		mtx_unlock(&pvc->pvc_lock);
3101 
3102 		/*
3103 		 * A pv_chunk can only be removed from the pc_lru list
3104 		 * when both pvc->pvc_lock is owned and the
3105 		 * corresponding pmap is locked.
3106 		 */
3107 		if (pmap != next_pmap) {
3108 			if (pmap != NULL && pmap != locked_pmap)
3109 				PMAP_UNLOCK(pmap);
3110 			pmap = next_pmap;
3111 			/* Avoid deadlock and lock recursion. */
3112 			if (pmap > locked_pmap) {
3113 				RELEASE_PV_LIST_LOCK(lockp);
3114 				PMAP_LOCK(pmap);
3115 				mtx_lock(&pvc->pvc_lock);
3116 				continue;
3117 			} else if (pmap != locked_pmap) {
3118 				if (PMAP_TRYLOCK(pmap)) {
3119 					mtx_lock(&pvc->pvc_lock);
3120 					continue;
3121 				} else {
3122 					pmap = NULL; /* pmap is not locked */
3123 					mtx_lock(&pvc->pvc_lock);
3124 					pc = TAILQ_NEXT(pc_marker, pc_lru);
3125 					if (pc == NULL ||
3126 					    pc->pc_pmap != next_pmap)
3127 						continue;
3128 					goto next_chunk;
3129 				}
3130 			}
3131 		}
3132 
3133 		/*
3134 		 * Destroy every non-wired, 4 KB page mapping in the chunk.
3135 		 */
3136 		freed = 0;
3137 		for (field = 0; field < _NPCM; field++) {
3138 			for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3139 			    inuse != 0; inuse &= ~(1UL << bit)) {
3140 				bit = ffsl(inuse) - 1;
3141 				pv = &pc->pc_pventry[field * 64 + bit];
3142 				va = pv->pv_va;
3143 				pde = pmap_pde(pmap, va, &lvl);
3144 				if (lvl != 2)
3145 					continue;
3146 				pte = pmap_l2_to_l3(pde, va);
3147 				tpte = pmap_load(pte);
3148 				if ((tpte & ATTR_SW_WIRED) != 0)
3149 					continue;
3150 				if ((tpte & ATTR_CONTIGUOUS) != 0)
3151 					(void)pmap_demote_l3c(pmap, pte, va);
3152 				tpte = pmap_load_clear(pte);
3153 				m = PTE_TO_VM_PAGE(tpte);
3154 				if (pmap_pte_dirty(pmap, tpte))
3155 					vm_page_dirty(m);
3156 				if ((tpte & ATTR_AF) != 0) {
3157 					pmap_s1_invalidate_page(pmap, va, true);
3158 					vm_page_aflag_set(m, PGA_REFERENCED);
3159 				}
3160 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3161 				TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3162 				m->md.pv_gen++;
3163 				if (TAILQ_EMPTY(&m->md.pv_list) &&
3164 				    (m->flags & PG_FICTITIOUS) == 0) {
3165 					pvh = page_to_pvh(m);
3166 					if (TAILQ_EMPTY(&pvh->pv_list)) {
3167 						vm_page_aflag_clear(m,
3168 						    PGA_WRITEABLE);
3169 					}
3170 				}
3171 				pc->pc_map[field] |= 1UL << bit;
3172 				pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3173 				freed++;
3174 			}
3175 		}
3176 		if (freed == 0) {
3177 			mtx_lock(&pvc->pvc_lock);
3178 			goto next_chunk;
3179 		}
3180 		/* Every freed mapping is for a 4 KB page. */
3181 		pmap_resident_count_dec(pmap, freed);
3182 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3183 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3184 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3185 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3186 		if (pc_is_free(pc)) {
3187 			PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3188 			PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3189 			PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3190 			/* Entire chunk is free; return it. */
3191 			m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3192 			dump_drop_page(m_pc->phys_addr);
3193 			mtx_lock(&pvc->pvc_lock);
3194 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3195 			break;
3196 		}
3197 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3198 		mtx_lock(&pvc->pvc_lock);
3199 		/* One freed pv entry in locked_pmap is sufficient. */
3200 		if (pmap == locked_pmap)
3201 			break;
3202 
3203 next_chunk:
3204 		TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3205 		TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3206 		if (pvc->active_reclaims == 1 && pmap != NULL) {
3207 			/*
3208 			 * Rotate the pv chunks list so that we do not
3209 			 * scan the same pv chunks that could not be
3210 			 * freed (because they contained a wired
3211 			 * and/or superpage mapping) on every
3212 			 * invocation of reclaim_pv_chunk().
3213 			 */
3214 			while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3215 				MPASS(pc->pc_pmap != NULL);
3216 				TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3217 				TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3218 			}
3219 		}
3220 	}
3221 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3222 	TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3223 	pvc->active_reclaims--;
3224 	mtx_unlock(&pvc->pvc_lock);
3225 	if (pmap != NULL && pmap != locked_pmap)
3226 		PMAP_UNLOCK(pmap);
3227 	if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3228 		m_pc = SLIST_FIRST(&free);
3229 		SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3230 		/* Recycle a freed page table page. */
3231 		m_pc->ref_count = 1;
3232 	}
3233 	vm_page_free_pages_toq(&free, true);
3234 	return (m_pc);
3235 }
3236 
3237 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3238 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3239 {
3240 	vm_page_t m;
3241 	int i, domain;
3242 
3243 	domain = PCPU_GET(domain);
3244 	for (i = 0; i < vm_ndomains; i++) {
3245 		m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3246 		if (m != NULL)
3247 			break;
3248 		domain = (domain + 1) % vm_ndomains;
3249 	}
3250 
3251 	return (m);
3252 }
3253 
3254 /*
3255  * free the pv_entry back to the free list
3256  */
3257 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3258 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3259 {
3260 	struct pv_chunk *pc;
3261 	int idx, field, bit;
3262 
3263 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3264 	PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3265 	PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3266 	PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3267 	pc = pv_to_chunk(pv);
3268 	idx = pv - &pc->pc_pventry[0];
3269 	field = idx / 64;
3270 	bit = idx % 64;
3271 	pc->pc_map[field] |= 1ul << bit;
3272 	if (!pc_is_free(pc)) {
3273 		/* 98% of the time, pc is already at the head of the list. */
3274 		if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3275 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3276 			TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3277 		}
3278 		return;
3279 	}
3280 	TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3281 	free_pv_chunk(pc);
3282 }
3283 
3284 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3285 free_pv_chunk_dequeued(struct pv_chunk *pc)
3286 {
3287 	vm_page_t m;
3288 
3289 	PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3290 	PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3291 	PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3292 	/* entire chunk is free, return it */
3293 	m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3294 	dump_drop_page(m->phys_addr);
3295 	vm_page_unwire_noq(m);
3296 	vm_page_free(m);
3297 }
3298 
3299 static void
free_pv_chunk(struct pv_chunk * pc)3300 free_pv_chunk(struct pv_chunk *pc)
3301 {
3302 	struct pv_chunks_list *pvc;
3303 
3304 	pvc = &pv_chunks[pc_to_domain(pc)];
3305 	mtx_lock(&pvc->pvc_lock);
3306 	TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3307 	mtx_unlock(&pvc->pvc_lock);
3308 	free_pv_chunk_dequeued(pc);
3309 }
3310 
3311 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3312 free_pv_chunk_batch(struct pv_chunklist *batch)
3313 {
3314 	struct pv_chunks_list *pvc;
3315 	struct pv_chunk *pc, *npc;
3316 	int i;
3317 
3318 	for (i = 0; i < vm_ndomains; i++) {
3319 		if (TAILQ_EMPTY(&batch[i]))
3320 			continue;
3321 		pvc = &pv_chunks[i];
3322 		mtx_lock(&pvc->pvc_lock);
3323 		TAILQ_FOREACH(pc, &batch[i], pc_list) {
3324 			TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3325 		}
3326 		mtx_unlock(&pvc->pvc_lock);
3327 	}
3328 
3329 	for (i = 0; i < vm_ndomains; i++) {
3330 		TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3331 			free_pv_chunk_dequeued(pc);
3332 		}
3333 	}
3334 }
3335 
3336 /*
3337  * Returns a new PV entry, allocating a new PV chunk from the system when
3338  * needed.  If this PV chunk allocation fails and a PV list lock pointer was
3339  * given, a PV chunk is reclaimed from an arbitrary pmap.  Otherwise, NULL is
3340  * returned.
3341  *
3342  * The given PV list lock may be released.
3343  */
3344 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3345 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3346 {
3347 	struct pv_chunks_list *pvc;
3348 	int bit, field;
3349 	pv_entry_t pv;
3350 	struct pv_chunk *pc;
3351 	vm_page_t m;
3352 
3353 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3354 	PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3355 retry:
3356 	pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3357 	if (pc != NULL) {
3358 		for (field = 0; field < _NPCM; field++) {
3359 			if (pc->pc_map[field]) {
3360 				bit = ffsl(pc->pc_map[field]) - 1;
3361 				break;
3362 			}
3363 		}
3364 		if (field < _NPCM) {
3365 			pv = &pc->pc_pventry[field * 64 + bit];
3366 			pc->pc_map[field] &= ~(1ul << bit);
3367 			/* If this was the last item, move it to tail */
3368 			if (pc_is_full(pc)) {
3369 				TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3370 				TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3371 				    pc_list);
3372 			}
3373 			PV_STAT(atomic_add_long(&pv_entry_count, 1));
3374 			PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3375 			return (pv);
3376 		}
3377 	}
3378 	/* No free items, allocate another chunk */
3379 	m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3380 	if (m == NULL) {
3381 		if (lockp == NULL) {
3382 			PV_STAT(pc_chunk_tryfail++);
3383 			return (NULL);
3384 		}
3385 		m = reclaim_pv_chunk(pmap, lockp);
3386 		if (m == NULL)
3387 			goto retry;
3388 	}
3389 	PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3390 	PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3391 	dump_add_page(m->phys_addr);
3392 	pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3393 	pc->pc_pmap = pmap;
3394 	memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3395 	pc->pc_map[0] &= ~1ul;		/* preallocated bit 0 */
3396 	pvc = &pv_chunks[vm_page_domain(m)];
3397 	mtx_lock(&pvc->pvc_lock);
3398 	TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3399 	mtx_unlock(&pvc->pvc_lock);
3400 	pv = &pc->pc_pventry[0];
3401 	TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3402 	PV_STAT(atomic_add_long(&pv_entry_count, 1));
3403 	PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3404 	return (pv);
3405 }
3406 
3407 /*
3408  * Ensure that the number of spare PV entries in the specified pmap meets or
3409  * exceeds the given count, "needed".
3410  *
3411  * The given PV list lock may be released.
3412  */
3413 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3414 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3415 {
3416 	struct pv_chunks_list *pvc;
3417 	struct pch new_tail[PMAP_MEMDOM];
3418 	struct pv_chunk *pc;
3419 	vm_page_t m;
3420 	int avail, free, i;
3421 	bool reclaimed;
3422 
3423 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3424 	KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3425 
3426 	/*
3427 	 * Newly allocated PV chunks must be stored in a private list until
3428 	 * the required number of PV chunks have been allocated.  Otherwise,
3429 	 * reclaim_pv_chunk() could recycle one of these chunks.  In
3430 	 * contrast, these chunks must be added to the pmap upon allocation.
3431 	 */
3432 	for (i = 0; i < PMAP_MEMDOM; i++)
3433 		TAILQ_INIT(&new_tail[i]);
3434 retry:
3435 	avail = 0;
3436 	TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3437 		bit_count((bitstr_t *)pc->pc_map, 0,
3438 		    sizeof(pc->pc_map) * NBBY, &free);
3439 		if (free == 0)
3440 			break;
3441 		avail += free;
3442 		if (avail >= needed)
3443 			break;
3444 	}
3445 	for (reclaimed = false; avail < needed; avail += _NPCPV) {
3446 		m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3447 		if (m == NULL) {
3448 			m = reclaim_pv_chunk(pmap, lockp);
3449 			if (m == NULL)
3450 				goto retry;
3451 			reclaimed = true;
3452 		}
3453 		PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3454 		PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3455 		dump_add_page(m->phys_addr);
3456 		pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3457 		pc->pc_pmap = pmap;
3458 		memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3459 		TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3460 		TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3461 		PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3462 
3463 		/*
3464 		 * The reclaim might have freed a chunk from the current pmap.
3465 		 * If that chunk contained available entries, we need to
3466 		 * re-count the number of available entries.
3467 		 */
3468 		if (reclaimed)
3469 			goto retry;
3470 	}
3471 	for (i = 0; i < vm_ndomains; i++) {
3472 		if (TAILQ_EMPTY(&new_tail[i]))
3473 			continue;
3474 		pvc = &pv_chunks[i];
3475 		mtx_lock(&pvc->pvc_lock);
3476 		TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3477 		mtx_unlock(&pvc->pvc_lock);
3478 	}
3479 }
3480 
3481 /*
3482  * First find and then remove the pv entry for the specified pmap and virtual
3483  * address from the specified pv list.  Returns the pv entry if found and NULL
3484  * otherwise.  This operation can be performed on pv lists for either 4KB or
3485  * 2MB page mappings.
3486  */
3487 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3488 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3489 {
3490 	pv_entry_t pv;
3491 
3492 	TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3493 		if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3494 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3495 			pvh->pv_gen++;
3496 			break;
3497 		}
3498 	}
3499 	return (pv);
3500 }
3501 
3502 /*
3503  * After demotion from a 2MB page mapping to 512 4KB page mappings,
3504  * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3505  * entries for each of the 4KB page mappings.
3506  */
3507 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3508 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3509     struct rwlock **lockp)
3510 {
3511 	struct md_page *pvh;
3512 	struct pv_chunk *pc;
3513 	pv_entry_t pv;
3514 	vm_offset_t va_last;
3515 	vm_page_t m;
3516 	int bit, field;
3517 
3518 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3519 	KASSERT((va & L2_OFFSET) == 0,
3520 	    ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3521 	KASSERT((pa & L2_OFFSET) == 0,
3522 	    ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3523 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3524 
3525 	/*
3526 	 * Transfer the 2mpage's pv entry for this mapping to the first
3527 	 * page's pv list.  Once this transfer begins, the pv list lock
3528 	 * must not be released until the last pv entry is reinstantiated.
3529 	 */
3530 	pvh = pa_to_pvh(pa);
3531 	pv = pmap_pvh_remove(pvh, pmap, va);
3532 	KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3533 	m = PHYS_TO_VM_PAGE(pa);
3534 	TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3535 	m->md.pv_gen++;
3536 	/* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3537 	PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3538 	va_last = va + L2_SIZE - PAGE_SIZE;
3539 	for (;;) {
3540 		pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3541 		KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3542 		for (field = 0; field < _NPCM; field++) {
3543 			while (pc->pc_map[field]) {
3544 				bit = ffsl(pc->pc_map[field]) - 1;
3545 				pc->pc_map[field] &= ~(1ul << bit);
3546 				pv = &pc->pc_pventry[field * 64 + bit];
3547 				va += PAGE_SIZE;
3548 				pv->pv_va = va;
3549 				m++;
3550 				KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3551 			    ("pmap_pv_demote_l2: page %p is not managed", m));
3552 				TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3553 				m->md.pv_gen++;
3554 				if (va == va_last)
3555 					goto out;
3556 			}
3557 		}
3558 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3559 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3560 	}
3561 out:
3562 	if (pc_is_full(pc)) {
3563 		TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3564 		TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3565 	}
3566 	PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3567 	PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3568 }
3569 
3570 /*
3571  * First find and then destroy the pv entry for the specified pmap and virtual
3572  * address.  This operation can be performed on pv lists for either 4KB or 2MB
3573  * page mappings.
3574  */
3575 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3576 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3577 {
3578 	pv_entry_t pv;
3579 
3580 	pv = pmap_pvh_remove(pvh, pmap, va);
3581 	KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3582 	free_pv_entry(pmap, pv);
3583 }
3584 
3585 /*
3586  * Conditionally create the PV entry for a 4KB page mapping if the required
3587  * memory can be allocated without resorting to reclamation.
3588  */
3589 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3590 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3591     struct rwlock **lockp)
3592 {
3593 	pv_entry_t pv;
3594 
3595 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3596 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3597 	if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3598 		pv->pv_va = va;
3599 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3600 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3601 		m->md.pv_gen++;
3602 		return (true);
3603 	} else
3604 		return (false);
3605 }
3606 
3607 /*
3608  * Create the PV entry for a 2MB page mapping.  Always returns true unless the
3609  * flag PMAP_ENTER_NORECLAIM is specified.  If that flag is specified, returns
3610  * false if the PV entry cannot be allocated without resorting to reclamation.
3611  */
3612 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3613 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3614     struct rwlock **lockp)
3615 {
3616 	struct md_page *pvh;
3617 	pv_entry_t pv;
3618 	vm_paddr_t pa;
3619 
3620 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3621 	/* Pass NULL instead of the lock pointer to disable reclamation. */
3622 	if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3623 	    NULL : lockp)) == NULL)
3624 		return (false);
3625 	pv->pv_va = va;
3626 	pa = PTE_TO_PHYS(l2e);
3627 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3628 	pvh = pa_to_pvh(pa);
3629 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3630 	pvh->pv_gen++;
3631 	return (true);
3632 }
3633 
3634 /*
3635  * Conditionally creates the PV entries for a L3C superpage mapping if
3636  * the required memory can be allocated without resorting to reclamation.
3637  */
3638 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3639 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3640     struct rwlock **lockp)
3641 {
3642 	pv_entry_t pv;
3643 	vm_offset_t tva;
3644 	vm_paddr_t pa __diagused;
3645 	vm_page_t mt;
3646 
3647 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3648 	KASSERT((va & L3C_OFFSET) == 0,
3649 	    ("pmap_pv_insert_l3c: va is not aligned"));
3650 	pa = VM_PAGE_TO_PHYS(m);
3651 	KASSERT((pa & L3C_OFFSET) == 0,
3652 	    ("pmap_pv_insert_l3c: pa is not aligned"));
3653 	CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3654 	for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3655 		/* Pass NULL instead of lockp to disable reclamation. */
3656 		pv = get_pv_entry(pmap, NULL);
3657 		if (__predict_false(pv == NULL)) {
3658 			while (tva > va) {
3659 				mt--;
3660 				tva -= L3_SIZE;
3661 				pmap_pvh_free(&mt->md, pmap, tva);
3662 			}
3663 			return (false);
3664 		}
3665 		pv->pv_va = tva;
3666 		TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3667 		mt->md.pv_gen++;
3668 	}
3669 	return (true);
3670 }
3671 
3672 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3673 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3674 {
3675 	pt_entry_t newl2, oldl2 __diagused;
3676 	vm_page_t ml3;
3677 	vm_paddr_t ml3pa;
3678 
3679 	KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3680 	KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3681 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3682 
3683 	ml3 = pmap_remove_pt_page(pmap, va);
3684 	if (ml3 == NULL)
3685 		panic("pmap_remove_kernel_l2: Missing pt page");
3686 
3687 	ml3pa = VM_PAGE_TO_PHYS(ml3);
3688 	newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3689 
3690 	/*
3691 	 * If this page table page was unmapped by a promotion, then it
3692 	 * contains valid mappings.  Zero it to invalidate those mappings.
3693 	 */
3694 	if (vm_page_any_valid(ml3))
3695 		pagezero((void *)PHYS_TO_DMAP(ml3pa));
3696 
3697 	/*
3698 	 * Demote the mapping.  The caller must have already invalidated the
3699 	 * mapping (i.e., the "break" in break-before-make).
3700 	 */
3701 	oldl2 = pmap_load_store(l2, newl2);
3702 	KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3703 	    __func__, l2, oldl2));
3704 }
3705 
3706 /*
3707  * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3708  */
3709 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)3710 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3711     pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3712 {
3713 	struct md_page *pvh;
3714 	pt_entry_t old_l2;
3715 	vm_page_t m, ml3, mt;
3716 
3717 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3718 	KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3719 	old_l2 = pmap_load_clear(l2);
3720 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3721 	    ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3722 
3723 	/*
3724 	 * Since a promotion must break the 4KB page mappings before making
3725 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3726 	 */
3727 	pmap_s1_invalidate_page(pmap, sva, true);
3728 
3729 	if (old_l2 & ATTR_SW_WIRED)
3730 		pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3731 	pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3732 	if (old_l2 & ATTR_SW_MANAGED) {
3733 		m = PTE_TO_VM_PAGE(old_l2);
3734 		pvh = page_to_pvh(m);
3735 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3736 		pmap_pvh_free(pvh, pmap, sva);
3737 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3738 			if (pmap_pte_dirty(pmap, old_l2))
3739 				vm_page_dirty(mt);
3740 			if (old_l2 & ATTR_AF)
3741 				vm_page_aflag_set(mt, PGA_REFERENCED);
3742 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3743 			    TAILQ_EMPTY(&pvh->pv_list))
3744 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3745 		}
3746 	}
3747 	if (pmap == kernel_pmap) {
3748 		pmap_remove_kernel_l2(pmap, l2, sva);
3749 	} else {
3750 		ml3 = pmap_remove_pt_page(pmap, sva);
3751 		if (ml3 != NULL) {
3752 			KASSERT(vm_page_any_valid(ml3),
3753 			    ("pmap_remove_l2: l3 page not promoted"));
3754 			pmap_resident_count_dec(pmap, 1);
3755 			KASSERT(ml3->ref_count == NL3PG,
3756 			    ("pmap_remove_l2: l3 page ref count error"));
3757 			ml3->ref_count = 0;
3758 			pmap_add_delayed_free_list(ml3, free, false);
3759 		}
3760 	}
3761 	return (pmap_unuse_pt(pmap, sva, l1e, free));
3762 }
3763 
3764 /*
3765  * pmap_remove_l3: do the things to unmap a page in a process
3766  */
3767 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)3768 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3769     pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3770 {
3771 	struct md_page *pvh;
3772 	pt_entry_t old_l3;
3773 	vm_page_t m;
3774 
3775 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3776 	old_l3 = pmap_load(l3);
3777 	if ((old_l3 & ATTR_CONTIGUOUS) != 0)
3778 		(void)pmap_demote_l3c(pmap, l3, va);
3779 	old_l3 = pmap_load_clear(l3);
3780 	pmap_s1_invalidate_page(pmap, va, true);
3781 	if (old_l3 & ATTR_SW_WIRED)
3782 		pmap->pm_stats.wired_count -= 1;
3783 	pmap_resident_count_dec(pmap, 1);
3784 	if (old_l3 & ATTR_SW_MANAGED) {
3785 		m = PTE_TO_VM_PAGE(old_l3);
3786 		if (pmap_pte_dirty(pmap, old_l3))
3787 			vm_page_dirty(m);
3788 		if (old_l3 & ATTR_AF)
3789 			vm_page_aflag_set(m, PGA_REFERENCED);
3790 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3791 		pmap_pvh_free(&m->md, pmap, va);
3792 		if (TAILQ_EMPTY(&m->md.pv_list) &&
3793 		    (m->flags & PG_FICTITIOUS) == 0) {
3794 			pvh = page_to_pvh(m);
3795 			if (TAILQ_EMPTY(&pvh->pv_list))
3796 				vm_page_aflag_clear(m, PGA_WRITEABLE);
3797 		}
3798 	}
3799 	return (pmap_unuse_pt(pmap, va, l2e, free));
3800 }
3801 
3802 /*
3803  * Removes the specified L3C superpage mapping.  Requests TLB invalidations
3804  * to be performed by the caller through the returned "*vap". Returns true
3805  * if the level 3 table "ml3" was unmapped and added to the spglist "free".
3806  * Otherwise, returns false.
3807  */
3808 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)3809 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
3810     vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
3811     struct rwlock **lockp)
3812 {
3813 	struct md_page *pvh;
3814 	struct rwlock *new_lock;
3815 	pt_entry_t first_l3e, l3e, *tl3p;
3816 	vm_offset_t tva;
3817 	vm_page_t m, mt;
3818 
3819 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3820 	KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
3821 	    0, ("pmap_remove_l3c: l3p is not aligned"));
3822 	KASSERT((va & L3C_OFFSET) == 0,
3823 	    ("pmap_remove_l3c: va is not aligned"));
3824 
3825 	/*
3826 	 * Hardware accessed and dirty bit maintenance might only update a
3827 	 * single L3 entry, so we must combine the accessed and dirty bits
3828 	 * from this entire set of contiguous L3 entries.
3829 	 */
3830 	first_l3e = pmap_load_clear(l3p);
3831 	for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
3832 		l3e = pmap_load_clear(tl3p);
3833 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
3834 		    ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
3835 		if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
3836 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
3837 			first_l3e &= ~ATTR_S1_AP_RW_BIT;
3838 		first_l3e |= l3e & ATTR_AF;
3839 	}
3840 	if ((first_l3e & ATTR_SW_WIRED) != 0)
3841 		pmap->pm_stats.wired_count -= L3C_ENTRIES;
3842 	pmap_resident_count_dec(pmap, L3C_ENTRIES);
3843 	if ((first_l3e & ATTR_SW_MANAGED) != 0) {
3844 		m = PTE_TO_VM_PAGE(first_l3e);
3845 		new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3846 		if (new_lock != *lockp) {
3847 			if (*lockp != NULL) {
3848 				/*
3849 				 * Pending TLB invalidations must be
3850 				 * performed before the PV list lock is
3851 				 * released.  Otherwise, a concurrent
3852 				 * pmap_remove_all() on a physical page
3853 				 * could return while a stale TLB entry
3854 				 * still provides access to that page.
3855 				 */
3856 				if (*vap != va_next) {
3857 					pmap_invalidate_range(pmap, *vap, va,
3858 					    true);
3859 					*vap = va_next;
3860 				}
3861 				rw_wunlock(*lockp);
3862 			}
3863 			*lockp = new_lock;
3864 			rw_wlock(*lockp);
3865 		}
3866 		pvh = page_to_pvh(m);
3867 		for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
3868 		    L3_SIZE) {
3869 			if (pmap_pte_dirty(pmap, first_l3e))
3870 				vm_page_dirty(mt);
3871 			if ((first_l3e & ATTR_AF) != 0)
3872 				vm_page_aflag_set(mt, PGA_REFERENCED);
3873 			pmap_pvh_free(&mt->md, pmap, tva);
3874 			if (TAILQ_EMPTY(&mt->md.pv_list) &&
3875 			    TAILQ_EMPTY(&pvh->pv_list))
3876 				vm_page_aflag_clear(mt, PGA_WRITEABLE);
3877 		}
3878 	}
3879 	if (*vap == va_next)
3880 		*vap = va;
3881 	if (ml3 != NULL) {
3882 		ml3->ref_count -= L3C_ENTRIES;
3883 		if (ml3->ref_count == 0) {
3884 			_pmap_unwire_l3(pmap, va, ml3, free);
3885 			return (true);
3886 		}
3887 	}
3888 	return (false);
3889 }
3890 
3891 /*
3892  * Remove the specified range of addresses from the L3 page table that is
3893  * identified by the given L2 entry.
3894  */
3895 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)3896 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3897     vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3898 {
3899 	struct md_page *pvh;
3900 	struct rwlock *new_lock;
3901 	pt_entry_t *l3, old_l3;
3902 	vm_offset_t va;
3903 	vm_page_t l3pg, m;
3904 
3905 	KASSERT(ADDR_IS_CANONICAL(sva),
3906 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
3907 	KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3908 	    ("%s: End address not in canonical form: %lx", __func__, eva));
3909 
3910 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3911 	KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3912 	    ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3913 	l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
3914 	va = eva;
3915 	for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3916 		old_l3 = pmap_load(l3);
3917 		if (!pmap_l3_valid(old_l3)) {
3918 			if (va != eva) {
3919 				pmap_invalidate_range(pmap, va, sva, true);
3920 				va = eva;
3921 			}
3922 			continue;
3923 		}
3924 		if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
3925 			/*
3926 			 * Is this entire set of contiguous L3 entries being
3927 			 * removed?  Handle the possibility that "eva" is zero
3928 			 * because of address wraparound.
3929 			 */
3930 			if ((sva & L3C_OFFSET) == 0 &&
3931 			    sva + L3C_OFFSET <= eva - 1) {
3932 				if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
3933 				    l3pg, free, lockp)) {
3934 					/* The L3 table was unmapped. */
3935 					sva += L3C_SIZE;
3936 					break;
3937 				}
3938 				l3 += L3C_ENTRIES - 1;
3939 				sva += L3C_SIZE - L3_SIZE;
3940 				continue;
3941 			}
3942 
3943 			(void)pmap_demote_l3c(pmap, l3, sva);
3944 		}
3945 		old_l3 = pmap_load_clear(l3);
3946 		if ((old_l3 & ATTR_SW_WIRED) != 0)
3947 			pmap->pm_stats.wired_count--;
3948 		pmap_resident_count_dec(pmap, 1);
3949 		if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3950 			m = PTE_TO_VM_PAGE(old_l3);
3951 			if (pmap_pte_dirty(pmap, old_l3))
3952 				vm_page_dirty(m);
3953 			if ((old_l3 & ATTR_AF) != 0)
3954 				vm_page_aflag_set(m, PGA_REFERENCED);
3955 			new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3956 			if (new_lock != *lockp) {
3957 				if (*lockp != NULL) {
3958 					/*
3959 					 * Pending TLB invalidations must be
3960 					 * performed before the PV list lock is
3961 					 * released.  Otherwise, a concurrent
3962 					 * pmap_remove_all() on a physical page
3963 					 * could return while a stale TLB entry
3964 					 * still provides access to that page.
3965 					 */
3966 					if (va != eva) {
3967 						pmap_invalidate_range(pmap, va,
3968 						    sva, true);
3969 						va = eva;
3970 					}
3971 					rw_wunlock(*lockp);
3972 				}
3973 				*lockp = new_lock;
3974 				rw_wlock(*lockp);
3975 			}
3976 			pmap_pvh_free(&m->md, pmap, sva);
3977 			if (TAILQ_EMPTY(&m->md.pv_list) &&
3978 			    (m->flags & PG_FICTITIOUS) == 0) {
3979 				pvh = page_to_pvh(m);
3980 				if (TAILQ_EMPTY(&pvh->pv_list))
3981 					vm_page_aflag_clear(m, PGA_WRITEABLE);
3982 			}
3983 		}
3984 		if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3985 			/*
3986 			 * _pmap_unwire_l3() has already invalidated the TLB
3987 			 * entries at all levels for "sva".  So, we need not
3988 			 * perform "sva += L3_SIZE;" here.  Moreover, we need
3989 			 * not perform "va = sva;" if "sva" is at the start
3990 			 * of a new valid range consisting of a single page.
3991 			 */
3992 			break;
3993 		}
3994 		if (va == eva)
3995 			va = sva;
3996 	}
3997 	if (va != eva)
3998 		pmap_invalidate_range(pmap, va, sva, true);
3999 }
4000 
4001 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)4002 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
4003 {
4004 	struct rwlock *lock;
4005 	vm_offset_t va_next;
4006 	pd_entry_t *l0, *l1, *l2;
4007 	pt_entry_t l3_paddr;
4008 	struct spglist free;
4009 
4010 	/*
4011 	 * Perform an unsynchronized read.  This is, however, safe.
4012 	 */
4013 	if (pmap->pm_stats.resident_count == 0)
4014 		return;
4015 
4016 	SLIST_INIT(&free);
4017 
4018 	PMAP_LOCK(pmap);
4019 	if (map_delete)
4020 		pmap_bti_on_remove(pmap, sva, eva);
4021 
4022 	lock = NULL;
4023 	for (; sva < eva; sva = va_next) {
4024 		if (pmap->pm_stats.resident_count == 0)
4025 			break;
4026 
4027 		l0 = pmap_l0(pmap, sva);
4028 		if (pmap_load(l0) == 0) {
4029 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4030 			if (va_next < sva)
4031 				va_next = eva;
4032 			continue;
4033 		}
4034 
4035 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4036 		if (va_next < sva)
4037 			va_next = eva;
4038 		l1 = pmap_l0_to_l1(l0, sva);
4039 		if (pmap_load(l1) == 0)
4040 			continue;
4041 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4042 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4043 			KASSERT(va_next <= eva,
4044 			    ("partial update of non-transparent 1G page "
4045 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4046 			    pmap_load(l1), sva, eva, va_next));
4047 			MPASS(pmap != kernel_pmap);
4048 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4049 			pmap_clear(l1);
4050 			pmap_s1_invalidate_page(pmap, sva, true);
4051 			pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4052 			pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4053 			continue;
4054 		}
4055 
4056 		/*
4057 		 * Calculate index for next page table.
4058 		 */
4059 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4060 		if (va_next < sva)
4061 			va_next = eva;
4062 
4063 		l2 = pmap_l1_to_l2(l1, sva);
4064 		if (l2 == NULL)
4065 			continue;
4066 
4067 		l3_paddr = pmap_load(l2);
4068 
4069 		if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4070 			if (sva + L2_SIZE == va_next && eva >= va_next) {
4071 				pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4072 				    &free, &lock);
4073 				continue;
4074 			} else if (pmap_demote_l2_locked(pmap, l2, sva,
4075 			    &lock) == NULL)
4076 				continue;
4077 			l3_paddr = pmap_load(l2);
4078 		}
4079 
4080 		/*
4081 		 * Weed out invalid mappings.
4082 		 */
4083 		if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4084 			continue;
4085 
4086 		/*
4087 		 * Limit our scan to either the end of the va represented
4088 		 * by the current page table page, or to the end of the
4089 		 * range being removed.
4090 		 */
4091 		if (va_next > eva)
4092 			va_next = eva;
4093 
4094 		pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4095 		    &lock);
4096 	}
4097 	if (lock != NULL)
4098 		rw_wunlock(lock);
4099 	PMAP_UNLOCK(pmap);
4100 	vm_page_free_pages_toq(&free, true);
4101 }
4102 
4103 /*
4104  *	Remove the given range of addresses from the specified map.
4105  *
4106  *	It is assumed that the start and end are properly
4107  *	rounded to the page size.
4108  */
4109 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4110 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4111 {
4112 	pmap_remove1(pmap, sva, eva, false);
4113 }
4114 
4115 /*
4116  *	Remove the given range of addresses as part of a logical unmap
4117  *	operation. This has the effect of calling pmap_remove(), but
4118  *	also clears any metadata that should persist for the lifetime
4119  *	of a logical mapping.
4120  */
4121 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4122 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4123 {
4124 	pmap_remove1(pmap, sva, eva, true);
4125 }
4126 
4127 /*
4128  *	Routine:	pmap_remove_all
4129  *	Function:
4130  *		Removes this physical page from
4131  *		all physical maps in which it resides.
4132  *		Reflects back modify bits to the pager.
4133  *
4134  *	Notes:
4135  *		Original versions of this routine were very
4136  *		inefficient because they iteratively called
4137  *		pmap_remove (slow...)
4138  */
4139 
4140 void
pmap_remove_all(vm_page_t m)4141 pmap_remove_all(vm_page_t m)
4142 {
4143 	struct md_page *pvh;
4144 	pv_entry_t pv;
4145 	pmap_t pmap;
4146 	struct rwlock *lock;
4147 	pd_entry_t *pde, tpde;
4148 	pt_entry_t *pte, tpte;
4149 	vm_offset_t va;
4150 	struct spglist free;
4151 	int lvl, pvh_gen, md_gen;
4152 
4153 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4154 	    ("pmap_remove_all: page %p is not managed", m));
4155 	SLIST_INIT(&free);
4156 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4157 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4158 	rw_wlock(lock);
4159 retry:
4160 	while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4161 		pmap = PV_PMAP(pv);
4162 		if (!PMAP_TRYLOCK(pmap)) {
4163 			pvh_gen = pvh->pv_gen;
4164 			rw_wunlock(lock);
4165 			PMAP_LOCK(pmap);
4166 			rw_wlock(lock);
4167 			if (pvh_gen != pvh->pv_gen) {
4168 				PMAP_UNLOCK(pmap);
4169 				goto retry;
4170 			}
4171 		}
4172 		va = pv->pv_va;
4173 		pte = pmap_pte_exists(pmap, va, 2, __func__);
4174 		pmap_demote_l2_locked(pmap, pte, va, &lock);
4175 		PMAP_UNLOCK(pmap);
4176 	}
4177 	while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4178 		pmap = PV_PMAP(pv);
4179 		if (!PMAP_TRYLOCK(pmap)) {
4180 			pvh_gen = pvh->pv_gen;
4181 			md_gen = m->md.pv_gen;
4182 			rw_wunlock(lock);
4183 			PMAP_LOCK(pmap);
4184 			rw_wlock(lock);
4185 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4186 				PMAP_UNLOCK(pmap);
4187 				goto retry;
4188 			}
4189 		}
4190 		pmap_resident_count_dec(pmap, 1);
4191 
4192 		pde = pmap_pde(pmap, pv->pv_va, &lvl);
4193 		KASSERT(pde != NULL,
4194 		    ("pmap_remove_all: no page directory entry found"));
4195 		KASSERT(lvl == 2,
4196 		    ("pmap_remove_all: invalid pde level %d", lvl));
4197 		tpde = pmap_load(pde);
4198 
4199 		pte = pmap_l2_to_l3(pde, pv->pv_va);
4200 		tpte = pmap_load(pte);
4201 		if ((tpte & ATTR_CONTIGUOUS) != 0)
4202 			(void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4203 		tpte = pmap_load_clear(pte);
4204 		if (tpte & ATTR_SW_WIRED)
4205 			pmap->pm_stats.wired_count--;
4206 		if ((tpte & ATTR_AF) != 0) {
4207 			pmap_invalidate_page(pmap, pv->pv_va, true);
4208 			vm_page_aflag_set(m, PGA_REFERENCED);
4209 		}
4210 
4211 		/*
4212 		 * Update the vm_page_t clean and reference bits.
4213 		 */
4214 		if (pmap_pte_dirty(pmap, tpte))
4215 			vm_page_dirty(m);
4216 		pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4217 		TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4218 		m->md.pv_gen++;
4219 		free_pv_entry(pmap, pv);
4220 		PMAP_UNLOCK(pmap);
4221 	}
4222 	vm_page_aflag_clear(m, PGA_WRITEABLE);
4223 	rw_wunlock(lock);
4224 	vm_page_free_pages_toq(&free, true);
4225 }
4226 
4227 /*
4228  * Masks and sets bits in a level 2 page table entries in the specified pmap
4229  */
4230 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4231 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4232     pt_entry_t nbits)
4233 {
4234 	pd_entry_t old_l2;
4235 	vm_page_t m, mt;
4236 
4237 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4238 	PMAP_ASSERT_STAGE1(pmap);
4239 	KASSERT((sva & L2_OFFSET) == 0,
4240 	    ("pmap_protect_l2: sva is not 2mpage aligned"));
4241 	old_l2 = pmap_load(l2);
4242 	KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4243 	    ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4244 
4245 	/*
4246 	 * Return if the L2 entry already has the desired access restrictions
4247 	 * in place.
4248 	 */
4249 	if ((old_l2 & mask) == nbits)
4250 		return;
4251 
4252 	while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4253 		cpu_spinwait();
4254 
4255 	/*
4256 	 * When a dirty read/write superpage mapping is write protected,
4257 	 * update the dirty field of each of the superpage's constituent 4KB
4258 	 * pages.
4259 	 */
4260 	if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4261 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4262 	    pmap_pte_dirty(pmap, old_l2)) {
4263 		m = PTE_TO_VM_PAGE(old_l2);
4264 		for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4265 			vm_page_dirty(mt);
4266 	}
4267 
4268 	/*
4269 	 * Since a promotion must break the 4KB page mappings before making
4270 	 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4271 	 */
4272 	pmap_s1_invalidate_page(pmap, sva, true);
4273 }
4274 
4275 /*
4276  * Masks and sets bits in the specified L3C superpage mapping.
4277  *
4278  * Requests TLB invalidations to be performed by the caller through the
4279  * returned "*vap".
4280  */
4281 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4282 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4283     vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4284 {
4285 	pt_entry_t l3e, *tl3p;
4286 	vm_page_t m, mt;
4287 	bool dirty;
4288 
4289 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4290 	KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4291 	    0, ("pmap_mask_set_l3c: l3p is not aligned"));
4292 	KASSERT((va & L3C_OFFSET) == 0,
4293 	    ("pmap_mask_set_l3c: va is not aligned"));
4294 	dirty = false;
4295 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4296 		l3e = pmap_load(tl3p);
4297 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4298 		    ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4299 		while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4300 			cpu_spinwait();
4301 		if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4302 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4303 			dirty = true;
4304 	}
4305 
4306 	/*
4307 	 * When a dirty read/write superpage mapping is write protected,
4308 	 * update the dirty field of each of the superpage's constituent 4KB
4309 	 * pages.
4310 	 */
4311 	if ((l3e & ATTR_SW_MANAGED) != 0 &&
4312 	    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4313 	    dirty) {
4314 		m = PTE_TO_VM_PAGE(pmap_load(l3p));
4315 		for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4316 			vm_page_dirty(mt);
4317 	}
4318 
4319 	if (*vap == va_next)
4320 		*vap = va;
4321 }
4322 
4323 /*
4324  * Masks and sets bits in last level page table entries in the specified
4325  * pmap and range
4326  */
4327 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4328 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4329     pt_entry_t nbits, bool invalidate)
4330 {
4331 	vm_offset_t va, va_next;
4332 	pd_entry_t *l0, *l1, *l2;
4333 	pt_entry_t *l3p, l3;
4334 
4335 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4336 	for (; sva < eva; sva = va_next) {
4337 		l0 = pmap_l0(pmap, sva);
4338 		if (pmap_load(l0) == 0) {
4339 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4340 			if (va_next < sva)
4341 				va_next = eva;
4342 			continue;
4343 		}
4344 
4345 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4346 		if (va_next < sva)
4347 			va_next = eva;
4348 		l1 = pmap_l0_to_l1(l0, sva);
4349 		if (pmap_load(l1) == 0)
4350 			continue;
4351 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4352 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4353 			KASSERT(va_next <= eva,
4354 			    ("partial update of non-transparent 1G page "
4355 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4356 			    pmap_load(l1), sva, eva, va_next));
4357 			MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4358 			if ((pmap_load(l1) & mask) != nbits) {
4359 				pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4360 				if (invalidate)
4361 					pmap_s1_invalidate_page(pmap, sva, true);
4362 			}
4363 			continue;
4364 		}
4365 
4366 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4367 		if (va_next < sva)
4368 			va_next = eva;
4369 
4370 		l2 = pmap_l1_to_l2(l1, sva);
4371 		if (pmap_load(l2) == 0)
4372 			continue;
4373 
4374 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4375 			if (sva + L2_SIZE == va_next && eva >= va_next) {
4376 				pmap_protect_l2(pmap, l2, sva, mask, nbits);
4377 				continue;
4378 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
4379 				continue;
4380 		}
4381 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4382 		    ("pmap_protect: Invalid L2 entry after demotion"));
4383 
4384 		if (va_next > eva)
4385 			va_next = eva;
4386 
4387 		va = va_next;
4388 		for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4389 		    sva += L3_SIZE) {
4390 			l3 = pmap_load(l3p);
4391 
4392 			/*
4393 			 * Go to the next L3 entry if the current one is
4394 			 * invalid or already has the desired access
4395 			 * restrictions in place.  (The latter case occurs
4396 			 * frequently.  For example, in a "buildworld"
4397 			 * workload, almost 1 out of 4 L3 entries already
4398 			 * have the desired restrictions.)
4399 			 */
4400 			if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4401 				if (va != va_next) {
4402 					if (invalidate)
4403 						pmap_s1_invalidate_range(pmap,
4404 						    va, sva, true);
4405 					va = va_next;
4406 				}
4407 				if ((l3 & ATTR_CONTIGUOUS) != 0) {
4408 					l3p += L3C_ENTRIES - 1;
4409 					sva += L3C_SIZE - L3_SIZE;
4410 				}
4411 				continue;
4412 			}
4413 
4414 			if ((l3 & ATTR_CONTIGUOUS) != 0) {
4415 				/*
4416 				 * Is this entire set of contiguous L3 entries
4417 				 * being protected?  Handle the possibility
4418 				 * that "va_next" is zero because of address
4419 				 * wraparound.
4420 				 */
4421 				if ((sva & L3C_OFFSET) == 0 &&
4422 				    sva + L3C_OFFSET <= va_next - 1) {
4423 					pmap_mask_set_l3c(pmap, l3p, sva, &va,
4424 					    va_next, mask, nbits);
4425 					l3p += L3C_ENTRIES - 1;
4426 					sva += L3C_SIZE - L3_SIZE;
4427 					continue;
4428 				}
4429 
4430 				(void)pmap_demote_l3c(pmap, l3p, sva);
4431 
4432 				/*
4433 				 * The L3 entry's accessed bit may have changed.
4434 				 */
4435 				l3 = pmap_load(l3p);
4436 			}
4437 			while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4438 			    nbits))
4439 				cpu_spinwait();
4440 
4441 			/*
4442 			 * When a dirty read/write mapping is write protected,
4443 			 * update the page's dirty field.
4444 			 */
4445 			if ((l3 & ATTR_SW_MANAGED) != 0 &&
4446 			    (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4447 			    pmap_pte_dirty(pmap, l3))
4448 				vm_page_dirty(PTE_TO_VM_PAGE(l3));
4449 
4450 			if (va == va_next)
4451 				va = sva;
4452 		}
4453 		if (va != va_next && invalidate)
4454 			pmap_s1_invalidate_range(pmap, va, sva, true);
4455 	}
4456 }
4457 
4458 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4459 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4460     pt_entry_t nbits, bool invalidate)
4461 {
4462 	PMAP_LOCK(pmap);
4463 	pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4464 	PMAP_UNLOCK(pmap);
4465 }
4466 
4467 /*
4468  *	Set the physical protection on the
4469  *	specified range of this map as requested.
4470  */
4471 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4472 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4473 {
4474 	pt_entry_t mask, nbits;
4475 
4476 	PMAP_ASSERT_STAGE1(pmap);
4477 	KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4478 	if (prot == VM_PROT_NONE) {
4479 		pmap_remove(pmap, sva, eva);
4480 		return;
4481 	}
4482 
4483 	mask = nbits = 0;
4484 	if ((prot & VM_PROT_WRITE) == 0) {
4485 		mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4486 		nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4487 	}
4488 	if ((prot & VM_PROT_EXECUTE) == 0) {
4489 		mask |= ATTR_S1_XN;
4490 		nbits |= ATTR_S1_XN;
4491 	}
4492 	if (pmap == kernel_pmap) {
4493 		mask |= ATTR_KERN_GP;
4494 		nbits |= ATTR_KERN_GP;
4495 	}
4496 	if (mask == 0)
4497 		return;
4498 
4499 	pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4500 }
4501 
4502 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4503 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4504 {
4505 
4506 	MPASS((sva & L3_OFFSET) == 0);
4507 	MPASS(((sva + size) & L3_OFFSET) == 0);
4508 
4509 	pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4510 	    ATTR_SW_NO_PROMOTE, false);
4511 }
4512 
4513 /*
4514  * Inserts the specified page table page into the specified pmap's collection
4515  * of idle page table pages.  Each of a pmap's page table pages is responsible
4516  * for mapping a distinct range of virtual addresses.  The pmap's collection is
4517  * ordered by this virtual address range.
4518  *
4519  * If "promoted" is false, then the page table page "mpte" must be zero filled;
4520  * "mpte"'s valid field will be set to 0.
4521  *
4522  * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4523  * contain valid mappings with identical attributes except for ATTR_AF;
4524  * "mpte"'s valid field will be set to 1.
4525  *
4526  * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4527  * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4528  * field will be set to VM_PAGE_BITS_ALL.
4529  */
4530 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4531 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4532     bool all_l3e_AF_set)
4533 {
4534 
4535 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4536 	KASSERT(promoted || !all_l3e_AF_set,
4537 	    ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4538 	mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4539 	return (vm_radix_insert(&pmap->pm_root, mpte));
4540 }
4541 
4542 /*
4543  * Removes the page table page mapping the specified virtual address from the
4544  * specified pmap's collection of idle page table pages, and returns it.
4545  * Otherwise, returns NULL if there is no page table page corresponding to the
4546  * specified virtual address.
4547  */
4548 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4549 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4550 {
4551 
4552 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4553 	return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4554 }
4555 
4556 /*
4557  * Performs a break-before-make update of a pmap entry. This is needed when
4558  * either promoting or demoting pages to ensure the TLB doesn't get into an
4559  * inconsistent state.
4560  */
4561 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4562 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4563     vm_offset_t va, vm_size_t size)
4564 {
4565 	register_t intr;
4566 
4567 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4568 	KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4569 	    ("%s: Updating non-promote pte", __func__));
4570 
4571 	/*
4572 	 * Ensure we don't get switched out with the page table in an
4573 	 * inconsistent state. We also need to ensure no interrupts fire
4574 	 * as they may make use of an address we are about to invalidate.
4575 	 */
4576 	intr = intr_disable();
4577 
4578 	/*
4579 	 * Clear the old mapping's valid bit, but leave the rest of the entry
4580 	 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4581 	 * lookup the physical address.
4582 	 */
4583 	pmap_clear_bits(ptep, ATTR_DESCR_VALID);
4584 
4585 	/*
4586 	 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4587 	 * be cached, so we invalidate intermediate entries as well as final
4588 	 * entries.
4589 	 */
4590 	pmap_s1_invalidate_range(pmap, va, va + size, false);
4591 
4592 	/* Create the new mapping */
4593 	pmap_store(ptep, newpte);
4594 	dsb(ishst);
4595 
4596 	intr_restore(intr);
4597 }
4598 
4599 /*
4600  * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping.
4601  */
4602 static void
pmap_update_strided(pmap_t pmap,pd_entry_t * ptep,pd_entry_t * ptep_end,pd_entry_t newpte,vm_offset_t va,vm_offset_t stride,vm_size_t size)4603 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end,
4604     pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size)
4605 {
4606 	pd_entry_t *lip;
4607 	register_t intr;
4608 
4609 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4610 	KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4611 	    ("%s: Updating non-promote pte", __func__));
4612 
4613 	/*
4614 	 * Ensure we don't get switched out with the page table in an
4615 	 * inconsistent state. We also need to ensure no interrupts fire
4616 	 * as they may make use of an address we are about to invalidate.
4617 	 */
4618 	intr = intr_disable();
4619 
4620 	/*
4621 	 * Clear the old mapping's valid bits, but leave the rest of each
4622 	 * entry unchanged, so that a lockless, concurrent pmap_kextract() can
4623 	 * still lookup the physical address.
4624 	 */
4625 	for (lip = ptep; lip < ptep_end; lip++)
4626 		pmap_clear_bits(lip, ATTR_DESCR_VALID);
4627 
4628 	/* Only final entries are changing. */
4629 	pmap_s1_invalidate_strided(pmap, va, va + size, stride, true);
4630 
4631 	/* Create the new mapping. */
4632 	for (lip = ptep; lip < ptep_end; lip++) {
4633 		pmap_store(lip, newpte);
4634 		newpte += stride;
4635 	}
4636 	dsb(ishst);
4637 
4638 	intr_restore(intr);
4639 }
4640 
4641 #if VM_NRESERVLEVEL > 0
4642 /*
4643  * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4644  * replace the many pv entries for the 4KB page mappings by a single pv entry
4645  * for the 2MB page mapping.
4646  */
4647 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4648 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4649     struct rwlock **lockp)
4650 {
4651 	struct md_page *pvh;
4652 	pv_entry_t pv;
4653 	vm_offset_t va_last;
4654 	vm_page_t m;
4655 
4656 	KASSERT((pa & L2_OFFSET) == 0,
4657 	    ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4658 	CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4659 
4660 	/*
4661 	 * Transfer the first page's pv entry for this mapping to the 2mpage's
4662 	 * pv list.  Aside from avoiding the cost of a call to get_pv_entry(),
4663 	 * a transfer avoids the possibility that get_pv_entry() calls
4664 	 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4665 	 * mappings that is being promoted.
4666 	 */
4667 	m = PHYS_TO_VM_PAGE(pa);
4668 	va = va & ~L2_OFFSET;
4669 	pv = pmap_pvh_remove(&m->md, pmap, va);
4670 	KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4671 	pvh = page_to_pvh(m);
4672 	TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4673 	pvh->pv_gen++;
4674 	/* Free the remaining NPTEPG - 1 pv entries. */
4675 	va_last = va + L2_SIZE - PAGE_SIZE;
4676 	do {
4677 		m++;
4678 		va += PAGE_SIZE;
4679 		pmap_pvh_free(&m->md, pmap, va);
4680 	} while (va < va_last);
4681 }
4682 
4683 /*
4684  * Tries to promote the 512, contiguous 4KB page mappings that are within a
4685  * single level 2 table entry to a single 2MB page mapping.  For promotion
4686  * to occur, two conditions must be met: (1) the 4KB page mappings must map
4687  * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4688  * identical characteristics.
4689  */
4690 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4691 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4692     struct rwlock **lockp)
4693 {
4694 	pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4695 
4696 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4697 
4698 	/*
4699 	 * Currently, this function only supports promotion on stage 1 pmaps
4700 	 * because it tests stage 1 specific fields and performs a break-
4701 	 * before-make sequence that is incorrect for stage 2 pmaps.
4702 	 */
4703 	if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4704 		return (false);
4705 
4706 	/*
4707 	 * Examine the first L3E in the specified PTP.  Abort if this L3E is
4708 	 * ineligible for promotion...
4709 	 */
4710 	firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4711 	newl2 = pmap_load(firstl3);
4712 	if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4713 		return (false);
4714 	/* ... is not the first physical page within an L2 block */
4715 	if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4716 	    ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4717 		atomic_add_long(&pmap_l2_p_failures, 1);
4718 		CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4719 		    " in pmap %p", va, pmap);
4720 		return (false);
4721 	}
4722 
4723 	/*
4724 	 * Both here and in the below "for" loop, to allow for repromotion
4725 	 * after MADV_FREE, conditionally write protect a clean L3E before
4726 	 * possibly aborting the promotion due to other L3E attributes.  Why?
4727 	 * Suppose that MADV_FREE is applied to a part of a superpage, the
4728 	 * address range [S, E).  pmap_advise() will demote the superpage
4729 	 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4730 	 * set AP_RO and clear AF in the L3Es for the rest of [S, E).  Later,
4731 	 * imagine that the memory in [S, E) is recycled, but the last 4KB
4732 	 * page in [S, E) is not the last to be rewritten, or simply accessed.
4733 	 * In other words, there is still a 4KB page in [S, E), call it P,
4734 	 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4735 	 * Unless we write protect P before aborting the promotion, if and
4736 	 * when P is finally rewritten, there won't be a page fault to trigger
4737 	 * repromotion.
4738 	 */
4739 setl2:
4740 	if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4741 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4742 		/*
4743 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4744 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4745 		 */
4746 		if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4747 			goto setl2;
4748 		newl2 &= ~ATTR_SW_DBM;
4749 		CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4750 		    " in pmap %p", va & ~L2_OFFSET, pmap);
4751 	}
4752 
4753 	/*
4754 	 * Examine each of the other L3Es in the specified PTP.  Abort if this
4755 	 * L3E maps an unexpected 4KB physical page or does not have identical
4756 	 * characteristics to the first L3E.  If ATTR_AF is not set in every
4757 	 * PTE, then request that the PTP be refilled on demotion.
4758 	 */
4759 	all_l3e_AF = newl2 & ATTR_AF;
4760 	pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4761 	    + L2_SIZE - PAGE_SIZE;
4762 	for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4763 		oldl3 = pmap_load(l3);
4764 		if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4765 			atomic_add_long(&pmap_l2_p_failures, 1);
4766 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4767 			    " in pmap %p", va, pmap);
4768 			return (false);
4769 		}
4770 setl3:
4771 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4772 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4773 			/*
4774 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4775 			 * set, ATTR_SW_DBM can be cleared without a TLB
4776 			 * invalidation.
4777 			 */
4778 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4779 			    ~ATTR_SW_DBM))
4780 				goto setl3;
4781 			oldl3 &= ~ATTR_SW_DBM;
4782 		}
4783 		if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
4784 			atomic_add_long(&pmap_l2_p_failures, 1);
4785 			CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4786 			    " in pmap %p", va, pmap);
4787 			return (false);
4788 		}
4789 		all_l3e_AF &= oldl3;
4790 		pa -= PAGE_SIZE;
4791 	}
4792 
4793 	/*
4794 	 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4795 	 * mapping, so that promotions triggered by speculative mappings,
4796 	 * such as pmap_enter_quick(), don't automatically mark the
4797 	 * underlying pages as referenced.
4798 	 */
4799 	newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
4800 
4801 	/*
4802 	 * Save the page table page in its current state until the L2
4803 	 * mapping the superpage is demoted by pmap_demote_l2() or
4804 	 * destroyed by pmap_remove_l3().
4805 	 */
4806 	if (mpte == NULL)
4807 		mpte = PTE_TO_VM_PAGE(pmap_load(l2));
4808 	KASSERT(mpte >= vm_page_array &&
4809 	    mpte < &vm_page_array[vm_page_array_size],
4810 	    ("pmap_promote_l2: page table page is out of range"));
4811 	KASSERT(mpte->pindex == pmap_l2_pindex(va),
4812 	    ("pmap_promote_l2: page table page's pindex is wrong"));
4813 	if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4814 		atomic_add_long(&pmap_l2_p_failures, 1);
4815 		CTR2(KTR_PMAP,
4816 		    "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4817 		    pmap);
4818 		return (false);
4819 	}
4820 
4821 	if ((newl2 & ATTR_SW_MANAGED) != 0)
4822 		pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
4823 
4824 	pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
4825 
4826 	atomic_add_long(&pmap_l2_promotions, 1);
4827 	CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4828 	    pmap);
4829 	return (true);
4830 }
4831 
4832 /*
4833  * Tries to promote an aligned, contiguous set of base page mappings to a
4834  * single L3C page mapping.  For promotion to occur, two conditions must be
4835  * met: (1) the base page mappings must map aligned, contiguous physical
4836  * memory and (2) the base page mappings must have identical characteristics
4837  * except for the accessed flag.
4838  */
4839 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)4840 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
4841 {
4842 	pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
4843 
4844 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4845 
4846 	/*
4847 	 * Currently, this function only supports promotion on stage 1 pmaps
4848 	 * because it tests stage 1 specific fields and performs a break-
4849 	 * before-make sequence that is incorrect for stage 2 pmaps.
4850 	 */
4851 	if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4852 		return (false);
4853 
4854 	/*
4855 	 * Compute the address of the first L3 entry in the superpage
4856 	 * candidate.
4857 	 */
4858 	l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
4859 	    sizeof(pt_entry_t)) - 1));
4860 
4861 	firstl3c = pmap_load(l3p);
4862 
4863 	/*
4864 	 * Examine the first L3 entry. Abort if this L3E is ineligible for
4865 	 * promotion...
4866 	 */
4867 	if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
4868 		return (false);
4869 	/* ...is not properly aligned... */
4870 	if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
4871 	    (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
4872 		counter_u64_add(pmap_l3c_p_failures, 1);
4873 		CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4874 		    " in pmap %p", va, pmap);
4875 		return (false);
4876 	}
4877 
4878 	/*
4879 	 * If the first L3 entry is a clean read-write mapping, convert it
4880 	 * to a read-only mapping.  See pmap_promote_l2() for the rationale.
4881 	 */
4882 set_first:
4883 	if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4884 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4885 		/*
4886 		 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4887 		 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4888 		 */
4889 		if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
4890 			goto set_first;
4891 		firstl3c &= ~ATTR_SW_DBM;
4892 		CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4893 		    " in pmap %p", va & ~L3C_OFFSET, pmap);
4894 	}
4895 
4896 	/*
4897 	 * Check that the rest of the L3 entries are compatible with the first,
4898 	 * and convert clean read-write mappings to read-only mappings.
4899 	 */
4900 	all_l3e_AF = firstl3c & ATTR_AF;
4901 	pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
4902 	    L3C_SIZE - PAGE_SIZE;
4903 	for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
4904 		oldl3 = pmap_load(l3);
4905 		if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4906 			counter_u64_add(pmap_l3c_p_failures, 1);
4907 			CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4908 			    " in pmap %p", va, pmap);
4909 			return (false);
4910 		}
4911 set_l3:
4912 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4913 		    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4914 			/*
4915 			 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4916 			 * set, ATTR_SW_DBM can be cleared without a TLB
4917 			 * invalidation.
4918 			 */
4919 			if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4920 			    ~ATTR_SW_DBM))
4921 				goto set_l3;
4922 			oldl3 &= ~ATTR_SW_DBM;
4923 			CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4924 			    " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
4925 			    (va & ~L3C_OFFSET), pmap);
4926 		}
4927 		if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
4928 			counter_u64_add(pmap_l3c_p_failures, 1);
4929 			CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4930 			    " in pmap %p", va, pmap);
4931 			return (false);
4932 		}
4933 		all_l3e_AF &= oldl3;
4934 		pa -= PAGE_SIZE;
4935 	}
4936 
4937 	/*
4938 	 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4939 	 * mapping, so that promotions triggered by speculative mappings,
4940 	 * such as pmap_enter_quick(), don't automatically mark the
4941 	 * underlying pages as referenced.
4942 	 */
4943 	firstl3c &= ~ATTR_AF | all_l3e_AF;
4944 
4945 	/*
4946 	 * Remake the mappings with the contiguous bit set.
4947 	 */
4948 	pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c |
4949 	    ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE);
4950 
4951 	counter_u64_add(pmap_l3c_promotions, 1);
4952 	CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
4953 	    pmap);
4954 	return (true);
4955 }
4956 #endif /* VM_NRESERVLEVEL > 0 */
4957 
4958 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t pte,int flags,int psind)4959 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
4960     int psind)
4961 {
4962 	pd_entry_t *l0p, *l1p, *l2p, newpte, origpte;
4963 	vm_page_t mp;
4964 
4965 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4966 	KASSERT(psind > 0 && psind < MAXPAGESIZES,
4967 	    ("psind %d unexpected", psind));
4968 	KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0,
4969 	    ("unaligned phys address %#lx pte %#lx psind %d",
4970 	    PTE_TO_PHYS(pte), pte, psind));
4971 
4972 restart:
4973 	newpte = pte;
4974 	if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
4975 		return (KERN_PROTECTION_FAILURE);
4976 	if (psind == 2) {
4977 		PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4978 
4979 		l0p = pmap_l0(pmap, va);
4980 		if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4981 			mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4982 			if (mp == NULL) {
4983 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4984 					return (KERN_RESOURCE_SHORTAGE);
4985 				PMAP_UNLOCK(pmap);
4986 				vm_wait(NULL);
4987 				PMAP_LOCK(pmap);
4988 				goto restart;
4989 			}
4990 			l1p = pmap_l0_to_l1(l0p, va);
4991 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4992 			origpte = pmap_load(l1p);
4993 		} else {
4994 			l1p = pmap_l0_to_l1(l0p, va);
4995 			KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4996 			origpte = pmap_load(l1p);
4997 			if ((origpte & ATTR_DESCR_VALID) == 0) {
4998 				mp = PTE_TO_VM_PAGE(pmap_load(l0p));
4999 				mp->ref_count++;
5000 			}
5001 		}
5002 		KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
5003 		    (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
5004 		    (origpte & ATTR_DESCR_VALID) == 0,
5005 		    ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
5006 		    va, origpte, newpte));
5007 		pmap_store(l1p, newpte);
5008 	} else /* (psind == 1) */ {
5009 		l2p = pmap_l2(pmap, va);
5010 		if (l2p == NULL) {
5011 			mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
5012 			if (mp == NULL) {
5013 				if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5014 					return (KERN_RESOURCE_SHORTAGE);
5015 				PMAP_UNLOCK(pmap);
5016 				vm_wait(NULL);
5017 				PMAP_LOCK(pmap);
5018 				goto restart;
5019 			}
5020 			l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5021 			l2p = &l2p[pmap_l2_index(va)];
5022 			origpte = pmap_load(l2p);
5023 		} else {
5024 			l1p = pmap_l1(pmap, va);
5025 			origpte = pmap_load(l2p);
5026 			if ((origpte & ATTR_DESCR_VALID) == 0) {
5027 				mp = PTE_TO_VM_PAGE(pmap_load(l1p));
5028 				mp->ref_count++;
5029 			}
5030 		}
5031 		KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5032 		    ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
5033 		    PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5034 		    ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
5035 		    va, origpte, newpte));
5036 		pmap_store(l2p, newpte);
5037 	}
5038 	dsb(ishst);
5039 
5040 	if ((origpte & ATTR_DESCR_VALID) == 0)
5041 		pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
5042 	if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
5043 		pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
5044 	else if ((newpte & ATTR_SW_WIRED) == 0 &&
5045 	    (origpte & ATTR_SW_WIRED) != 0)
5046 		pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
5047 
5048 	return (KERN_SUCCESS);
5049 }
5050 
5051 /*
5052  *	Insert the given physical page (p) at
5053  *	the specified virtual address (v) in the
5054  *	target physical map with the protection requested.
5055  *
5056  *	If specified, the page will be wired down, meaning
5057  *	that the related pte can not be reclaimed.
5058  *
5059  *	NB:  This is the only routine which MAY NOT lazy-evaluate
5060  *	or lose information.  That is, this routine must actually
5061  *	insert this page into the given map NOW.
5062  */
5063 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)5064 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5065     u_int flags, int8_t psind)
5066 {
5067 	struct rwlock *lock;
5068 	pd_entry_t *pde;
5069 	pt_entry_t new_l3, orig_l3;
5070 	pt_entry_t *l2, *l3;
5071 	pv_entry_t pv;
5072 	vm_paddr_t opa, pa;
5073 	vm_page_t mpte, om;
5074 	bool nosleep;
5075 	int lvl, rv;
5076 
5077 	KASSERT(ADDR_IS_CANONICAL(va),
5078 	    ("%s: Address not in canonical form: %lx", __func__, va));
5079 
5080 	va = trunc_page(va);
5081 	if ((m->oflags & VPO_UNMANAGED) == 0)
5082 		VM_PAGE_OBJECT_BUSY_ASSERT(m);
5083 	pa = VM_PAGE_TO_PHYS(m);
5084 	new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE);
5085 	new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5086 	new_l3 |= pmap_pte_prot(pmap, prot);
5087 	if ((flags & PMAP_ENTER_WIRED) != 0)
5088 		new_l3 |= ATTR_SW_WIRED;
5089 	if (pmap->pm_stage == PM_STAGE1) {
5090 		if (!ADDR_IS_KERNEL(va))
5091 			new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5092 		else
5093 			new_l3 |= ATTR_S1_UXN;
5094 		if (pmap != kernel_pmap)
5095 			new_l3 |= ATTR_S1_nG;
5096 	} else {
5097 		/*
5098 		 * Clear the access flag on executable mappings, this will be
5099 		 * set later when the page is accessed. The fault handler is
5100 		 * required to invalidate the I-cache.
5101 		 *
5102 		 * TODO: Switch to the valid flag to allow hardware management
5103 		 * of the access flag. Much of the pmap code assumes the
5104 		 * valid flag is set and fails to destroy the old page tables
5105 		 * correctly if it is clear.
5106 		 */
5107 		if (prot & VM_PROT_EXECUTE)
5108 			new_l3 &= ~ATTR_AF;
5109 	}
5110 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5111 		new_l3 |= ATTR_SW_MANAGED;
5112 		if ((prot & VM_PROT_WRITE) != 0) {
5113 			new_l3 |= ATTR_SW_DBM;
5114 			if ((flags & VM_PROT_WRITE) == 0) {
5115 				if (pmap->pm_stage == PM_STAGE1)
5116 					new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5117 				else
5118 					new_l3 &=
5119 					    ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5120 			}
5121 		}
5122 	}
5123 
5124 	CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5125 
5126 	lock = NULL;
5127 	PMAP_LOCK(pmap);
5128 	if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5129 		KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5130 		    ("managed largepage va %#lx flags %#x", va, flags));
5131 		new_l3 &= ~L3_PAGE;
5132 		if (psind == 2) {
5133 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5134 			new_l3 |= L1_BLOCK;
5135 		} else /* (psind == 1) */
5136 			new_l3 |= L2_BLOCK;
5137 		rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5138 		goto out;
5139 	}
5140 	if (psind == 1) {
5141 		/* Assert the required virtual and physical alignment. */
5142 		KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5143 		KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5144 		rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5145 		    flags, m, &lock);
5146 		goto out;
5147 	}
5148 	mpte = NULL;
5149 
5150 	/*
5151 	 * In the case that a page table page is not
5152 	 * resident, we are creating it here.
5153 	 */
5154 retry:
5155 	pde = pmap_pde(pmap, va, &lvl);
5156 	if (pde != NULL && lvl == 2) {
5157 		l3 = pmap_l2_to_l3(pde, va);
5158 		if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
5159 			mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5160 			mpte->ref_count++;
5161 		}
5162 		goto havel3;
5163 	} else if (pde != NULL && lvl == 1) {
5164 		l2 = pmap_l1_to_l2(pde, va);
5165 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5166 		    (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5167 			l3 = &l3[pmap_l3_index(va)];
5168 			if (!ADDR_IS_KERNEL(va)) {
5169 				mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5170 				mpte->ref_count++;
5171 			}
5172 			goto havel3;
5173 		}
5174 		/* We need to allocate an L3 table. */
5175 	}
5176 	if (!ADDR_IS_KERNEL(va)) {
5177 		nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5178 
5179 		/*
5180 		 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5181 		 * to handle the possibility that a superpage mapping for "va"
5182 		 * was created while we slept.
5183 		 */
5184 		mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5185 		    nosleep ? NULL : &lock);
5186 		if (mpte == NULL && nosleep) {
5187 			CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5188 			rv = KERN_RESOURCE_SHORTAGE;
5189 			goto out;
5190 		}
5191 		goto retry;
5192 	} else
5193 		panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5194 
5195 havel3:
5196 	orig_l3 = pmap_load(l3);
5197 	opa = PTE_TO_PHYS(orig_l3);
5198 	pv = NULL;
5199 	new_l3 |= pmap_pte_bti(pmap, va);
5200 
5201 	/*
5202 	 * Is the specified virtual address already mapped?
5203 	 */
5204 	if (pmap_l3_valid(orig_l3)) {
5205 		/*
5206 		 * Wiring change, just update stats. We don't worry about
5207 		 * wiring PT pages as they remain resident as long as there
5208 		 * are valid mappings in them. Hence, if a user page is wired,
5209 		 * the PT page will be also.
5210 		 */
5211 		if ((flags & PMAP_ENTER_WIRED) != 0 &&
5212 		    (orig_l3 & ATTR_SW_WIRED) == 0)
5213 			pmap->pm_stats.wired_count++;
5214 		else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5215 		    (orig_l3 & ATTR_SW_WIRED) != 0)
5216 			pmap->pm_stats.wired_count--;
5217 
5218 		/*
5219 		 * Remove the extra PT page reference.
5220 		 */
5221 		if (mpte != NULL) {
5222 			mpte->ref_count--;
5223 			KASSERT(mpte->ref_count > 0,
5224 			    ("pmap_enter: missing reference to page table page,"
5225 			     " va: 0x%lx", va));
5226 		}
5227 
5228 		/*
5229 		 * Has the physical page changed?
5230 		 */
5231 		if (opa == pa) {
5232 			/*
5233 			 * No, might be a protection or wiring change.
5234 			 */
5235 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5236 			    (new_l3 & ATTR_SW_DBM) != 0)
5237 				vm_page_aflag_set(m, PGA_WRITEABLE);
5238 			goto validate;
5239 		}
5240 
5241 		/*
5242 		 * The physical page has changed.  Temporarily invalidate
5243 		 * the mapping.
5244 		 */
5245 		if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5246 			(void)pmap_demote_l3c(pmap, l3, va);
5247 		orig_l3 = pmap_load_clear(l3);
5248 		KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5249 		    ("pmap_enter: unexpected pa update for %#lx", va));
5250 		if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5251 			om = PHYS_TO_VM_PAGE(opa);
5252 
5253 			/*
5254 			 * The pmap lock is sufficient to synchronize with
5255 			 * concurrent calls to pmap_page_test_mappings() and
5256 			 * pmap_ts_referenced().
5257 			 */
5258 			if (pmap_pte_dirty(pmap, orig_l3))
5259 				vm_page_dirty(om);
5260 			if ((orig_l3 & ATTR_AF) != 0) {
5261 				pmap_invalidate_page(pmap, va, true);
5262 				vm_page_aflag_set(om, PGA_REFERENCED);
5263 			}
5264 			CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5265 			pv = pmap_pvh_remove(&om->md, pmap, va);
5266 			if ((m->oflags & VPO_UNMANAGED) != 0)
5267 				free_pv_entry(pmap, pv);
5268 			if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5269 			    TAILQ_EMPTY(&om->md.pv_list) &&
5270 			    ((om->flags & PG_FICTITIOUS) != 0 ||
5271 			    TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5272 				vm_page_aflag_clear(om, PGA_WRITEABLE);
5273 		} else {
5274 			KASSERT((orig_l3 & ATTR_AF) != 0,
5275 			    ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5276 			pmap_invalidate_page(pmap, va, true);
5277 		}
5278 		orig_l3 = 0;
5279 	} else {
5280 		/*
5281 		 * Increment the counters.
5282 		 */
5283 		if ((new_l3 & ATTR_SW_WIRED) != 0)
5284 			pmap->pm_stats.wired_count++;
5285 		pmap_resident_count_inc(pmap, 1);
5286 	}
5287 	/*
5288 	 * Enter on the PV list if part of our managed memory.
5289 	 */
5290 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5291 		if (pv == NULL) {
5292 			pv = get_pv_entry(pmap, &lock);
5293 			pv->pv_va = va;
5294 		}
5295 		CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5296 		TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5297 		m->md.pv_gen++;
5298 		if ((new_l3 & ATTR_SW_DBM) != 0)
5299 			vm_page_aflag_set(m, PGA_WRITEABLE);
5300 	}
5301 
5302 validate:
5303 	if (pmap->pm_stage == PM_STAGE1) {
5304 		/*
5305 		 * Sync icache if exec permission and attribute
5306 		 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5307 		 * is stored and made valid for hardware table walk. If done
5308 		 * later, then other can access this page before caches are
5309 		 * properly synced. Don't do it for kernel memory which is
5310 		 * mapped with exec permission even if the memory isn't going
5311 		 * to hold executable code. The only time when icache sync is
5312 		 * needed is after kernel module is loaded and the relocation
5313 		 * info is processed. And it's done in elf_cpu_load_file().
5314 		*/
5315 		if ((prot & VM_PROT_EXECUTE) &&  pmap != kernel_pmap &&
5316 		    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5317 		    (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5318 			PMAP_ASSERT_STAGE1(pmap);
5319 			cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5320 			    PAGE_SIZE);
5321 		}
5322 	} else {
5323 		cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5324 	}
5325 
5326 	/*
5327 	 * Update the L3 entry
5328 	 */
5329 	if (pmap_l3_valid(orig_l3)) {
5330 		KASSERT(opa == pa, ("pmap_enter: invalid update"));
5331 		if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5332 			/* same PA, different attributes */
5333 			if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5334 				(void)pmap_demote_l3c(pmap, l3, va);
5335 			orig_l3 = pmap_load_store(l3, new_l3);
5336 			pmap_invalidate_page(pmap, va, true);
5337 			if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5338 			    pmap_pte_dirty(pmap, orig_l3))
5339 				vm_page_dirty(m);
5340 		} else {
5341 			/*
5342 			 * orig_l3 == new_l3
5343 			 * This can happens if multiple threads simultaneously
5344 			 * access not yet mapped page. This bad for performance
5345 			 * since this can cause full demotion-NOP-promotion
5346 			 * cycle.
5347 			 * Another possible reasons are:
5348 			 * - VM and pmap memory layout are diverged
5349 			 * - tlb flush is missing somewhere and CPU doesn't see
5350 			 *   actual mapping.
5351 			 */
5352 			CTR4(KTR_PMAP, "%s: already mapped page - "
5353 			    "pmap %p va 0x%#lx pte 0x%lx",
5354 			    __func__, pmap, va, new_l3);
5355 		}
5356 	} else {
5357 		/* New mapping */
5358 		pmap_store(l3, new_l3);
5359 		dsb(ishst);
5360 	}
5361 
5362 #if VM_NRESERVLEVEL > 0
5363 	/*
5364 	 * First, attempt L3C promotion, if the virtual and physical addresses
5365 	 * are aligned with each other and an underlying reservation has the
5366 	 * neighboring L3 pages allocated.  The first condition is simply an
5367 	 * optimization that recognizes some eventual promotion failures early
5368 	 * at a lower run-time cost.  Then, if both the page table page and
5369 	 * the reservation are fully populated, attempt L2 promotion.
5370 	 */
5371 	if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5372 	    (m->flags & PG_FICTITIOUS) == 0 &&
5373 	    vm_reserv_is_populated(m, L3C_ENTRIES) &&
5374 	    pmap_promote_l3c(pmap, l3, va) &&
5375 	    (mpte == NULL || mpte->ref_count == NL3PG) &&
5376 	    vm_reserv_level_iffullpop(m) == 0)
5377 		(void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5378 #endif
5379 
5380 	rv = KERN_SUCCESS;
5381 out:
5382 	if (lock != NULL)
5383 		rw_wunlock(lock);
5384 	PMAP_UNLOCK(pmap);
5385 	return (rv);
5386 }
5387 
5388 /*
5389  * Tries to create a read- and/or execute-only L2 page mapping.  Returns
5390  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
5391  * value.  See pmap_enter_l2() for the possible error values when "no sleep",
5392  * "no replace", and "no reclaim" are specified.
5393  */
5394 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5395 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5396     struct rwlock **lockp)
5397 {
5398 	pd_entry_t new_l2;
5399 
5400 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5401 	PMAP_ASSERT_STAGE1(pmap);
5402 	KASSERT(ADDR_IS_CANONICAL(va),
5403 	    ("%s: Address not in canonical form: %lx", __func__, va));
5404 
5405 	new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5406 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5407 	    L2_BLOCK);
5408 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5409 		new_l2 |= ATTR_SW_MANAGED;
5410 		new_l2 &= ~ATTR_AF;
5411 	}
5412 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5413 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5414 		new_l2 |= ATTR_S1_XN;
5415 	if (!ADDR_IS_KERNEL(va))
5416 		new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5417 	else
5418 		new_l2 |= ATTR_S1_UXN;
5419 	if (pmap != kernel_pmap)
5420 		new_l2 |= ATTR_S1_nG;
5421 	return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5422 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5423 }
5424 
5425 /*
5426  * Returns true if every page table entry in the specified page table is
5427  * zero.
5428  */
5429 static bool
pmap_every_pte_zero(vm_paddr_t pa)5430 pmap_every_pte_zero(vm_paddr_t pa)
5431 {
5432 	pt_entry_t *pt_end, *pte;
5433 
5434 	KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5435 	pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5436 	for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5437 		if (*pte != 0)
5438 			return (false);
5439 	}
5440 	return (true);
5441 }
5442 
5443 /*
5444  * Tries to create the specified L2 page mapping.  Returns KERN_SUCCESS if
5445  * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5446  * KERN_RESOURCE_SHORTAGE otherwise.  Returns KERN_FAILURE if
5447  * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5448  * within the L2 virtual address range starting at the specified virtual
5449  * address.  Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5450  * L2 page mapping already exists at the specified virtual address.  Returns
5451  * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5452  * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5453  * and a PV entry allocation failed.
5454  */
5455 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5456 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5457     vm_page_t m, struct rwlock **lockp)
5458 {
5459 	struct spglist free;
5460 	pd_entry_t *l2, old_l2;
5461 	vm_page_t l2pg, mt;
5462 	vm_page_t uwptpg;
5463 
5464 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5465 	KASSERT(ADDR_IS_CANONICAL(va),
5466 	    ("%s: Address not in canonical form: %lx", __func__, va));
5467 
5468 	if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5469 	    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5470 		CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5471 		    va, pmap);
5472 		return (KERN_RESOURCE_SHORTAGE);
5473 	}
5474 
5475 	/*
5476 	 * If bti is not the same for the whole l2 range, return failure
5477 	 * and let vm_fault() cope.  Check after l2 allocation, since
5478 	 * it could sleep.
5479 	 */
5480 	if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) {
5481 		KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5482 		pmap_abort_ptp(pmap, va, l2pg);
5483 		return (KERN_PROTECTION_FAILURE);
5484 	}
5485 
5486 	/*
5487 	 * If there are existing mappings, either abort or remove them.
5488 	 */
5489 	if ((old_l2 = pmap_load(l2)) != 0) {
5490 		KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5491 		    ("pmap_enter_l2: l2pg's ref count is too low"));
5492 		if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5493 			if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5494 				if (l2pg != NULL)
5495 					l2pg->ref_count--;
5496 				CTR2(KTR_PMAP,
5497 				    "pmap_enter_l2: no space for va %#lx"
5498 				    " in pmap %p", va, pmap);
5499 				return (KERN_NO_SPACE);
5500 			} else if (!ADDR_IS_KERNEL(va) ||
5501 			    !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5502 				if (l2pg != NULL)
5503 					l2pg->ref_count--;
5504 				CTR2(KTR_PMAP,
5505 				    "pmap_enter_l2: failure for va %#lx"
5506 				    " in pmap %p", va, pmap);
5507 				return (KERN_FAILURE);
5508 			}
5509 		}
5510 		SLIST_INIT(&free);
5511 		if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
5512 			(void)pmap_remove_l2(pmap, l2, va,
5513 			    pmap_load(pmap_l1(pmap, va)), &free, lockp);
5514 		else
5515 			pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5516 			    &free, lockp);
5517 		if (!ADDR_IS_KERNEL(va)) {
5518 			vm_page_free_pages_toq(&free, true);
5519 			KASSERT(pmap_load(l2) == 0,
5520 			    ("pmap_enter_l2: non-zero L2 entry %p", l2));
5521 		} else {
5522 			KASSERT(SLIST_EMPTY(&free),
5523 			    ("pmap_enter_l2: freed kernel page table page"));
5524 
5525 			/*
5526 			 * Both pmap_remove_l2() and pmap_remove_l3_range()
5527 			 * will leave the kernel page table page zero filled.
5528 			 * Nonetheless, the TLB could have an intermediate
5529 			 * entry for the kernel page table page, so request
5530 			 * an invalidation at all levels after clearing
5531 			 * the L2_TABLE entry.
5532 			 */
5533 			mt = PTE_TO_VM_PAGE(pmap_load(l2));
5534 			if (pmap_insert_pt_page(pmap, mt, false, false))
5535 				panic("pmap_enter_l2: trie insert failed");
5536 			pmap_clear(l2);
5537 			pmap_s1_invalidate_page(pmap, va, false);
5538 		}
5539 	}
5540 
5541 	/*
5542 	 * Allocate leaf ptpage for wired userspace pages.
5543 	 */
5544 	uwptpg = NULL;
5545 	if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5546 		uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5547 		if (uwptpg == NULL) {
5548 			pmap_abort_ptp(pmap, va, l2pg);
5549 			return (KERN_RESOURCE_SHORTAGE);
5550 		}
5551 		uwptpg->pindex = pmap_l2_pindex(va);
5552 		if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5553 			vm_page_unwire_noq(uwptpg);
5554 			vm_page_free(uwptpg);
5555 			pmap_abort_ptp(pmap, va, l2pg);
5556 			return (KERN_RESOURCE_SHORTAGE);
5557 		}
5558 		pmap_resident_count_inc(pmap, 1);
5559 		uwptpg->ref_count = NL3PG;
5560 	}
5561 	if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5562 		/*
5563 		 * Abort this mapping if its PV entry could not be created.
5564 		 */
5565 		if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5566 			if (l2pg != NULL)
5567 				pmap_abort_ptp(pmap, va, l2pg);
5568 			if (uwptpg != NULL) {
5569 				mt = pmap_remove_pt_page(pmap, va);
5570 				KASSERT(mt == uwptpg,
5571 				    ("removed pt page %p, expected %p", mt,
5572 				    uwptpg));
5573 				pmap_resident_count_dec(pmap, 1);
5574 				uwptpg->ref_count = 1;
5575 				vm_page_unwire_noq(uwptpg);
5576 				vm_page_free(uwptpg);
5577 			}
5578 			CTR2(KTR_PMAP,
5579 			    "pmap_enter_l2: failure for va %#lx in pmap %p",
5580 			    va, pmap);
5581 			return (KERN_RESOURCE_SHORTAGE);
5582 		}
5583 		if ((new_l2 & ATTR_SW_DBM) != 0)
5584 			for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5585 				vm_page_aflag_set(mt, PGA_WRITEABLE);
5586 	}
5587 
5588 	/*
5589 	 * Increment counters.
5590 	 */
5591 	if ((new_l2 & ATTR_SW_WIRED) != 0)
5592 		pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5593 	pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5594 
5595 	/*
5596 	 * Conditionally sync the icache.  See pmap_enter() for details.
5597 	 */
5598 	if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5599 	    PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5600 	    pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5601 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5602 		    L2_SIZE);
5603 	}
5604 
5605 	/*
5606 	 * Map the superpage.
5607 	 */
5608 	pmap_store(l2, new_l2);
5609 	dsb(ishst);
5610 
5611 	atomic_add_long(&pmap_l2_mappings, 1);
5612 	CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5613 	    va, pmap);
5614 
5615 	return (KERN_SUCCESS);
5616 }
5617 
5618 /*
5619  * Tries to create a read- and/or execute-only L3C page mapping.  Returns
5620  * KERN_SUCCESS if the mapping was created.  Otherwise, returns an error
5621  * value.
5622  */
5623 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)5624 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5625     vm_prot_t prot, struct rwlock **lockp)
5626 {
5627 	pt_entry_t l3e;
5628 
5629 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5630 	PMAP_ASSERT_STAGE1(pmap);
5631 	KASSERT(ADDR_IS_CANONICAL(va),
5632 	    ("%s: Address not in canonical form: %lx", __func__, va));
5633 
5634 	l3e = VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5635 	    ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5636 	    ATTR_CONTIGUOUS | L3_PAGE;
5637 	if ((m->oflags & VPO_UNMANAGED) == 0) {
5638 		l3e |= ATTR_SW_MANAGED;
5639 		l3e &= ~ATTR_AF;
5640 	}
5641 	if ((prot & VM_PROT_EXECUTE) == 0 ||
5642 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
5643 		l3e |= ATTR_S1_XN;
5644 	if (!ADDR_IS_KERNEL(va))
5645 		l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5646 	else
5647 		l3e |= ATTR_S1_UXN;
5648 	if (pmap != kernel_pmap)
5649 		l3e |= ATTR_S1_nG;
5650 	return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5651 	    PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
5652 }
5653 
5654 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)5655 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
5656     vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
5657 {
5658 	pd_entry_t *l2p, *pde;
5659 	pt_entry_t *l3p, *tl3p;
5660 	vm_page_t mt;
5661 	vm_paddr_t pa;
5662 	vm_pindex_t l2pindex;
5663 	int lvl;
5664 
5665 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5666 	KASSERT((va & L3C_OFFSET) == 0,
5667 	    ("pmap_enter_l3c: va is not aligned"));
5668 	KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
5669 	    ("pmap_enter_l3c: managed mapping within the clean submap"));
5670 
5671 	/*
5672 	 * If the L3 PTP is not resident, we attempt to create it here.
5673 	 */
5674 	if (!ADDR_IS_KERNEL(va)) {
5675 		/*
5676 		 * Were we given the correct L3 PTP?  If so, we can simply
5677 		 * increment its ref count.
5678 		 */
5679 		l2pindex = pmap_l2_pindex(va);
5680 		if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
5681 			(*ml3p)->ref_count += L3C_ENTRIES;
5682 		} else {
5683 retry:
5684 			/*
5685 			 * Get the L2 entry.
5686 			 */
5687 			pde = pmap_pde(pmap, va, &lvl);
5688 
5689 			/*
5690 			 * If the L2 entry is a superpage, we either abort or
5691 			 * demote depending on the given flags.
5692 			 */
5693 			if (lvl == 1) {
5694 				l2p = pmap_l1_to_l2(pde, va);
5695 				if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
5696 				    L2_BLOCK) {
5697 					if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5698 						return (KERN_FAILURE);
5699 					l3p = pmap_demote_l2_locked(pmap, l2p,
5700 					    va, lockp);
5701 					if (l3p != NULL) {
5702 						*ml3p = PTE_TO_VM_PAGE(
5703 						    pmap_load(l2p));
5704 						(*ml3p)->ref_count +=
5705 						    L3C_ENTRIES;
5706 						goto have_l3p;
5707 					}
5708 				}
5709 				/* We need to allocate an L3 PTP. */
5710 			}
5711 
5712 			/*
5713 			 * If the L3 PTP is mapped, we just increment its ref
5714 			 * count.  Otherwise, we attempt to allocate it.
5715 			 */
5716 			if (lvl == 2 && pmap_load(pde) != 0) {
5717 				*ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
5718 				(*ml3p)->ref_count += L3C_ENTRIES;
5719 			} else {
5720 				*ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
5721 				    PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
5722 				if (*ml3p == NULL) {
5723 					if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5724 						return (KERN_FAILURE);
5725 
5726 					/*
5727 					 * The page table may have changed
5728 					 * while we slept.
5729 					 */
5730 					goto retry;
5731 				}
5732 				(*ml3p)->ref_count += L3C_ENTRIES - 1;
5733 			}
5734 		}
5735 		l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
5736 	} else {
5737 		*ml3p = NULL;
5738 
5739 		/*
5740 		 * If the L2 entry is a superpage, we either abort or demote
5741 		 * depending on the given flags.
5742 		 */
5743 		pde = pmap_pde(kernel_pmap, va, &lvl);
5744 		if (lvl == 1) {
5745 			l2p = pmap_l1_to_l2(pde, va);
5746 			KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
5747 			    ("pmap_enter_l3c: missing L2 block"));
5748 			if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5749 				return (KERN_FAILURE);
5750 			l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
5751 		} else {
5752 			KASSERT(lvl == 2,
5753 			    ("pmap_enter_l3c: Invalid level %d", lvl));
5754 			l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
5755 			    pmap_load(pde)));
5756 		}
5757 	}
5758 have_l3p:
5759 	l3p = &l3p[pmap_l3_index(va)];
5760 
5761 	/*
5762 	 * If bti is not the same for the whole L3C range, return failure
5763 	 * and let vm_fault() cope.  Check after L3 allocation, since
5764 	 * it could sleep.
5765 	 */
5766 	if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) {
5767 		KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP"));
5768 		(*ml3p)->ref_count -= L3C_ENTRIES - 1;
5769 		pmap_abort_ptp(pmap, va, *ml3p);
5770 		*ml3p = NULL;
5771 		return (KERN_PROTECTION_FAILURE);
5772 	}
5773 
5774 	/*
5775 	 * If there are existing mappings, either abort or remove them.
5776 	 */
5777 	if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5778 		for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5779 			if (pmap_load(tl3p) != 0) {
5780 				if (*ml3p != NULL)
5781 					(*ml3p)->ref_count -= L3C_ENTRIES;
5782 				return (KERN_FAILURE);
5783 			}
5784 		}
5785 	} else {
5786 		/*
5787 		 * Because we increment the L3 page's reference count above,
5788 		 * it is guaranteed not to be freed here and we can pass NULL
5789 		 * instead of a valid free list.
5790 		 */
5791 		pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
5792 		    va + L3C_SIZE, NULL, lockp);
5793 	}
5794 
5795 	/*
5796 	 * Enter on the PV list if part of our managed memory.
5797 	 */
5798 	if ((l3e & ATTR_SW_MANAGED) != 0) {
5799 		if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
5800 			if (*ml3p != NULL) {
5801 				(*ml3p)->ref_count -= L3C_ENTRIES - 1;
5802 				pmap_abort_ptp(pmap, va, *ml3p);
5803 				*ml3p = NULL;
5804 			}
5805 			return (KERN_RESOURCE_SHORTAGE);
5806 		}
5807 		if ((l3e & ATTR_SW_DBM) != 0)
5808 			for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
5809 				vm_page_aflag_set(mt, PGA_WRITEABLE);
5810 	}
5811 
5812 	/*
5813 	 * Increment counters.
5814 	 */
5815 	if ((l3e & ATTR_SW_WIRED) != 0)
5816 		pmap->pm_stats.wired_count += L3C_ENTRIES;
5817 	pmap_resident_count_inc(pmap, L3C_ENTRIES);
5818 
5819 	pa = VM_PAGE_TO_PHYS(m);
5820 	KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
5821 
5822 	/*
5823 	 * Sync the icache before the mapping is stored.
5824 	 */
5825 	if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
5826 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5827 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
5828 
5829 	/*
5830 	 * Map the superpage.
5831 	 */
5832 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5833 		pmap_store(tl3p, l3e);
5834 		l3e += L3_SIZE;
5835 	}
5836 	dsb(ishst);
5837 
5838 	counter_u64_add(pmap_l3c_mappings, 1);
5839 	CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
5840 	    va, pmap);
5841 	return (KERN_SUCCESS);
5842 }
5843 
5844 /*
5845  * Maps a sequence of resident pages belonging to the same object.
5846  * The sequence begins with the given page m_start.  This page is
5847  * mapped at the given virtual address start.  Each subsequent page is
5848  * mapped at a virtual address that is offset from start by the same
5849  * amount as the page is offset from m_start within the object.  The
5850  * last page in the sequence is the page with the largest offset from
5851  * m_start that can be mapped at a virtual address less than the given
5852  * virtual address end.  Not every virtual page between start and end
5853  * is mapped; only those for which a resident page exists with the
5854  * corresponding offset from m_start are mapped.
5855  */
5856 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)5857 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5858     vm_page_t m_start, vm_prot_t prot)
5859 {
5860 	struct rwlock *lock;
5861 	vm_offset_t va;
5862 	vm_page_t m, mpte;
5863 	vm_pindex_t diff, psize;
5864 	int rv;
5865 
5866 	VM_OBJECT_ASSERT_LOCKED(m_start->object);
5867 
5868 	psize = atop(end - start);
5869 	mpte = NULL;
5870 	m = m_start;
5871 	lock = NULL;
5872 	PMAP_LOCK(pmap);
5873 	while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5874 		va = start + ptoa(diff);
5875 		if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
5876 		    m->psind == 1 && pmap_ps_enabled(pmap) &&
5877 		    ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
5878 		    KERN_SUCCESS || rv == KERN_NO_SPACE))
5879 			m = &m[L2_SIZE / PAGE_SIZE - 1];
5880 		else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
5881 		    (VM_PAGE_TO_PHYS(m) & L3C_OFFSET) == 0 &&
5882 		    vm_reserv_is_populated(m, L3C_ENTRIES) &&
5883 		    pmap_ps_enabled(pmap) &&
5884 		    ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
5885 		    &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE))
5886 			m = &m[L3C_ENTRIES - 1];
5887 		else {
5888 			/*
5889 			 * In general, if a superpage mapping were possible,
5890 			 * it would have been created above.  That said, if
5891 			 * start and end are not superpage aligned, then
5892 			 * promotion might be possible at the ends of [start,
5893 			 * end).  However, in practice, those promotion
5894 			 * attempts are so unlikely to succeed that they are
5895 			 * not worth trying.
5896 			 */
5897 			mpte = pmap_enter_quick_locked(pmap, va, m, prot |
5898 			    VM_PROT_NO_PROMOTE, mpte, &lock);
5899 		}
5900 		m = TAILQ_NEXT(m, listq);
5901 	}
5902 	if (lock != NULL)
5903 		rw_wunlock(lock);
5904 	PMAP_UNLOCK(pmap);
5905 }
5906 
5907 /*
5908  * this code makes some *MAJOR* assumptions:
5909  * 1. Current pmap & pmap exists.
5910  * 2. Not wired.
5911  * 3. Read access.
5912  * 4. No page table pages.
5913  * but is *MUCH* faster than pmap_enter...
5914  */
5915 
5916 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)5917 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5918 {
5919 	struct rwlock *lock;
5920 
5921 	lock = NULL;
5922 	PMAP_LOCK(pmap);
5923 	(void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5924 	if (lock != NULL)
5925 		rw_wunlock(lock);
5926 	PMAP_UNLOCK(pmap);
5927 }
5928 
5929 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)5930 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5931     vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5932 {
5933 	pt_entry_t *l1, *l2, *l3, l3_val;
5934 	vm_paddr_t pa;
5935 	int lvl;
5936 
5937 	KASSERT(!VA_IS_CLEANMAP(va) ||
5938 	    (m->oflags & VPO_UNMANAGED) != 0,
5939 	    ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5940 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5941 	PMAP_ASSERT_STAGE1(pmap);
5942 	KASSERT(ADDR_IS_CANONICAL(va),
5943 	    ("%s: Address not in canonical form: %lx", __func__, va));
5944 	l2 = NULL;
5945 
5946 	CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
5947 	/*
5948 	 * In the case that a page table page is not
5949 	 * resident, we are creating it here.
5950 	 */
5951 	if (!ADDR_IS_KERNEL(va)) {
5952 		vm_pindex_t l2pindex;
5953 
5954 		/*
5955 		 * Calculate pagetable page index
5956 		 */
5957 		l2pindex = pmap_l2_pindex(va);
5958 		if (mpte && (mpte->pindex == l2pindex)) {
5959 			mpte->ref_count++;
5960 		} else {
5961 			/*
5962 			 * If the page table page is mapped, we just increment
5963 			 * the hold count, and activate it.  Otherwise, we
5964 			 * attempt to allocate a page table page, passing NULL
5965 			 * instead of the PV list lock pointer because we don't
5966 			 * intend to sleep.  If this attempt fails, we don't
5967 			 * retry.  Instead, we give up.
5968 			 */
5969 			l1 = pmap_l1(pmap, va);
5970 			if (l1 != NULL && pmap_load(l1) != 0) {
5971 				if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
5972 				    L1_BLOCK)
5973 					return (NULL);
5974 				l2 = pmap_l1_to_l2(l1, va);
5975 				if (pmap_load(l2) != 0) {
5976 					if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
5977 					    L2_BLOCK)
5978 						return (NULL);
5979 					mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5980 					mpte->ref_count++;
5981 				} else {
5982 					mpte = _pmap_alloc_l3(pmap, l2pindex,
5983 					    NULL);
5984 					if (mpte == NULL)
5985 						return (mpte);
5986 				}
5987 			} else {
5988 				mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
5989 				if (mpte == NULL)
5990 					return (mpte);
5991 			}
5992 		}
5993 		l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5994 		l3 = &l3[pmap_l3_index(va)];
5995 	} else {
5996 		mpte = NULL;
5997 		l2 = pmap_pde(kernel_pmap, va, &lvl);
5998 		KASSERT(l2 != NULL,
5999 		    ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
6000 		     va));
6001 		KASSERT(lvl == 2,
6002 		    ("pmap_enter_quick_locked: Invalid level %d", lvl));
6003 		l3 = pmap_l2_to_l3(l2, va);
6004 	}
6005 
6006 	/*
6007 	 * Abort if a mapping already exists.
6008 	 */
6009 	if (pmap_load(l3) != 0) {
6010 		if (mpte != NULL)
6011 			mpte->ref_count--;
6012 		return (NULL);
6013 	}
6014 
6015 	/*
6016 	 * Enter on the PV list if part of our managed memory.
6017 	 */
6018 	if ((m->oflags & VPO_UNMANAGED) == 0 &&
6019 	    !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6020 		if (mpte != NULL)
6021 			pmap_abort_ptp(pmap, va, mpte);
6022 		return (NULL);
6023 	}
6024 
6025 	/*
6026 	 * Increment counters
6027 	 */
6028 	pmap_resident_count_inc(pmap, 1);
6029 
6030 	pa = VM_PAGE_TO_PHYS(m);
6031 	l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
6032 	    ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
6033 	l3_val |= pmap_pte_bti(pmap, va);
6034 	if ((prot & VM_PROT_EXECUTE) == 0 ||
6035 	    m->md.pv_memattr == VM_MEMATTR_DEVICE)
6036 		l3_val |= ATTR_S1_XN;
6037 	if (!ADDR_IS_KERNEL(va))
6038 		l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6039 	else
6040 		l3_val |= ATTR_S1_UXN;
6041 	if (pmap != kernel_pmap)
6042 		l3_val |= ATTR_S1_nG;
6043 
6044 	/*
6045 	 * Now validate mapping with RO protection
6046 	 */
6047 	if ((m->oflags & VPO_UNMANAGED) == 0) {
6048 		l3_val |= ATTR_SW_MANAGED;
6049 		l3_val &= ~ATTR_AF;
6050 	}
6051 
6052 	/* Sync icache before the mapping is stored to PTE */
6053 	if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
6054 	    m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6055 		cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
6056 
6057 	pmap_store(l3, l3_val);
6058 	dsb(ishst);
6059 
6060 #if VM_NRESERVLEVEL > 0
6061 	/*
6062 	 * First, attempt L3C promotion, if the virtual and physical addresses
6063 	 * are aligned with each other and an underlying reservation has the
6064 	 * neighboring L3 pages allocated.  The first condition is simply an
6065 	 * optimization that recognizes some eventual promotion failures early
6066 	 * at a lower run-time cost.  Then, attempt L2 promotion, if both the
6067 	 * PTP and the reservation are fully populated.
6068 	 */
6069 	if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
6070 	    (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
6071 	    (m->flags & PG_FICTITIOUS) == 0 &&
6072 	    vm_reserv_is_populated(m, L3C_ENTRIES) &&
6073 	    pmap_promote_l3c(pmap, l3, va) &&
6074 	    (mpte == NULL || mpte->ref_count == NL3PG) &&
6075 	    vm_reserv_level_iffullpop(m) == 0) {
6076 		if (l2 == NULL)
6077 			l2 = pmap_pde(pmap, va, &lvl);
6078 
6079 		/*
6080 		 * If promotion succeeds, then the next call to this function
6081 		 * should not be given the unmapped PTP as a hint.
6082 		 */
6083 		if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
6084 			mpte = NULL;
6085 	}
6086 #endif
6087 
6088 	return (mpte);
6089 }
6090 
6091 /*
6092  * This code maps large physical mmap regions into the
6093  * processor address space.  Note that some shortcuts
6094  * are taken, but the code works.
6095  */
6096 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6097 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6098     vm_pindex_t pindex, vm_size_t size)
6099 {
6100 
6101 	VM_OBJECT_ASSERT_WLOCKED(object);
6102 	KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6103 	    ("pmap_object_init_pt: non-device object"));
6104 }
6105 
6106 /*
6107  *	Clear the wired attribute from the mappings for the specified range of
6108  *	addresses in the given pmap.  Every valid mapping within that range
6109  *	must have the wired attribute set.  In contrast, invalid mappings
6110  *	cannot have the wired attribute set, so they are ignored.
6111  *
6112  *	The wired attribute of the page table entry is not a hardware feature,
6113  *	so there is no need to invalidate any TLB entries.
6114  */
6115 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6116 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6117 {
6118 	vm_offset_t va_next;
6119 	pd_entry_t *l0, *l1, *l2;
6120 	pt_entry_t *l3;
6121 	bool partial_l3c;
6122 
6123 	PMAP_LOCK(pmap);
6124 	for (; sva < eva; sva = va_next) {
6125 		l0 = pmap_l0(pmap, sva);
6126 		if (pmap_load(l0) == 0) {
6127 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6128 			if (va_next < sva)
6129 				va_next = eva;
6130 			continue;
6131 		}
6132 
6133 		l1 = pmap_l0_to_l1(l0, sva);
6134 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6135 		if (va_next < sva)
6136 			va_next = eva;
6137 		if (pmap_load(l1) == 0)
6138 			continue;
6139 
6140 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6141 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6142 			KASSERT(va_next <= eva,
6143 			    ("partial update of non-transparent 1G page "
6144 			    "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6145 			    pmap_load(l1), sva, eva, va_next));
6146 			MPASS(pmap != kernel_pmap);
6147 			MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6148 			    ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6149 			pmap_clear_bits(l1, ATTR_SW_WIRED);
6150 			pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6151 			continue;
6152 		}
6153 
6154 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6155 		if (va_next < sva)
6156 			va_next = eva;
6157 
6158 		l2 = pmap_l1_to_l2(l1, sva);
6159 		if (pmap_load(l2) == 0)
6160 			continue;
6161 
6162 		if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6163 			if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6164 				panic("pmap_unwire: l2 %#jx is missing "
6165 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6166 
6167 			/*
6168 			 * Are we unwiring the entire large page?  If not,
6169 			 * demote the mapping and fall through.
6170 			 */
6171 			if (sva + L2_SIZE == va_next && eva >= va_next) {
6172 				pmap_clear_bits(l2, ATTR_SW_WIRED);
6173 				pmap->pm_stats.wired_count -= L2_SIZE /
6174 				    PAGE_SIZE;
6175 				continue;
6176 			} else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6177 				panic("pmap_unwire: demotion failed");
6178 		}
6179 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6180 		    ("pmap_unwire: Invalid l2 entry after demotion"));
6181 
6182 		if (va_next > eva)
6183 			va_next = eva;
6184 		for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6185 		    sva != va_next; l3++, sva += L3_SIZE) {
6186 			if (pmap_load(l3) == 0)
6187 				continue;
6188 			if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6189 				/*
6190 				 * Avoid demotion for whole-page unwiring.
6191 				 */
6192 				if ((sva & L3C_OFFSET) == 0) {
6193 					/*
6194 					 * Handle the possibility that
6195 					 * "va_next" is zero because of
6196 					 * address wraparound.
6197 					 */
6198 					partial_l3c = sva + L3C_OFFSET >
6199 					    va_next - 1;
6200 				}
6201 				if (partial_l3c)
6202 					(void)pmap_demote_l3c(pmap, l3, sva);
6203 			}
6204 			if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6205 				panic("pmap_unwire: l3 %#jx is missing "
6206 				    "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6207 
6208 			/*
6209 			 * ATTR_SW_WIRED must be cleared atomically.  Although
6210 			 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6211 			 * the System MMU may write to the entry concurrently.
6212 			 */
6213 			pmap_clear_bits(l3, ATTR_SW_WIRED);
6214 			pmap->pm_stats.wired_count--;
6215 		}
6216 	}
6217 	PMAP_UNLOCK(pmap);
6218 }
6219 
6220 /*
6221  * This function requires that the caller has already added one to ml3's
6222  * ref_count in anticipation of creating a 4KB page mapping.
6223  */
6224 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6225 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6226     vm_page_t ml3, struct rwlock **lockp)
6227 {
6228 	pt_entry_t *tl3p;
6229 
6230 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6231 	KASSERT((va & L3C_OFFSET) == 0,
6232 	    ("pmap_copy_l3c: va is not aligned"));
6233 	KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6234 	    ("pmap_copy_l3c: l3e is not managed"));
6235 
6236 	/*
6237 	 * Abort if a mapping already exists.
6238 	 */
6239 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6240 		if (pmap_load(tl3p) != 0) {
6241 			if (ml3 != NULL)
6242 				ml3->ref_count--;
6243 			return (false);
6244 		}
6245 
6246 	if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6247 		if (ml3 != NULL)
6248 			pmap_abort_ptp(pmap, va, ml3);
6249 		return (false);
6250 	}
6251 	ml3->ref_count += L3C_ENTRIES - 1;
6252 
6253 	/*
6254 	 * Clear the wired and accessed bits.  However, leave the dirty bit
6255 	 * unchanged because read/write superpage mappings are required to be
6256 	 * dirty.
6257 	 */
6258 	l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6259 
6260 	for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6261 		pmap_store(tl3p, l3e);
6262 		l3e += L3_SIZE;
6263 	}
6264 	pmap_resident_count_inc(pmap, L3C_ENTRIES);
6265 	counter_u64_add(pmap_l3c_mappings, 1);
6266 	CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6267 	    va, pmap);
6268 	return (true);
6269 }
6270 
6271 /*
6272  *	Copy the range specified by src_addr/len
6273  *	from the source map to the range dst_addr/len
6274  *	in the destination map.
6275  *
6276  *	This routine is only advisory and need not do anything.
6277  *
6278  *	Because the executable mappings created by this routine are copied,
6279  *	it should not have to flush the instruction cache.
6280  */
6281 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6282 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6283     vm_offset_t src_addr)
6284 {
6285 	struct rwlock *lock;
6286 	pd_entry_t *l0, *l1, *l2, srcptepaddr;
6287 	pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6288 	vm_offset_t addr, end_addr, va_next;
6289 	vm_page_t dst_m, dstmpte, srcmpte;
6290 
6291 	PMAP_ASSERT_STAGE1(dst_pmap);
6292 	PMAP_ASSERT_STAGE1(src_pmap);
6293 
6294 	if (dst_addr != src_addr)
6295 		return;
6296 	end_addr = src_addr + len;
6297 	lock = NULL;
6298 	if (dst_pmap < src_pmap) {
6299 		PMAP_LOCK(dst_pmap);
6300 		PMAP_LOCK(src_pmap);
6301 	} else {
6302 		PMAP_LOCK(src_pmap);
6303 		PMAP_LOCK(dst_pmap);
6304 	}
6305 	for (addr = src_addr; addr < end_addr; addr = va_next) {
6306 		l0 = pmap_l0(src_pmap, addr);
6307 		if (pmap_load(l0) == 0) {
6308 			va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6309 			if (va_next < addr)
6310 				va_next = end_addr;
6311 			continue;
6312 		}
6313 
6314 		va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6315 		if (va_next < addr)
6316 			va_next = end_addr;
6317 		l1 = pmap_l0_to_l1(l0, addr);
6318 		if (pmap_load(l1) == 0)
6319 			continue;
6320 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6321 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6322 			KASSERT(va_next <= end_addr,
6323 			    ("partial update of non-transparent 1G page "
6324 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6325 			    pmap_load(l1), addr, end_addr, va_next));
6326 			srcptepaddr = pmap_load(l1);
6327 			l1 = pmap_l1(dst_pmap, addr);
6328 			if (l1 == NULL) {
6329 				if (_pmap_alloc_l3(dst_pmap,
6330 				    pmap_l0_pindex(addr), NULL) == NULL)
6331 					break;
6332 				l1 = pmap_l1(dst_pmap, addr);
6333 			} else {
6334 				l0 = pmap_l0(dst_pmap, addr);
6335 				dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6336 				dst_m->ref_count++;
6337 			}
6338 			KASSERT(pmap_load(l1) == 0,
6339 			    ("1G mapping present in dst pmap "
6340 			    "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6341 			    pmap_load(l1), addr, end_addr, va_next));
6342 			pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6343 			pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6344 			continue;
6345 		}
6346 
6347 		va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6348 		if (va_next < addr)
6349 			va_next = end_addr;
6350 		l2 = pmap_l1_to_l2(l1, addr);
6351 		srcptepaddr = pmap_load(l2);
6352 		if (srcptepaddr == 0)
6353 			continue;
6354 		if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6355 			/*
6356 			 * We can only virtual copy whole superpages.
6357 			 */
6358 			if ((addr & L2_OFFSET) != 0 ||
6359 			    addr + L2_SIZE > end_addr)
6360 				continue;
6361 			l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6362 			if (l2 == NULL)
6363 				break;
6364 			if (pmap_load(l2) == 0 &&
6365 			    ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6366 			    pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6367 			    PMAP_ENTER_NORECLAIM, &lock))) {
6368 				/*
6369 				 * We leave the dirty bit unchanged because
6370 				 * managed read/write superpage mappings are
6371 				 * required to be dirty.  However, managed
6372 				 * superpage mappings are not required to
6373 				 * have their accessed bit set, so we clear
6374 				 * it because we don't know if this mapping
6375 				 * will be used.
6376 				 */
6377 				srcptepaddr &= ~ATTR_SW_WIRED;
6378 				if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6379 					srcptepaddr &= ~ATTR_AF;
6380 				pmap_store(l2, srcptepaddr);
6381 				pmap_resident_count_inc(dst_pmap, L2_SIZE /
6382 				    PAGE_SIZE);
6383 				atomic_add_long(&pmap_l2_mappings, 1);
6384 			} else
6385 				pmap_abort_ptp(dst_pmap, addr, dst_m);
6386 			continue;
6387 		}
6388 		KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6389 		    ("pmap_copy: invalid L2 entry"));
6390 		srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6391 		KASSERT(srcmpte->ref_count > 0,
6392 		    ("pmap_copy: source page table page is unused"));
6393 		if (va_next > end_addr)
6394 			va_next = end_addr;
6395 		src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6396 		src_pte = &src_pte[pmap_l3_index(addr)];
6397 		dstmpte = NULL;
6398 		for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6399 			ptetemp = pmap_load(src_pte);
6400 
6401 			/*
6402 			 * We only virtual copy managed pages.
6403 			 */
6404 			if ((ptetemp & ATTR_SW_MANAGED) == 0)
6405 				continue;
6406 
6407 			if (dstmpte != NULL) {
6408 				KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6409 				    ("dstmpte pindex/addr mismatch"));
6410 				dstmpte->ref_count++;
6411 			} else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6412 			    NULL)) == NULL)
6413 				goto out;
6414 			dst_pte = (pt_entry_t *)
6415 			    PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6416 			dst_pte = &dst_pte[pmap_l3_index(addr)];
6417 			if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6418 			    L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6419 			    va_next - 1) {
6420 				if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6421 				    ptetemp, dstmpte, &lock))
6422 					goto out;
6423 				addr += L3C_SIZE - PAGE_SIZE;
6424 				src_pte += L3C_ENTRIES - 1;
6425 			} else if (pmap_load(dst_pte) == 0 &&
6426 			    pmap_try_insert_pv_entry(dst_pmap, addr,
6427 			    PTE_TO_VM_PAGE(ptetemp), &lock)) {
6428 				/*
6429 				 * Clear the wired, contiguous, modified, and
6430 				 * accessed bits from the destination PTE.
6431 				 * The contiguous bit is cleared because we
6432 				 * are not copying the entire L3C superpage.
6433 				 */
6434 				mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6435 				    ATTR_AF;
6436 				nbits = 0;
6437 				if ((ptetemp & ATTR_SW_DBM) != 0)
6438 					nbits |= ATTR_S1_AP_RW_BIT;
6439 				pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6440 				pmap_resident_count_inc(dst_pmap, 1);
6441 			} else {
6442 				pmap_abort_ptp(dst_pmap, addr, dstmpte);
6443 				goto out;
6444 			}
6445 			/* Have we copied all of the valid mappings? */
6446 			if (dstmpte->ref_count >= srcmpte->ref_count)
6447 				break;
6448 		}
6449 	}
6450 out:
6451 	/*
6452 	 * XXX This barrier may not be needed because the destination pmap is
6453 	 * not active.
6454 	 */
6455 	dsb(ishst);
6456 
6457 	if (lock != NULL)
6458 		rw_wunlock(lock);
6459 	PMAP_UNLOCK(src_pmap);
6460 	PMAP_UNLOCK(dst_pmap);
6461 }
6462 
6463 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6464 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6465 {
6466 	int error;
6467 
6468 	if (dst_pmap->pm_stage != src_pmap->pm_stage)
6469 		return (EINVAL);
6470 
6471 	if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6472 		return (0);
6473 
6474 	for (;;) {
6475 		if (dst_pmap < src_pmap) {
6476 			PMAP_LOCK(dst_pmap);
6477 			PMAP_LOCK(src_pmap);
6478 		} else {
6479 			PMAP_LOCK(src_pmap);
6480 			PMAP_LOCK(dst_pmap);
6481 		}
6482 		error = pmap_bti_copy(dst_pmap, src_pmap);
6483 		/* Clean up partial copy on failure due to no memory. */
6484 		if (error == ENOMEM)
6485 			pmap_bti_deassign_all(dst_pmap);
6486 		PMAP_UNLOCK(src_pmap);
6487 		PMAP_UNLOCK(dst_pmap);
6488 		if (error != ENOMEM)
6489 			break;
6490 		vm_wait(NULL);
6491 	}
6492 	return (error);
6493 }
6494 
6495 /*
6496  *	pmap_zero_page zeros the specified hardware page by mapping
6497  *	the page into KVM and using bzero to clear its contents.
6498  */
6499 void
pmap_zero_page(vm_page_t m)6500 pmap_zero_page(vm_page_t m)
6501 {
6502 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6503 
6504 	pagezero((void *)va);
6505 }
6506 
6507 /*
6508  *	pmap_zero_page_area zeros the specified hardware page by mapping
6509  *	the page into KVM and using bzero to clear its contents.
6510  *
6511  *	off and size may not cover an area beyond a single hardware page.
6512  */
6513 void
pmap_zero_page_area(vm_page_t m,int off,int size)6514 pmap_zero_page_area(vm_page_t m, int off, int size)
6515 {
6516 	vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6517 
6518 	if (off == 0 && size == PAGE_SIZE)
6519 		pagezero((void *)va);
6520 	else
6521 		bzero((char *)va + off, size);
6522 }
6523 
6524 /*
6525  *	pmap_copy_page copies the specified (machine independent)
6526  *	page by mapping the page into virtual memory and using
6527  *	bcopy to copy the page, one machine dependent page at a
6528  *	time.
6529  */
6530 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6531 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6532 {
6533 	vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6534 	vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6535 
6536 	pagecopy((void *)src, (void *)dst);
6537 }
6538 
6539 int unmapped_buf_allowed = 1;
6540 
6541 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6542 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6543     vm_offset_t b_offset, int xfersize)
6544 {
6545 	void *a_cp, *b_cp;
6546 	vm_page_t m_a, m_b;
6547 	vm_paddr_t p_a, p_b;
6548 	vm_offset_t a_pg_offset, b_pg_offset;
6549 	int cnt;
6550 
6551 	while (xfersize > 0) {
6552 		a_pg_offset = a_offset & PAGE_MASK;
6553 		m_a = ma[a_offset >> PAGE_SHIFT];
6554 		p_a = m_a->phys_addr;
6555 		b_pg_offset = b_offset & PAGE_MASK;
6556 		m_b = mb[b_offset >> PAGE_SHIFT];
6557 		p_b = m_b->phys_addr;
6558 		cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6559 		cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6560 		if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6561 			panic("!DMAP a %lx", p_a);
6562 		} else {
6563 			a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6564 		}
6565 		if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6566 			panic("!DMAP b %lx", p_b);
6567 		} else {
6568 			b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6569 		}
6570 		bcopy(a_cp, b_cp, cnt);
6571 		a_offset += cnt;
6572 		b_offset += cnt;
6573 		xfersize -= cnt;
6574 	}
6575 }
6576 
6577 vm_offset_t
pmap_quick_enter_page(vm_page_t m)6578 pmap_quick_enter_page(vm_page_t m)
6579 {
6580 
6581 	return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6582 }
6583 
6584 void
pmap_quick_remove_page(vm_offset_t addr)6585 pmap_quick_remove_page(vm_offset_t addr)
6586 {
6587 }
6588 
6589 /*
6590  * Returns true if the pmap's pv is one of the first
6591  * 16 pvs linked to from this page.  This count may
6592  * be changed upwards or downwards in the future; it
6593  * is only necessary that true be returned for a small
6594  * subset of pmaps for proper page aging.
6595  */
6596 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)6597 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6598 {
6599 	struct md_page *pvh;
6600 	struct rwlock *lock;
6601 	pv_entry_t pv;
6602 	int loops = 0;
6603 	bool rv;
6604 
6605 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6606 	    ("pmap_page_exists_quick: page %p is not managed", m));
6607 	rv = false;
6608 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6609 	rw_rlock(lock);
6610 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6611 		if (PV_PMAP(pv) == pmap) {
6612 			rv = true;
6613 			break;
6614 		}
6615 		loops++;
6616 		if (loops >= 16)
6617 			break;
6618 	}
6619 	if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6620 		pvh = page_to_pvh(m);
6621 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6622 			if (PV_PMAP(pv) == pmap) {
6623 				rv = true;
6624 				break;
6625 			}
6626 			loops++;
6627 			if (loops >= 16)
6628 				break;
6629 		}
6630 	}
6631 	rw_runlock(lock);
6632 	return (rv);
6633 }
6634 
6635 /*
6636  *	pmap_page_wired_mappings:
6637  *
6638  *	Return the number of managed mappings to the given physical page
6639  *	that are wired.
6640  */
6641 int
pmap_page_wired_mappings(vm_page_t m)6642 pmap_page_wired_mappings(vm_page_t m)
6643 {
6644 	struct rwlock *lock;
6645 	struct md_page *pvh;
6646 	pmap_t pmap;
6647 	pt_entry_t *pte;
6648 	pv_entry_t pv;
6649 	int count, md_gen, pvh_gen;
6650 
6651 	if ((m->oflags & VPO_UNMANAGED) != 0)
6652 		return (0);
6653 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6654 	rw_rlock(lock);
6655 restart:
6656 	count = 0;
6657 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6658 		pmap = PV_PMAP(pv);
6659 		if (!PMAP_TRYLOCK(pmap)) {
6660 			md_gen = m->md.pv_gen;
6661 			rw_runlock(lock);
6662 			PMAP_LOCK(pmap);
6663 			rw_rlock(lock);
6664 			if (md_gen != m->md.pv_gen) {
6665 				PMAP_UNLOCK(pmap);
6666 				goto restart;
6667 			}
6668 		}
6669 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6670 		if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6671 			count++;
6672 		PMAP_UNLOCK(pmap);
6673 	}
6674 	if ((m->flags & PG_FICTITIOUS) == 0) {
6675 		pvh = page_to_pvh(m);
6676 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6677 			pmap = PV_PMAP(pv);
6678 			if (!PMAP_TRYLOCK(pmap)) {
6679 				md_gen = m->md.pv_gen;
6680 				pvh_gen = pvh->pv_gen;
6681 				rw_runlock(lock);
6682 				PMAP_LOCK(pmap);
6683 				rw_rlock(lock);
6684 				if (md_gen != m->md.pv_gen ||
6685 				    pvh_gen != pvh->pv_gen) {
6686 					PMAP_UNLOCK(pmap);
6687 					goto restart;
6688 				}
6689 			}
6690 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6691 			if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6692 				count++;
6693 			PMAP_UNLOCK(pmap);
6694 		}
6695 	}
6696 	rw_runlock(lock);
6697 	return (count);
6698 }
6699 
6700 /*
6701  * Returns true if the given page is mapped individually or as part of
6702  * a 2mpage.  Otherwise, returns false.
6703  */
6704 bool
pmap_page_is_mapped(vm_page_t m)6705 pmap_page_is_mapped(vm_page_t m)
6706 {
6707 	struct rwlock *lock;
6708 	bool rv;
6709 
6710 	if ((m->oflags & VPO_UNMANAGED) != 0)
6711 		return (false);
6712 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6713 	rw_rlock(lock);
6714 	rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6715 	    ((m->flags & PG_FICTITIOUS) == 0 &&
6716 	    !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
6717 	rw_runlock(lock);
6718 	return (rv);
6719 }
6720 
6721 /*
6722  * Destroy all managed, non-wired mappings in the given user-space
6723  * pmap.  This pmap cannot be active on any processor besides the
6724  * caller.
6725  *
6726  * This function cannot be applied to the kernel pmap.  Moreover, it
6727  * is not intended for general use.  It is only to be used during
6728  * process termination.  Consequently, it can be implemented in ways
6729  * that make it faster than pmap_remove().  First, it can more quickly
6730  * destroy mappings by iterating over the pmap's collection of PV
6731  * entries, rather than searching the page table.  Second, it doesn't
6732  * have to test and clear the page table entries atomically, because
6733  * no processor is currently accessing the user address space.  In
6734  * particular, a page table entry's dirty bit won't change state once
6735  * this function starts.
6736  */
6737 void
pmap_remove_pages(pmap_t pmap)6738 pmap_remove_pages(pmap_t pmap)
6739 {
6740 	pd_entry_t *pde;
6741 	pt_entry_t *pte, tpte;
6742 	struct spglist free;
6743 	struct pv_chunklist free_chunks[PMAP_MEMDOM];
6744 	vm_page_t m, ml3, mt;
6745 	pv_entry_t pv;
6746 	struct md_page *pvh;
6747 	struct pv_chunk *pc, *npc;
6748 	struct rwlock *lock;
6749 	int64_t bit;
6750 	uint64_t inuse, bitmask;
6751 	int allfree, field, i, idx, lvl;
6752 	int freed __pvused;
6753 	vm_paddr_t pa;
6754 
6755 	lock = NULL;
6756 
6757 	for (i = 0; i < PMAP_MEMDOM; i++)
6758 		TAILQ_INIT(&free_chunks[i]);
6759 	SLIST_INIT(&free);
6760 	PMAP_LOCK(pmap);
6761 	TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6762 		allfree = 1;
6763 		freed = 0;
6764 		for (field = 0; field < _NPCM; field++) {
6765 			inuse = ~pc->pc_map[field] & pc_freemask[field];
6766 			while (inuse != 0) {
6767 				bit = ffsl(inuse) - 1;
6768 				bitmask = 1UL << bit;
6769 				idx = field * 64 + bit;
6770 				pv = &pc->pc_pventry[idx];
6771 				inuse &= ~bitmask;
6772 
6773 				pde = pmap_pde(pmap, pv->pv_va, &lvl);
6774 				KASSERT(pde != NULL,
6775 				    ("Attempting to remove an unmapped page"));
6776 
6777 				switch(lvl) {
6778 				case 1:
6779 					pte = pmap_l1_to_l2(pde, pv->pv_va);
6780 					tpte = pmap_load(pte);
6781 					KASSERT((tpte & ATTR_DESCR_MASK) ==
6782 					    L2_BLOCK,
6783 					    ("Attempting to remove an invalid "
6784 					    "block: %lx", tpte));
6785 					break;
6786 				case 2:
6787 					pte = pmap_l2_to_l3(pde, pv->pv_va);
6788 					tpte = pmap_load(pte);
6789 					KASSERT((tpte & ATTR_DESCR_MASK) ==
6790 					    L3_PAGE,
6791 					    ("Attempting to remove an invalid "
6792 					     "page: %lx", tpte));
6793 					break;
6794 				default:
6795 					panic(
6796 					    "Invalid page directory level: %d",
6797 					    lvl);
6798 				}
6799 
6800 				/*
6801 				 * We cannot remove wired mappings at this time.
6802 				 *
6803 				 * For L3C superpages, all of the constituent PTEs
6804 				 * should have the wired bit set, so we don't
6805 				 * check for ATTR_CONTIGUOUS here.
6806 				 */
6807 				if (tpte & ATTR_SW_WIRED) {
6808 					allfree = 0;
6809 					continue;
6810 				}
6811 
6812 				/* Mark free */
6813 				pc->pc_map[field] |= bitmask;
6814 
6815 				/*
6816 				 * Because this pmap is not active on other
6817 				 * processors, the dirty bit cannot have
6818 				 * changed state since we last loaded pte.
6819 				 */
6820 				pmap_clear(pte);
6821 
6822 				pa = PTE_TO_PHYS(tpte);
6823 
6824 				m = PHYS_TO_VM_PAGE(pa);
6825 				KASSERT(m->phys_addr == pa,
6826 				    ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6827 				    m, (uintmax_t)m->phys_addr,
6828 				    (uintmax_t)tpte));
6829 
6830 				KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6831 				    m < &vm_page_array[vm_page_array_size],
6832 				    ("pmap_remove_pages: bad pte %#jx",
6833 				    (uintmax_t)tpte));
6834 
6835 				/*
6836 				 * Update the vm_page_t clean/reference bits.
6837 				 *
6838 				 * We don't check for ATTR_CONTIGUOUS here
6839 				 * because writeable L3C superpages are expected
6840 				 * to be dirty, i.e., every constituent PTE
6841 				 * should be dirty.
6842 				 */
6843 				if (pmap_pte_dirty(pmap, tpte)) {
6844 					switch (lvl) {
6845 					case 1:
6846 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6847 							vm_page_dirty(mt);
6848 						break;
6849 					case 2:
6850 						vm_page_dirty(m);
6851 						break;
6852 					}
6853 				}
6854 
6855 				CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6856 
6857 				switch (lvl) {
6858 				case 1:
6859 					pmap_resident_count_dec(pmap,
6860 					    L2_SIZE / PAGE_SIZE);
6861 					pvh = page_to_pvh(m);
6862 					TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
6863 					pvh->pv_gen++;
6864 					if (TAILQ_EMPTY(&pvh->pv_list)) {
6865 						for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6866 							if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
6867 							    TAILQ_EMPTY(&mt->md.pv_list))
6868 								vm_page_aflag_clear(mt, PGA_WRITEABLE);
6869 					}
6870 					ml3 = pmap_remove_pt_page(pmap,
6871 					    pv->pv_va);
6872 					if (ml3 != NULL) {
6873 						KASSERT(vm_page_any_valid(ml3),
6874 						    ("pmap_remove_pages: l3 page not promoted"));
6875 						pmap_resident_count_dec(pmap,1);
6876 						KASSERT(ml3->ref_count == NL3PG,
6877 						    ("pmap_remove_pages: l3 page ref count error"));
6878 						ml3->ref_count = 0;
6879 						pmap_add_delayed_free_list(ml3,
6880 						    &free, false);
6881 					}
6882 					break;
6883 				case 2:
6884 					pmap_resident_count_dec(pmap, 1);
6885 					TAILQ_REMOVE(&m->md.pv_list, pv,
6886 					    pv_next);
6887 					m->md.pv_gen++;
6888 					if ((m->a.flags & PGA_WRITEABLE) != 0 &&
6889 					    TAILQ_EMPTY(&m->md.pv_list) &&
6890 					    (m->flags & PG_FICTITIOUS) == 0) {
6891 						pvh = page_to_pvh(m);
6892 						if (TAILQ_EMPTY(&pvh->pv_list))
6893 							vm_page_aflag_clear(m,
6894 							    PGA_WRITEABLE);
6895 					}
6896 					break;
6897 				}
6898 				pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
6899 				    &free);
6900 				freed++;
6901 			}
6902 		}
6903 		PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6904 		PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6905 		PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6906 		if (allfree) {
6907 			TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6908 			TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
6909 			    pc_list);
6910 		}
6911 	}
6912 	if (lock != NULL)
6913 		rw_wunlock(lock);
6914 	pmap_invalidate_all(pmap);
6915 	pmap_bti_deassign_all(pmap);
6916 	free_pv_chunk_batch(free_chunks);
6917 	PMAP_UNLOCK(pmap);
6918 	vm_page_free_pages_toq(&free, true);
6919 }
6920 
6921 /*
6922  * This is used to check if a page has been accessed or modified.
6923  */
6924 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)6925 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
6926 {
6927 	struct rwlock *lock;
6928 	pv_entry_t pv;
6929 	struct md_page *pvh;
6930 	pt_entry_t l3e, mask, *pte, value;
6931 	pmap_t pmap;
6932 	int md_gen, pvh_gen;
6933 	bool rv;
6934 
6935 	rv = false;
6936 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6937 	rw_rlock(lock);
6938 restart:
6939 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6940 		pmap = PV_PMAP(pv);
6941 		PMAP_ASSERT_STAGE1(pmap);
6942 		if (!PMAP_TRYLOCK(pmap)) {
6943 			md_gen = m->md.pv_gen;
6944 			rw_runlock(lock);
6945 			PMAP_LOCK(pmap);
6946 			rw_rlock(lock);
6947 			if (md_gen != m->md.pv_gen) {
6948 				PMAP_UNLOCK(pmap);
6949 				goto restart;
6950 			}
6951 		}
6952 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6953 		mask = 0;
6954 		value = 0;
6955 		if (modified) {
6956 			mask |= ATTR_S1_AP_RW_BIT;
6957 			value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6958 		}
6959 		if (accessed) {
6960 			mask |= ATTR_AF | ATTR_DESCR_MASK;
6961 			value |= ATTR_AF | L3_PAGE;
6962 		}
6963 		l3e = pmap_load(pte);
6964 		if ((l3e & ATTR_CONTIGUOUS) != 0)
6965 			l3e = pmap_load_l3c(pte);
6966 		PMAP_UNLOCK(pmap);
6967 		rv = (l3e & mask) == value;
6968 		if (rv)
6969 			goto out;
6970 	}
6971 	if ((m->flags & PG_FICTITIOUS) == 0) {
6972 		pvh = page_to_pvh(m);
6973 		TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6974 			pmap = PV_PMAP(pv);
6975 			PMAP_ASSERT_STAGE1(pmap);
6976 			if (!PMAP_TRYLOCK(pmap)) {
6977 				md_gen = m->md.pv_gen;
6978 				pvh_gen = pvh->pv_gen;
6979 				rw_runlock(lock);
6980 				PMAP_LOCK(pmap);
6981 				rw_rlock(lock);
6982 				if (md_gen != m->md.pv_gen ||
6983 				    pvh_gen != pvh->pv_gen) {
6984 					PMAP_UNLOCK(pmap);
6985 					goto restart;
6986 				}
6987 			}
6988 			pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6989 			mask = 0;
6990 			value = 0;
6991 			if (modified) {
6992 				mask |= ATTR_S1_AP_RW_BIT;
6993 				value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6994 			}
6995 			if (accessed) {
6996 				mask |= ATTR_AF | ATTR_DESCR_MASK;
6997 				value |= ATTR_AF | L2_BLOCK;
6998 			}
6999 			rv = (pmap_load(pte) & mask) == value;
7000 			PMAP_UNLOCK(pmap);
7001 			if (rv)
7002 				goto out;
7003 		}
7004 	}
7005 out:
7006 	rw_runlock(lock);
7007 	return (rv);
7008 }
7009 
7010 /*
7011  *	pmap_is_modified:
7012  *
7013  *	Return whether or not the specified physical page was modified
7014  *	in any physical maps.
7015  */
7016 bool
pmap_is_modified(vm_page_t m)7017 pmap_is_modified(vm_page_t m)
7018 {
7019 
7020 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7021 	    ("pmap_is_modified: page %p is not managed", m));
7022 
7023 	/*
7024 	 * If the page is not busied then this check is racy.
7025 	 */
7026 	if (!pmap_page_is_write_mapped(m))
7027 		return (false);
7028 	return (pmap_page_test_mappings(m, false, true));
7029 }
7030 
7031 /*
7032  *	pmap_is_prefaultable:
7033  *
7034  *	Return whether or not the specified virtual address is eligible
7035  *	for prefault.
7036  */
7037 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)7038 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7039 {
7040 	pd_entry_t *pde;
7041 	pt_entry_t *pte;
7042 	bool rv;
7043 	int lvl;
7044 
7045 	/*
7046 	 * Return true if and only if the L3 entry for the specified virtual
7047 	 * address is allocated but invalid.
7048 	 */
7049 	rv = false;
7050 	PMAP_LOCK(pmap);
7051 	pde = pmap_pde(pmap, addr, &lvl);
7052 	if (pde != NULL && lvl == 2) {
7053 		pte = pmap_l2_to_l3(pde, addr);
7054 		rv = pmap_load(pte) == 0;
7055 	}
7056 	PMAP_UNLOCK(pmap);
7057 	return (rv);
7058 }
7059 
7060 /*
7061  *	pmap_is_referenced:
7062  *
7063  *	Return whether or not the specified physical page was referenced
7064  *	in any physical maps.
7065  */
7066 bool
pmap_is_referenced(vm_page_t m)7067 pmap_is_referenced(vm_page_t m)
7068 {
7069 
7070 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7071 	    ("pmap_is_referenced: page %p is not managed", m));
7072 	return (pmap_page_test_mappings(m, true, false));
7073 }
7074 
7075 /*
7076  * Clear the write and modified bits in each of the given page's mappings.
7077  */
7078 void
pmap_remove_write(vm_page_t m)7079 pmap_remove_write(vm_page_t m)
7080 {
7081 	struct md_page *pvh;
7082 	pmap_t pmap;
7083 	struct rwlock *lock;
7084 	pv_entry_t next_pv, pv;
7085 	pt_entry_t oldpte, *pte, set, clear, mask, val;
7086 	vm_offset_t va;
7087 	int md_gen, pvh_gen;
7088 
7089 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7090 	    ("pmap_remove_write: page %p is not managed", m));
7091 	vm_page_assert_busied(m);
7092 
7093 	if (!pmap_page_is_write_mapped(m))
7094 		return;
7095 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7096 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7097 	rw_wlock(lock);
7098 retry:
7099 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7100 		pmap = PV_PMAP(pv);
7101 		PMAP_ASSERT_STAGE1(pmap);
7102 		if (!PMAP_TRYLOCK(pmap)) {
7103 			pvh_gen = pvh->pv_gen;
7104 			rw_wunlock(lock);
7105 			PMAP_LOCK(pmap);
7106 			rw_wlock(lock);
7107 			if (pvh_gen != pvh->pv_gen) {
7108 				PMAP_UNLOCK(pmap);
7109 				goto retry;
7110 			}
7111 		}
7112 		va = pv->pv_va;
7113 		pte = pmap_pte_exists(pmap, va, 2, __func__);
7114 		if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7115 			(void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7116 		KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7117 		    ("inconsistent pv lock %p %p for page %p",
7118 		    lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7119 		PMAP_UNLOCK(pmap);
7120 	}
7121 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7122 		pmap = PV_PMAP(pv);
7123 		if (!PMAP_TRYLOCK(pmap)) {
7124 			pvh_gen = pvh->pv_gen;
7125 			md_gen = m->md.pv_gen;
7126 			rw_wunlock(lock);
7127 			PMAP_LOCK(pmap);
7128 			rw_wlock(lock);
7129 			if (pvh_gen != pvh->pv_gen ||
7130 			    md_gen != m->md.pv_gen) {
7131 				PMAP_UNLOCK(pmap);
7132 				goto retry;
7133 			}
7134 		}
7135 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7136 		oldpte = pmap_load(pte);
7137 		if ((oldpte & ATTR_SW_DBM) != 0) {
7138 			if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7139 				(void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7140 
7141 				/*
7142 				 * The L3 entry's accessed bit may have
7143 				 * changed.
7144 				 */
7145 				oldpte = pmap_load(pte);
7146 			}
7147 			if (pmap->pm_stage == PM_STAGE1) {
7148 				set = ATTR_S1_AP_RW_BIT;
7149 				clear = 0;
7150 				mask = ATTR_S1_AP_RW_BIT;
7151 				val = ATTR_S1_AP(ATTR_S1_AP_RW);
7152 			} else {
7153 				set = 0;
7154 				clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7155 				mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7156 				val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7157 			}
7158 			clear |= ATTR_SW_DBM;
7159 			while (!atomic_fcmpset_64(pte, &oldpte,
7160 			    (oldpte | set) & ~clear))
7161 				cpu_spinwait();
7162 
7163 			if ((oldpte & mask) == val)
7164 				vm_page_dirty(m);
7165 			pmap_invalidate_page(pmap, pv->pv_va, true);
7166 		}
7167 		PMAP_UNLOCK(pmap);
7168 	}
7169 	rw_wunlock(lock);
7170 	vm_page_aflag_clear(m, PGA_WRITEABLE);
7171 }
7172 
7173 /*
7174  *	pmap_ts_referenced:
7175  *
7176  *	Return a count of reference bits for a page, clearing those bits.
7177  *	It is not necessary for every reference bit to be cleared, but it
7178  *	is necessary that 0 only be returned when there are truly no
7179  *	reference bits set.
7180  *
7181  *	As an optimization, update the page's dirty field if a modified bit is
7182  *	found while counting reference bits.  This opportunistic update can be
7183  *	performed at low cost and can eliminate the need for some future calls
7184  *	to pmap_is_modified().  However, since this function stops after
7185  *	finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7186  *	dirty pages.  Those dirty pages will only be detected by a future call
7187  *	to pmap_is_modified().
7188  */
7189 int
pmap_ts_referenced(vm_page_t m)7190 pmap_ts_referenced(vm_page_t m)
7191 {
7192 	struct md_page *pvh;
7193 	pv_entry_t pv, pvf;
7194 	pmap_t pmap;
7195 	struct rwlock *lock;
7196 	pt_entry_t *pte, tpte;
7197 	vm_offset_t va;
7198 	vm_paddr_t pa;
7199 	int cleared, md_gen, not_cleared, pvh_gen;
7200 	struct spglist free;
7201 
7202 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7203 	    ("pmap_ts_referenced: page %p is not managed", m));
7204 	SLIST_INIT(&free);
7205 	cleared = 0;
7206 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7207 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7208 	rw_wlock(lock);
7209 retry:
7210 	not_cleared = 0;
7211 	if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7212 		goto small_mappings;
7213 	pv = pvf;
7214 	do {
7215 		if (pvf == NULL)
7216 			pvf = pv;
7217 		pmap = PV_PMAP(pv);
7218 		if (!PMAP_TRYLOCK(pmap)) {
7219 			pvh_gen = pvh->pv_gen;
7220 			rw_wunlock(lock);
7221 			PMAP_LOCK(pmap);
7222 			rw_wlock(lock);
7223 			if (pvh_gen != pvh->pv_gen) {
7224 				PMAP_UNLOCK(pmap);
7225 				goto retry;
7226 			}
7227 		}
7228 		va = pv->pv_va;
7229 		pte = pmap_pte_exists(pmap, va, 2, __func__);
7230 		tpte = pmap_load(pte);
7231 		if (pmap_pte_dirty(pmap, tpte)) {
7232 			/*
7233 			 * Although "tpte" is mapping a 2MB page, because
7234 			 * this function is called at a 4KB page granularity,
7235 			 * we only update the 4KB page under test.
7236 			 */
7237 			vm_page_dirty(m);
7238 		}
7239 		if ((tpte & ATTR_AF) != 0) {
7240 			pa = VM_PAGE_TO_PHYS(m);
7241 
7242 			/*
7243 			 * Since this reference bit is shared by 512 4KB pages,
7244 			 * it should not be cleared every time it is tested.
7245 			 * Apply a simple "hash" function on the physical page
7246 			 * number, the virtual superpage number, and the pmap
7247 			 * address to select one 4KB page out of the 512 on
7248 			 * which testing the reference bit will result in
7249 			 * clearing that reference bit.  This function is
7250 			 * designed to avoid the selection of the same 4KB page
7251 			 * for every 2MB page mapping.
7252 			 *
7253 			 * On demotion, a mapping that hasn't been referenced
7254 			 * is simply destroyed.  To avoid the possibility of a
7255 			 * subsequent page fault on a demoted wired mapping,
7256 			 * always leave its reference bit set.  Moreover,
7257 			 * since the superpage is wired, the current state of
7258 			 * its reference bit won't affect page replacement.
7259 			 */
7260 			if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7261 			    (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7262 			    (tpte & ATTR_SW_WIRED) == 0) {
7263 				pmap_clear_bits(pte, ATTR_AF);
7264 				pmap_invalidate_page(pmap, va, true);
7265 				cleared++;
7266 			} else
7267 				not_cleared++;
7268 		}
7269 		PMAP_UNLOCK(pmap);
7270 		/* Rotate the PV list if it has more than one entry. */
7271 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
7272 			TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7273 			TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7274 			pvh->pv_gen++;
7275 		}
7276 		if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7277 			goto out;
7278 	} while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7279 small_mappings:
7280 	if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7281 		goto out;
7282 	pv = pvf;
7283 	do {
7284 		if (pvf == NULL)
7285 			pvf = pv;
7286 		pmap = PV_PMAP(pv);
7287 		if (!PMAP_TRYLOCK(pmap)) {
7288 			pvh_gen = pvh->pv_gen;
7289 			md_gen = m->md.pv_gen;
7290 			rw_wunlock(lock);
7291 			PMAP_LOCK(pmap);
7292 			rw_wlock(lock);
7293 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7294 				PMAP_UNLOCK(pmap);
7295 				goto retry;
7296 			}
7297 		}
7298 		pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7299 		tpte = pmap_load(pte);
7300 		if (pmap_pte_dirty(pmap, tpte))
7301 			vm_page_dirty(m);
7302 		if ((tpte & ATTR_AF) != 0) {
7303 			if ((tpte & ATTR_SW_WIRED) == 0) {
7304 				/*
7305 				 * Clear the accessed bit in this L3 entry
7306 				 * regardless of the contiguous bit.
7307 				 */
7308 				pmap_clear_bits(pte, ATTR_AF);
7309 				pmap_invalidate_page(pmap, pv->pv_va, true);
7310 				cleared++;
7311 			} else
7312 				not_cleared++;
7313 		} else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7314 		    (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7315 			/*
7316 			 * An L3C superpage mapping is regarded as accessed
7317 			 * until the accessed bit has been cleared in all
7318 			 * of its constituent entries.
7319 			 */
7320 			not_cleared++;
7321 		}
7322 		PMAP_UNLOCK(pmap);
7323 		/* Rotate the PV list if it has more than one entry. */
7324 		if (TAILQ_NEXT(pv, pv_next) != NULL) {
7325 			TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7326 			TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7327 			m->md.pv_gen++;
7328 		}
7329 	} while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7330 	    not_cleared < PMAP_TS_REFERENCED_MAX);
7331 out:
7332 	rw_wunlock(lock);
7333 	vm_page_free_pages_toq(&free, true);
7334 	return (cleared + not_cleared);
7335 }
7336 
7337 /*
7338  *	Apply the given advice to the specified range of addresses within the
7339  *	given pmap.  Depending on the advice, clear the referenced and/or
7340  *	modified flags in each mapping and set the mapped page's dirty field.
7341  */
7342 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7343 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7344 {
7345 	struct rwlock *lock;
7346 	vm_offset_t va, va_next, dva;
7347 	vm_page_t m;
7348 	pd_entry_t *l0, *l1, *l2, oldl2;
7349 	pt_entry_t *l3, *dl3, oldl3;
7350 
7351 	PMAP_ASSERT_STAGE1(pmap);
7352 
7353 	if (advice != MADV_DONTNEED && advice != MADV_FREE)
7354 		return;
7355 
7356 	PMAP_LOCK(pmap);
7357 	for (; sva < eva; sva = va_next) {
7358 		l0 = pmap_l0(pmap, sva);
7359 		if (pmap_load(l0) == 0) {
7360 			va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7361 			if (va_next < sva)
7362 				va_next = eva;
7363 			continue;
7364 		}
7365 
7366 		va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7367 		if (va_next < sva)
7368 			va_next = eva;
7369 		l1 = pmap_l0_to_l1(l0, sva);
7370 		if (pmap_load(l1) == 0)
7371 			continue;
7372 		if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7373 			PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7374 			continue;
7375 		}
7376 
7377 		va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7378 		if (va_next < sva)
7379 			va_next = eva;
7380 		l2 = pmap_l1_to_l2(l1, sva);
7381 		oldl2 = pmap_load(l2);
7382 		if (oldl2 == 0)
7383 			continue;
7384 		if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7385 			if ((oldl2 & ATTR_SW_MANAGED) == 0)
7386 				continue;
7387 			lock = NULL;
7388 			if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7389 				if (lock != NULL)
7390 					rw_wunlock(lock);
7391 
7392 				/*
7393 				 * The 2MB page mapping was destroyed.
7394 				 */
7395 				continue;
7396 			}
7397 
7398 			/*
7399 			 * Unless the page mappings are wired, remove the
7400 			 * mapping to a single page so that a subsequent
7401 			 * access may repromote.  Choosing the last page
7402 			 * within the address range [sva, min(va_next, eva))
7403 			 * generally results in more repromotions.  Since the
7404 			 * underlying page table page is fully populated, this
7405 			 * removal never frees a page table page.
7406 			 */
7407 			if ((oldl2 & ATTR_SW_WIRED) == 0) {
7408 				va = eva;
7409 				if (va > va_next)
7410 					va = va_next;
7411 				va -= PAGE_SIZE;
7412 				KASSERT(va >= sva,
7413 				    ("pmap_advise: no address gap"));
7414 				l3 = pmap_l2_to_l3(l2, va);
7415 				KASSERT(pmap_load(l3) != 0,
7416 				    ("pmap_advise: invalid PTE"));
7417 				pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7418 				    NULL, &lock);
7419 			}
7420 			if (lock != NULL)
7421 				rw_wunlock(lock);
7422 		}
7423 		KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7424 		    ("pmap_advise: invalid L2 entry after demotion"));
7425 		if (va_next > eva)
7426 			va_next = eva;
7427 		va = va_next;
7428 		for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7429 		    sva += L3_SIZE) {
7430 			oldl3 = pmap_load(l3);
7431 			if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7432 			    (ATTR_SW_MANAGED | L3_PAGE))
7433 				goto maybe_invlrng;
7434 			else if (pmap_pte_dirty(pmap, oldl3)) {
7435 				if (advice == MADV_DONTNEED) {
7436 					/*
7437 					 * Future calls to pmap_is_modified()
7438 					 * can be avoided by making the page
7439 					 * dirty now.
7440 					 */
7441 					m = PTE_TO_VM_PAGE(oldl3);
7442 					vm_page_dirty(m);
7443 				}
7444 				if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7445 					/*
7446 					 * Unconditionally demote the L3C
7447 					 * superpage because we do not allow
7448 					 * writeable, clean superpages.
7449 					 */
7450 					(void)pmap_demote_l3c(pmap, l3, sva);
7451 
7452 					/*
7453                                          * Destroy the final mapping before the
7454                                          * next L3C boundary or va_next,
7455 					 * whichever comes first, so that a
7456 					 * subsequent access may act as a
7457 					 * repromotion trigger.
7458 					 */
7459                                         if ((oldl3 & ATTR_SW_WIRED) == 0) {
7460 						dva = MIN((sva & ~L3C_OFFSET) +
7461 						    L3C_SIZE - PAGE_SIZE,
7462 						    va_next - PAGE_SIZE);
7463 						dl3 = pmap_l2_to_l3(l2, dva);
7464 						KASSERT(pmap_load(dl3) != 0,
7465 						    ("pmap_advise: invalid PTE"));
7466 						lock = NULL;
7467 						pmap_remove_l3(pmap, dl3, dva,
7468 						    pmap_load(l2), NULL, &lock);
7469 						if (lock != NULL)
7470 							rw_wunlock(lock);
7471 					}
7472 
7473 					/*
7474 					 * The L3 entry's accessed bit may have
7475 					 * changed.
7476 					 */
7477 					oldl3 = pmap_load(l3);
7478 				}
7479 
7480 				/*
7481 				 * Check that we did not just destroy this entry so
7482 				 * we avoid corrupting the page able.
7483 				 */
7484 				if (oldl3 != 0) {
7485 					while (!atomic_fcmpset_long(l3, &oldl3,
7486 					    (oldl3 & ~ATTR_AF) |
7487 					    ATTR_S1_AP(ATTR_S1_AP_RO)))
7488 						cpu_spinwait();
7489 				}
7490 			} else if ((oldl3 & ATTR_AF) != 0) {
7491 				/*
7492 				 * Clear the accessed bit in this L3 entry
7493 				 * regardless of the contiguous bit.
7494 				 */
7495 				pmap_clear_bits(l3, ATTR_AF);
7496 			} else
7497 				goto maybe_invlrng;
7498 			if (va == va_next)
7499 				va = sva;
7500 			continue;
7501 maybe_invlrng:
7502 			if (va != va_next) {
7503 				pmap_s1_invalidate_range(pmap, va, sva, true);
7504 				va = va_next;
7505 			}
7506 		}
7507 		if (va != va_next)
7508 			pmap_s1_invalidate_range(pmap, va, sva, true);
7509 	}
7510 	PMAP_UNLOCK(pmap);
7511 }
7512 
7513 /*
7514  *	Clear the modify bits on the specified physical page.
7515  */
7516 void
pmap_clear_modify(vm_page_t m)7517 pmap_clear_modify(vm_page_t m)
7518 {
7519 	struct md_page *pvh;
7520 	struct rwlock *lock;
7521 	pmap_t pmap;
7522 	pv_entry_t next_pv, pv;
7523 	pd_entry_t *l2, oldl2;
7524 	pt_entry_t *l3, oldl3;
7525 	vm_offset_t va;
7526 	int md_gen, pvh_gen;
7527 
7528 	KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7529 	    ("pmap_clear_modify: page %p is not managed", m));
7530 	vm_page_assert_busied(m);
7531 
7532 	if (!pmap_page_is_write_mapped(m))
7533 		return;
7534 	pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7535 	lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7536 	rw_wlock(lock);
7537 restart:
7538 	TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7539 		pmap = PV_PMAP(pv);
7540 		PMAP_ASSERT_STAGE1(pmap);
7541 		if (!PMAP_TRYLOCK(pmap)) {
7542 			pvh_gen = pvh->pv_gen;
7543 			rw_wunlock(lock);
7544 			PMAP_LOCK(pmap);
7545 			rw_wlock(lock);
7546 			if (pvh_gen != pvh->pv_gen) {
7547 				PMAP_UNLOCK(pmap);
7548 				goto restart;
7549 			}
7550 		}
7551 		va = pv->pv_va;
7552 		l2 = pmap_l2(pmap, va);
7553 		oldl2 = pmap_load(l2);
7554 		/* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7555 		if ((oldl2 & ATTR_SW_DBM) != 0 &&
7556 		    pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7557 		    (oldl2 & ATTR_SW_WIRED) == 0) {
7558 			/*
7559 			 * Write protect the mapping to a single page so that
7560 			 * a subsequent write access may repromote.
7561 			 */
7562 			va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7563 			l3 = pmap_l2_to_l3(l2, va);
7564 			oldl3 = pmap_load(l3);
7565 			while (!atomic_fcmpset_long(l3, &oldl3,
7566 			    (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7567 				cpu_spinwait();
7568 			vm_page_dirty(m);
7569 			pmap_s1_invalidate_page(pmap, va, true);
7570 		}
7571 		PMAP_UNLOCK(pmap);
7572 	}
7573 	TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7574 		pmap = PV_PMAP(pv);
7575 		PMAP_ASSERT_STAGE1(pmap);
7576 		if (!PMAP_TRYLOCK(pmap)) {
7577 			md_gen = m->md.pv_gen;
7578 			pvh_gen = pvh->pv_gen;
7579 			rw_wunlock(lock);
7580 			PMAP_LOCK(pmap);
7581 			rw_wlock(lock);
7582 			if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7583 				PMAP_UNLOCK(pmap);
7584 				goto restart;
7585 			}
7586 		}
7587 		l2 = pmap_l2(pmap, pv->pv_va);
7588 		l3 = pmap_l2_to_l3(l2, pv->pv_va);
7589 		oldl3 = pmap_load(l3);
7590 		KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7591 		    (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7592 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7593 		    ("writeable L3C superpage not dirty"));
7594 		if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7595 			if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7596 				(void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7597 			pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7598 			pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7599 		}
7600 		PMAP_UNLOCK(pmap);
7601 	}
7602 	rw_wunlock(lock);
7603 }
7604 
7605 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)7606 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7607 {
7608 	struct pmap_preinit_mapping *ppim;
7609 	vm_offset_t va, offset;
7610 	pd_entry_t old_l2e, *pde;
7611 	pt_entry_t *l2;
7612 	int i, lvl, l2_blocks, free_l2_count, start_idx;
7613 
7614 	if (!vm_initialized) {
7615 		/*
7616 		 * No L3 ptables so map entire L2 blocks where start VA is:
7617 		 * 	preinit_map_va + start_idx * L2_SIZE
7618 		 * There may be duplicate mappings (multiple VA -> same PA) but
7619 		 * ARM64 dcache is always PIPT so that's acceptable.
7620 		 */
7621 		 if (size == 0)
7622 			 return (NULL);
7623 
7624 		 /* Calculate how many L2 blocks are needed for the mapping */
7625 		l2_blocks = (roundup2(pa + size, L2_SIZE) -
7626 		    rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7627 
7628 		offset = pa & L2_OFFSET;
7629 
7630 		if (preinit_map_va == 0)
7631 			return (NULL);
7632 
7633 		/* Map 2MiB L2 blocks from reserved VA space */
7634 
7635 		free_l2_count = 0;
7636 		start_idx = -1;
7637 		/* Find enough free contiguous VA space */
7638 		for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7639 			ppim = pmap_preinit_mapping + i;
7640 			if (free_l2_count > 0 && ppim->pa != 0) {
7641 				/* Not enough space here */
7642 				free_l2_count = 0;
7643 				start_idx = -1;
7644 				continue;
7645 			}
7646 
7647 			if (ppim->pa == 0) {
7648 				/* Free L2 block */
7649 				if (start_idx == -1)
7650 					start_idx = i;
7651 				free_l2_count++;
7652 				if (free_l2_count == l2_blocks)
7653 					break;
7654 			}
7655 		}
7656 		if (free_l2_count != l2_blocks)
7657 			panic("%s: too many preinit mappings", __func__);
7658 
7659 		va = preinit_map_va + (start_idx * L2_SIZE);
7660 		for (i = start_idx; i < start_idx + l2_blocks; i++) {
7661 			/* Mark entries as allocated */
7662 			ppim = pmap_preinit_mapping + i;
7663 			ppim->pa = pa;
7664 			ppim->va = va + offset;
7665 			ppim->size = size;
7666 		}
7667 
7668 		/* Map L2 blocks */
7669 		pa = rounddown2(pa, L2_SIZE);
7670 		old_l2e = 0;
7671 		for (i = 0; i < l2_blocks; i++) {
7672 			pde = pmap_pde(kernel_pmap, va, &lvl);
7673 			KASSERT(pde != NULL,
7674 			    ("pmap_mapbios: Invalid page entry, va: 0x%lx",
7675 			    va));
7676 			KASSERT(lvl == 1,
7677 			    ("pmap_mapbios: Invalid level %d", lvl));
7678 
7679 			/* Insert L2_BLOCK */
7680 			l2 = pmap_l1_to_l2(pde, va);
7681 			old_l2e |= pmap_load_store(l2,
7682 			    PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN |
7683 			    ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
7684 			    L2_BLOCK);
7685 
7686 			va += L2_SIZE;
7687 			pa += L2_SIZE;
7688 		}
7689 		if ((old_l2e & ATTR_DESCR_VALID) != 0)
7690 			pmap_s1_invalidate_all(kernel_pmap);
7691 		else {
7692 			/*
7693 			 * Because the old entries were invalid and the new
7694 			 * mappings are not executable, an isb is not required.
7695 			 */
7696 			dsb(ishst);
7697 		}
7698 
7699 		va = preinit_map_va + (start_idx * L2_SIZE);
7700 
7701 	} else {
7702 		/* kva_alloc may be used to map the pages */
7703 		offset = pa & PAGE_MASK;
7704 		size = round_page(offset + size);
7705 
7706 		va = kva_alloc(size);
7707 		if (va == 0)
7708 			panic("%s: Couldn't allocate KVA", __func__);
7709 
7710 		pde = pmap_pde(kernel_pmap, va, &lvl);
7711 		KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
7712 
7713 		/* L3 table is linked */
7714 		va = trunc_page(va);
7715 		pa = trunc_page(pa);
7716 		pmap_kenter(va, size, pa, memory_mapping_mode(pa));
7717 	}
7718 
7719 	return ((void *)(va + offset));
7720 }
7721 
7722 void
pmap_unmapbios(void * p,vm_size_t size)7723 pmap_unmapbios(void *p, vm_size_t size)
7724 {
7725 	struct pmap_preinit_mapping *ppim;
7726 	vm_offset_t offset, va, va_trunc;
7727 	pd_entry_t *pde;
7728 	pt_entry_t *l2;
7729 	int i, lvl, l2_blocks, block;
7730 	bool preinit_map;
7731 
7732 	va = (vm_offset_t)p;
7733 	l2_blocks =
7734 	   (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
7735 	KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
7736 
7737 	/* Remove preinit mapping */
7738 	preinit_map = false;
7739 	block = 0;
7740 	for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7741 		ppim = pmap_preinit_mapping + i;
7742 		if (ppim->va == va) {
7743 			KASSERT(ppim->size == size,
7744 			    ("pmap_unmapbios: size mismatch"));
7745 			ppim->va = 0;
7746 			ppim->pa = 0;
7747 			ppim->size = 0;
7748 			preinit_map = true;
7749 			offset = block * L2_SIZE;
7750 			va_trunc = rounddown2(va, L2_SIZE) + offset;
7751 
7752 			/* Remove L2_BLOCK */
7753 			pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
7754 			KASSERT(pde != NULL,
7755 			    ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
7756 			    va_trunc));
7757 			l2 = pmap_l1_to_l2(pde, va_trunc);
7758 			pmap_clear(l2);
7759 
7760 			if (block == (l2_blocks - 1))
7761 				break;
7762 			block++;
7763 		}
7764 	}
7765 	if (preinit_map) {
7766 		pmap_s1_invalidate_all(kernel_pmap);
7767 		return;
7768 	}
7769 
7770 	/* Unmap the pages reserved with kva_alloc. */
7771 	if (vm_initialized) {
7772 		offset = va & PAGE_MASK;
7773 		size = round_page(offset + size);
7774 		va = trunc_page(va);
7775 
7776 		/* Unmap and invalidate the pages */
7777 		pmap_kremove_device(va, size);
7778 
7779 		kva_free(va, size);
7780 	}
7781 }
7782 
7783 /*
7784  * Sets the memory attribute for the specified page.
7785  */
7786 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)7787 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7788 {
7789 
7790 	m->md.pv_memattr = ma;
7791 
7792 	/*
7793 	 * If "m" is a normal page, update its direct mapping.  This update
7794 	 * can be relied upon to perform any cache operations that are
7795 	 * required for data coherence.
7796 	 */
7797 	if ((m->flags & PG_FICTITIOUS) == 0 &&
7798 	    pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7799 	    m->md.pv_memattr) != 0)
7800 		panic("memory attribute change on the direct map failed");
7801 }
7802 
7803 /*
7804  * Changes the specified virtual address range's memory type to that given by
7805  * the parameter "mode".  The specified virtual address range must be
7806  * completely contained within either the direct map or the kernel map.  If
7807  * the virtual address range is contained within the kernel map, then the
7808  * memory type for each of the corresponding ranges of the direct map is also
7809  * changed.  (The corresponding ranges of the direct map are those ranges that
7810  * map the same physical pages as the specified virtual address range.)  These
7811  * changes to the direct map are necessary because Intel describes the
7812  * behavior of their processors as "undefined" if two or more mappings to the
7813  * same physical page have different memory types.
7814  *
7815  * Returns zero if the change completed successfully, and either EINVAL or
7816  * ENOMEM if the change failed.  Specifically, EINVAL is returned if some part
7817  * of the virtual address range was not mapped, and ENOMEM is returned if
7818  * there was insufficient memory available to complete the change.  In the
7819  * latter case, the memory type may have been changed on some part of the
7820  * virtual address range or the direct map.
7821  */
7822 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)7823 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7824 {
7825 	int error;
7826 
7827 	PMAP_LOCK(kernel_pmap);
7828 	error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
7829 	PMAP_UNLOCK(kernel_pmap);
7830 	return (error);
7831 }
7832 
7833 /*
7834  * Changes the specified virtual address range's protections to those
7835  * specified by "prot".  Like pmap_change_attr(), protections for aliases
7836  * in the direct map are updated as well.  Protections on aliasing mappings may
7837  * be a subset of the requested protections; for example, mappings in the direct
7838  * map are never executable.
7839  */
7840 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)7841 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
7842 {
7843 	int error;
7844 
7845 	/* Only supported within the kernel map. */
7846 	if (va < VM_MIN_KERNEL_ADDRESS)
7847 		return (EINVAL);
7848 
7849 	PMAP_LOCK(kernel_pmap);
7850 	error = pmap_change_props_locked(va, size, prot, -1, false);
7851 	PMAP_UNLOCK(kernel_pmap);
7852 	return (error);
7853 }
7854 
7855 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)7856 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
7857     int mode, bool skip_unmapped)
7858 {
7859 	vm_offset_t base, offset, tmpva;
7860 	vm_size_t pte_size;
7861 	vm_paddr_t pa;
7862 	pt_entry_t pte, *ptep, *newpte;
7863 	pt_entry_t bits, mask;
7864 	int lvl, rv;
7865 
7866 	PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7867 	base = trunc_page(va);
7868 	offset = va & PAGE_MASK;
7869 	size = round_page(offset + size);
7870 
7871 	if (!VIRT_IN_DMAP(base) &&
7872 	    !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
7873 		return (EINVAL);
7874 
7875 	bits = 0;
7876 	mask = 0;
7877 	if (mode != -1) {
7878 		bits = ATTR_S1_IDX(mode);
7879 		mask = ATTR_S1_IDX_MASK;
7880 		if (mode == VM_MEMATTR_DEVICE) {
7881 			mask |= ATTR_S1_XN;
7882 			bits |= ATTR_S1_XN;
7883 		}
7884 	}
7885 	if (prot != VM_PROT_NONE) {
7886 		/* Don't mark the DMAP as executable. It never is on arm64. */
7887 		if (VIRT_IN_DMAP(base)) {
7888 			prot &= ~VM_PROT_EXECUTE;
7889 			/*
7890 			 * XXX Mark the DMAP as writable for now. We rely
7891 			 * on this in ddb & dtrace to insert breakpoint
7892 			 * instructions.
7893 			 */
7894 			prot |= VM_PROT_WRITE;
7895 		}
7896 
7897 		if ((prot & VM_PROT_WRITE) == 0) {
7898 			bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
7899 		}
7900 		if ((prot & VM_PROT_EXECUTE) == 0) {
7901 			bits |= ATTR_S1_PXN;
7902 		}
7903 		bits |= ATTR_S1_UXN;
7904 		mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
7905 	}
7906 
7907 	for (tmpva = base; tmpva < base + size; ) {
7908 		ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
7909 		if (ptep == NULL && !skip_unmapped) {
7910 			return (EINVAL);
7911 		} else if ((ptep == NULL && skip_unmapped) ||
7912 		    (pmap_load(ptep) & mask) == bits) {
7913 			/*
7914 			 * We already have the correct attribute or there
7915 			 * is no memory mapped at this address and we are
7916 			 * skipping unmapped memory.
7917 			 */
7918 			switch (lvl) {
7919 			default:
7920 				panic("Invalid DMAP table level: %d\n", lvl);
7921 			case 1:
7922 				tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
7923 				break;
7924 			case 2:
7925 				tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
7926 				break;
7927 			case 3:
7928 				tmpva += PAGE_SIZE;
7929 				break;
7930 			}
7931 		} else {
7932 			/* We can't demote/promote this entry */
7933 			MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
7934 
7935 			/*
7936 			 * Find the entry and demote it if the requested change
7937 			 * only applies to part of the address range mapped by
7938 			 * the entry.
7939 			 */
7940 			switch (lvl) {
7941 			default:
7942 				panic("Invalid DMAP table level: %d\n", lvl);
7943 			case 1:
7944 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7945 				if ((tmpva & L1_OFFSET) == 0 &&
7946 				    (base + size - tmpva) >= L1_SIZE) {
7947 					pte_size = L1_SIZE;
7948 					break;
7949 				}
7950 				newpte = pmap_demote_l1(kernel_pmap, ptep,
7951 				    tmpva & ~L1_OFFSET);
7952 				if (newpte == NULL)
7953 					return (EINVAL);
7954 				ptep = pmap_l1_to_l2(ptep, tmpva);
7955 				/* FALLTHROUGH */
7956 			case 2:
7957 				if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
7958 					if ((tmpva & L2C_OFFSET) == 0 &&
7959 					    (base + size - tmpva) >= L2C_SIZE) {
7960 						pte_size = L2C_SIZE;
7961 						break;
7962 					}
7963 					if (!pmap_demote_l2c(kernel_pmap, ptep,
7964 					    tmpva))
7965 						return (EINVAL);
7966 				}
7967 				if ((tmpva & L2_OFFSET) == 0 &&
7968 				    (base + size - tmpva) >= L2_SIZE) {
7969 					pte_size = L2_SIZE;
7970 					break;
7971 				}
7972 				newpte = pmap_demote_l2(kernel_pmap, ptep,
7973 				    tmpva);
7974 				if (newpte == NULL)
7975 					return (EINVAL);
7976 				ptep = pmap_l2_to_l3(ptep, tmpva);
7977 				/* FALLTHROUGH */
7978 			case 3:
7979 				if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
7980 					if ((tmpva & L3C_OFFSET) == 0 &&
7981 					    (base + size - tmpva) >= L3C_SIZE) {
7982 						pte_size = L3C_SIZE;
7983 						break;
7984 					}
7985 					if (!pmap_demote_l3c(kernel_pmap, ptep,
7986 					    tmpva))
7987 						return (EINVAL);
7988 				}
7989 				pte_size = PAGE_SIZE;
7990 				break;
7991 			}
7992 
7993 			/* Update the entry */
7994 			pte = pmap_load(ptep);
7995 			pte &= ~mask;
7996 			pte |= bits;
7997 
7998 			switch (pte_size) {
7999 			case L2C_SIZE:
8000 				pmap_update_strided(kernel_pmap, ptep, ptep +
8001 				    L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE);
8002 				break;
8003 			case L3C_SIZE:
8004 				pmap_update_strided(kernel_pmap, ptep, ptep +
8005 				    L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE);
8006 				break;
8007 			default:
8008 				/*
8009 				 * We are updating a single block or page entry,
8010 				 * so regardless of pte_size pass PAGE_SIZE in
8011 				 * order that a single TLB invalidation is
8012 				 * performed.
8013 				 */
8014 				pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
8015 				    PAGE_SIZE);
8016 				break;
8017 			}
8018 
8019 			pa = PTE_TO_PHYS(pte);
8020 			if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
8021 				/*
8022 				 * Keep the DMAP memory in sync.
8023 				 */
8024 				rv = pmap_change_props_locked(
8025 				    PHYS_TO_DMAP(pa), pte_size,
8026 				    prot, mode, true);
8027 				if (rv != 0)
8028 					return (rv);
8029 			}
8030 
8031 			/*
8032 			 * If moving to a non-cacheable entry flush
8033 			 * the cache.
8034 			 */
8035 			if (mode == VM_MEMATTR_UNCACHEABLE)
8036 				cpu_dcache_wbinv_range((void *)tmpva, pte_size);
8037 			tmpva += pte_size;
8038 		}
8039 	}
8040 
8041 	return (0);
8042 }
8043 
8044 /*
8045  * Create an L2 table to map all addresses within an L1 mapping.
8046  */
8047 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)8048 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
8049 {
8050 	pt_entry_t *l2, newl2, oldl1;
8051 	vm_offset_t tmpl1;
8052 	vm_paddr_t l2phys, phys;
8053 	vm_page_t ml2;
8054 	int i;
8055 
8056 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8057 	oldl1 = pmap_load(l1);
8058 	PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8059 	KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
8060 	    ("pmap_demote_l1: Demoting a non-block entry"));
8061 	KASSERT((va & L1_OFFSET) == 0,
8062 	    ("pmap_demote_l1: Invalid virtual address %#lx", va));
8063 	KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
8064 	    ("pmap_demote_l1: Level 1 table shouldn't be managed"));
8065 	KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
8066 	    ("pmap_demote_l1: Demoting entry with no-demote flag set"));
8067 
8068 	tmpl1 = 0;
8069 	if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
8070 		tmpl1 = kva_alloc(PAGE_SIZE);
8071 		if (tmpl1 == 0)
8072 			return (NULL);
8073 	}
8074 
8075 	if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
8076 	    NULL) {
8077 		CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
8078 		    " in pmap %p", va, pmap);
8079 		l2 = NULL;
8080 		goto fail;
8081 	}
8082 
8083 	l2phys = VM_PAGE_TO_PHYS(ml2);
8084 	l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
8085 
8086 	/* Address the range points at */
8087 	phys = PTE_TO_PHYS(oldl1);
8088 	/* The attributed from the old l1 table to be copied */
8089 	newl2 = oldl1 & ATTR_MASK;
8090 
8091 	/* Create the new entries */
8092 	newl2 |= ATTR_CONTIGUOUS;
8093 	for (i = 0; i < Ln_ENTRIES; i++) {
8094 		l2[i] = newl2 | phys;
8095 		phys += L2_SIZE;
8096 	}
8097 	KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) |
8098 	    L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0],
8099 	    ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
8100 
8101 	if (tmpl1 != 0) {
8102 		pmap_kenter(tmpl1, PAGE_SIZE,
8103 		    DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
8104 		    VM_MEMATTR_WRITE_BACK);
8105 		l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
8106 	}
8107 
8108 	pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
8109 
8110 	counter_u64_add(pmap_l1_demotions, 1);
8111 fail:
8112 	if (tmpl1 != 0) {
8113 		pmap_kremove(tmpl1);
8114 		kva_free(tmpl1, PAGE_SIZE);
8115 	}
8116 
8117 	return (l2);
8118 }
8119 
8120 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8121 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8122 {
8123 	pt_entry_t *l3;
8124 
8125 	for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8126 		*l3 = newl3;
8127 		newl3 += L3_SIZE;
8128 	}
8129 }
8130 
8131 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8132 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8133 {
8134 #ifdef INVARIANTS
8135 #ifdef DIAGNOSTIC
8136 	pt_entry_t *xl3p, *yl3p;
8137 
8138 	for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8139 	    xl3p++, newl3e += PAGE_SIZE) {
8140 		if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8141 			printf("pmap_demote_l2: xl3e %zd and newl3e map "
8142 			    "different pages: found %#lx, expected %#lx\n",
8143 			    xl3p - firstl3p, pmap_load(xl3p), newl3e);
8144 			printf("page table dump\n");
8145 			for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8146 			    yl3p++) {
8147 				printf("%zd %#lx\n", yl3p - firstl3p,
8148 				    pmap_load(yl3p));
8149 			}
8150 			panic("firstpte");
8151 		}
8152 	}
8153 #else
8154 	KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8155 	    ("pmap_demote_l2: firstl3 and newl3e map different physical"
8156 	    " addresses"));
8157 #endif
8158 #endif
8159 }
8160 
8161 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8162 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8163     struct rwlock **lockp)
8164 {
8165 	struct spglist free;
8166 
8167 	SLIST_INIT(&free);
8168 	(void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
8169 	    lockp);
8170 	vm_page_free_pages_toq(&free, true);
8171 }
8172 
8173 /*
8174  * Create an L3 table to map all addresses within an L2 mapping.
8175  */
8176 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8177 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8178     struct rwlock **lockp)
8179 {
8180 	pt_entry_t *l3, newl3, oldl2;
8181 	vm_offset_t tmpl2;
8182 	vm_paddr_t l3phys;
8183 	vm_page_t ml3;
8184 
8185 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8186 	PMAP_ASSERT_STAGE1(pmap);
8187 	KASSERT(ADDR_IS_CANONICAL(va),
8188 	    ("%s: Address not in canonical form: %lx", __func__, va));
8189 
8190 	l3 = NULL;
8191 	oldl2 = pmap_load(l2);
8192 	KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8193 	    ("pmap_demote_l2: Demoting a non-block entry"));
8194 	KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8195 	    ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8196 	va &= ~L2_OFFSET;
8197 
8198 	tmpl2 = 0;
8199 	if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8200 		tmpl2 = kva_alloc(PAGE_SIZE);
8201 		if (tmpl2 == 0)
8202 			return (NULL);
8203 	}
8204 
8205 	/*
8206 	 * Invalidate the 2MB page mapping and return "failure" if the
8207 	 * mapping was never accessed.
8208 	 */
8209 	if ((oldl2 & ATTR_AF) == 0) {
8210 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8211 		    ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
8212 		pmap_demote_l2_abort(pmap, va, l2, lockp);
8213 		CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
8214 		    va, pmap);
8215 		goto fail;
8216 	}
8217 
8218 	if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8219 		KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8220 		    ("pmap_demote_l2: page table page for a wired mapping"
8221 		    " is missing"));
8222 
8223 		/*
8224 		 * If the page table page is missing and the mapping
8225 		 * is for a kernel address, the mapping must belong to
8226 		 * either the direct map or the early kernel memory.
8227 		 * Page table pages are preallocated for every other
8228 		 * part of the kernel address space, so the direct map
8229 		 * region and early kernel memory are the only parts of the
8230 		 * kernel address space that must be handled here.
8231 		 */
8232 		KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
8233 		    (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8234 		    ("pmap_demote_l2: No saved mpte for va %#lx", va));
8235 
8236 		/*
8237 		 * If the 2MB page mapping belongs to the direct map
8238 		 * region of the kernel's address space, then the page
8239 		 * allocation request specifies the highest possible
8240 		 * priority (VM_ALLOC_INTERRUPT).  Otherwise, the
8241 		 * priority is normal.
8242 		 */
8243 		ml3 = vm_page_alloc_noobj(
8244 		    (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8245 		    VM_ALLOC_WIRED);
8246 
8247 		/*
8248 		 * If the allocation of the new page table page fails,
8249 		 * invalidate the 2MB page mapping and return "failure".
8250 		 */
8251 		if (ml3 == NULL) {
8252 			pmap_demote_l2_abort(pmap, va, l2, lockp);
8253 			CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8254 			    " in pmap %p", va, pmap);
8255 			goto fail;
8256 		}
8257 		ml3->pindex = pmap_l2_pindex(va);
8258 
8259 		if (!ADDR_IS_KERNEL(va)) {
8260 			ml3->ref_count = NL3PG;
8261 			pmap_resident_count_inc(pmap, 1);
8262 		}
8263 	}
8264 	l3phys = VM_PAGE_TO_PHYS(ml3);
8265 	l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8266 	newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8267 	KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8268 	    (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8269 	    ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8270 
8271 	/*
8272 	 * If the PTP is not leftover from an earlier promotion or it does not
8273 	 * have ATTR_AF set in every L3E, then fill it.  The new L3Es will all
8274 	 * have ATTR_AF set.
8275 	 *
8276 	 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8277 	 * performs a dsb().  That dsb() ensures that the stores for filling
8278 	 * "l3" are visible before "l3" is added to the page table.
8279 	 */
8280 	if (!vm_page_all_valid(ml3))
8281 		pmap_fill_l3(l3, newl3);
8282 
8283 	pmap_demote_l2_check(l3, newl3);
8284 
8285 	/*
8286 	 * If the mapping has changed attributes, update the L3Es.
8287 	 */
8288 	if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8289 		pmap_fill_l3(l3, newl3);
8290 
8291 	/*
8292 	 * Map the temporary page so we don't lose access to the l2 table.
8293 	 */
8294 	if (tmpl2 != 0) {
8295 		pmap_kenter(tmpl2, PAGE_SIZE,
8296 		    DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8297 		    VM_MEMATTR_WRITE_BACK);
8298 		l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8299 	}
8300 
8301 	/*
8302 	 * The spare PV entries must be reserved prior to demoting the
8303 	 * mapping, that is, prior to changing the PDE.  Otherwise, the state
8304 	 * of the L2 and the PV lists will be inconsistent, which can result
8305 	 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8306 	 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8307 	 * PV entry for the 2MB page mapping that is being demoted.
8308 	 */
8309 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
8310 		reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8311 
8312 	/*
8313 	 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8314 	 * the 2MB page mapping.
8315 	 */
8316 	pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8317 
8318 	/*
8319 	 * Demote the PV entry.
8320 	 */
8321 	if ((oldl2 & ATTR_SW_MANAGED) != 0)
8322 		pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8323 
8324 	atomic_add_long(&pmap_l2_demotions, 1);
8325 	CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8326 	    " in pmap %p %lx", va, pmap, l3[0]);
8327 
8328 fail:
8329 	if (tmpl2 != 0) {
8330 		pmap_kremove(tmpl2);
8331 		kva_free(tmpl2, PAGE_SIZE);
8332 	}
8333 
8334 	return (l3);
8335 
8336 }
8337 
8338 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8339 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8340 {
8341 	struct rwlock *lock;
8342 	pt_entry_t *l3;
8343 
8344 	lock = NULL;
8345 	l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8346 	if (lock != NULL)
8347 		rw_wunlock(lock);
8348 	return (l3);
8349 }
8350 
8351 /*
8352  * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings.
8353  */
8354 static bool
pmap_demote_l2c(pmap_t pmap,pt_entry_t * l2p,vm_offset_t va)8355 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va)
8356 {
8357 	pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p;
8358 	vm_offset_t tmpl3;
8359 	register_t intr;
8360 
8361 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8362 	PMAP_ASSERT_STAGE1(pmap);
8363 	l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES *
8364 	    sizeof(pd_entry_t)) - 1));
8365 	l2c_end = l2c_start + L2C_ENTRIES;
8366 	tmpl3 = 0;
8367 	if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end &&
8368 	    (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) {
8369 		tmpl3 = kva_alloc(PAGE_SIZE);
8370 		if (tmpl3 == 0)
8371 			return (false);
8372 		pmap_kenter(tmpl3, PAGE_SIZE,
8373 		    DMAP_TO_PHYS((vm_offset_t)l2c_start) & ~L3_OFFSET,
8374 		    VM_MEMATTR_WRITE_BACK);
8375 		l2c_start = (pd_entry_t *)(tmpl3 +
8376 		    ((vm_offset_t)l2c_start & PAGE_MASK));
8377 		l2c_end = (pd_entry_t *)(tmpl3 +
8378 		    ((vm_offset_t)l2c_end & PAGE_MASK));
8379 	}
8380 	mask = 0;
8381 	nbits = ATTR_DESCR_VALID;
8382 	intr = intr_disable();
8383 
8384 	/*
8385 	 * Break the mappings.
8386 	 */
8387 	for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8388 		/*
8389 		 * Clear the mapping's contiguous and valid bits, but leave
8390 		 * the rest of the entry unchanged, so that a lockless,
8391 		 * concurrent pmap_kextract() can still lookup the physical
8392 		 * address.
8393 		 */
8394 		l2e = pmap_load(tl2p);
8395 		KASSERT((l2e & ATTR_CONTIGUOUS) != 0,
8396 		    ("pmap_demote_l2c: missing ATTR_CONTIGUOUS"));
8397 		KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8398 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8399 		    ("pmap_demote_l2c: missing ATTR_S1_AP_RW"));
8400 		while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS |
8401 		    ATTR_DESCR_VALID)))
8402 			cpu_spinwait();
8403 
8404 		/*
8405 		 * Hardware accessed and dirty bit maintenance might only
8406 		 * update a single L2 entry, so we must combine the accessed
8407 		 * and dirty bits from this entire set of contiguous L2
8408 		 * entries.
8409 		 */
8410 		if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8411 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8412 			mask = ATTR_S1_AP_RW_BIT;
8413 		nbits |= l2e & ATTR_AF;
8414 	}
8415 	if ((nbits & ATTR_AF) != 0) {
8416 		pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va +
8417 		    L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true);
8418 	}
8419 
8420 	/*
8421 	 * Remake the mappings, updating the accessed and dirty bits.
8422 	 */
8423 	for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8424 		l2e = pmap_load(tl2p);
8425 		while (!atomic_fcmpset_64(tl2p, &l2e, (l2e & ~mask) | nbits))
8426 			cpu_spinwait();
8427 	}
8428 	dsb(ishst);
8429 
8430 	intr_restore(intr);
8431 	if (tmpl3 != 0) {
8432 		pmap_kremove(tmpl3);
8433 		kva_free(tmpl3, PAGE_SIZE);
8434 	}
8435 	counter_u64_add(pmap_l2c_demotions, 1);
8436 	CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p",
8437 	    va, pmap);
8438 	return (true);
8439 }
8440 
8441 /*
8442  * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8443  */
8444 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8445 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8446 {
8447 	pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8448 	vm_offset_t tmpl3;
8449 	register_t intr;
8450 
8451 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8452 	l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8453 	    sizeof(pt_entry_t)) - 1));
8454 	l3c_end = l3c_start + L3C_ENTRIES;
8455 	tmpl3 = 0;
8456 	if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8457 	    (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8458 		tmpl3 = kva_alloc(PAGE_SIZE);
8459 		if (tmpl3 == 0)
8460 			return (false);
8461 		pmap_kenter(tmpl3, PAGE_SIZE,
8462 		    DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8463 		    VM_MEMATTR_WRITE_BACK);
8464 		l3c_start = (pt_entry_t *)(tmpl3 +
8465 		    ((vm_offset_t)l3c_start & PAGE_MASK));
8466 		l3c_end = (pt_entry_t *)(tmpl3 +
8467 		    ((vm_offset_t)l3c_end & PAGE_MASK));
8468 	}
8469 	mask = 0;
8470 	nbits = ATTR_DESCR_VALID;
8471 	intr = intr_disable();
8472 
8473 	/*
8474 	 * Break the mappings.
8475 	 */
8476 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8477 		/*
8478 		 * Clear the mapping's contiguous and valid bits, but leave
8479 		 * the rest of the entry unchanged, so that a lockless,
8480 		 * concurrent pmap_kextract() can still lookup the physical
8481 		 * address.
8482 		 */
8483 		l3e = pmap_load(tl3p);
8484 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8485 		    ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8486 		KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8487 		    (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8488 		    ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8489 		while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8490 		    ATTR_DESCR_VALID)))
8491 			cpu_spinwait();
8492 
8493 		/*
8494 		 * Hardware accessed and dirty bit maintenance might only
8495 		 * update a single L3 entry, so we must combine the accessed
8496 		 * and dirty bits from this entire set of contiguous L3
8497 		 * entries.
8498 		 */
8499 		if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8500 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8501 			mask = ATTR_S1_AP_RW_BIT;
8502 		nbits |= l3e & ATTR_AF;
8503 	}
8504 	if ((nbits & ATTR_AF) != 0) {
8505 		pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8506 		    ~L3C_OFFSET, true);
8507 	}
8508 
8509 	/*
8510 	 * Remake the mappings, updating the accessed and dirty bits.
8511 	 */
8512 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8513 		l3e = pmap_load(tl3p);
8514 		while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
8515 			cpu_spinwait();
8516 	}
8517 	dsb(ishst);
8518 
8519 	intr_restore(intr);
8520 	if (tmpl3 != 0) {
8521 		pmap_kremove(tmpl3);
8522 		kva_free(tmpl3, PAGE_SIZE);
8523 	}
8524 	counter_u64_add(pmap_l3c_demotions, 1);
8525 	CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8526 	    va, pmap);
8527 	return (true);
8528 }
8529 
8530 /*
8531  * Accumulate the accessed and dirty bits within a L3C superpage and
8532  * return the specified PTE with them applied correctly.
8533  */
8534 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)8535 pmap_load_l3c(pt_entry_t *l3p)
8536 {
8537 	pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8538 
8539 	l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8540 	    sizeof(pt_entry_t)) - 1));
8541 	l3c_end = l3c_start + L3C_ENTRIES;
8542 	mask = 0;
8543 	nbits = 0;
8544 	/* Iterate over each mapping in the superpage. */
8545 	for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8546 		l3e = pmap_load(tl3p);
8547 		KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8548 		    ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8549 		/* Update mask if the current page has its dirty bit set. */
8550 		if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8551 		    (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8552 			mask = ATTR_S1_AP_RW_BIT;
8553 		/* Update nbits if the accessed bit is set. */
8554 		nbits |= l3e & ATTR_AF;
8555 	}
8556 	return ((pmap_load(l3p) & ~mask) | nbits);
8557 }
8558 
8559 /*
8560  * Perform the pmap work for mincore(2).  If the page is not both referenced and
8561  * modified by this pmap, returns its physical address so that the caller can
8562  * find other mappings.
8563  */
8564 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)8565 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8566 {
8567 	pt_entry_t *pte, tpte;
8568 	vm_paddr_t mask, pa;
8569 	int lvl, val;
8570 	bool managed;
8571 
8572 	PMAP_ASSERT_STAGE1(pmap);
8573 	PMAP_LOCK(pmap);
8574 	pte = pmap_pte(pmap, addr, &lvl);
8575 	if (pte != NULL) {
8576 		tpte = pmap_load(pte);
8577 
8578 		switch (lvl) {
8579 		case 3:
8580 			mask = L3_OFFSET;
8581 			break;
8582 		case 2:
8583 			mask = L2_OFFSET;
8584 			break;
8585 		case 1:
8586 			mask = L1_OFFSET;
8587 			break;
8588 		default:
8589 			panic("pmap_mincore: invalid level %d", lvl);
8590 		}
8591 
8592 		managed = (tpte & ATTR_SW_MANAGED) != 0;
8593 		val = MINCORE_INCORE;
8594 		if (lvl != 3)
8595 			val |= MINCORE_PSIND(3 - lvl);
8596 		if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8597 		    (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8598 			val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8599 		if ((tpte & ATTR_AF) == ATTR_AF)
8600 			val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8601 
8602 		pa = PTE_TO_PHYS(tpte) | (addr & mask);
8603 	} else {
8604 		managed = false;
8605 		val = 0;
8606 	}
8607 
8608 	if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8609 	    (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8610 		*pap = pa;
8611 	}
8612 	PMAP_UNLOCK(pmap);
8613 	return (val);
8614 }
8615 
8616 /*
8617  * Garbage collect every ASID that is neither active on a processor nor
8618  * reserved.
8619  */
8620 static void
pmap_reset_asid_set(pmap_t pmap)8621 pmap_reset_asid_set(pmap_t pmap)
8622 {
8623 	pmap_t curpmap;
8624 	int asid, cpuid, epoch;
8625 	struct asid_set *set;
8626 	enum pmap_stage stage;
8627 
8628 	set = pmap->pm_asid_set;
8629 	stage = pmap->pm_stage;
8630 
8631 	set = pmap->pm_asid_set;
8632 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8633 	mtx_assert(&set->asid_set_mutex, MA_OWNED);
8634 
8635 	/*
8636 	 * Ensure that the store to asid_epoch is globally visible before the
8637 	 * loads from pc_curpmap are performed.
8638 	 */
8639 	epoch = set->asid_epoch + 1;
8640 	if (epoch == INT_MAX)
8641 		epoch = 0;
8642 	set->asid_epoch = epoch;
8643 	dsb(ishst);
8644 	if (stage == PM_STAGE1) {
8645 		__asm __volatile("tlbi vmalle1is");
8646 	} else {
8647 		KASSERT(pmap_clean_stage2_tlbi != NULL,
8648 		    ("%s: Unset stage 2 tlb invalidation callback\n",
8649 		    __func__));
8650 		pmap_clean_stage2_tlbi();
8651 	}
8652 	dsb(ish);
8653 	bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
8654 	    set->asid_set_size - 1);
8655 	CPU_FOREACH(cpuid) {
8656 		if (cpuid == curcpu)
8657 			continue;
8658 		if (stage == PM_STAGE1) {
8659 			curpmap = pcpu_find(cpuid)->pc_curpmap;
8660 			PMAP_ASSERT_STAGE1(pmap);
8661 		} else {
8662 			curpmap = pcpu_find(cpuid)->pc_curvmpmap;
8663 			if (curpmap == NULL)
8664 				continue;
8665 			PMAP_ASSERT_STAGE2(pmap);
8666 		}
8667 		KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
8668 		asid = COOKIE_TO_ASID(curpmap->pm_cookie);
8669 		if (asid == -1)
8670 			continue;
8671 		bit_set(set->asid_set, asid);
8672 		curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
8673 	}
8674 }
8675 
8676 /*
8677  * Allocate a new ASID for the specified pmap.
8678  */
8679 static void
pmap_alloc_asid(pmap_t pmap)8680 pmap_alloc_asid(pmap_t pmap)
8681 {
8682 	struct asid_set *set;
8683 	int new_asid;
8684 
8685 	set = pmap->pm_asid_set;
8686 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8687 
8688 	mtx_lock_spin(&set->asid_set_mutex);
8689 
8690 	/*
8691 	 * While this processor was waiting to acquire the asid set mutex,
8692 	 * pmap_reset_asid_set() running on another processor might have
8693 	 * updated this pmap's cookie to the current epoch.  In which case, we
8694 	 * don't need to allocate a new ASID.
8695 	 */
8696 	if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
8697 		goto out;
8698 
8699 	bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
8700 	    &new_asid);
8701 	if (new_asid == -1) {
8702 		bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8703 		    set->asid_next, &new_asid);
8704 		if (new_asid == -1) {
8705 			pmap_reset_asid_set(pmap);
8706 			bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8707 			    set->asid_set_size, &new_asid);
8708 			KASSERT(new_asid != -1, ("ASID allocation failure"));
8709 		}
8710 	}
8711 	bit_set(set->asid_set, new_asid);
8712 	set->asid_next = new_asid + 1;
8713 	pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
8714 out:
8715 	mtx_unlock_spin(&set->asid_set_mutex);
8716 }
8717 
8718 static uint64_t __read_mostly ttbr_flags;
8719 
8720 /*
8721  * Compute the value that should be stored in ttbr0 to activate the specified
8722  * pmap.  This value may change from time to time.
8723  */
8724 uint64_t
pmap_to_ttbr0(pmap_t pmap)8725 pmap_to_ttbr0(pmap_t pmap)
8726 {
8727 	uint64_t ttbr;
8728 
8729 	ttbr = pmap->pm_ttbr;
8730 	ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
8731 	ttbr |= ttbr_flags;
8732 
8733 	return (ttbr);
8734 }
8735 
8736 static void
pmap_set_cnp(void * arg)8737 pmap_set_cnp(void *arg)
8738 {
8739 	uint64_t ttbr0, ttbr1;
8740 	u_int cpuid;
8741 
8742 	cpuid = *(u_int *)arg;
8743 	if (cpuid == curcpu) {
8744 		/*
8745 		 * Set the flags while all CPUs are handling the
8746 		 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
8747 		 * to pmap_to_ttbr0 after this will have the CnP flag set.
8748 		 * The dsb after invalidating the TLB will act as a barrier
8749 		 * to ensure all CPUs can observe this change.
8750 		 */
8751 		ttbr_flags |= TTBR_CnP;
8752 	}
8753 
8754 	ttbr0 = READ_SPECIALREG(ttbr0_el1);
8755 	ttbr0 |= TTBR_CnP;
8756 
8757 	ttbr1 = READ_SPECIALREG(ttbr1_el1);
8758 	ttbr1 |= TTBR_CnP;
8759 
8760 	/* Update ttbr{0,1}_el1 with the CnP flag */
8761 	WRITE_SPECIALREG(ttbr0_el1, ttbr0);
8762 	WRITE_SPECIALREG(ttbr1_el1, ttbr1);
8763 	isb();
8764 	__asm __volatile("tlbi vmalle1is");
8765 	dsb(ish);
8766 	isb();
8767 }
8768 
8769 /*
8770  * Defer enabling some features until we have read the ID registers to know
8771  * if they are supported on all CPUs.
8772  */
8773 static void
pmap_init_mp(void * dummy __unused)8774 pmap_init_mp(void *dummy __unused)
8775 {
8776 	uint64_t reg;
8777 
8778 	if (get_kernel_reg(ID_AA64PFR1_EL1, &reg)) {
8779 		if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
8780 			if (bootverbose)
8781 				printf("Enabling BTI\n");
8782 			pmap_bti_support = true;
8783 
8784 			pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
8785 			    sizeof(struct rs_el), NULL, NULL, NULL, NULL,
8786 			    UMA_ALIGN_PTR, 0);
8787 		}
8788 	}
8789 }
8790 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
8791 
8792 /*
8793  * Defer enabling CnP until we have read the ID registers to know if it's
8794  * supported on all CPUs.
8795  */
8796 static void
pmap_init_cnp(void * dummy __unused)8797 pmap_init_cnp(void *dummy __unused)
8798 {
8799 	uint64_t reg;
8800 	u_int cpuid;
8801 
8802 	if (!get_kernel_reg(ID_AA64MMFR2_EL1, &reg))
8803 		return;
8804 
8805 	if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
8806 		if (bootverbose)
8807 			printf("Enabling CnP\n");
8808 		cpuid = curcpu;
8809 		smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
8810 	}
8811 
8812 }
8813 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
8814 
8815 static bool
pmap_activate_int(pmap_t pmap)8816 pmap_activate_int(pmap_t pmap)
8817 {
8818 	struct asid_set *set;
8819 	int epoch;
8820 
8821 	KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
8822 	KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
8823 
8824 	if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
8825 	    (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
8826 		/*
8827 		 * Handle the possibility that the old thread was preempted
8828 		 * after an "ic" or "tlbi" instruction but before it performed
8829 		 * a "dsb" instruction.  If the old thread migrates to a new
8830 		 * processor, its completion of a "dsb" instruction on that
8831 		 * new processor does not guarantee that the "ic" or "tlbi"
8832 		 * instructions performed on the old processor have completed.
8833 		 */
8834 		dsb(ish);
8835 		return (false);
8836 	}
8837 
8838 	set = pmap->pm_asid_set;
8839 	KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8840 
8841 	/*
8842 	 * Ensure that the store to curpmap is globally visible before the
8843 	 * load from asid_epoch is performed.
8844 	 */
8845 	if (pmap->pm_stage == PM_STAGE1)
8846 		PCPU_SET(curpmap, pmap);
8847 	else
8848 		PCPU_SET(curvmpmap, pmap);
8849 	dsb(ish);
8850 	epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
8851 	if (epoch >= 0 && epoch != set->asid_epoch)
8852 		pmap_alloc_asid(pmap);
8853 
8854 	if (pmap->pm_stage == PM_STAGE1) {
8855 		set_ttbr0(pmap_to_ttbr0(pmap));
8856 		if (PCPU_GET(bcast_tlbi_workaround) != 0)
8857 			invalidate_local_icache();
8858 	}
8859 	return (true);
8860 }
8861 
8862 void
pmap_activate_vm(pmap_t pmap)8863 pmap_activate_vm(pmap_t pmap)
8864 {
8865 
8866 	PMAP_ASSERT_STAGE2(pmap);
8867 
8868 	(void)pmap_activate_int(pmap);
8869 }
8870 
8871 void
pmap_activate(struct thread * td)8872 pmap_activate(struct thread *td)
8873 {
8874 	pmap_t	pmap;
8875 
8876 	pmap = vmspace_pmap(td->td_proc->p_vmspace);
8877 	PMAP_ASSERT_STAGE1(pmap);
8878 	critical_enter();
8879 	(void)pmap_activate_int(pmap);
8880 	critical_exit();
8881 }
8882 
8883 /*
8884  * Activate the thread we are switching to.
8885  * To simplify the assembly in cpu_throw return the new threads pcb.
8886  */
8887 struct pcb *
pmap_switch(struct thread * new)8888 pmap_switch(struct thread *new)
8889 {
8890 	pcpu_bp_harden bp_harden;
8891 	struct pcb *pcb;
8892 
8893 	/* Store the new curthread */
8894 	PCPU_SET(curthread, new);
8895 
8896 	/* And the new pcb */
8897 	pcb = new->td_pcb;
8898 	PCPU_SET(curpcb, pcb);
8899 
8900 	/*
8901 	 * TODO: We may need to flush the cache here if switching
8902 	 * to a user process.
8903 	 */
8904 
8905 	if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
8906 		/*
8907 		 * Stop userspace from training the branch predictor against
8908 		 * other processes. This will call into a CPU specific
8909 		 * function that clears the branch predictor state.
8910 		 */
8911 		bp_harden = PCPU_GET(bp_harden);
8912 		if (bp_harden != NULL)
8913 			bp_harden();
8914 	}
8915 
8916 	return (pcb);
8917 }
8918 
8919 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)8920 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
8921 {
8922 
8923 	PMAP_ASSERT_STAGE1(pmap);
8924 	KASSERT(ADDR_IS_CANONICAL(va),
8925 	    ("%s: Address not in canonical form: %lx", __func__, va));
8926 
8927 	if (ADDR_IS_KERNEL(va)) {
8928 		cpu_icache_sync_range((void *)va, sz);
8929 	} else {
8930 		u_int len, offset;
8931 		vm_paddr_t pa;
8932 
8933 		/* Find the length of data in this page to flush */
8934 		offset = va & PAGE_MASK;
8935 		len = imin(PAGE_SIZE - offset, sz);
8936 
8937 		while (sz != 0) {
8938 			/* Extract the physical address & find it in the DMAP */
8939 			pa = pmap_extract(pmap, va);
8940 			if (pa != 0)
8941 				cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
8942 				    len);
8943 
8944 			/* Move to the next page */
8945 			sz -= len;
8946 			va += len;
8947 			/* Set the length for the next iteration */
8948 			len = imin(PAGE_SIZE, sz);
8949 		}
8950 	}
8951 }
8952 
8953 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)8954 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
8955 {
8956 	pd_entry_t *pdep;
8957 	pt_entry_t *ptep, pte;
8958 	int rv, lvl, dfsc;
8959 
8960 	PMAP_ASSERT_STAGE2(pmap);
8961 	rv = KERN_FAILURE;
8962 
8963 	/* Data and insn aborts use same encoding for FSC field. */
8964 	dfsc = esr & ISS_DATA_DFSC_MASK;
8965 	switch (dfsc) {
8966 	case ISS_DATA_DFSC_TF_L0:
8967 	case ISS_DATA_DFSC_TF_L1:
8968 	case ISS_DATA_DFSC_TF_L2:
8969 	case ISS_DATA_DFSC_TF_L3:
8970 		PMAP_LOCK(pmap);
8971 		pdep = pmap_pde(pmap, far, &lvl);
8972 		if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
8973 			PMAP_UNLOCK(pmap);
8974 			break;
8975 		}
8976 
8977 		switch (lvl) {
8978 		case 0:
8979 			ptep = pmap_l0_to_l1(pdep, far);
8980 			break;
8981 		case 1:
8982 			ptep = pmap_l1_to_l2(pdep, far);
8983 			break;
8984 		case 2:
8985 			ptep = pmap_l2_to_l3(pdep, far);
8986 			break;
8987 		default:
8988 			panic("%s: Invalid pde level %d", __func__,lvl);
8989 		}
8990 		goto fault_exec;
8991 
8992 	case ISS_DATA_DFSC_AFF_L1:
8993 	case ISS_DATA_DFSC_AFF_L2:
8994 	case ISS_DATA_DFSC_AFF_L3:
8995 		PMAP_LOCK(pmap);
8996 		ptep = pmap_pte(pmap, far, &lvl);
8997 fault_exec:
8998 		if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
8999 			if (icache_vmid) {
9000 				pmap_invalidate_vpipt_icache();
9001 			} else {
9002 				/*
9003 				 * If accessing an executable page invalidate
9004 				 * the I-cache so it will be valid when we
9005 				 * continue execution in the guest. The D-cache
9006 				 * is assumed to already be clean to the Point
9007 				 * of Coherency.
9008 				 */
9009 				if ((pte & ATTR_S2_XN_MASK) !=
9010 				    ATTR_S2_XN(ATTR_S2_XN_NONE)) {
9011 					invalidate_icache();
9012 				}
9013 			}
9014 			pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
9015 			rv = KERN_SUCCESS;
9016 		}
9017 		PMAP_UNLOCK(pmap);
9018 		break;
9019 	}
9020 
9021 	return (rv);
9022 }
9023 
9024 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)9025 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9026 {
9027 	pt_entry_t pte, *ptep;
9028 	register_t intr;
9029 	uint64_t ec, par;
9030 	int lvl, rv;
9031 
9032 	rv = KERN_FAILURE;
9033 
9034 	ec = ESR_ELx_EXCEPTION(esr);
9035 	switch (ec) {
9036 	case EXCP_INSN_ABORT_L:
9037 	case EXCP_INSN_ABORT:
9038 	case EXCP_DATA_ABORT_L:
9039 	case EXCP_DATA_ABORT:
9040 		break;
9041 	default:
9042 		return (rv);
9043 	}
9044 
9045 	if (pmap->pm_stage == PM_STAGE2)
9046 		return (pmap_stage2_fault(pmap, esr, far));
9047 
9048 	/* Data and insn aborts use same encoding for FSC field. */
9049 	switch (esr & ISS_DATA_DFSC_MASK) {
9050 	case ISS_DATA_DFSC_AFF_L1:
9051 	case ISS_DATA_DFSC_AFF_L2:
9052 	case ISS_DATA_DFSC_AFF_L3:
9053 		PMAP_LOCK(pmap);
9054 		ptep = pmap_pte(pmap, far, &lvl);
9055 		if (ptep != NULL) {
9056 			pmap_set_bits(ptep, ATTR_AF);
9057 			rv = KERN_SUCCESS;
9058 			/*
9059 			 * XXXMJ as an optimization we could mark the entry
9060 			 * dirty if this is a write fault.
9061 			 */
9062 		}
9063 		PMAP_UNLOCK(pmap);
9064 		break;
9065 	case ISS_DATA_DFSC_PF_L1:
9066 	case ISS_DATA_DFSC_PF_L2:
9067 	case ISS_DATA_DFSC_PF_L3:
9068 		if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
9069 		    (esr & ISS_DATA_WnR) == 0)
9070 			return (rv);
9071 		PMAP_LOCK(pmap);
9072 		ptep = pmap_pte(pmap, far, &lvl);
9073 		if (ptep != NULL &&
9074 		    ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
9075 			if ((pte & ATTR_S1_AP_RW_BIT) ==
9076 			    ATTR_S1_AP(ATTR_S1_AP_RO)) {
9077 				pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
9078 				pmap_s1_invalidate_page(pmap, far, true);
9079 			}
9080 			rv = KERN_SUCCESS;
9081 		}
9082 		PMAP_UNLOCK(pmap);
9083 		break;
9084 	case ISS_DATA_DFSC_TF_L0:
9085 	case ISS_DATA_DFSC_TF_L1:
9086 	case ISS_DATA_DFSC_TF_L2:
9087 	case ISS_DATA_DFSC_TF_L3:
9088 		/*
9089 		 * Retry the translation.  A break-before-make sequence can
9090 		 * produce a transient fault.
9091 		 */
9092 		if (pmap == kernel_pmap) {
9093 			/*
9094 			 * The translation fault may have occurred within a
9095 			 * critical section.  Therefore, we must check the
9096 			 * address without acquiring the kernel pmap's lock.
9097 			 */
9098 			if (pmap_klookup(far, NULL))
9099 				rv = KERN_SUCCESS;
9100 		} else {
9101 			PMAP_LOCK(pmap);
9102 			/* Ask the MMU to check the address. */
9103 			intr = intr_disable();
9104 			par = arm64_address_translate_s1e0r(far);
9105 			intr_restore(intr);
9106 			PMAP_UNLOCK(pmap);
9107 
9108 			/*
9109 			 * If the translation was successful, then we can
9110 			 * return success to the trap handler.
9111 			 */
9112 			if (PAR_SUCCESS(par))
9113 				rv = KERN_SUCCESS;
9114 		}
9115 		break;
9116 	}
9117 
9118 	return (rv);
9119 }
9120 
9121 /*
9122  *	Increase the starting virtual address of the given mapping if a
9123  *	different alignment might result in more superpage mappings.
9124  */
9125 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)9126 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
9127     vm_offset_t *addr, vm_size_t size)
9128 {
9129 	vm_offset_t superpage_offset;
9130 
9131 	if (size < L2_SIZE)
9132 		return;
9133 	if (object != NULL && (object->flags & OBJ_COLORED) != 0)
9134 		offset += ptoa(object->pg_color);
9135 	superpage_offset = offset & L2_OFFSET;
9136 	if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
9137 	    (*addr & L2_OFFSET) == superpage_offset)
9138 		return;
9139 	if ((*addr & L2_OFFSET) < superpage_offset)
9140 		*addr = (*addr & ~L2_OFFSET) + superpage_offset;
9141 	else
9142 		*addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
9143 }
9144 
9145 /**
9146  * Get the kernel virtual address of a set of physical pages. If there are
9147  * physical addresses not covered by the DMAP perform a transient mapping
9148  * that will be removed when calling pmap_unmap_io_transient.
9149  *
9150  * \param page        The pages the caller wishes to obtain the virtual
9151  *                    address on the kernel memory map.
9152  * \param vaddr       On return contains the kernel virtual memory address
9153  *                    of the pages passed in the page parameter.
9154  * \param count       Number of pages passed in.
9155  * \param can_fault   true if the thread using the mapped pages can take
9156  *                    page faults, false otherwise.
9157  *
9158  * \returns true if the caller must call pmap_unmap_io_transient when
9159  *          finished or false otherwise.
9160  *
9161  */
9162 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9163 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9164     bool can_fault)
9165 {
9166 	vm_paddr_t paddr;
9167 	bool needs_mapping;
9168 	int error __diagused, i;
9169 
9170 	/*
9171 	 * Allocate any KVA space that we need, this is done in a separate
9172 	 * loop to prevent calling vmem_alloc while pinned.
9173 	 */
9174 	needs_mapping = false;
9175 	for (i = 0; i < count; i++) {
9176 		paddr = VM_PAGE_TO_PHYS(page[i]);
9177 		if (__predict_false(!PHYS_IN_DMAP(paddr))) {
9178 			error = vmem_alloc(kernel_arena, PAGE_SIZE,
9179 			    M_BESTFIT | M_WAITOK, &vaddr[i]);
9180 			KASSERT(error == 0, ("vmem_alloc failed: %d", error));
9181 			needs_mapping = true;
9182 		} else {
9183 			vaddr[i] = PHYS_TO_DMAP(paddr);
9184 		}
9185 	}
9186 
9187 	/* Exit early if everything is covered by the DMAP */
9188 	if (!needs_mapping)
9189 		return (false);
9190 
9191 	if (!can_fault)
9192 		sched_pin();
9193 	for (i = 0; i < count; i++) {
9194 		paddr = VM_PAGE_TO_PHYS(page[i]);
9195 		if (!PHYS_IN_DMAP(paddr)) {
9196 			panic(
9197 			   "pmap_map_io_transient: TODO: Map out of DMAP data");
9198 		}
9199 	}
9200 
9201 	return (needs_mapping);
9202 }
9203 
9204 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9205 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9206     bool can_fault)
9207 {
9208 	vm_paddr_t paddr;
9209 	int i;
9210 
9211 	if (!can_fault)
9212 		sched_unpin();
9213 	for (i = 0; i < count; i++) {
9214 		paddr = VM_PAGE_TO_PHYS(page[i]);
9215 		if (!PHYS_IN_DMAP(paddr)) {
9216 			panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9217 		}
9218 	}
9219 }
9220 
9221 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9222 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9223 {
9224 
9225 	return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
9226 }
9227 
9228 static void *
bti_dup_range(void * ctx __unused,void * data)9229 bti_dup_range(void *ctx __unused, void *data)
9230 {
9231 	struct rs_el *node, *new_node;
9232 
9233 	new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9234 	if (new_node == NULL)
9235 		return (NULL);
9236 	node = data;
9237 	memcpy(new_node, node, sizeof(*node));
9238 	return (new_node);
9239 }
9240 
9241 static void
bti_free_range(void * ctx __unused,void * node)9242 bti_free_range(void *ctx __unused, void *node)
9243 {
9244 
9245 	uma_zfree(pmap_bti_ranges_zone, node);
9246 }
9247 
9248 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9249 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9250 {
9251 	struct rs_el *rs;
9252 	int error;
9253 
9254 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9255 	PMAP_ASSERT_STAGE1(pmap);
9256 	MPASS(pmap->pm_bti != NULL);
9257 	rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9258 	if (rs == NULL)
9259 		return (ENOMEM);
9260 	error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9261 	if (error != 0)
9262 		uma_zfree(pmap_bti_ranges_zone, rs);
9263 	return (error);
9264 }
9265 
9266 static void
pmap_bti_deassign_all(pmap_t pmap)9267 pmap_bti_deassign_all(pmap_t pmap)
9268 {
9269 
9270 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9271 	if (pmap->pm_bti != NULL)
9272 		rangeset_remove_all(pmap->pm_bti);
9273 }
9274 
9275 /*
9276  * Returns true if the BTI setting is the same across the specified address
9277  * range, and false otherwise.  When returning true, updates the referenced PTE
9278  * to reflect the BTI setting.
9279  *
9280  * Only stage 1 pmaps support BTI.  The kernel pmap is always a stage 1 pmap
9281  * that has the same BTI setting implicitly across its entire address range.
9282  */
9283 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t * pte)9284 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
9285 {
9286 	struct rs_el *next_rs, *rs;
9287 	vm_offset_t va;
9288 
9289 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9290 	KASSERT(ADDR_IS_CANONICAL(sva),
9291 	    ("%s: Start address not in canonical form: %lx", __func__, sva));
9292 	KASSERT(ADDR_IS_CANONICAL(eva),
9293 	    ("%s: End address not in canonical form: %lx", __func__, eva));
9294 	KASSERT((*pte & ATTR_S1_GP) == 0,
9295 	    ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte));
9296 
9297 	if (pmap == kernel_pmap) {
9298 		*pte |= ATTR_KERN_GP;
9299 		return (true);
9300 	}
9301 	if (pmap->pm_bti == NULL)
9302 		return (true);
9303 	PMAP_ASSERT_STAGE1(pmap);
9304 	rs = rangeset_lookup(pmap->pm_bti, sva);
9305 	if (rs == NULL) {
9306 		rs = rangeset_next(pmap->pm_bti, sva);
9307 		return (rs == NULL ||
9308 			rs->re_start >= eva);
9309 	}
9310 	while ((va = rs->re_end) < eva) {
9311 		next_rs = rangeset_next(pmap->pm_bti, va);
9312 		if (next_rs == NULL ||
9313 		    va != next_rs->re_start)
9314 			return (false);
9315 		rs = next_rs;
9316 	}
9317 	if (rs != NULL)
9318 		*pte |= ATTR_S1_GP;
9319 	return (true);
9320 }
9321 
9322 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9323 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9324 {
9325 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9326 	MPASS(ADDR_IS_CANONICAL(va));
9327 
9328 	if (pmap->pm_stage != PM_STAGE1)
9329 		return (0);
9330 	if (pmap == kernel_pmap)
9331 		return (ATTR_KERN_GP);
9332 	if (pmap->pm_bti != NULL && rangeset_lookup(pmap->pm_bti, va) != NULL)
9333 		return (ATTR_S1_GP);
9334 	return (0);
9335 }
9336 
9337 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9338 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9339 {
9340 
9341 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9342 	if (pmap->pm_bti != NULL)
9343 		rangeset_remove(pmap->pm_bti, sva, eva);
9344 }
9345 
9346 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9347 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9348 {
9349 
9350 	PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9351 	PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9352 	MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9353 	MPASS(src_pmap->pm_bti != NULL);
9354 	MPASS(dst_pmap->pm_bti != NULL);
9355 	if (src_pmap->pm_bti->rs_data_ctx == NULL)
9356 		return (0);
9357 	return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9358 }
9359 
9360 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9361 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9362 {
9363 	PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9364 	PMAP_ASSERT_STAGE1(pmap);
9365 
9366 	pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9367 	    true);
9368 }
9369 
9370 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9371 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9372 {
9373 	int error;
9374 
9375 	if (pmap->pm_bti == NULL)
9376 		return (0);
9377 	if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9378 		return (EINVAL);
9379 	if (pmap->pm_stage != PM_STAGE1)
9380 		return (EINVAL);
9381 	if (eva <= sva || ADDR_IS_KERNEL(eva))
9382 		return (EFAULT);
9383 
9384 	sva = trunc_page(sva);
9385 	eva = round_page(eva);
9386 	for (;;) {
9387 		PMAP_LOCK(pmap);
9388 		error = pmap_bti_assign(pmap, sva, eva);
9389 		if (error == 0)
9390 			pmap_bti_update_range(pmap, sva, eva, true);
9391 		PMAP_UNLOCK(pmap);
9392 		if (error != ENOMEM)
9393 			break;
9394 		vm_wait(NULL);
9395 	}
9396 	return (error);
9397 }
9398 
9399 #if defined(KASAN) || defined(KMSAN)
9400 static pd_entry_t	*pmap_san_early_l2;
9401 
9402 #define	SAN_BOOTSTRAP_L2_SIZE	(1 * L2_SIZE)
9403 #define	SAN_BOOTSTRAP_SIZE	(2 * PAGE_SIZE)
9404 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9405 pmap_san_enter_bootstrap_alloc_l2(void)
9406 {
9407 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9408 	static size_t offset = 0;
9409 	vm_offset_t addr;
9410 
9411 	if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9412 		panic("%s: out of memory for the bootstrap shadow map L2 entries",
9413 		    __func__);
9414 	}
9415 
9416 	addr = (uintptr_t)&bootstrap_data[offset];
9417 	offset += L2_SIZE;
9418 	return (addr);
9419 }
9420 
9421 /*
9422  * SAN L1 + L2 pages, maybe L3 entries later?
9423  */
9424 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9425 pmap_san_enter_bootstrap_alloc_pages(int npages)
9426 {
9427 	static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9428 	static size_t offset = 0;
9429 	vm_offset_t addr;
9430 
9431 	if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9432 		panic("%s: out of memory for the bootstrap shadow map",
9433 		    __func__);
9434 	}
9435 
9436 	addr = (uintptr_t)&bootstrap_data[offset];
9437 	offset += (npages * PAGE_SIZE);
9438 	return (addr);
9439 }
9440 
9441 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)9442 pmap_san_enter_bootstrap(void)
9443 {
9444 	vm_offset_t freemempos;
9445 
9446 	/* L1, L2 */
9447 	freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9448 	bs_state.freemempos = freemempos;
9449 	bs_state.va = KASAN_MIN_ADDRESS;
9450 	pmap_bootstrap_l1_table(&bs_state);
9451 	pmap_san_early_l2 = bs_state.l2;
9452 }
9453 
9454 static vm_page_t
pmap_san_enter_alloc_l3(void)9455 pmap_san_enter_alloc_l3(void)
9456 {
9457 	vm_page_t m;
9458 
9459 	m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9460 	    VM_ALLOC_ZERO);
9461 	if (m == NULL)
9462 		panic("%s: no memory to grow shadow map", __func__);
9463 	return (m);
9464 }
9465 
9466 static vm_page_t
pmap_san_enter_alloc_l2(void)9467 pmap_san_enter_alloc_l2(void)
9468 {
9469 	return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9470 	    Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9471 }
9472 
9473 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)9474 pmap_san_enter(vm_offset_t va)
9475 {
9476 	pd_entry_t *l1, *l2;
9477 	pt_entry_t *l3;
9478 	vm_page_t m;
9479 
9480 	if (virtual_avail == 0) {
9481 		vm_offset_t block;
9482 		int slot;
9483 		bool first;
9484 
9485 		/* Temporary shadow map prior to pmap_bootstrap(). */
9486 		first = pmap_san_early_l2 == NULL;
9487 		if (first)
9488 			pmap_san_enter_bootstrap();
9489 
9490 		l2 = pmap_san_early_l2;
9491 		slot = pmap_l2_index(va);
9492 
9493 		if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9494 			MPASS(first);
9495 			block = pmap_san_enter_bootstrap_alloc_l2();
9496 			pmap_store(&l2[slot],
9497 			    PHYS_TO_PTE(pmap_early_vtophys(block)) |
9498 			    PMAP_SAN_PTE_BITS | L2_BLOCK);
9499 			dmb(ishst);
9500 		}
9501 
9502 		return;
9503 	}
9504 
9505 	mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9506 	l1 = pmap_l1(kernel_pmap, va);
9507 	MPASS(l1 != NULL);
9508 	if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9509 		m = pmap_san_enter_alloc_l3();
9510 		pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9511 	}
9512 	l2 = pmap_l1_to_l2(l1, va);
9513 	if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9514 		m = pmap_san_enter_alloc_l2();
9515 		if (m != NULL) {
9516 			pmap_store(l2, VM_PAGE_TO_PTE(m) |
9517 			    PMAP_SAN_PTE_BITS | L2_BLOCK);
9518 		} else {
9519 			m = pmap_san_enter_alloc_l3();
9520 			pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9521 		}
9522 		dmb(ishst);
9523 	}
9524 	if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9525 		return;
9526 	l3 = pmap_l2_to_l3(l2, va);
9527 	if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9528 		return;
9529 	m = pmap_san_enter_alloc_l3();
9530 	pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9531 	dmb(ishst);
9532 }
9533 #endif /* KASAN || KMSAN */
9534 
9535 /*
9536  * Track a range of the kernel's virtual address space that is contiguous
9537  * in various mapping attributes.
9538  */
9539 struct pmap_kernel_map_range {
9540 	vm_offset_t sva;
9541 	pt_entry_t attrs;
9542 	int l3pages;
9543 	int l3contig;
9544 	int l2blocks;
9545 	int l2contig;
9546 	int l1blocks;
9547 };
9548 
9549 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)9550 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9551     vm_offset_t eva)
9552 {
9553 	const char *mode;
9554 	int index;
9555 
9556 	if (eva <= range->sva)
9557 		return;
9558 
9559 	index = range->attrs & ATTR_S1_IDX_MASK;
9560 	switch (index) {
9561 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9562 		mode = "DEV-NP";
9563 		break;
9564 	case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9565 		mode = "DEV";
9566 		break;
9567 	case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9568 		mode = "UC";
9569 		break;
9570 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9571 		mode = "WB";
9572 		break;
9573 	case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9574 		mode = "WT";
9575 		break;
9576 	default:
9577 		printf(
9578 		    "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9579 		    __func__, index, range->sva, eva);
9580 		mode = "??";
9581 		break;
9582 	}
9583 
9584 	sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n",
9585 	    range->sva, eva,
9586 	    (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9587 	    (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9588 	    (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9589 	    (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9590 	    (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9591 	    mode, range->l1blocks, range->l2contig, range->l2blocks,
9592 	    range->l3contig, range->l3pages);
9593 
9594 	/* Reset to sentinel value. */
9595 	range->sva = 0xfffffffffffffffful;
9596 }
9597 
9598 /*
9599  * Determine whether the attributes specified by a page table entry match those
9600  * being tracked by the current range.
9601  */
9602 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)9603 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
9604 {
9605 
9606 	return (range->attrs == attrs);
9607 }
9608 
9609 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)9610 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
9611     pt_entry_t attrs)
9612 {
9613 
9614 	memset(range, 0, sizeof(*range));
9615 	range->sva = va;
9616 	range->attrs = attrs;
9617 }
9618 
9619 /* Get the block/page attributes that correspond to the table attributes */
9620 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)9621 sysctl_kmaps_table_attrs(pd_entry_t table)
9622 {
9623 	pt_entry_t attrs;
9624 
9625 	attrs = 0;
9626 	if ((table & TATTR_UXN_TABLE) != 0)
9627 		attrs |= ATTR_S1_UXN;
9628 	if ((table & TATTR_PXN_TABLE) != 0)
9629 		attrs |= ATTR_S1_PXN;
9630 	if ((table & TATTR_AP_TABLE_RO) != 0)
9631 		attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
9632 
9633 	return (attrs);
9634 }
9635 
9636 /* Read the block/page attributes we care about */
9637 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)9638 sysctl_kmaps_block_attrs(pt_entry_t block)
9639 {
9640 	return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
9641 	    ATTR_S1_GP));
9642 }
9643 
9644 /*
9645  * Given a leaf PTE, derive the mapping's attributes.  If they do not match
9646  * those of the current run, dump the address range and its attributes, and
9647  * begin a new run.
9648  */
9649 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)9650 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
9651     vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
9652     pt_entry_t l3e)
9653 {
9654 	pt_entry_t attrs;
9655 
9656 	attrs = sysctl_kmaps_table_attrs(l0e);
9657 
9658 	if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9659 		attrs |= sysctl_kmaps_block_attrs(l1e);
9660 		goto done;
9661 	}
9662 	attrs |= sysctl_kmaps_table_attrs(l1e);
9663 
9664 	if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9665 		attrs |= sysctl_kmaps_block_attrs(l2e);
9666 		goto done;
9667 	}
9668 	attrs |= sysctl_kmaps_table_attrs(l2e);
9669 	attrs |= sysctl_kmaps_block_attrs(l3e);
9670 
9671 done:
9672 	if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
9673 		sysctl_kmaps_dump(sb, range, va);
9674 		sysctl_kmaps_reinit(range, va, attrs);
9675 	}
9676 }
9677 
9678 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)9679 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
9680 {
9681 	struct pmap_kernel_map_range range;
9682 	struct sbuf sbuf, *sb;
9683 	pd_entry_t l0e, *l1, l1e, *l2, l2e;
9684 	pt_entry_t *l3, l3e;
9685 	vm_offset_t sva;
9686 	vm_paddr_t pa;
9687 	int error, i, j, k, l;
9688 
9689 	error = sysctl_wire_old_buffer(req, 0);
9690 	if (error != 0)
9691 		return (error);
9692 	sb = &sbuf;
9693 	sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
9694 
9695 	/* Sentinel value. */
9696 	range.sva = 0xfffffffffffffffful;
9697 
9698 	/*
9699 	 * Iterate over the kernel page tables without holding the kernel pmap
9700 	 * lock.  Kernel page table pages are never freed, so at worst we will
9701 	 * observe inconsistencies in the output.
9702 	 */
9703 	for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
9704 	    i++) {
9705 		if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
9706 			sbuf_printf(sb, "\nDirect map:\n");
9707 		else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
9708 			sbuf_printf(sb, "\nKernel map:\n");
9709 #ifdef KASAN
9710 		else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
9711 			sbuf_printf(sb, "\nKASAN shadow map:\n");
9712 #endif
9713 #ifdef KMSAN
9714 		else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
9715 			sbuf_printf(sb, "\nKMSAN shadow map:\n");
9716 		else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
9717 			sbuf_printf(sb, "\nKMSAN origin map:\n");
9718 #endif
9719 
9720 		l0e = kernel_pmap->pm_l0[i];
9721 		if ((l0e & ATTR_DESCR_VALID) == 0) {
9722 			sysctl_kmaps_dump(sb, &range, sva);
9723 			sva += L0_SIZE;
9724 			continue;
9725 		}
9726 		pa = PTE_TO_PHYS(l0e);
9727 		l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9728 
9729 		for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
9730 			l1e = l1[j];
9731 			if ((l1e & ATTR_DESCR_VALID) == 0) {
9732 				sysctl_kmaps_dump(sb, &range, sva);
9733 				sva += L1_SIZE;
9734 				continue;
9735 			}
9736 			if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
9737 				PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
9738 				sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
9739 				    0, 0);
9740 				range.l1blocks++;
9741 				sva += L1_SIZE;
9742 				continue;
9743 			}
9744 			pa = PTE_TO_PHYS(l1e);
9745 			l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9746 
9747 			for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
9748 				l2e = l2[k];
9749 				if ((l2e & ATTR_DESCR_VALID) == 0) {
9750 					sysctl_kmaps_dump(sb, &range, sva);
9751 					sva += L2_SIZE;
9752 					continue;
9753 				}
9754 				if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
9755 					sysctl_kmaps_check(sb, &range, sva,
9756 					    l0e, l1e, l2e, 0);
9757 					if ((l2e & ATTR_CONTIGUOUS) != 0)
9758 						range.l2contig +=
9759 						    k % L2C_ENTRIES == 0 ?
9760 						    1 : 0;
9761 					else
9762 						range.l2blocks++;
9763 					sva += L2_SIZE;
9764 					continue;
9765 				}
9766 				pa = PTE_TO_PHYS(l2e);
9767 				l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
9768 
9769 				for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
9770 				    l++, sva += L3_SIZE) {
9771 					l3e = l3[l];
9772 					if ((l3e & ATTR_DESCR_VALID) == 0) {
9773 						sysctl_kmaps_dump(sb, &range,
9774 						    sva);
9775 						continue;
9776 					}
9777 					sysctl_kmaps_check(sb, &range, sva,
9778 					    l0e, l1e, l2e, l3e);
9779 					if ((l3e & ATTR_CONTIGUOUS) != 0)
9780 						range.l3contig +=
9781 						    l % L3C_ENTRIES == 0 ?
9782 						    1 : 0;
9783 					else
9784 						range.l3pages++;
9785 				}
9786 			}
9787 		}
9788 	}
9789 
9790 	error = sbuf_finish(sb);
9791 	sbuf_delete(sb);
9792 	return (error);
9793 }
9794 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
9795     CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
9796     NULL, 0, sysctl_kmaps, "A",
9797     "Dump kernel address layout");
9798