1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 */
52 /*-
53 * Copyright (c) 2003 Networks Associates Technology, Inc.
54 * All rights reserved.
55 *
56 * This software was developed for the FreeBSD Project by Jake Burkholder,
57 * Safeport Network Services, and Network Associates Laboratories, the
58 * Security Research Division of Network Associates, Inc. under
59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60 * CHATS research program.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81 * SUCH DAMAGE.
82 */
83
84 #include <sys/cdefs.h>
85 /*
86 * Manages physical address maps.
87 *
88 * Since the information managed by this module is
89 * also stored by the logical address mapping module,
90 * this module may throw away valid virtual-to-physical
91 * mappings at almost any time. However, invalidations
92 * of virtual-to-physical mappings must be done as
93 * requested.
94 *
95 * In order to cope with hardware architectures which
96 * make virtual-to-physical map invalidates expensive,
97 * this module may delay invalidate or reduced protection
98 * operations until such time as they are actually
99 * necessary. This module is given full information as
100 * to which processors are currently using which maps,
101 * and to when physical maps must be made correct.
102 */
103
104 #include "opt_vm.h"
105
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147
148 #include <machine/asan.h>
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152
153 #ifdef NUMA
154 #define PMAP_MEMDOM MAXMEMDOM
155 #else
156 #define PMAP_MEMDOM 1
157 #endif
158
159 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
160 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
161
162 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
163 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
164 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
166
167 #define NUL0E L0_ENTRIES
168 #define NUL1E (NUL0E * NL1PG)
169 #define NUL2E (NUL1E * NL2PG)
170
171 #ifdef PV_STATS
172 #define PV_STAT(x) do { x ; } while (0)
173 #define __pvused
174 #else
175 #define PV_STAT(x) do { } while (0)
176 #define __pvused __unused
177 #endif
178
179 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
180 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
181 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
182
183 #ifdef __ARM_FEATURE_BTI_DEFAULT
184 #define ATTR_KERN_GP ATTR_S1_GP
185 #else
186 #define ATTR_KERN_GP 0
187 #endif
188 #define PMAP_SAN_PTE_BITS (ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | \
189 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
190
191 struct pmap_large_md_page {
192 struct rwlock pv_lock;
193 struct md_page pv_page;
194 /* Pad to a power of 2, see pmap_init_pv_table(). */
195 int pv_pad[2];
196 };
197
198 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
199 #define pv_dummy pv_dummy_large.pv_page
200 __read_mostly static struct pmap_large_md_page *pv_table;
201
202 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)203 _pa_to_pmdp(vm_paddr_t pa)
204 {
205 struct vm_phys_seg *seg;
206
207 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
208 return ((struct pmap_large_md_page *)seg->md_first +
209 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
210 return (NULL);
211 }
212
213 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)214 pa_to_pmdp(vm_paddr_t pa)
215 {
216 struct pmap_large_md_page *pvd;
217
218 pvd = _pa_to_pmdp(pa);
219 if (pvd == NULL)
220 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
221 return (pvd);
222 }
223
224 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)225 page_to_pmdp(vm_page_t m)
226 {
227 struct vm_phys_seg *seg;
228
229 seg = &vm_phys_segs[m->segind];
230 return ((struct pmap_large_md_page *)seg->md_first +
231 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
232 }
233
234 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
235 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page))
236
237 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
238 struct pmap_large_md_page *_pvd; \
239 struct rwlock *_lock; \
240 _pvd = _pa_to_pmdp(pa); \
241 if (__predict_false(_pvd == NULL)) \
242 _lock = &pv_dummy_large.pv_lock; \
243 else \
244 _lock = &(_pvd->pv_lock); \
245 _lock; \
246 })
247
248 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)249 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
250 {
251 if ((m->flags & PG_FICTITIOUS) == 0)
252 return (&page_to_pmdp(m)->pv_lock);
253 else
254 return (&pv_dummy_large.pv_lock);
255 }
256
257 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \
258 struct rwlock **_lockp = (lockp); \
259 struct rwlock *_new_lock = (new_lock); \
260 \
261 if (_new_lock != *_lockp) { \
262 if (*_lockp != NULL) \
263 rw_wunlock(*_lockp); \
264 *_lockp = _new_lock; \
265 rw_wlock(*_lockp); \
266 } \
267 } while (0)
268
269 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \
270 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
271
272 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
273 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
274
275 #define RELEASE_PV_LIST_LOCK(lockp) do { \
276 struct rwlock **_lockp = (lockp); \
277 \
278 if (*_lockp != NULL) { \
279 rw_wunlock(*_lockp); \
280 *_lockp = NULL; \
281 } \
282 } while (0)
283
284 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
285 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
286
287 /*
288 * The presence of this flag indicates that the mapping is writeable.
289 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
290 * it is dirty. This flag may only be set on managed mappings.
291 *
292 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
293 * as a software managed bit.
294 */
295 #define ATTR_SW_DBM ATTR_DBM
296
297 struct pmap kernel_pmap_store;
298
299 /* Used for mapping ACPI memory before VM is initialized */
300 #define PMAP_PREINIT_MAPPING_COUNT 32
301 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
302 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
303 static int vm_initialized = 0; /* No need to use pre-init maps when set */
304
305 /*
306 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
307 * Always map entire L2 block for simplicity.
308 * VA of L2 block = preinit_map_va + i * L2_SIZE
309 */
310 static struct pmap_preinit_mapping {
311 vm_paddr_t pa;
312 vm_offset_t va;
313 vm_size_t size;
314 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
315
316 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
317 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
318 vm_offset_t kernel_vm_end = 0;
319
320 /*
321 * Data for the pv entry allocation mechanism.
322 */
323 #ifdef NUMA
324 static __inline int
pc_to_domain(struct pv_chunk * pc)325 pc_to_domain(struct pv_chunk *pc)
326 {
327 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
328 }
329 #else
330 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)331 pc_to_domain(struct pv_chunk *pc __unused)
332 {
333 return (0);
334 }
335 #endif
336
337 struct pv_chunks_list {
338 struct mtx pvc_lock;
339 TAILQ_HEAD(pch, pv_chunk) pvc_list;
340 int active_reclaims;
341 } __aligned(CACHE_LINE_SIZE);
342
343 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
344
345 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
346 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
347 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
348
349 extern pt_entry_t pagetable_l0_ttbr1[];
350
351 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
352 static vm_paddr_t physmap[PHYSMAP_SIZE];
353 static u_int physmap_idx;
354
355 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
356 "VM/pmap parameters");
357
358 #if PAGE_SIZE == PAGE_SIZE_4K
359 #define L1_BLOCKS_SUPPORTED 1
360 #else
361 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
362 #define L1_BLOCKS_SUPPORTED 0
363 #endif
364
365 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED)
366
367 /*
368 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
369 * that it has currently allocated to a pmap, a cursor ("asid_next") to
370 * optimize its search for a free ASID in the bit vector, and an epoch number
371 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
372 * ASIDs that are not currently active on a processor.
373 *
374 * The current epoch number is always in the range [0, INT_MAX). Negative
375 * numbers and INT_MAX are reserved for special cases that are described
376 * below.
377 */
378 struct asid_set {
379 int asid_bits;
380 bitstr_t *asid_set;
381 int asid_set_size;
382 int asid_next;
383 int asid_epoch;
384 struct mtx asid_set_mutex;
385 };
386
387 static struct asid_set asids;
388 static struct asid_set vmids;
389
390 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
391 "ASID allocator");
392 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
393 "The number of bits in an ASID");
394 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
395 "The last allocated ASID plus one");
396 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
397 "The current epoch number");
398
399 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
400 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
401 "The number of bits in an VMID");
402 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
403 "The last allocated VMID plus one");
404 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
405 "The current epoch number");
406
407 void (*pmap_clean_stage2_tlbi)(void);
408 void (*pmap_invalidate_vpipt_icache)(void);
409 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
410 void (*pmap_stage2_invalidate_all)(uint64_t);
411
412 /*
413 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
414 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
415 * dynamically allocated ASIDs have a non-negative epoch number.
416 *
417 * An invalid ASID is represented by -1.
418 *
419 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
420 * which indicates that an ASID should never be allocated to the pmap, and
421 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
422 * allocated when the pmap is next activated.
423 */
424 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
425 ((u_long)(epoch) << 32)))
426 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
427 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
428
429 #define TLBI_VA_SHIFT 12
430 #define TLBI_VA_MASK ((1ul << 44) - 1)
431 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
432 #define TLBI_VA_L3_INCR (L3_SIZE >> TLBI_VA_SHIFT)
433
434 static int __read_frequently superpages_enabled = 1;
435 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
436 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
437 "Are large page mappings enabled?");
438
439 /*
440 * True when Branch Target Identification should be used by userspace. This
441 * allows pmap to mark pages as guarded with ATTR_S1_GP.
442 */
443 __read_mostly static bool pmap_bti_support = false;
444
445 /*
446 * Internal flags for pmap_enter()'s helper functions.
447 */
448 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
449 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
450
451 TAILQ_HEAD(pv_chunklist, pv_chunk);
452
453 static void free_pv_chunk(struct pv_chunk *pc);
454 static void free_pv_chunk_batch(struct pv_chunklist *batch);
455 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
456 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
457 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
458 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
459 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
460 vm_offset_t va);
461
462 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
463 static bool pmap_activate_int(pmap_t pmap);
464 static void pmap_alloc_asid(pmap_t pmap);
465 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
466 vm_prot_t prot, int mode, bool skip_unmapped);
467 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
468 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
469 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
470 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
471 vm_offset_t va, struct rwlock **lockp);
472 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
473 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
474 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
475 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
476 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
477 u_int flags, vm_page_t m, struct rwlock **lockp);
478 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
479 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
480 static bool pmap_every_pte_zero(vm_paddr_t pa);
481 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
482 bool all_l3e_AF_set);
483 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
484 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
485 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
486 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
487 struct rwlock **lockp);
488 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
489 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
490 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
491 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
492 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
493 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
494 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
495 struct rwlock **lockp);
496 static void pmap_reset_asid_set(pmap_t pmap);
497 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
498 vm_page_t m, struct rwlock **lockp);
499
500 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
501 struct rwlock **lockp);
502
503 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
504 struct spglist *free);
505 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
506 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
507 vm_offset_t va, vm_size_t size);
508 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
509
510 static uma_zone_t pmap_bti_ranges_zone;
511 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
512 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
513 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
514 static void *bti_dup_range(void *ctx, void *data);
515 static void bti_free_range(void *ctx, void *node);
516 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
517 static void pmap_bti_deassign_all(pmap_t pmap);
518
519 /*
520 * These load the old table data and store the new value.
521 * They need to be atomic as the System MMU may write to the table at
522 * the same time as the CPU.
523 */
524 #define pmap_clear(table) atomic_store_64(table, 0)
525 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
526 #define pmap_load(table) (*table)
527 #define pmap_load_clear(table) atomic_swap_64(table, 0)
528 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
529 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
530 #define pmap_store(table, entry) atomic_store_64(table, entry)
531
532 /********************/
533 /* Inline functions */
534 /********************/
535
536 static __inline void
pagecopy(void * s,void * d)537 pagecopy(void *s, void *d)
538 {
539
540 memcpy(d, s, PAGE_SIZE);
541 }
542
543 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)544 pmap_l0(pmap_t pmap, vm_offset_t va)
545 {
546
547 return (&pmap->pm_l0[pmap_l0_index(va)]);
548 }
549
550 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)551 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
552 {
553 pd_entry_t *l1;
554
555 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
556 return (&l1[pmap_l1_index(va)]);
557 }
558
559 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)560 pmap_l1(pmap_t pmap, vm_offset_t va)
561 {
562 pd_entry_t *l0;
563
564 l0 = pmap_l0(pmap, va);
565 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
566 return (NULL);
567
568 return (pmap_l0_to_l1(l0, va));
569 }
570
571 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)572 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
573 {
574 pd_entry_t l1, *l2p;
575
576 l1 = pmap_load(l1p);
577
578 KASSERT(ADDR_IS_CANONICAL(va),
579 ("%s: Address not in canonical form: %lx", __func__, va));
580 /*
581 * The valid bit may be clear if pmap_update_entry() is concurrently
582 * modifying the entry, so for KVA only the entry type may be checked.
583 */
584 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
585 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
586 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
587 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
588 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
589 return (&l2p[pmap_l2_index(va)]);
590 }
591
592 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)593 pmap_l2(pmap_t pmap, vm_offset_t va)
594 {
595 pd_entry_t *l1;
596
597 l1 = pmap_l1(pmap, va);
598 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
599 return (NULL);
600
601 return (pmap_l1_to_l2(l1, va));
602 }
603
604 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)605 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
606 {
607 pd_entry_t l2;
608 pt_entry_t *l3p;
609
610 l2 = pmap_load(l2p);
611
612 KASSERT(ADDR_IS_CANONICAL(va),
613 ("%s: Address not in canonical form: %lx", __func__, va));
614 /*
615 * The valid bit may be clear if pmap_update_entry() is concurrently
616 * modifying the entry, so for KVA only the entry type may be checked.
617 */
618 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
619 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
620 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
621 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
622 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
623 return (&l3p[pmap_l3_index(va)]);
624 }
625
626 /*
627 * Returns the lowest valid pde for a given virtual address.
628 * The next level may or may not point to a valid page or block.
629 */
630 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)631 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
632 {
633 pd_entry_t *l0, *l1, *l2, desc;
634
635 l0 = pmap_l0(pmap, va);
636 desc = pmap_load(l0) & ATTR_DESCR_MASK;
637 if (desc != L0_TABLE) {
638 *level = -1;
639 return (NULL);
640 }
641
642 l1 = pmap_l0_to_l1(l0, va);
643 desc = pmap_load(l1) & ATTR_DESCR_MASK;
644 if (desc != L1_TABLE) {
645 *level = 0;
646 return (l0);
647 }
648
649 l2 = pmap_l1_to_l2(l1, va);
650 desc = pmap_load(l2) & ATTR_DESCR_MASK;
651 if (desc != L2_TABLE) {
652 *level = 1;
653 return (l1);
654 }
655
656 *level = 2;
657 return (l2);
658 }
659
660 /*
661 * Returns the lowest valid pte block or table entry for a given virtual
662 * address. If there are no valid entries return NULL and set the level to
663 * the first invalid level.
664 */
665 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)666 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
667 {
668 pd_entry_t *l1, *l2, desc;
669 pt_entry_t *l3;
670
671 l1 = pmap_l1(pmap, va);
672 if (l1 == NULL) {
673 *level = 0;
674 return (NULL);
675 }
676 desc = pmap_load(l1) & ATTR_DESCR_MASK;
677 if (desc == L1_BLOCK) {
678 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
679 *level = 1;
680 return (l1);
681 }
682
683 if (desc != L1_TABLE) {
684 *level = 1;
685 return (NULL);
686 }
687
688 l2 = pmap_l1_to_l2(l1, va);
689 desc = pmap_load(l2) & ATTR_DESCR_MASK;
690 if (desc == L2_BLOCK) {
691 *level = 2;
692 return (l2);
693 }
694
695 if (desc != L2_TABLE) {
696 *level = 2;
697 return (NULL);
698 }
699
700 *level = 3;
701 l3 = pmap_l2_to_l3(l2, va);
702 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
703 return (NULL);
704
705 return (l3);
706 }
707
708 /*
709 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
710 * level that maps the specified virtual address, then a pointer to that entry
711 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled
712 * and a diagnostic message is provided, in which case this function panics.
713 */
714 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)715 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
716 {
717 pd_entry_t *l0p, *l1p, *l2p;
718 pt_entry_t desc, *l3p;
719 int walk_level __diagused;
720
721 KASSERT(level >= 0 && level < 4,
722 ("%s: %s passed an out-of-range level (%d)", __func__, diag,
723 level));
724 l0p = pmap_l0(pmap, va);
725 desc = pmap_load(l0p) & ATTR_DESCR_MASK;
726 if (desc == L0_TABLE && level > 0) {
727 l1p = pmap_l0_to_l1(l0p, va);
728 desc = pmap_load(l1p) & ATTR_DESCR_MASK;
729 if (desc == L1_BLOCK && level == 1) {
730 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
731 return (l1p);
732 }
733 if (desc == L1_TABLE && level > 1) {
734 l2p = pmap_l1_to_l2(l1p, va);
735 desc = pmap_load(l2p) & ATTR_DESCR_MASK;
736 if (desc == L2_BLOCK && level == 2)
737 return (l2p);
738 else if (desc == L2_TABLE && level > 2) {
739 l3p = pmap_l2_to_l3(l2p, va);
740 desc = pmap_load(l3p) & ATTR_DESCR_MASK;
741 if (desc == L3_PAGE && level == 3)
742 return (l3p);
743 else
744 walk_level = 3;
745 } else
746 walk_level = 2;
747 } else
748 walk_level = 1;
749 } else
750 walk_level = 0;
751 KASSERT(diag == NULL,
752 ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
753 diag, va, level, desc, walk_level));
754 return (NULL);
755 }
756
757 bool
pmap_ps_enabled(pmap_t pmap)758 pmap_ps_enabled(pmap_t pmap)
759 {
760 /*
761 * Promotion requires a hypervisor call when the kernel is running
762 * in EL1. To stop this disable superpage support on non-stage 1
763 * pmaps for now.
764 */
765 if (pmap->pm_stage != PM_STAGE1)
766 return (false);
767
768 #ifdef KMSAN
769 /*
770 * The break-before-make in pmap_update_entry() results in a situation
771 * where a CPU may call into the KMSAN runtime while the entry is
772 * invalid. If the entry is used to map the current thread structure,
773 * then the runtime will attempt to access unmapped memory. Avoid this
774 * by simply disabling superpage promotion for the kernel map.
775 */
776 if (pmap == kernel_pmap)
777 return (false);
778 #endif
779
780 return (superpages_enabled != 0);
781 }
782
783 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)784 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
785 pd_entry_t **l2, pt_entry_t **l3)
786 {
787 pd_entry_t *l0p, *l1p, *l2p;
788
789 if (pmap->pm_l0 == NULL)
790 return (false);
791
792 l0p = pmap_l0(pmap, va);
793 *l0 = l0p;
794
795 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
796 return (false);
797
798 l1p = pmap_l0_to_l1(l0p, va);
799 *l1 = l1p;
800
801 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
802 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
803 *l2 = NULL;
804 *l3 = NULL;
805 return (true);
806 }
807
808 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
809 return (false);
810
811 l2p = pmap_l1_to_l2(l1p, va);
812 *l2 = l2p;
813
814 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
815 *l3 = NULL;
816 return (true);
817 }
818
819 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
820 return (false);
821
822 *l3 = pmap_l2_to_l3(l2p, va);
823
824 return (true);
825 }
826
827 static __inline int
pmap_l3_valid(pt_entry_t l3)828 pmap_l3_valid(pt_entry_t l3)
829 {
830
831 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
832 }
833
834 CTASSERT(L1_BLOCK == L2_BLOCK);
835
836 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)837 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
838 {
839 pt_entry_t val;
840
841 if (pmap->pm_stage == PM_STAGE1) {
842 val = ATTR_S1_IDX(memattr);
843 if (memattr == VM_MEMATTR_DEVICE)
844 val |= ATTR_S1_XN;
845 return (val);
846 }
847
848 val = 0;
849
850 switch (memattr) {
851 case VM_MEMATTR_DEVICE:
852 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
853 ATTR_S2_XN(ATTR_S2_XN_ALL));
854 case VM_MEMATTR_UNCACHEABLE:
855 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
856 case VM_MEMATTR_WRITE_BACK:
857 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
858 case VM_MEMATTR_WRITE_THROUGH:
859 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
860 default:
861 panic("%s: invalid memory attribute %x", __func__, memattr);
862 }
863 }
864
865 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)866 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
867 {
868 pt_entry_t val;
869
870 val = 0;
871 if (pmap->pm_stage == PM_STAGE1) {
872 if ((prot & VM_PROT_EXECUTE) == 0)
873 val |= ATTR_S1_XN;
874 if ((prot & VM_PROT_WRITE) == 0)
875 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
876 } else {
877 if ((prot & VM_PROT_WRITE) != 0)
878 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
879 if ((prot & VM_PROT_READ) != 0)
880 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
881 if ((prot & VM_PROT_EXECUTE) == 0)
882 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
883 }
884
885 return (val);
886 }
887
888 /*
889 * Checks if the PTE is dirty.
890 */
891 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)892 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
893 {
894
895 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
896
897 if (pmap->pm_stage == PM_STAGE1) {
898 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
899 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
900
901 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
902 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
903 }
904
905 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
906 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
907 }
908
909 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)910 pmap_resident_count_inc(pmap_t pmap, int count)
911 {
912
913 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
914 pmap->pm_stats.resident_count += count;
915 }
916
917 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)918 pmap_resident_count_dec(pmap_t pmap, int count)
919 {
920
921 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
922 KASSERT(pmap->pm_stats.resident_count >= count,
923 ("pmap %p resident count underflow %ld %d", pmap,
924 pmap->pm_stats.resident_count, count));
925 pmap->pm_stats.resident_count -= count;
926 }
927
928 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)929 pmap_early_vtophys(vm_offset_t va)
930 {
931 vm_paddr_t pa_page;
932
933 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
934 return (pa_page | (va & PAR_LOW_MASK));
935 }
936
937 /* State of the bootstrapped DMAP page tables */
938 struct pmap_bootstrap_state {
939 pt_entry_t *l1;
940 pt_entry_t *l2;
941 pt_entry_t *l3;
942 vm_offset_t freemempos;
943 vm_offset_t va;
944 vm_paddr_t pa;
945 pt_entry_t table_attrs;
946 u_int l0_slot;
947 u_int l1_slot;
948 u_int l2_slot;
949 bool dmap_valid;
950 };
951
952 /* The bootstrap state */
953 static struct pmap_bootstrap_state bs_state = {
954 .l1 = NULL,
955 .l2 = NULL,
956 .l3 = NULL,
957 .table_attrs = TATTR_PXN_TABLE,
958 .l0_slot = L0_ENTRIES,
959 .l1_slot = Ln_ENTRIES,
960 .l2_slot = Ln_ENTRIES,
961 .dmap_valid = false,
962 };
963
964 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)965 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
966 {
967 vm_paddr_t l1_pa;
968 pd_entry_t l0e;
969 u_int l0_slot;
970
971 /* Link the level 0 table to a level 1 table */
972 l0_slot = pmap_l0_index(state->va);
973 if (l0_slot != state->l0_slot) {
974 /*
975 * Make sure we move from a low address to high address
976 * before the DMAP region is ready. This ensures we never
977 * modify an existing mapping until we can map from a
978 * physical address to a virtual address.
979 */
980 MPASS(state->l0_slot < l0_slot ||
981 state->l0_slot == L0_ENTRIES ||
982 state->dmap_valid);
983
984 /* Reset lower levels */
985 state->l2 = NULL;
986 state->l3 = NULL;
987 state->l1_slot = Ln_ENTRIES;
988 state->l2_slot = Ln_ENTRIES;
989
990 /* Check the existing L0 entry */
991 state->l0_slot = l0_slot;
992 if (state->dmap_valid) {
993 l0e = pagetable_l0_ttbr1[l0_slot];
994 if ((l0e & ATTR_DESCR_VALID) != 0) {
995 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
996 l1_pa = PTE_TO_PHYS(l0e);
997 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
998 return;
999 }
1000 }
1001
1002 /* Create a new L0 table entry */
1003 state->l1 = (pt_entry_t *)state->freemempos;
1004 memset(state->l1, 0, PAGE_SIZE);
1005 state->freemempos += PAGE_SIZE;
1006
1007 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1008 MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1009 MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1010 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1011 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1012 }
1013 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1014 }
1015
1016 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1017 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1018 {
1019 vm_paddr_t l2_pa;
1020 pd_entry_t l1e;
1021 u_int l1_slot;
1022
1023 /* Make sure there is a valid L0 -> L1 table */
1024 pmap_bootstrap_l0_table(state);
1025
1026 /* Link the level 1 table to a level 2 table */
1027 l1_slot = pmap_l1_index(state->va);
1028 if (l1_slot != state->l1_slot) {
1029 /* See pmap_bootstrap_l0_table for a description */
1030 MPASS(state->l1_slot < l1_slot ||
1031 state->l1_slot == Ln_ENTRIES ||
1032 state->dmap_valid);
1033
1034 /* Reset lower levels */
1035 state->l3 = NULL;
1036 state->l2_slot = Ln_ENTRIES;
1037
1038 /* Check the existing L1 entry */
1039 state->l1_slot = l1_slot;
1040 if (state->dmap_valid) {
1041 l1e = state->l1[l1_slot];
1042 if ((l1e & ATTR_DESCR_VALID) != 0) {
1043 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1044 l2_pa = PTE_TO_PHYS(l1e);
1045 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1046 return;
1047 }
1048 }
1049
1050 /* Create a new L1 table entry */
1051 state->l2 = (pt_entry_t *)state->freemempos;
1052 memset(state->l2, 0, PAGE_SIZE);
1053 state->freemempos += PAGE_SIZE;
1054
1055 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1056 MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1057 MPASS(state->l1[l1_slot] == 0);
1058 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1059 state->table_attrs | L1_TABLE);
1060 }
1061 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1062 }
1063
1064 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1065 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1066 {
1067 vm_paddr_t l3_pa;
1068 pd_entry_t l2e;
1069 u_int l2_slot;
1070
1071 /* Make sure there is a valid L1 -> L2 table */
1072 pmap_bootstrap_l1_table(state);
1073
1074 /* Link the level 2 table to a level 3 table */
1075 l2_slot = pmap_l2_index(state->va);
1076 if (l2_slot != state->l2_slot) {
1077 /* See pmap_bootstrap_l0_table for a description */
1078 MPASS(state->l2_slot < l2_slot ||
1079 state->l2_slot == Ln_ENTRIES ||
1080 state->dmap_valid);
1081
1082 /* Check the existing L2 entry */
1083 state->l2_slot = l2_slot;
1084 if (state->dmap_valid) {
1085 l2e = state->l2[l2_slot];
1086 if ((l2e & ATTR_DESCR_VALID) != 0) {
1087 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1088 l3_pa = PTE_TO_PHYS(l2e);
1089 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1090 return;
1091 }
1092 }
1093
1094 /* Create a new L2 table entry */
1095 state->l3 = (pt_entry_t *)state->freemempos;
1096 memset(state->l3, 0, PAGE_SIZE);
1097 state->freemempos += PAGE_SIZE;
1098
1099 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1100 MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1101 MPASS(state->l2[l2_slot] == 0);
1102 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1103 state->table_attrs | L2_TABLE);
1104 }
1105 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1106 }
1107
1108 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1109 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1110 {
1111 u_int l2_slot;
1112 bool first;
1113
1114 if ((physmap[i + 1] - state->pa) < L2_SIZE)
1115 return;
1116
1117 /* Make sure there is a valid L1 table */
1118 pmap_bootstrap_l1_table(state);
1119
1120 MPASS((state->va & L2_OFFSET) == 0);
1121 for (first = true;
1122 state->va < DMAP_MAX_ADDRESS &&
1123 (physmap[i + 1] - state->pa) >= L2_SIZE;
1124 state->va += L2_SIZE, state->pa += L2_SIZE) {
1125 /*
1126 * Stop if we are about to walk off the end of what the
1127 * current L1 slot can address.
1128 */
1129 if (!first && (state->pa & L1_OFFSET) == 0)
1130 break;
1131
1132 first = false;
1133 l2_slot = pmap_l2_index(state->va);
1134 MPASS((state->pa & L2_OFFSET) == 0);
1135 MPASS(state->l2[l2_slot] == 0);
1136 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1137 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1138 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | L2_BLOCK);
1139 }
1140 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1141 }
1142
1143 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1144 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1145 {
1146 pt_entry_t contig;
1147 u_int l3_slot;
1148 bool first;
1149
1150 if (physmap[i + 1] - state->pa < L3_SIZE)
1151 return;
1152
1153 /* Make sure there is a valid L2 table */
1154 pmap_bootstrap_l2_table(state);
1155
1156 MPASS((state->va & L3_OFFSET) == 0);
1157 for (first = true, contig = 0;
1158 state->va < DMAP_MAX_ADDRESS &&
1159 physmap[i + 1] - state->pa >= L3_SIZE;
1160 state->va += L3_SIZE, state->pa += L3_SIZE) {
1161 /*
1162 * Stop if we are about to walk off the end of what the
1163 * current L2 slot can address.
1164 */
1165 if (!first && (state->pa & L2_OFFSET) == 0)
1166 break;
1167
1168 /*
1169 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1170 * L3 pages, set the contiguous bit within each PTE so that
1171 * the chunk can be cached using only one TLB entry.
1172 */
1173 if ((state->pa & L3C_OFFSET) == 0) {
1174 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1175 physmap[i + 1] - state->pa >= L3C_SIZE) {
1176 contig = ATTR_CONTIGUOUS;
1177 } else {
1178 contig = 0;
1179 }
1180 }
1181
1182 first = false;
1183 l3_slot = pmap_l3_index(state->va);
1184 MPASS((state->pa & L3_OFFSET) == 0);
1185 MPASS(state->l3[l3_slot] == 0);
1186 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1187 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1188 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1189 }
1190 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1191 }
1192
1193 static void
pmap_bootstrap_dmap(vm_paddr_t min_pa)1194 pmap_bootstrap_dmap(vm_paddr_t min_pa)
1195 {
1196 int i;
1197
1198 dmap_phys_base = min_pa & ~L1_OFFSET;
1199 dmap_phys_max = 0;
1200 dmap_max_addr = 0;
1201
1202 for (i = 0; i < (physmap_idx * 2); i += 2) {
1203 bs_state.pa = physmap[i] & ~L3_OFFSET;
1204 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1205
1206 /* Create L3 mappings at the start of the region */
1207 if ((bs_state.pa & L2_OFFSET) != 0)
1208 pmap_bootstrap_l3_page(&bs_state, i);
1209 MPASS(bs_state.pa <= physmap[i + 1]);
1210
1211 if (L1_BLOCKS_SUPPORTED) {
1212 /* Create L2 mappings at the start of the region */
1213 if ((bs_state.pa & L1_OFFSET) != 0)
1214 pmap_bootstrap_l2_block(&bs_state, i);
1215 MPASS(bs_state.pa <= physmap[i + 1]);
1216
1217 /* Create the main L1 block mappings */
1218 for (; bs_state.va < DMAP_MAX_ADDRESS &&
1219 (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1220 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1221 /* Make sure there is a valid L1 table */
1222 pmap_bootstrap_l0_table(&bs_state);
1223 MPASS((bs_state.pa & L1_OFFSET) == 0);
1224 pmap_store(
1225 &bs_state.l1[pmap_l1_index(bs_state.va)],
1226 PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT |
1227 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1228 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1229 }
1230 MPASS(bs_state.pa <= physmap[i + 1]);
1231
1232 /* Create L2 mappings at the end of the region */
1233 pmap_bootstrap_l2_block(&bs_state, i);
1234 } else {
1235 while (bs_state.va < DMAP_MAX_ADDRESS &&
1236 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1237 pmap_bootstrap_l2_block(&bs_state, i);
1238 }
1239 }
1240 MPASS(bs_state.pa <= physmap[i + 1]);
1241
1242 /* Create L3 mappings at the end of the region */
1243 pmap_bootstrap_l3_page(&bs_state, i);
1244 MPASS(bs_state.pa == physmap[i + 1]);
1245
1246 if (bs_state.pa > dmap_phys_max) {
1247 dmap_phys_max = bs_state.pa;
1248 dmap_max_addr = bs_state.va;
1249 }
1250 }
1251
1252 cpu_tlb_flushID();
1253 }
1254
1255 static void
pmap_bootstrap_l2(vm_offset_t va)1256 pmap_bootstrap_l2(vm_offset_t va)
1257 {
1258 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1259
1260 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1261 bs_state.va = va;
1262
1263 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1264 pmap_bootstrap_l1_table(&bs_state);
1265 }
1266
1267 static void
pmap_bootstrap_l3(vm_offset_t va)1268 pmap_bootstrap_l3(vm_offset_t va)
1269 {
1270 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1271
1272 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1273 bs_state.va = va;
1274
1275 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1276 pmap_bootstrap_l2_table(&bs_state);
1277 }
1278
1279 /*
1280 * Bootstrap the system enough to run with virtual memory.
1281 */
1282 void
pmap_bootstrap(vm_size_t kernlen)1283 pmap_bootstrap(vm_size_t kernlen)
1284 {
1285 vm_offset_t dpcpu, msgbufpv;
1286 vm_paddr_t start_pa, pa, min_pa;
1287 int i;
1288
1289 /* Verify that the ASID is set through TTBR0. */
1290 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1291 ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1292
1293 /* Set this early so we can use the pagetable walking functions */
1294 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1295 PMAP_LOCK_INIT(kernel_pmap);
1296 kernel_pmap->pm_l0_paddr =
1297 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1298 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1299 vm_radix_init(&kernel_pmap->pm_root);
1300 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1301 kernel_pmap->pm_stage = PM_STAGE1;
1302 kernel_pmap->pm_levels = 4;
1303 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1304 kernel_pmap->pm_asid_set = &asids;
1305
1306 /* Assume the address we were loaded to is a valid physical address */
1307 min_pa = pmap_early_vtophys(KERNBASE);
1308
1309 physmap_idx = physmem_avail(physmap, nitems(physmap));
1310 physmap_idx /= 2;
1311
1312 /*
1313 * Find the minimum physical address. physmap is sorted,
1314 * but may contain empty ranges.
1315 */
1316 for (i = 0; i < physmap_idx * 2; i += 2) {
1317 if (physmap[i] == physmap[i + 1])
1318 continue;
1319 if (physmap[i] <= min_pa)
1320 min_pa = physmap[i];
1321 }
1322
1323 bs_state.freemempos = KERNBASE + kernlen;
1324 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1325
1326 /* Create a direct map region early so we can use it for pa -> va */
1327 pmap_bootstrap_dmap(min_pa);
1328 bs_state.dmap_valid = true;
1329 /*
1330 * We only use PXN when we know nothing will be executed from it, e.g.
1331 * the DMAP region.
1332 */
1333 bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1334
1335 start_pa = pa = pmap_early_vtophys(KERNBASE);
1336
1337 /*
1338 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
1339 * loader allocated the first and only l2 page table page used to map
1340 * the kernel, preloaded files and module metadata.
1341 */
1342 pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1343 /* And the l3 tables for the early devmap */
1344 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1345
1346 cpu_tlb_flushID();
1347
1348 #define alloc_pages(var, np) \
1349 (var) = bs_state.freemempos; \
1350 bs_state.freemempos += (np * PAGE_SIZE); \
1351 memset((char *)(var), 0, ((np) * PAGE_SIZE));
1352
1353 /* Allocate dynamic per-cpu area. */
1354 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1355 dpcpu_init((void *)dpcpu, 0);
1356
1357 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1358 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1359 msgbufp = (void *)msgbufpv;
1360
1361 /* Reserve some VA space for early BIOS/ACPI mapping */
1362 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1363
1364 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1365 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1366 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1367 kernel_vm_end = virtual_avail;
1368
1369 pa = pmap_early_vtophys(bs_state.freemempos);
1370
1371 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1372
1373 cpu_tlb_flushID();
1374 }
1375
1376 #if defined(KASAN) || defined(KMSAN)
1377 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1378 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1379 vm_offset_t *vap, vm_offset_t eva)
1380 {
1381 vm_paddr_t pa;
1382 vm_offset_t va;
1383 pd_entry_t *l2;
1384
1385 va = *vap;
1386 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1387 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1388 l2 = pmap_l2(kernel_pmap, va);
1389
1390 /*
1391 * KASAN stack checking results in us having already allocated
1392 * part of our shadow map, so we can just skip those segments.
1393 */
1394 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1395 pa += L2_SIZE;
1396 continue;
1397 }
1398
1399 bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1400 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1401 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1402 }
1403 *vap = va;
1404 }
1405
1406 /*
1407 * Finish constructing the initial shadow map:
1408 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1409 * shadow map)
1410 * - Map that entire range using L2 superpages.
1411 */
1412 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1413 pmap_bootstrap_san1(vm_offset_t va, int scale)
1414 {
1415 vm_offset_t eva;
1416 vm_paddr_t kernstart;
1417 int i;
1418
1419 kernstart = pmap_early_vtophys(KERNBASE);
1420
1421 /*
1422 * Rebuild physmap one more time, we may have excluded more regions from
1423 * allocation since pmap_bootstrap().
1424 */
1425 bzero(physmap, sizeof(physmap));
1426 physmap_idx = physmem_avail(physmap, nitems(physmap));
1427 physmap_idx /= 2;
1428
1429 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1430
1431 /*
1432 * Find a slot in the physmap large enough for what we needed. We try to put
1433 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1434 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1435 */
1436 for (i = (physmap_idx * 2) - 2; i >= 0; i -= 2) {
1437 vm_paddr_t plow, phigh;
1438
1439 /* L2 mappings must be backed by memory that is L2-aligned */
1440 plow = roundup2(physmap[i], L2_SIZE);
1441 phigh = physmap[i + 1];
1442 if (plow >= phigh)
1443 continue;
1444 if (kernstart >= plow && kernstart < phigh)
1445 phigh = kernstart;
1446 if (phigh - plow >= L2_SIZE) {
1447 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1448 if (va >= eva)
1449 break;
1450 }
1451 }
1452 if (i < 0)
1453 panic("Could not find phys region for shadow map");
1454
1455 /*
1456 * Done. We should now have a valid shadow address mapped for all KVA
1457 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1458 * shadow accesses by the sanitizer runtime will succeed for this range.
1459 * When the kernel virtual address range is later expanded, as will
1460 * happen in vm_mem_init(), the shadow map will be grown as well. This
1461 * is handled by pmap_san_enter().
1462 */
1463 }
1464
1465 void
pmap_bootstrap_san(void)1466 pmap_bootstrap_san(void)
1467 {
1468 #ifdef KASAN
1469 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1470 #else
1471 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1472 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1473 pd_entry_t *l0, *l1;
1474
1475 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1476 panic("initial kernel map is too large");
1477
1478 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1479 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1480 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1481 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1482 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1483 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1484 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1485
1486 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1487 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1488 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1489 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1490 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1491 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1492 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1493 #endif
1494 }
1495 #endif
1496
1497 /*
1498 * Initialize a vm_page's machine-dependent fields.
1499 */
1500 void
pmap_page_init(vm_page_t m)1501 pmap_page_init(vm_page_t m)
1502 {
1503
1504 TAILQ_INIT(&m->md.pv_list);
1505 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1506 }
1507
1508 static void
pmap_init_asids(struct asid_set * set,int bits)1509 pmap_init_asids(struct asid_set *set, int bits)
1510 {
1511 int i;
1512
1513 set->asid_bits = bits;
1514
1515 /*
1516 * We may be too early in the overall initialization process to use
1517 * bit_alloc().
1518 */
1519 set->asid_set_size = 1 << set->asid_bits;
1520 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1521 M_WAITOK | M_ZERO);
1522 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1523 bit_set(set->asid_set, i);
1524 set->asid_next = ASID_FIRST_AVAILABLE;
1525 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1526 }
1527
1528 static void
pmap_init_pv_table(void)1529 pmap_init_pv_table(void)
1530 {
1531 struct vm_phys_seg *seg, *next_seg;
1532 struct pmap_large_md_page *pvd;
1533 vm_size_t s;
1534 int domain, i, j, pages;
1535
1536 /*
1537 * We strongly depend on the size being a power of two, so the assert
1538 * is overzealous. However, should the struct be resized to a
1539 * different power of two, the code below needs to be revisited.
1540 */
1541 CTASSERT((sizeof(*pvd) == 64));
1542
1543 /*
1544 * Calculate the size of the array.
1545 */
1546 s = 0;
1547 for (i = 0; i < vm_phys_nsegs; i++) {
1548 seg = &vm_phys_segs[i];
1549 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1550 pmap_l2_pindex(seg->start);
1551 s += round_page(pages * sizeof(*pvd));
1552 }
1553 pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1554 if (pv_table == NULL)
1555 panic("%s: kva_alloc failed\n", __func__);
1556
1557 /*
1558 * Iterate physical segments to allocate domain-local memory for PV
1559 * list headers.
1560 */
1561 pvd = pv_table;
1562 for (i = 0; i < vm_phys_nsegs; i++) {
1563 seg = &vm_phys_segs[i];
1564 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1565 pmap_l2_pindex(seg->start);
1566 domain = seg->domain;
1567
1568 s = round_page(pages * sizeof(*pvd));
1569
1570 for (j = 0; j < s; j += PAGE_SIZE) {
1571 vm_page_t m = vm_page_alloc_noobj_domain(domain,
1572 VM_ALLOC_ZERO);
1573 if (m == NULL)
1574 panic("failed to allocate PV table page");
1575 pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1576 }
1577
1578 for (j = 0; j < s / sizeof(*pvd); j++) {
1579 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1580 TAILQ_INIT(&pvd->pv_page.pv_list);
1581 pvd++;
1582 }
1583 }
1584 pvd = &pv_dummy_large;
1585 memset(pvd, 0, sizeof(*pvd));
1586 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1587 TAILQ_INIT(&pvd->pv_page.pv_list);
1588
1589 /*
1590 * Set pointers from vm_phys_segs to pv_table.
1591 */
1592 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1593 seg = &vm_phys_segs[i];
1594 seg->md_first = pvd;
1595 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1596 pmap_l2_pindex(seg->start);
1597
1598 /*
1599 * If there is a following segment, and the final
1600 * superpage of this segment and the initial superpage
1601 * of the next segment are the same then adjust the
1602 * pv_table entry for that next segment down by one so
1603 * that the pv_table entries will be shared.
1604 */
1605 if (i + 1 < vm_phys_nsegs) {
1606 next_seg = &vm_phys_segs[i + 1];
1607 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1608 pmap_l2_pindex(next_seg->start)) {
1609 pvd--;
1610 }
1611 }
1612 }
1613 }
1614
1615 /*
1616 * Initialize the pmap module.
1617 * Called by vm_init, to initialize any structures that the pmap
1618 * system needs to map virtual memory.
1619 */
1620 void
pmap_init(void)1621 pmap_init(void)
1622 {
1623 uint64_t mmfr1;
1624 int i, vmid_bits;
1625
1626 /*
1627 * Are large page mappings enabled?
1628 */
1629 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1630 if (superpages_enabled) {
1631 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1632 ("pmap_init: can't assign to pagesizes[1]"));
1633 pagesizes[1] = L2_SIZE;
1634 if (L1_BLOCKS_SUPPORTED) {
1635 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1636 ("pmap_init: can't assign to pagesizes[2]"));
1637 pagesizes[2] = L1_SIZE;
1638 }
1639 }
1640
1641 /*
1642 * Initialize the ASID allocator.
1643 */
1644 pmap_init_asids(&asids,
1645 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1646
1647 if (has_hyp()) {
1648 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1649 vmid_bits = 8;
1650
1651 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1652 ID_AA64MMFR1_VMIDBits_16)
1653 vmid_bits = 16;
1654 pmap_init_asids(&vmids, vmid_bits);
1655 }
1656
1657 /*
1658 * Initialize pv chunk lists.
1659 */
1660 for (i = 0; i < PMAP_MEMDOM; i++) {
1661 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1662 MTX_DEF);
1663 TAILQ_INIT(&pv_chunks[i].pvc_list);
1664 }
1665 pmap_init_pv_table();
1666
1667 vm_initialized = 1;
1668 }
1669
1670 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1671 "2MB page mapping counters");
1672
1673 static u_long pmap_l2_demotions;
1674 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1675 &pmap_l2_demotions, 0, "2MB page demotions");
1676
1677 static u_long pmap_l2_mappings;
1678 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1679 &pmap_l2_mappings, 0, "2MB page mappings");
1680
1681 static u_long pmap_l2_p_failures;
1682 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1683 &pmap_l2_p_failures, 0, "2MB page promotion failures");
1684
1685 static u_long pmap_l2_promotions;
1686 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1687 &pmap_l2_promotions, 0, "2MB page promotions");
1688
1689 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1690 "L3C (64KB/2MB) page mapping counters");
1691
1692 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1693 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1694 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1695
1696 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1697 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1698 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1699
1700 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1701 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1702 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1703
1704 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1705 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1706 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1707
1708 /*
1709 * If the given value for "final_only" is false, then any cached intermediate-
1710 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1711 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1712 * Otherwise, just the cached final-level entry is invalidated.
1713 */
1714 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1715 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1716 {
1717 if (final_only)
1718 __asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1719 else
1720 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1721 }
1722
1723 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1724 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1725 {
1726 if (final_only)
1727 __asm __volatile("tlbi vale1is, %0" : : "r" (r));
1728 else
1729 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1730 }
1731
1732 /*
1733 * Invalidates any cached final- and optionally intermediate-level TLB entries
1734 * for the specified virtual address in the given virtual address space.
1735 */
1736 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1737 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1738 {
1739 uint64_t r;
1740
1741 PMAP_ASSERT_STAGE1(pmap);
1742
1743 dsb(ishst);
1744 r = TLBI_VA(va);
1745 if (pmap == kernel_pmap) {
1746 pmap_s1_invalidate_kernel(r, final_only);
1747 } else {
1748 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1749 pmap_s1_invalidate_user(r, final_only);
1750 }
1751 dsb(ish);
1752 isb();
1753 }
1754
1755 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1756 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1757 {
1758 PMAP_ASSERT_STAGE2(pmap);
1759 MPASS(pmap_stage2_invalidate_range != NULL);
1760 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1761 final_only);
1762 }
1763
1764 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1765 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1766 {
1767 if (pmap->pm_stage == PM_STAGE1)
1768 pmap_s1_invalidate_page(pmap, va, final_only);
1769 else
1770 pmap_s2_invalidate_page(pmap, va, final_only);
1771 }
1772
1773 /*
1774 * Invalidates any cached final- and optionally intermediate-level TLB entries
1775 * for the specified virtual address range in the given virtual address space.
1776 */
1777 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1778 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1779 bool final_only)
1780 {
1781 uint64_t end, r, start;
1782
1783 PMAP_ASSERT_STAGE1(pmap);
1784
1785 dsb(ishst);
1786 if (pmap == kernel_pmap) {
1787 start = TLBI_VA(sva);
1788 end = TLBI_VA(eva);
1789 for (r = start; r < end; r += TLBI_VA_L3_INCR)
1790 pmap_s1_invalidate_kernel(r, final_only);
1791 } else {
1792 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1793 start |= TLBI_VA(sva);
1794 end |= TLBI_VA(eva);
1795 for (r = start; r < end; r += TLBI_VA_L3_INCR)
1796 pmap_s1_invalidate_user(r, final_only);
1797 }
1798 dsb(ish);
1799 isb();
1800 }
1801
1802 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1803 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1804 bool final_only)
1805 {
1806 PMAP_ASSERT_STAGE2(pmap);
1807 MPASS(pmap_stage2_invalidate_range != NULL);
1808 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1809 }
1810
1811 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1812 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1813 bool final_only)
1814 {
1815 if (pmap->pm_stage == PM_STAGE1)
1816 pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1817 else
1818 pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1819 }
1820
1821 /*
1822 * Invalidates all cached intermediate- and final-level TLB entries for the
1823 * given virtual address space.
1824 */
1825 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)1826 pmap_s1_invalidate_all(pmap_t pmap)
1827 {
1828 uint64_t r;
1829
1830 PMAP_ASSERT_STAGE1(pmap);
1831
1832 dsb(ishst);
1833 if (pmap == kernel_pmap) {
1834 __asm __volatile("tlbi vmalle1is");
1835 } else {
1836 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1837 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
1838 }
1839 dsb(ish);
1840 isb();
1841 }
1842
1843 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)1844 pmap_s2_invalidate_all(pmap_t pmap)
1845 {
1846 PMAP_ASSERT_STAGE2(pmap);
1847 MPASS(pmap_stage2_invalidate_all != NULL);
1848 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1849 }
1850
1851 static __inline void
pmap_invalidate_all(pmap_t pmap)1852 pmap_invalidate_all(pmap_t pmap)
1853 {
1854 if (pmap->pm_stage == PM_STAGE1)
1855 pmap_s1_invalidate_all(pmap);
1856 else
1857 pmap_s2_invalidate_all(pmap);
1858 }
1859
1860 /*
1861 * Routine: pmap_extract
1862 * Function:
1863 * Extract the physical page address associated
1864 * with the given map/virtual_address pair.
1865 */
1866 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1867 pmap_extract(pmap_t pmap, vm_offset_t va)
1868 {
1869 pt_entry_t *pte, tpte;
1870 vm_paddr_t pa;
1871 int lvl;
1872
1873 pa = 0;
1874 PMAP_LOCK(pmap);
1875 /*
1876 * Find the block or page map for this virtual address. pmap_pte
1877 * will return either a valid block/page entry, or NULL.
1878 */
1879 pte = pmap_pte(pmap, va, &lvl);
1880 if (pte != NULL) {
1881 tpte = pmap_load(pte);
1882 pa = PTE_TO_PHYS(tpte);
1883 switch(lvl) {
1884 case 1:
1885 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1886 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1887 ("pmap_extract: Invalid L1 pte found: %lx",
1888 tpte & ATTR_DESCR_MASK));
1889 pa |= (va & L1_OFFSET);
1890 break;
1891 case 2:
1892 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1893 ("pmap_extract: Invalid L2 pte found: %lx",
1894 tpte & ATTR_DESCR_MASK));
1895 pa |= (va & L2_OFFSET);
1896 break;
1897 case 3:
1898 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1899 ("pmap_extract: Invalid L3 pte found: %lx",
1900 tpte & ATTR_DESCR_MASK));
1901 pa |= (va & L3_OFFSET);
1902 break;
1903 }
1904 }
1905 PMAP_UNLOCK(pmap);
1906 return (pa);
1907 }
1908
1909 /*
1910 * Routine: pmap_extract_and_hold
1911 * Function:
1912 * Atomically extract and hold the physical page
1913 * with the given pmap and virtual address pair
1914 * if that mapping permits the given protection.
1915 */
1916 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1917 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1918 {
1919 pt_entry_t *pte, tpte;
1920 vm_offset_t off;
1921 vm_page_t m;
1922 int lvl;
1923 bool use;
1924
1925 m = NULL;
1926 PMAP_LOCK(pmap);
1927 pte = pmap_pte(pmap, va, &lvl);
1928 if (pte != NULL) {
1929 tpte = pmap_load(pte);
1930
1931 KASSERT(lvl > 0 && lvl <= 3,
1932 ("pmap_extract_and_hold: Invalid level %d", lvl));
1933 /*
1934 * Check that the pte is either a L3 page, or a L1 or L2 block
1935 * entry. We can assume L1_BLOCK == L2_BLOCK.
1936 */
1937 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1938 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1939 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1940 tpte & ATTR_DESCR_MASK));
1941
1942 use = false;
1943 if ((prot & VM_PROT_WRITE) == 0)
1944 use = true;
1945 else if (pmap->pm_stage == PM_STAGE1 &&
1946 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1947 use = true;
1948 else if (pmap->pm_stage == PM_STAGE2 &&
1949 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1950 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1951 use = true;
1952
1953 if (use) {
1954 switch (lvl) {
1955 case 1:
1956 off = va & L1_OFFSET;
1957 break;
1958 case 2:
1959 off = va & L2_OFFSET;
1960 break;
1961 case 3:
1962 default:
1963 off = 0;
1964 }
1965 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
1966 if (m != NULL && !vm_page_wire_mapped(m))
1967 m = NULL;
1968 }
1969 }
1970 PMAP_UNLOCK(pmap);
1971 return (m);
1972 }
1973
1974 /*
1975 * Walks the page tables to translate a kernel virtual address to a
1976 * physical address. Returns true if the kva is valid and stores the
1977 * physical address in pa if it is not NULL.
1978 *
1979 * See the comment above data_abort() for the rationale for specifying
1980 * NO_PERTHREAD_SSP here.
1981 */
1982 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)1983 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
1984 {
1985 pt_entry_t *pte, tpte;
1986 register_t intr;
1987 uint64_t par;
1988
1989 /*
1990 * Disable interrupts so we don't get interrupted between asking
1991 * for address translation, and getting the result back.
1992 */
1993 intr = intr_disable();
1994 par = arm64_address_translate_s1e1r(va);
1995 intr_restore(intr);
1996
1997 if (PAR_SUCCESS(par)) {
1998 if (pa != NULL)
1999 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2000 return (true);
2001 }
2002
2003 /*
2004 * Fall back to walking the page table. The address translation
2005 * instruction may fail when the page is in a break-before-make
2006 * sequence. As we only clear the valid bit in said sequence we
2007 * can walk the page table to find the physical address.
2008 */
2009
2010 pte = pmap_l1(kernel_pmap, va);
2011 if (pte == NULL)
2012 return (false);
2013
2014 /*
2015 * A concurrent pmap_update_entry() will clear the entry's valid bit
2016 * but leave the rest of the entry unchanged. Therefore, we treat a
2017 * non-zero entry as being valid, and we ignore the valid bit when
2018 * determining whether the entry maps a block, page, or table.
2019 */
2020 tpte = pmap_load(pte);
2021 if (tpte == 0)
2022 return (false);
2023 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2024 if (pa != NULL)
2025 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2026 return (true);
2027 }
2028 pte = pmap_l1_to_l2(&tpte, va);
2029 tpte = pmap_load(pte);
2030 if (tpte == 0)
2031 return (false);
2032 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2033 if (pa != NULL)
2034 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2035 return (true);
2036 }
2037 pte = pmap_l2_to_l3(&tpte, va);
2038 tpte = pmap_load(pte);
2039 if (tpte == 0)
2040 return (false);
2041 if (pa != NULL)
2042 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2043 return (true);
2044 }
2045
2046 /*
2047 * Routine: pmap_kextract
2048 * Function:
2049 * Extract the physical page address associated with the given kernel
2050 * virtual address.
2051 */
2052 vm_paddr_t
pmap_kextract(vm_offset_t va)2053 pmap_kextract(vm_offset_t va)
2054 {
2055 vm_paddr_t pa;
2056
2057 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2058 return (DMAP_TO_PHYS(va));
2059
2060 if (pmap_klookup(va, &pa) == false)
2061 return (0);
2062 return (pa);
2063 }
2064
2065 /***************************************************
2066 * Low level mapping routines.....
2067 ***************************************************/
2068
2069 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2070 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2071 {
2072 pd_entry_t *pde;
2073 pt_entry_t attr, old_l3e, *pte;
2074 vm_offset_t va;
2075 vm_page_t mpte;
2076 int error, lvl;
2077
2078 KASSERT((pa & L3_OFFSET) == 0,
2079 ("pmap_kenter: Invalid physical address"));
2080 KASSERT((sva & L3_OFFSET) == 0,
2081 ("pmap_kenter: Invalid virtual address"));
2082 KASSERT((size & PAGE_MASK) == 0,
2083 ("pmap_kenter: Mapping is not page-sized"));
2084
2085 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2086 ATTR_KERN_GP | ATTR_S1_IDX(mode);
2087 old_l3e = 0;
2088 va = sva;
2089 while (size != 0) {
2090 pde = pmap_pde(kernel_pmap, va, &lvl);
2091 KASSERT(pde != NULL,
2092 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2093 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2094
2095 /*
2096 * If we have an aligned, contiguous chunk of L2_SIZE, try
2097 * to create an L2_BLOCK mapping.
2098 */
2099 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2100 (pa & L2_OFFSET) == 0 && vm_initialized) {
2101 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2102 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2103 ("pmap_kenter: Unexpected mapping"));
2104 PMAP_LOCK(kernel_pmap);
2105 error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2106 false);
2107 if (error == 0) {
2108 attr &= ~ATTR_CONTIGUOUS;
2109
2110 /*
2111 * Although the page table page "mpte" should
2112 * be devoid of mappings, the TLB might hold
2113 * intermediate entries that reference it, so
2114 * we perform a single-page invalidation.
2115 */
2116 pmap_update_entry(kernel_pmap, pde,
2117 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2118 PAGE_SIZE);
2119 }
2120 PMAP_UNLOCK(kernel_pmap);
2121 if (error == 0) {
2122 va += L2_SIZE;
2123 pa += L2_SIZE;
2124 size -= L2_SIZE;
2125 continue;
2126 }
2127 }
2128
2129 /*
2130 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2131 * L3 pages, set the contiguous bit within each PTE so that
2132 * the chunk can be cached using only one TLB entry.
2133 */
2134 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2135 if (size >= L3C_SIZE)
2136 attr |= ATTR_CONTIGUOUS;
2137 else
2138 attr &= ~ATTR_CONTIGUOUS;
2139 }
2140
2141 pte = pmap_l2_to_l3(pde, va);
2142 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2143 L3_PAGE);
2144
2145 va += PAGE_SIZE;
2146 pa += PAGE_SIZE;
2147 size -= PAGE_SIZE;
2148 }
2149 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2150 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2151 else {
2152 /*
2153 * Because the old entries were invalid and the new mappings
2154 * are not executable, an isb is not required.
2155 */
2156 dsb(ishst);
2157 }
2158 }
2159
2160 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2161 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2162 {
2163
2164 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2165 }
2166
2167 /*
2168 * Remove a page from the kernel pagetables.
2169 */
2170 void
pmap_kremove(vm_offset_t va)2171 pmap_kremove(vm_offset_t va)
2172 {
2173 pt_entry_t *pte;
2174
2175 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2176 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2177 ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2178 pmap_clear(pte);
2179 pmap_s1_invalidate_page(kernel_pmap, va, true);
2180 }
2181
2182 /*
2183 * Remove the specified range of mappings from the kernel address space.
2184 *
2185 * Should only be applied to mappings that were created by pmap_kenter() or
2186 * pmap_kenter_device(). Nothing about this function is actually specific
2187 * to device mappings.
2188 */
2189 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2190 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2191 {
2192 pt_entry_t *ptep, *ptep_end;
2193 vm_offset_t va;
2194 int lvl;
2195
2196 KASSERT((sva & L3_OFFSET) == 0,
2197 ("pmap_kremove_device: Invalid virtual address"));
2198 KASSERT((size & PAGE_MASK) == 0,
2199 ("pmap_kremove_device: Mapping is not page-sized"));
2200
2201 va = sva;
2202 while (size != 0) {
2203 ptep = pmap_pte(kernel_pmap, va, &lvl);
2204 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2205 switch (lvl) {
2206 case 2:
2207 KASSERT((va & L2_OFFSET) == 0,
2208 ("Unaligned virtual address"));
2209 KASSERT(size >= L2_SIZE, ("Insufficient size"));
2210
2211 if (va != sva) {
2212 pmap_s1_invalidate_range(kernel_pmap, sva, va,
2213 true);
2214 }
2215 pmap_clear(ptep);
2216 pmap_s1_invalidate_page(kernel_pmap, va, true);
2217 PMAP_LOCK(kernel_pmap);
2218 pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2219 PMAP_UNLOCK(kernel_pmap);
2220
2221 va += L2_SIZE;
2222 sva = va;
2223 size -= L2_SIZE;
2224 break;
2225 case 3:
2226 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2227 KASSERT((va & L3C_OFFSET) == 0,
2228 ("Unaligned L3C virtual address"));
2229 KASSERT(size >= L3C_SIZE,
2230 ("Insufficient L3C size"));
2231
2232 ptep_end = ptep + L3C_ENTRIES;
2233 for (; ptep < ptep_end; ptep++)
2234 pmap_clear(ptep);
2235
2236 va += L3C_SIZE;
2237 size -= L3C_SIZE;
2238 break;
2239 }
2240 pmap_clear(ptep);
2241
2242 va += PAGE_SIZE;
2243 size -= PAGE_SIZE;
2244 break;
2245 default:
2246 __assert_unreachable();
2247 break;
2248 }
2249 }
2250 if (va != sva)
2251 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2252 }
2253
2254 /*
2255 * Used to map a range of physical addresses into kernel
2256 * virtual address space.
2257 *
2258 * The value passed in '*virt' is a suggested virtual address for
2259 * the mapping. Architectures which can support a direct-mapped
2260 * physical to virtual region can return the appropriate address
2261 * within that region, leaving '*virt' unchanged. Other
2262 * architectures should map the pages starting at '*virt' and
2263 * update '*virt' with the first usable address after the mapped
2264 * region.
2265 */
2266 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2267 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2268 {
2269 return PHYS_TO_DMAP(start);
2270 }
2271
2272 /*
2273 * Add a list of wired pages to the kva
2274 * this routine is only used for temporary
2275 * kernel mappings that do not need to have
2276 * page modification or references recorded.
2277 * Note that old mappings are simply written
2278 * over. The page *must* be wired.
2279 * Note: SMP coherent. Uses a ranged shootdown IPI.
2280 */
2281 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)2282 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2283 {
2284 pd_entry_t *pde;
2285 pt_entry_t attr, old_l3e, *pte;
2286 vm_offset_t va;
2287 vm_page_t m;
2288 int i, lvl;
2289
2290 old_l3e = 0;
2291 va = sva;
2292 for (i = 0; i < count; i++) {
2293 pde = pmap_pde(kernel_pmap, va, &lvl);
2294 KASSERT(pde != NULL,
2295 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2296 KASSERT(lvl == 2,
2297 ("pmap_qenter: Invalid level %d", lvl));
2298
2299 m = ma[i];
2300 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2301 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2302 pte = pmap_l2_to_l3(pde, va);
2303 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2304
2305 va += L3_SIZE;
2306 }
2307 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2308 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2309 else {
2310 /*
2311 * Because the old entries were invalid and the new mappings
2312 * are not executable, an isb is not required.
2313 */
2314 dsb(ishst);
2315 }
2316 }
2317
2318 /*
2319 * This routine tears out page mappings from the
2320 * kernel -- it is meant only for temporary mappings.
2321 */
2322 void
pmap_qremove(vm_offset_t sva,int count)2323 pmap_qremove(vm_offset_t sva, int count)
2324 {
2325 pt_entry_t *pte;
2326 vm_offset_t va;
2327
2328 KASSERT(ADDR_IS_CANONICAL(sva),
2329 ("%s: Address not in canonical form: %lx", __func__, sva));
2330 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2331
2332 va = sva;
2333 while (count-- > 0) {
2334 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2335 if (pte != NULL) {
2336 pmap_clear(pte);
2337 }
2338
2339 va += PAGE_SIZE;
2340 }
2341 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2342 }
2343
2344 /***************************************************
2345 * Page table page management routines.....
2346 ***************************************************/
2347 /*
2348 * Schedule the specified unused page table page to be freed. Specifically,
2349 * add the page to the specified list of pages that will be released to the
2350 * physical memory manager after the TLB has been updated.
2351 */
2352 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2353 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2354 {
2355
2356 if (set_PG_ZERO)
2357 m->flags |= PG_ZERO;
2358 else
2359 m->flags &= ~PG_ZERO;
2360 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2361 }
2362
2363 /*
2364 * Decrements a page table page's reference count, which is used to record the
2365 * number of valid page table entries within the page. If the reference count
2366 * drops to zero, then the page table page is unmapped. Returns true if the
2367 * page table page was unmapped and false otherwise.
2368 */
2369 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2370 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2371 {
2372
2373 --m->ref_count;
2374 if (m->ref_count == 0) {
2375 _pmap_unwire_l3(pmap, va, m, free);
2376 return (true);
2377 } else
2378 return (false);
2379 }
2380
2381 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2382 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2383 {
2384
2385 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2386 /*
2387 * unmap the page table page
2388 */
2389 if (m->pindex >= (NUL2E + NUL1E)) {
2390 /* l1 page */
2391 pd_entry_t *l0;
2392
2393 l0 = pmap_l0(pmap, va);
2394 pmap_clear(l0);
2395 } else if (m->pindex >= NUL2E) {
2396 /* l2 page */
2397 pd_entry_t *l1;
2398
2399 l1 = pmap_l1(pmap, va);
2400 pmap_clear(l1);
2401 } else {
2402 /* l3 page */
2403 pd_entry_t *l2;
2404
2405 l2 = pmap_l2(pmap, va);
2406 pmap_clear(l2);
2407 }
2408 pmap_resident_count_dec(pmap, 1);
2409 if (m->pindex < NUL2E) {
2410 /* We just released an l3, unhold the matching l2 */
2411 pd_entry_t *l1, tl1;
2412 vm_page_t l2pg;
2413
2414 l1 = pmap_l1(pmap, va);
2415 tl1 = pmap_load(l1);
2416 l2pg = PTE_TO_VM_PAGE(tl1);
2417 pmap_unwire_l3(pmap, va, l2pg, free);
2418 } else if (m->pindex < (NUL2E + NUL1E)) {
2419 /* We just released an l2, unhold the matching l1 */
2420 pd_entry_t *l0, tl0;
2421 vm_page_t l1pg;
2422
2423 l0 = pmap_l0(pmap, va);
2424 tl0 = pmap_load(l0);
2425 l1pg = PTE_TO_VM_PAGE(tl0);
2426 pmap_unwire_l3(pmap, va, l1pg, free);
2427 }
2428 pmap_invalidate_page(pmap, va, false);
2429
2430 /*
2431 * Put page on a list so that it is released after
2432 * *ALL* TLB shootdown is done
2433 */
2434 pmap_add_delayed_free_list(m, free, true);
2435 }
2436
2437 /*
2438 * After removing a page table entry, this routine is used to
2439 * conditionally free the page, and manage the reference count.
2440 */
2441 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2442 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2443 struct spglist *free)
2444 {
2445 vm_page_t mpte;
2446
2447 KASSERT(ADDR_IS_CANONICAL(va),
2448 ("%s: Address not in canonical form: %lx", __func__, va));
2449 if (ADDR_IS_KERNEL(va))
2450 return (0);
2451 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2452 mpte = PTE_TO_VM_PAGE(ptepde);
2453 return (pmap_unwire_l3(pmap, va, mpte, free));
2454 }
2455
2456 /*
2457 * Release a page table page reference after a failed attempt to create a
2458 * mapping.
2459 */
2460 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2461 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2462 {
2463 struct spglist free;
2464
2465 SLIST_INIT(&free);
2466 if (pmap_unwire_l3(pmap, va, mpte, &free))
2467 vm_page_free_pages_toq(&free, true);
2468 }
2469
2470 void
pmap_pinit0(pmap_t pmap)2471 pmap_pinit0(pmap_t pmap)
2472 {
2473
2474 PMAP_LOCK_INIT(pmap);
2475 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2476 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2477 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2478 TAILQ_INIT(&pmap->pm_pvchunk);
2479 vm_radix_init(&pmap->pm_root);
2480 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2481 pmap->pm_stage = PM_STAGE1;
2482 pmap->pm_levels = 4;
2483 pmap->pm_ttbr = pmap->pm_l0_paddr;
2484 pmap->pm_asid_set = &asids;
2485 pmap->pm_bti = NULL;
2486
2487 PCPU_SET(curpmap, pmap);
2488 }
2489
2490 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2491 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2492 {
2493 vm_page_t m;
2494
2495 /*
2496 * allocate the l0 page
2497 */
2498 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2499 VM_ALLOC_ZERO);
2500 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2501 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2502
2503 TAILQ_INIT(&pmap->pm_pvchunk);
2504 vm_radix_init(&pmap->pm_root);
2505 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2506 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2507
2508 MPASS(levels == 3 || levels == 4);
2509 pmap->pm_levels = levels;
2510 pmap->pm_stage = stage;
2511 pmap->pm_bti = NULL;
2512 switch (stage) {
2513 case PM_STAGE1:
2514 pmap->pm_asid_set = &asids;
2515 if (pmap_bti_support) {
2516 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2517 M_ZERO | M_WAITOK);
2518 rangeset_init(pmap->pm_bti, bti_dup_range,
2519 bti_free_range, pmap, M_NOWAIT);
2520 }
2521 break;
2522 case PM_STAGE2:
2523 pmap->pm_asid_set = &vmids;
2524 break;
2525 default:
2526 panic("%s: Invalid pmap type %d", __func__, stage);
2527 break;
2528 }
2529
2530 /* XXX Temporarily disable deferred ASID allocation. */
2531 pmap_alloc_asid(pmap);
2532
2533 /*
2534 * Allocate the level 1 entry to use as the root. This will increase
2535 * the refcount on the level 1 page so it won't be removed until
2536 * pmap_release() is called.
2537 */
2538 if (pmap->pm_levels == 3) {
2539 PMAP_LOCK(pmap);
2540 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2541 PMAP_UNLOCK(pmap);
2542 }
2543 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2544
2545 return (1);
2546 }
2547
2548 int
pmap_pinit(pmap_t pmap)2549 pmap_pinit(pmap_t pmap)
2550 {
2551
2552 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2553 }
2554
2555 /*
2556 * This routine is called if the desired page table page does not exist.
2557 *
2558 * If page table page allocation fails, this routine may sleep before
2559 * returning NULL. It sleeps only if a lock pointer was given.
2560 *
2561 * Note: If a page allocation fails at page table level two or three,
2562 * one or two pages may be held during the wait, only to be released
2563 * afterwards. This conservative approach is easily argued to avoid
2564 * race conditions.
2565 */
2566 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2567 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2568 {
2569 vm_page_t m, l1pg, l2pg;
2570
2571 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2572
2573 /*
2574 * Allocate a page table page.
2575 */
2576 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2577 if (lockp != NULL) {
2578 RELEASE_PV_LIST_LOCK(lockp);
2579 PMAP_UNLOCK(pmap);
2580 vm_wait(NULL);
2581 PMAP_LOCK(pmap);
2582 }
2583
2584 /*
2585 * Indicate the need to retry. While waiting, the page table
2586 * page may have been allocated.
2587 */
2588 return (NULL);
2589 }
2590 m->pindex = ptepindex;
2591
2592 /*
2593 * Because of AArch64's weak memory consistency model, we must have a
2594 * barrier here to ensure that the stores for zeroing "m", whether by
2595 * pmap_zero_page() or an earlier function, are visible before adding
2596 * "m" to the page table. Otherwise, a page table walk by another
2597 * processor's MMU could see the mapping to "m" and a stale, non-zero
2598 * PTE within "m".
2599 */
2600 dmb(ishst);
2601
2602 /*
2603 * Map the pagetable page into the process address space, if
2604 * it isn't already there.
2605 */
2606
2607 if (ptepindex >= (NUL2E + NUL1E)) {
2608 pd_entry_t *l0p, l0e;
2609 vm_pindex_t l0index;
2610
2611 l0index = ptepindex - (NUL2E + NUL1E);
2612 l0p = &pmap->pm_l0[l0index];
2613 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2614 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2615 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2616
2617 /*
2618 * Mark all kernel memory as not accessible from userspace
2619 * and userspace memory as not executable from the kernel.
2620 * This has been done for the bootstrap L0 entries in
2621 * locore.S.
2622 */
2623 if (pmap == kernel_pmap)
2624 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2625 else
2626 l0e |= TATTR_PXN_TABLE;
2627 pmap_store(l0p, l0e);
2628 } else if (ptepindex >= NUL2E) {
2629 vm_pindex_t l0index, l1index;
2630 pd_entry_t *l0, *l1;
2631 pd_entry_t tl0;
2632
2633 l1index = ptepindex - NUL2E;
2634 l0index = l1index >> Ln_ENTRIES_SHIFT;
2635
2636 l0 = &pmap->pm_l0[l0index];
2637 tl0 = pmap_load(l0);
2638 if (tl0 == 0) {
2639 /* recurse for allocating page dir */
2640 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2641 lockp) == NULL) {
2642 vm_page_unwire_noq(m);
2643 vm_page_free_zero(m);
2644 return (NULL);
2645 }
2646 } else {
2647 l1pg = PTE_TO_VM_PAGE(tl0);
2648 l1pg->ref_count++;
2649 }
2650
2651 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2652 l1 = &l1[ptepindex & Ln_ADDR_MASK];
2653 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2654 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2655 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2656 } else {
2657 vm_pindex_t l0index, l1index;
2658 pd_entry_t *l0, *l1, *l2;
2659 pd_entry_t tl0, tl1;
2660
2661 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2662 l0index = l1index >> Ln_ENTRIES_SHIFT;
2663
2664 l0 = &pmap->pm_l0[l0index];
2665 tl0 = pmap_load(l0);
2666 if (tl0 == 0) {
2667 /* recurse for allocating page dir */
2668 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2669 lockp) == NULL) {
2670 vm_page_unwire_noq(m);
2671 vm_page_free_zero(m);
2672 return (NULL);
2673 }
2674 tl0 = pmap_load(l0);
2675 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2676 l1 = &l1[l1index & Ln_ADDR_MASK];
2677 } else {
2678 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2679 l1 = &l1[l1index & Ln_ADDR_MASK];
2680 tl1 = pmap_load(l1);
2681 if (tl1 == 0) {
2682 /* recurse for allocating page dir */
2683 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2684 lockp) == NULL) {
2685 vm_page_unwire_noq(m);
2686 vm_page_free_zero(m);
2687 return (NULL);
2688 }
2689 } else {
2690 l2pg = PTE_TO_VM_PAGE(tl1);
2691 l2pg->ref_count++;
2692 }
2693 }
2694
2695 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2696 l2 = &l2[ptepindex & Ln_ADDR_MASK];
2697 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2698 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2699 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2700 }
2701
2702 pmap_resident_count_inc(pmap, 1);
2703
2704 return (m);
2705 }
2706
2707 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)2708 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2709 struct rwlock **lockp)
2710 {
2711 pd_entry_t *l1, *l2;
2712 vm_page_t l2pg;
2713 vm_pindex_t l2pindex;
2714
2715 KASSERT(ADDR_IS_CANONICAL(va),
2716 ("%s: Address not in canonical form: %lx", __func__, va));
2717
2718 retry:
2719 l1 = pmap_l1(pmap, va);
2720 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2721 l2 = pmap_l1_to_l2(l1, va);
2722 if (!ADDR_IS_KERNEL(va)) {
2723 /* Add a reference to the L2 page. */
2724 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2725 l2pg->ref_count++;
2726 } else
2727 l2pg = NULL;
2728 } else if (!ADDR_IS_KERNEL(va)) {
2729 /* Allocate a L2 page. */
2730 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2731 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2732 if (l2pg == NULL) {
2733 if (lockp != NULL)
2734 goto retry;
2735 else
2736 return (NULL);
2737 }
2738 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2739 l2 = &l2[pmap_l2_index(va)];
2740 } else
2741 panic("pmap_alloc_l2: missing page table page for va %#lx",
2742 va);
2743 *l2pgp = l2pg;
2744 return (l2);
2745 }
2746
2747 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2748 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2749 {
2750 vm_pindex_t ptepindex;
2751 pd_entry_t *pde, tpde;
2752 #ifdef INVARIANTS
2753 pt_entry_t *pte;
2754 #endif
2755 vm_page_t m;
2756 int lvl;
2757
2758 /*
2759 * Calculate pagetable page index
2760 */
2761 ptepindex = pmap_l2_pindex(va);
2762 retry:
2763 /*
2764 * Get the page directory entry
2765 */
2766 pde = pmap_pde(pmap, va, &lvl);
2767
2768 /*
2769 * If the page table page is mapped, we just increment the hold count,
2770 * and activate it. If we get a level 2 pde it will point to a level 3
2771 * table.
2772 */
2773 switch (lvl) {
2774 case -1:
2775 break;
2776 case 0:
2777 #ifdef INVARIANTS
2778 pte = pmap_l0_to_l1(pde, va);
2779 KASSERT(pmap_load(pte) == 0,
2780 ("pmap_alloc_l3: TODO: l0 superpages"));
2781 #endif
2782 break;
2783 case 1:
2784 #ifdef INVARIANTS
2785 pte = pmap_l1_to_l2(pde, va);
2786 KASSERT(pmap_load(pte) == 0,
2787 ("pmap_alloc_l3: TODO: l1 superpages"));
2788 #endif
2789 break;
2790 case 2:
2791 tpde = pmap_load(pde);
2792 if (tpde != 0) {
2793 m = PTE_TO_VM_PAGE(tpde);
2794 m->ref_count++;
2795 return (m);
2796 }
2797 break;
2798 default:
2799 panic("pmap_alloc_l3: Invalid level %d", lvl);
2800 }
2801
2802 /*
2803 * Here if the pte page isn't mapped, or if it has been deallocated.
2804 */
2805 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2806 if (m == NULL && lockp != NULL)
2807 goto retry;
2808
2809 return (m);
2810 }
2811
2812 /***************************************************
2813 * Pmap allocation/deallocation routines.
2814 ***************************************************/
2815
2816 /*
2817 * Release any resources held by the given physical map.
2818 * Called when a pmap initialized by pmap_pinit is being released.
2819 * Should only be called if the map contains no valid mappings.
2820 */
2821 void
pmap_release(pmap_t pmap)2822 pmap_release(pmap_t pmap)
2823 {
2824 bool rv __diagused;
2825 struct spglist freelist;
2826 struct asid_set *set;
2827 vm_page_t m;
2828 int asid;
2829
2830 if (pmap->pm_levels != 4) {
2831 PMAP_ASSERT_STAGE2(pmap);
2832 KASSERT(pmap->pm_stats.resident_count == 1,
2833 ("pmap_release: pmap resident count %ld != 0",
2834 pmap->pm_stats.resident_count));
2835 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2836 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2837
2838 SLIST_INIT(&freelist);
2839 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2840 PMAP_LOCK(pmap);
2841 rv = pmap_unwire_l3(pmap, 0, m, &freelist);
2842 PMAP_UNLOCK(pmap);
2843 MPASS(rv == true);
2844 vm_page_free_pages_toq(&freelist, true);
2845 }
2846
2847 KASSERT(pmap->pm_stats.resident_count == 0,
2848 ("pmap_release: pmap resident count %ld != 0",
2849 pmap->pm_stats.resident_count));
2850 KASSERT(vm_radix_is_empty(&pmap->pm_root),
2851 ("pmap_release: pmap has reserved page table page(s)"));
2852
2853 set = pmap->pm_asid_set;
2854 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2855
2856 /*
2857 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2858 * the entries when removing them so rely on a later tlb invalidation.
2859 * this will happen when updating the VMID generation. Because of this
2860 * we don't reuse VMIDs within a generation.
2861 */
2862 if (pmap->pm_stage == PM_STAGE1) {
2863 mtx_lock_spin(&set->asid_set_mutex);
2864 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2865 asid = COOKIE_TO_ASID(pmap->pm_cookie);
2866 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2867 asid < set->asid_set_size,
2868 ("pmap_release: pmap cookie has out-of-range asid"));
2869 bit_clear(set->asid_set, asid);
2870 }
2871 mtx_unlock_spin(&set->asid_set_mutex);
2872
2873 if (pmap->pm_bti != NULL) {
2874 rangeset_fini(pmap->pm_bti);
2875 free(pmap->pm_bti, M_DEVBUF);
2876 }
2877 }
2878
2879 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2880 vm_page_unwire_noq(m);
2881 vm_page_free_zero(m);
2882 }
2883
2884 static int
kvm_size(SYSCTL_HANDLER_ARGS)2885 kvm_size(SYSCTL_HANDLER_ARGS)
2886 {
2887 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2888
2889 return sysctl_handle_long(oidp, &ksize, 0, req);
2890 }
2891 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2892 0, 0, kvm_size, "LU",
2893 "Size of KVM");
2894
2895 static int
kvm_free(SYSCTL_HANDLER_ARGS)2896 kvm_free(SYSCTL_HANDLER_ARGS)
2897 {
2898 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2899
2900 return sysctl_handle_long(oidp, &kfree, 0, req);
2901 }
2902 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2903 0, 0, kvm_free, "LU",
2904 "Amount of KVM free");
2905
2906 /*
2907 * grow the number of kernel page table entries, if needed
2908 */
2909 void
pmap_growkernel(vm_offset_t addr)2910 pmap_growkernel(vm_offset_t addr)
2911 {
2912 vm_page_t nkpg;
2913 pd_entry_t *l0, *l1, *l2;
2914
2915 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2916
2917 addr = roundup2(addr, L2_SIZE);
2918 if (addr - 1 >= vm_map_max(kernel_map))
2919 addr = vm_map_max(kernel_map);
2920 if (kernel_vm_end < addr) {
2921 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2922 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2923 }
2924 while (kernel_vm_end < addr) {
2925 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2926 KASSERT(pmap_load(l0) != 0,
2927 ("pmap_growkernel: No level 0 kernel entry"));
2928
2929 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2930 if (pmap_load(l1) == 0) {
2931 /* We need a new PDP entry */
2932 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2933 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2934 if (nkpg == NULL)
2935 panic("pmap_growkernel: no memory to grow kernel");
2936 nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2937 /* See the dmb() in _pmap_alloc_l3(). */
2938 dmb(ishst);
2939 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
2940 continue; /* try again */
2941 }
2942 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2943 if (pmap_load(l2) != 0) {
2944 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2945 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2946 kernel_vm_end = vm_map_max(kernel_map);
2947 break;
2948 }
2949 continue;
2950 }
2951
2952 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2953 VM_ALLOC_ZERO);
2954 if (nkpg == NULL)
2955 panic("pmap_growkernel: no memory to grow kernel");
2956 nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2957 /* See the dmb() in _pmap_alloc_l3(). */
2958 dmb(ishst);
2959 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
2960
2961 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2962 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2963 kernel_vm_end = vm_map_max(kernel_map);
2964 break;
2965 }
2966 }
2967 }
2968
2969 /***************************************************
2970 * page management routines.
2971 ***************************************************/
2972
2973 static const uint64_t pc_freemask[_NPCM] = {
2974 [0 ... _NPCM - 2] = PC_FREEN,
2975 [_NPCM - 1] = PC_FREEL
2976 };
2977
2978 #ifdef PV_STATS
2979 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
2980
2981 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
2982 "Current number of pv entry chunks");
2983 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
2984 "Current number of pv entry chunks allocated");
2985 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
2986 "Current number of pv entry chunks frees");
2987 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
2988 "Number of times tried to get a chunk page but failed.");
2989
2990 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
2991 static int pv_entry_spare;
2992
2993 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
2994 "Current number of pv entry frees");
2995 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
2996 "Current number of pv entry allocs");
2997 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
2998 "Current number of pv entries");
2999 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3000 "Current number of spare pv entries");
3001 #endif
3002
3003 /*
3004 * We are in a serious low memory condition. Resort to
3005 * drastic measures to free some pages so we can allocate
3006 * another pv entry chunk.
3007 *
3008 * Returns NULL if PV entries were reclaimed from the specified pmap.
3009 *
3010 * We do not, however, unmap 2mpages because subsequent accesses will
3011 * allocate per-page pv entries until repromotion occurs, thereby
3012 * exacerbating the shortage of free pv entries.
3013 */
3014 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3015 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3016 {
3017 struct pv_chunks_list *pvc;
3018 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3019 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3020 struct md_page *pvh;
3021 pd_entry_t *pde;
3022 pmap_t next_pmap, pmap;
3023 pt_entry_t *pte, tpte;
3024 pv_entry_t pv;
3025 vm_offset_t va;
3026 vm_page_t m, m_pc;
3027 struct spglist free;
3028 uint64_t inuse;
3029 int bit, field, freed, lvl;
3030
3031 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3032 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3033
3034 pmap = NULL;
3035 m_pc = NULL;
3036 SLIST_INIT(&free);
3037 bzero(&pc_marker_b, sizeof(pc_marker_b));
3038 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3039 pc_marker = (struct pv_chunk *)&pc_marker_b;
3040 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3041
3042 pvc = &pv_chunks[domain];
3043 mtx_lock(&pvc->pvc_lock);
3044 pvc->active_reclaims++;
3045 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3046 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3047 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3048 SLIST_EMPTY(&free)) {
3049 next_pmap = pc->pc_pmap;
3050 if (next_pmap == NULL) {
3051 /*
3052 * The next chunk is a marker. However, it is
3053 * not our marker, so active_reclaims must be
3054 * > 1. Consequently, the next_chunk code
3055 * will not rotate the pv_chunks list.
3056 */
3057 goto next_chunk;
3058 }
3059 mtx_unlock(&pvc->pvc_lock);
3060
3061 /*
3062 * A pv_chunk can only be removed from the pc_lru list
3063 * when both pvc->pvc_lock is owned and the
3064 * corresponding pmap is locked.
3065 */
3066 if (pmap != next_pmap) {
3067 if (pmap != NULL && pmap != locked_pmap)
3068 PMAP_UNLOCK(pmap);
3069 pmap = next_pmap;
3070 /* Avoid deadlock and lock recursion. */
3071 if (pmap > locked_pmap) {
3072 RELEASE_PV_LIST_LOCK(lockp);
3073 PMAP_LOCK(pmap);
3074 mtx_lock(&pvc->pvc_lock);
3075 continue;
3076 } else if (pmap != locked_pmap) {
3077 if (PMAP_TRYLOCK(pmap)) {
3078 mtx_lock(&pvc->pvc_lock);
3079 continue;
3080 } else {
3081 pmap = NULL; /* pmap is not locked */
3082 mtx_lock(&pvc->pvc_lock);
3083 pc = TAILQ_NEXT(pc_marker, pc_lru);
3084 if (pc == NULL ||
3085 pc->pc_pmap != next_pmap)
3086 continue;
3087 goto next_chunk;
3088 }
3089 }
3090 }
3091
3092 /*
3093 * Destroy every non-wired, 4 KB page mapping in the chunk.
3094 */
3095 freed = 0;
3096 for (field = 0; field < _NPCM; field++) {
3097 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3098 inuse != 0; inuse &= ~(1UL << bit)) {
3099 bit = ffsl(inuse) - 1;
3100 pv = &pc->pc_pventry[field * 64 + bit];
3101 va = pv->pv_va;
3102 pde = pmap_pde(pmap, va, &lvl);
3103 if (lvl != 2)
3104 continue;
3105 pte = pmap_l2_to_l3(pde, va);
3106 tpte = pmap_load(pte);
3107 if ((tpte & ATTR_SW_WIRED) != 0)
3108 continue;
3109 if ((tpte & ATTR_CONTIGUOUS) != 0)
3110 (void)pmap_demote_l3c(pmap, pte, va);
3111 tpte = pmap_load_clear(pte);
3112 m = PTE_TO_VM_PAGE(tpte);
3113 if (pmap_pte_dirty(pmap, tpte))
3114 vm_page_dirty(m);
3115 if ((tpte & ATTR_AF) != 0) {
3116 pmap_s1_invalidate_page(pmap, va, true);
3117 vm_page_aflag_set(m, PGA_REFERENCED);
3118 }
3119 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3120 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3121 m->md.pv_gen++;
3122 if (TAILQ_EMPTY(&m->md.pv_list) &&
3123 (m->flags & PG_FICTITIOUS) == 0) {
3124 pvh = page_to_pvh(m);
3125 if (TAILQ_EMPTY(&pvh->pv_list)) {
3126 vm_page_aflag_clear(m,
3127 PGA_WRITEABLE);
3128 }
3129 }
3130 pc->pc_map[field] |= 1UL << bit;
3131 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3132 freed++;
3133 }
3134 }
3135 if (freed == 0) {
3136 mtx_lock(&pvc->pvc_lock);
3137 goto next_chunk;
3138 }
3139 /* Every freed mapping is for a 4 KB page. */
3140 pmap_resident_count_dec(pmap, freed);
3141 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3142 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3143 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3144 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3145 if (pc_is_free(pc)) {
3146 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3147 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3148 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3149 /* Entire chunk is free; return it. */
3150 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3151 dump_drop_page(m_pc->phys_addr);
3152 mtx_lock(&pvc->pvc_lock);
3153 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3154 break;
3155 }
3156 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3157 mtx_lock(&pvc->pvc_lock);
3158 /* One freed pv entry in locked_pmap is sufficient. */
3159 if (pmap == locked_pmap)
3160 break;
3161
3162 next_chunk:
3163 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3164 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3165 if (pvc->active_reclaims == 1 && pmap != NULL) {
3166 /*
3167 * Rotate the pv chunks list so that we do not
3168 * scan the same pv chunks that could not be
3169 * freed (because they contained a wired
3170 * and/or superpage mapping) on every
3171 * invocation of reclaim_pv_chunk().
3172 */
3173 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3174 MPASS(pc->pc_pmap != NULL);
3175 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3176 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3177 }
3178 }
3179 }
3180 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3181 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3182 pvc->active_reclaims--;
3183 mtx_unlock(&pvc->pvc_lock);
3184 if (pmap != NULL && pmap != locked_pmap)
3185 PMAP_UNLOCK(pmap);
3186 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3187 m_pc = SLIST_FIRST(&free);
3188 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3189 /* Recycle a freed page table page. */
3190 m_pc->ref_count = 1;
3191 }
3192 vm_page_free_pages_toq(&free, true);
3193 return (m_pc);
3194 }
3195
3196 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3197 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3198 {
3199 vm_page_t m;
3200 int i, domain;
3201
3202 domain = PCPU_GET(domain);
3203 for (i = 0; i < vm_ndomains; i++) {
3204 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3205 if (m != NULL)
3206 break;
3207 domain = (domain + 1) % vm_ndomains;
3208 }
3209
3210 return (m);
3211 }
3212
3213 /*
3214 * free the pv_entry back to the free list
3215 */
3216 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3217 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3218 {
3219 struct pv_chunk *pc;
3220 int idx, field, bit;
3221
3222 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3223 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3224 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3225 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3226 pc = pv_to_chunk(pv);
3227 idx = pv - &pc->pc_pventry[0];
3228 field = idx / 64;
3229 bit = idx % 64;
3230 pc->pc_map[field] |= 1ul << bit;
3231 if (!pc_is_free(pc)) {
3232 /* 98% of the time, pc is already at the head of the list. */
3233 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3234 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3235 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3236 }
3237 return;
3238 }
3239 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3240 free_pv_chunk(pc);
3241 }
3242
3243 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3244 free_pv_chunk_dequeued(struct pv_chunk *pc)
3245 {
3246 vm_page_t m;
3247
3248 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3249 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3250 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3251 /* entire chunk is free, return it */
3252 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3253 dump_drop_page(m->phys_addr);
3254 vm_page_unwire_noq(m);
3255 vm_page_free(m);
3256 }
3257
3258 static void
free_pv_chunk(struct pv_chunk * pc)3259 free_pv_chunk(struct pv_chunk *pc)
3260 {
3261 struct pv_chunks_list *pvc;
3262
3263 pvc = &pv_chunks[pc_to_domain(pc)];
3264 mtx_lock(&pvc->pvc_lock);
3265 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3266 mtx_unlock(&pvc->pvc_lock);
3267 free_pv_chunk_dequeued(pc);
3268 }
3269
3270 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3271 free_pv_chunk_batch(struct pv_chunklist *batch)
3272 {
3273 struct pv_chunks_list *pvc;
3274 struct pv_chunk *pc, *npc;
3275 int i;
3276
3277 for (i = 0; i < vm_ndomains; i++) {
3278 if (TAILQ_EMPTY(&batch[i]))
3279 continue;
3280 pvc = &pv_chunks[i];
3281 mtx_lock(&pvc->pvc_lock);
3282 TAILQ_FOREACH(pc, &batch[i], pc_list) {
3283 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3284 }
3285 mtx_unlock(&pvc->pvc_lock);
3286 }
3287
3288 for (i = 0; i < vm_ndomains; i++) {
3289 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3290 free_pv_chunk_dequeued(pc);
3291 }
3292 }
3293 }
3294
3295 /*
3296 * Returns a new PV entry, allocating a new PV chunk from the system when
3297 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3298 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3299 * returned.
3300 *
3301 * The given PV list lock may be released.
3302 */
3303 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3304 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3305 {
3306 struct pv_chunks_list *pvc;
3307 int bit, field;
3308 pv_entry_t pv;
3309 struct pv_chunk *pc;
3310 vm_page_t m;
3311
3312 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3313 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3314 retry:
3315 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3316 if (pc != NULL) {
3317 for (field = 0; field < _NPCM; field++) {
3318 if (pc->pc_map[field]) {
3319 bit = ffsl(pc->pc_map[field]) - 1;
3320 break;
3321 }
3322 }
3323 if (field < _NPCM) {
3324 pv = &pc->pc_pventry[field * 64 + bit];
3325 pc->pc_map[field] &= ~(1ul << bit);
3326 /* If this was the last item, move it to tail */
3327 if (pc_is_full(pc)) {
3328 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3329 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3330 pc_list);
3331 }
3332 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3333 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3334 return (pv);
3335 }
3336 }
3337 /* No free items, allocate another chunk */
3338 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3339 if (m == NULL) {
3340 if (lockp == NULL) {
3341 PV_STAT(pc_chunk_tryfail++);
3342 return (NULL);
3343 }
3344 m = reclaim_pv_chunk(pmap, lockp);
3345 if (m == NULL)
3346 goto retry;
3347 }
3348 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3349 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3350 dump_add_page(m->phys_addr);
3351 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3352 pc->pc_pmap = pmap;
3353 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3354 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */
3355 pvc = &pv_chunks[vm_page_domain(m)];
3356 mtx_lock(&pvc->pvc_lock);
3357 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3358 mtx_unlock(&pvc->pvc_lock);
3359 pv = &pc->pc_pventry[0];
3360 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3361 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3362 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3363 return (pv);
3364 }
3365
3366 /*
3367 * Ensure that the number of spare PV entries in the specified pmap meets or
3368 * exceeds the given count, "needed".
3369 *
3370 * The given PV list lock may be released.
3371 */
3372 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3373 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3374 {
3375 struct pv_chunks_list *pvc;
3376 struct pch new_tail[PMAP_MEMDOM];
3377 struct pv_chunk *pc;
3378 vm_page_t m;
3379 int avail, free, i;
3380 bool reclaimed;
3381
3382 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3383 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3384
3385 /*
3386 * Newly allocated PV chunks must be stored in a private list until
3387 * the required number of PV chunks have been allocated. Otherwise,
3388 * reclaim_pv_chunk() could recycle one of these chunks. In
3389 * contrast, these chunks must be added to the pmap upon allocation.
3390 */
3391 for (i = 0; i < PMAP_MEMDOM; i++)
3392 TAILQ_INIT(&new_tail[i]);
3393 retry:
3394 avail = 0;
3395 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3396 bit_count((bitstr_t *)pc->pc_map, 0,
3397 sizeof(pc->pc_map) * NBBY, &free);
3398 if (free == 0)
3399 break;
3400 avail += free;
3401 if (avail >= needed)
3402 break;
3403 }
3404 for (reclaimed = false; avail < needed; avail += _NPCPV) {
3405 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3406 if (m == NULL) {
3407 m = reclaim_pv_chunk(pmap, lockp);
3408 if (m == NULL)
3409 goto retry;
3410 reclaimed = true;
3411 }
3412 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3413 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3414 dump_add_page(m->phys_addr);
3415 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3416 pc->pc_pmap = pmap;
3417 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3418 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3419 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3420 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3421
3422 /*
3423 * The reclaim might have freed a chunk from the current pmap.
3424 * If that chunk contained available entries, we need to
3425 * re-count the number of available entries.
3426 */
3427 if (reclaimed)
3428 goto retry;
3429 }
3430 for (i = 0; i < vm_ndomains; i++) {
3431 if (TAILQ_EMPTY(&new_tail[i]))
3432 continue;
3433 pvc = &pv_chunks[i];
3434 mtx_lock(&pvc->pvc_lock);
3435 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3436 mtx_unlock(&pvc->pvc_lock);
3437 }
3438 }
3439
3440 /*
3441 * First find and then remove the pv entry for the specified pmap and virtual
3442 * address from the specified pv list. Returns the pv entry if found and NULL
3443 * otherwise. This operation can be performed on pv lists for either 4KB or
3444 * 2MB page mappings.
3445 */
3446 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3447 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3448 {
3449 pv_entry_t pv;
3450
3451 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3452 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3453 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3454 pvh->pv_gen++;
3455 break;
3456 }
3457 }
3458 return (pv);
3459 }
3460
3461 /*
3462 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3463 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3464 * entries for each of the 4KB page mappings.
3465 */
3466 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3467 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3468 struct rwlock **lockp)
3469 {
3470 struct md_page *pvh;
3471 struct pv_chunk *pc;
3472 pv_entry_t pv;
3473 vm_offset_t va_last;
3474 vm_page_t m;
3475 int bit, field;
3476
3477 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3478 KASSERT((va & L2_OFFSET) == 0,
3479 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3480 KASSERT((pa & L2_OFFSET) == 0,
3481 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3482 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3483
3484 /*
3485 * Transfer the 2mpage's pv entry for this mapping to the first
3486 * page's pv list. Once this transfer begins, the pv list lock
3487 * must not be released until the last pv entry is reinstantiated.
3488 */
3489 pvh = pa_to_pvh(pa);
3490 pv = pmap_pvh_remove(pvh, pmap, va);
3491 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3492 m = PHYS_TO_VM_PAGE(pa);
3493 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3494 m->md.pv_gen++;
3495 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3496 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3497 va_last = va + L2_SIZE - PAGE_SIZE;
3498 for (;;) {
3499 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3500 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3501 for (field = 0; field < _NPCM; field++) {
3502 while (pc->pc_map[field]) {
3503 bit = ffsl(pc->pc_map[field]) - 1;
3504 pc->pc_map[field] &= ~(1ul << bit);
3505 pv = &pc->pc_pventry[field * 64 + bit];
3506 va += PAGE_SIZE;
3507 pv->pv_va = va;
3508 m++;
3509 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3510 ("pmap_pv_demote_l2: page %p is not managed", m));
3511 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3512 m->md.pv_gen++;
3513 if (va == va_last)
3514 goto out;
3515 }
3516 }
3517 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3518 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3519 }
3520 out:
3521 if (pc_is_full(pc)) {
3522 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3523 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3524 }
3525 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3526 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3527 }
3528
3529 /*
3530 * First find and then destroy the pv entry for the specified pmap and virtual
3531 * address. This operation can be performed on pv lists for either 4KB or 2MB
3532 * page mappings.
3533 */
3534 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3535 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3536 {
3537 pv_entry_t pv;
3538
3539 pv = pmap_pvh_remove(pvh, pmap, va);
3540 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3541 free_pv_entry(pmap, pv);
3542 }
3543
3544 /*
3545 * Conditionally create the PV entry for a 4KB page mapping if the required
3546 * memory can be allocated without resorting to reclamation.
3547 */
3548 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3549 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3550 struct rwlock **lockp)
3551 {
3552 pv_entry_t pv;
3553
3554 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3555 /* Pass NULL instead of the lock pointer to disable reclamation. */
3556 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3557 pv->pv_va = va;
3558 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3559 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3560 m->md.pv_gen++;
3561 return (true);
3562 } else
3563 return (false);
3564 }
3565
3566 /*
3567 * Create the PV entry for a 2MB page mapping. Always returns true unless the
3568 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
3569 * false if the PV entry cannot be allocated without resorting to reclamation.
3570 */
3571 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3572 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3573 struct rwlock **lockp)
3574 {
3575 struct md_page *pvh;
3576 pv_entry_t pv;
3577 vm_paddr_t pa;
3578
3579 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3580 /* Pass NULL instead of the lock pointer to disable reclamation. */
3581 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3582 NULL : lockp)) == NULL)
3583 return (false);
3584 pv->pv_va = va;
3585 pa = PTE_TO_PHYS(l2e);
3586 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3587 pvh = pa_to_pvh(pa);
3588 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3589 pvh->pv_gen++;
3590 return (true);
3591 }
3592
3593 /*
3594 * Conditionally creates the PV entries for a L3C superpage mapping if
3595 * the required memory can be allocated without resorting to reclamation.
3596 */
3597 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3598 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3599 struct rwlock **lockp)
3600 {
3601 pv_entry_t pv;
3602 vm_offset_t tva;
3603 vm_paddr_t pa __diagused;
3604 vm_page_t mt;
3605
3606 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3607 KASSERT((va & L3C_OFFSET) == 0,
3608 ("pmap_pv_insert_l3c: va is not aligned"));
3609 pa = VM_PAGE_TO_PHYS(m);
3610 KASSERT((pa & L3C_OFFSET) == 0,
3611 ("pmap_pv_insert_l3c: pa is not aligned"));
3612 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3613 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3614 /* Pass NULL instead of lockp to disable reclamation. */
3615 pv = get_pv_entry(pmap, NULL);
3616 if (__predict_false(pv == NULL)) {
3617 while (tva > va) {
3618 mt--;
3619 tva -= L3_SIZE;
3620 pmap_pvh_free(&mt->md, pmap, tva);
3621 }
3622 return (false);
3623 }
3624 pv->pv_va = tva;
3625 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3626 mt->md.pv_gen++;
3627 }
3628 return (true);
3629 }
3630
3631 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3632 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3633 {
3634 pt_entry_t newl2, oldl2 __diagused;
3635 vm_page_t ml3;
3636 vm_paddr_t ml3pa;
3637
3638 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3639 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3640 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3641
3642 ml3 = pmap_remove_pt_page(pmap, va);
3643 if (ml3 == NULL)
3644 panic("pmap_remove_kernel_l2: Missing pt page");
3645
3646 ml3pa = VM_PAGE_TO_PHYS(ml3);
3647 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3648
3649 /*
3650 * If this page table page was unmapped by a promotion, then it
3651 * contains valid mappings. Zero it to invalidate those mappings.
3652 */
3653 if (vm_page_any_valid(ml3))
3654 pagezero((void *)PHYS_TO_DMAP(ml3pa));
3655
3656 /*
3657 * Demote the mapping. The caller must have already invalidated the
3658 * mapping (i.e., the "break" in break-before-make).
3659 */
3660 oldl2 = pmap_load_store(l2, newl2);
3661 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3662 __func__, l2, oldl2));
3663 }
3664
3665 /*
3666 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3667 */
3668 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)3669 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3670 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3671 {
3672 struct md_page *pvh;
3673 pt_entry_t old_l2;
3674 vm_page_t m, ml3, mt;
3675
3676 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3677 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3678 old_l2 = pmap_load_clear(l2);
3679 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3680 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3681
3682 /*
3683 * Since a promotion must break the 4KB page mappings before making
3684 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3685 */
3686 pmap_s1_invalidate_page(pmap, sva, true);
3687
3688 if (old_l2 & ATTR_SW_WIRED)
3689 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3690 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3691 if (old_l2 & ATTR_SW_MANAGED) {
3692 m = PTE_TO_VM_PAGE(old_l2);
3693 pvh = page_to_pvh(m);
3694 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3695 pmap_pvh_free(pvh, pmap, sva);
3696 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3697 if (pmap_pte_dirty(pmap, old_l2))
3698 vm_page_dirty(mt);
3699 if (old_l2 & ATTR_AF)
3700 vm_page_aflag_set(mt, PGA_REFERENCED);
3701 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3702 TAILQ_EMPTY(&pvh->pv_list))
3703 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3704 }
3705 }
3706 if (pmap == kernel_pmap) {
3707 pmap_remove_kernel_l2(pmap, l2, sva);
3708 } else {
3709 ml3 = pmap_remove_pt_page(pmap, sva);
3710 if (ml3 != NULL) {
3711 KASSERT(vm_page_any_valid(ml3),
3712 ("pmap_remove_l2: l3 page not promoted"));
3713 pmap_resident_count_dec(pmap, 1);
3714 KASSERT(ml3->ref_count == NL3PG,
3715 ("pmap_remove_l2: l3 page ref count error"));
3716 ml3->ref_count = 0;
3717 pmap_add_delayed_free_list(ml3, free, false);
3718 }
3719 }
3720 return (pmap_unuse_pt(pmap, sva, l1e, free));
3721 }
3722
3723 /*
3724 * pmap_remove_l3: do the things to unmap a page in a process
3725 */
3726 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)3727 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3728 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3729 {
3730 struct md_page *pvh;
3731 pt_entry_t old_l3;
3732 vm_page_t m;
3733
3734 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3735 old_l3 = pmap_load(l3);
3736 if ((old_l3 & ATTR_CONTIGUOUS) != 0)
3737 (void)pmap_demote_l3c(pmap, l3, va);
3738 old_l3 = pmap_load_clear(l3);
3739 pmap_s1_invalidate_page(pmap, va, true);
3740 if (old_l3 & ATTR_SW_WIRED)
3741 pmap->pm_stats.wired_count -= 1;
3742 pmap_resident_count_dec(pmap, 1);
3743 if (old_l3 & ATTR_SW_MANAGED) {
3744 m = PTE_TO_VM_PAGE(old_l3);
3745 if (pmap_pte_dirty(pmap, old_l3))
3746 vm_page_dirty(m);
3747 if (old_l3 & ATTR_AF)
3748 vm_page_aflag_set(m, PGA_REFERENCED);
3749 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3750 pmap_pvh_free(&m->md, pmap, va);
3751 if (TAILQ_EMPTY(&m->md.pv_list) &&
3752 (m->flags & PG_FICTITIOUS) == 0) {
3753 pvh = page_to_pvh(m);
3754 if (TAILQ_EMPTY(&pvh->pv_list))
3755 vm_page_aflag_clear(m, PGA_WRITEABLE);
3756 }
3757 }
3758 return (pmap_unuse_pt(pmap, va, l2e, free));
3759 }
3760
3761 /*
3762 * Removes the specified L3C superpage mapping. Requests TLB invalidations
3763 * to be performed by the caller through the returned "*vap". Returns true
3764 * if the level 3 table "ml3" was unmapped and added to the spglist "free".
3765 * Otherwise, returns false.
3766 */
3767 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)3768 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
3769 vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
3770 struct rwlock **lockp)
3771 {
3772 struct md_page *pvh;
3773 struct rwlock *new_lock;
3774 pt_entry_t first_l3e, l3e, *tl3p;
3775 vm_offset_t tva;
3776 vm_page_t m, mt;
3777
3778 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3779 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
3780 0, ("pmap_remove_l3c: l3p is not aligned"));
3781 KASSERT((va & L3C_OFFSET) == 0,
3782 ("pmap_remove_l3c: va is not aligned"));
3783
3784 /*
3785 * Hardware accessed and dirty bit maintenance might only update a
3786 * single L3 entry, so we must combine the accessed and dirty bits
3787 * from this entire set of contiguous L3 entries.
3788 */
3789 first_l3e = pmap_load_clear(l3p);
3790 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
3791 l3e = pmap_load_clear(tl3p);
3792 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
3793 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
3794 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
3795 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
3796 first_l3e &= ~ATTR_S1_AP_RW_BIT;
3797 first_l3e |= l3e & ATTR_AF;
3798 }
3799 if ((first_l3e & ATTR_SW_WIRED) != 0)
3800 pmap->pm_stats.wired_count -= L3C_ENTRIES;
3801 pmap_resident_count_dec(pmap, L3C_ENTRIES);
3802 if ((first_l3e & ATTR_SW_MANAGED) != 0) {
3803 m = PTE_TO_VM_PAGE(first_l3e);
3804 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3805 if (new_lock != *lockp) {
3806 if (*lockp != NULL) {
3807 /*
3808 * Pending TLB invalidations must be
3809 * performed before the PV list lock is
3810 * released. Otherwise, a concurrent
3811 * pmap_remove_all() on a physical page
3812 * could return while a stale TLB entry
3813 * still provides access to that page.
3814 */
3815 if (*vap != va_next) {
3816 pmap_invalidate_range(pmap, *vap, va,
3817 true);
3818 *vap = va_next;
3819 }
3820 rw_wunlock(*lockp);
3821 }
3822 *lockp = new_lock;
3823 rw_wlock(*lockp);
3824 }
3825 pvh = page_to_pvh(m);
3826 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
3827 L3_SIZE) {
3828 if (pmap_pte_dirty(pmap, first_l3e))
3829 vm_page_dirty(mt);
3830 if ((first_l3e & ATTR_AF) != 0)
3831 vm_page_aflag_set(mt, PGA_REFERENCED);
3832 pmap_pvh_free(&mt->md, pmap, tva);
3833 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3834 TAILQ_EMPTY(&pvh->pv_list))
3835 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3836 }
3837 }
3838 if (*vap == va_next)
3839 *vap = va;
3840 if (ml3 != NULL) {
3841 ml3->ref_count -= L3C_ENTRIES;
3842 if (ml3->ref_count == 0) {
3843 _pmap_unwire_l3(pmap, va, ml3, free);
3844 return (true);
3845 }
3846 }
3847 return (false);
3848 }
3849
3850 /*
3851 * Remove the specified range of addresses from the L3 page table that is
3852 * identified by the given L2 entry.
3853 */
3854 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)3855 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3856 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3857 {
3858 struct md_page *pvh;
3859 struct rwlock *new_lock;
3860 pt_entry_t *l3, old_l3;
3861 vm_offset_t va;
3862 vm_page_t l3pg, m;
3863
3864 KASSERT(ADDR_IS_CANONICAL(sva),
3865 ("%s: Start address not in canonical form: %lx", __func__, sva));
3866 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3867 ("%s: End address not in canonical form: %lx", __func__, eva));
3868
3869 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3870 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3871 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3872 l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
3873 va = eva;
3874 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3875 old_l3 = pmap_load(l3);
3876 if (!pmap_l3_valid(old_l3)) {
3877 if (va != eva) {
3878 pmap_invalidate_range(pmap, va, sva, true);
3879 va = eva;
3880 }
3881 continue;
3882 }
3883 if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
3884 /*
3885 * Is this entire set of contiguous L3 entries being
3886 * removed? Handle the possibility that "eva" is zero
3887 * because of address wraparound.
3888 */
3889 if ((sva & L3C_OFFSET) == 0 &&
3890 sva + L3C_OFFSET <= eva - 1) {
3891 if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
3892 l3pg, free, lockp)) {
3893 /* The L3 table was unmapped. */
3894 sva += L3C_SIZE;
3895 break;
3896 }
3897 l3 += L3C_ENTRIES - 1;
3898 sva += L3C_SIZE - L3_SIZE;
3899 continue;
3900 }
3901
3902 (void)pmap_demote_l3c(pmap, l3, sva);
3903 }
3904 old_l3 = pmap_load_clear(l3);
3905 if ((old_l3 & ATTR_SW_WIRED) != 0)
3906 pmap->pm_stats.wired_count--;
3907 pmap_resident_count_dec(pmap, 1);
3908 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3909 m = PTE_TO_VM_PAGE(old_l3);
3910 if (pmap_pte_dirty(pmap, old_l3))
3911 vm_page_dirty(m);
3912 if ((old_l3 & ATTR_AF) != 0)
3913 vm_page_aflag_set(m, PGA_REFERENCED);
3914 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3915 if (new_lock != *lockp) {
3916 if (*lockp != NULL) {
3917 /*
3918 * Pending TLB invalidations must be
3919 * performed before the PV list lock is
3920 * released. Otherwise, a concurrent
3921 * pmap_remove_all() on a physical page
3922 * could return while a stale TLB entry
3923 * still provides access to that page.
3924 */
3925 if (va != eva) {
3926 pmap_invalidate_range(pmap, va,
3927 sva, true);
3928 va = eva;
3929 }
3930 rw_wunlock(*lockp);
3931 }
3932 *lockp = new_lock;
3933 rw_wlock(*lockp);
3934 }
3935 pmap_pvh_free(&m->md, pmap, sva);
3936 if (TAILQ_EMPTY(&m->md.pv_list) &&
3937 (m->flags & PG_FICTITIOUS) == 0) {
3938 pvh = page_to_pvh(m);
3939 if (TAILQ_EMPTY(&pvh->pv_list))
3940 vm_page_aflag_clear(m, PGA_WRITEABLE);
3941 }
3942 }
3943 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3944 /*
3945 * _pmap_unwire_l3() has already invalidated the TLB
3946 * entries at all levels for "sva". So, we need not
3947 * perform "sva += L3_SIZE;" here. Moreover, we need
3948 * not perform "va = sva;" if "sva" is at the start
3949 * of a new valid range consisting of a single page.
3950 */
3951 break;
3952 }
3953 if (va == eva)
3954 va = sva;
3955 }
3956 if (va != eva)
3957 pmap_invalidate_range(pmap, va, sva, true);
3958 }
3959
3960 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)3961 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
3962 {
3963 struct rwlock *lock;
3964 vm_offset_t va_next;
3965 pd_entry_t *l0, *l1, *l2;
3966 pt_entry_t l3_paddr;
3967 struct spglist free;
3968
3969 /*
3970 * Perform an unsynchronized read. This is, however, safe.
3971 */
3972 if (pmap->pm_stats.resident_count == 0)
3973 return;
3974
3975 SLIST_INIT(&free);
3976
3977 PMAP_LOCK(pmap);
3978 if (map_delete)
3979 pmap_bti_on_remove(pmap, sva, eva);
3980
3981 lock = NULL;
3982 for (; sva < eva; sva = va_next) {
3983 if (pmap->pm_stats.resident_count == 0)
3984 break;
3985
3986 l0 = pmap_l0(pmap, sva);
3987 if (pmap_load(l0) == 0) {
3988 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
3989 if (va_next < sva)
3990 va_next = eva;
3991 continue;
3992 }
3993
3994 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
3995 if (va_next < sva)
3996 va_next = eva;
3997 l1 = pmap_l0_to_l1(l0, sva);
3998 if (pmap_load(l1) == 0)
3999 continue;
4000 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4001 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4002 KASSERT(va_next <= eva,
4003 ("partial update of non-transparent 1G page "
4004 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4005 pmap_load(l1), sva, eva, va_next));
4006 MPASS(pmap != kernel_pmap);
4007 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4008 pmap_clear(l1);
4009 pmap_s1_invalidate_page(pmap, sva, true);
4010 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4011 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4012 continue;
4013 }
4014
4015 /*
4016 * Calculate index for next page table.
4017 */
4018 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4019 if (va_next < sva)
4020 va_next = eva;
4021
4022 l2 = pmap_l1_to_l2(l1, sva);
4023 if (l2 == NULL)
4024 continue;
4025
4026 l3_paddr = pmap_load(l2);
4027
4028 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4029 if (sva + L2_SIZE == va_next && eva >= va_next) {
4030 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4031 &free, &lock);
4032 continue;
4033 } else if (pmap_demote_l2_locked(pmap, l2, sva,
4034 &lock) == NULL)
4035 continue;
4036 l3_paddr = pmap_load(l2);
4037 }
4038
4039 /*
4040 * Weed out invalid mappings.
4041 */
4042 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4043 continue;
4044
4045 /*
4046 * Limit our scan to either the end of the va represented
4047 * by the current page table page, or to the end of the
4048 * range being removed.
4049 */
4050 if (va_next > eva)
4051 va_next = eva;
4052
4053 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4054 &lock);
4055 }
4056 if (lock != NULL)
4057 rw_wunlock(lock);
4058 PMAP_UNLOCK(pmap);
4059 vm_page_free_pages_toq(&free, true);
4060 }
4061
4062 /*
4063 * Remove the given range of addresses from the specified map.
4064 *
4065 * It is assumed that the start and end are properly
4066 * rounded to the page size.
4067 */
4068 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4069 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4070 {
4071 pmap_remove1(pmap, sva, eva, false);
4072 }
4073
4074 /*
4075 * Remove the given range of addresses as part of a logical unmap
4076 * operation. This has the effect of calling pmap_remove(), but
4077 * also clears any metadata that should persist for the lifetime
4078 * of a logical mapping.
4079 */
4080 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4081 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4082 {
4083 pmap_remove1(pmap, sva, eva, true);
4084 }
4085
4086 /*
4087 * Routine: pmap_remove_all
4088 * Function:
4089 * Removes this physical page from
4090 * all physical maps in which it resides.
4091 * Reflects back modify bits to the pager.
4092 *
4093 * Notes:
4094 * Original versions of this routine were very
4095 * inefficient because they iteratively called
4096 * pmap_remove (slow...)
4097 */
4098
4099 void
pmap_remove_all(vm_page_t m)4100 pmap_remove_all(vm_page_t m)
4101 {
4102 struct md_page *pvh;
4103 pv_entry_t pv;
4104 pmap_t pmap;
4105 struct rwlock *lock;
4106 pd_entry_t *pde, tpde;
4107 pt_entry_t *pte, tpte;
4108 vm_offset_t va;
4109 struct spglist free;
4110 int lvl, pvh_gen, md_gen;
4111
4112 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4113 ("pmap_remove_all: page %p is not managed", m));
4114 SLIST_INIT(&free);
4115 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4116 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4117 rw_wlock(lock);
4118 retry:
4119 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4120 pmap = PV_PMAP(pv);
4121 if (!PMAP_TRYLOCK(pmap)) {
4122 pvh_gen = pvh->pv_gen;
4123 rw_wunlock(lock);
4124 PMAP_LOCK(pmap);
4125 rw_wlock(lock);
4126 if (pvh_gen != pvh->pv_gen) {
4127 PMAP_UNLOCK(pmap);
4128 goto retry;
4129 }
4130 }
4131 va = pv->pv_va;
4132 pte = pmap_pte_exists(pmap, va, 2, __func__);
4133 pmap_demote_l2_locked(pmap, pte, va, &lock);
4134 PMAP_UNLOCK(pmap);
4135 }
4136 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4137 pmap = PV_PMAP(pv);
4138 if (!PMAP_TRYLOCK(pmap)) {
4139 pvh_gen = pvh->pv_gen;
4140 md_gen = m->md.pv_gen;
4141 rw_wunlock(lock);
4142 PMAP_LOCK(pmap);
4143 rw_wlock(lock);
4144 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4145 PMAP_UNLOCK(pmap);
4146 goto retry;
4147 }
4148 }
4149 pmap_resident_count_dec(pmap, 1);
4150
4151 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4152 KASSERT(pde != NULL,
4153 ("pmap_remove_all: no page directory entry found"));
4154 KASSERT(lvl == 2,
4155 ("pmap_remove_all: invalid pde level %d", lvl));
4156 tpde = pmap_load(pde);
4157
4158 pte = pmap_l2_to_l3(pde, pv->pv_va);
4159 tpte = pmap_load(pte);
4160 if ((tpte & ATTR_CONTIGUOUS) != 0)
4161 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4162 tpte = pmap_load_clear(pte);
4163 if (tpte & ATTR_SW_WIRED)
4164 pmap->pm_stats.wired_count--;
4165 if ((tpte & ATTR_AF) != 0) {
4166 pmap_invalidate_page(pmap, pv->pv_va, true);
4167 vm_page_aflag_set(m, PGA_REFERENCED);
4168 }
4169
4170 /*
4171 * Update the vm_page_t clean and reference bits.
4172 */
4173 if (pmap_pte_dirty(pmap, tpte))
4174 vm_page_dirty(m);
4175 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4176 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4177 m->md.pv_gen++;
4178 free_pv_entry(pmap, pv);
4179 PMAP_UNLOCK(pmap);
4180 }
4181 vm_page_aflag_clear(m, PGA_WRITEABLE);
4182 rw_wunlock(lock);
4183 vm_page_free_pages_toq(&free, true);
4184 }
4185
4186 /*
4187 * Masks and sets bits in a level 2 page table entries in the specified pmap
4188 */
4189 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4190 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4191 pt_entry_t nbits)
4192 {
4193 pd_entry_t old_l2;
4194 vm_page_t m, mt;
4195
4196 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4197 PMAP_ASSERT_STAGE1(pmap);
4198 KASSERT((sva & L2_OFFSET) == 0,
4199 ("pmap_protect_l2: sva is not 2mpage aligned"));
4200 old_l2 = pmap_load(l2);
4201 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4202 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4203
4204 /*
4205 * Return if the L2 entry already has the desired access restrictions
4206 * in place.
4207 */
4208 if ((old_l2 & mask) == nbits)
4209 return;
4210
4211 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4212 cpu_spinwait();
4213
4214 /*
4215 * When a dirty read/write superpage mapping is write protected,
4216 * update the dirty field of each of the superpage's constituent 4KB
4217 * pages.
4218 */
4219 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4220 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4221 pmap_pte_dirty(pmap, old_l2)) {
4222 m = PTE_TO_VM_PAGE(old_l2);
4223 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4224 vm_page_dirty(mt);
4225 }
4226
4227 /*
4228 * Since a promotion must break the 4KB page mappings before making
4229 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4230 */
4231 pmap_s1_invalidate_page(pmap, sva, true);
4232 }
4233
4234 /*
4235 * Masks and sets bits in the specified L3C superpage mapping.
4236 *
4237 * Requests TLB invalidations to be performed by the caller through the
4238 * returned "*vap".
4239 */
4240 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4241 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4242 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4243 {
4244 pt_entry_t l3e, *tl3p;
4245 vm_page_t m, mt;
4246 bool dirty;
4247
4248 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4249 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4250 0, ("pmap_mask_set_l3c: l3p is not aligned"));
4251 KASSERT((va & L3C_OFFSET) == 0,
4252 ("pmap_mask_set_l3c: va is not aligned"));
4253 dirty = false;
4254 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4255 l3e = pmap_load(tl3p);
4256 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4257 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4258 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4259 cpu_spinwait();
4260 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4261 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4262 dirty = true;
4263 }
4264
4265 /*
4266 * When a dirty read/write superpage mapping is write protected,
4267 * update the dirty field of each of the superpage's constituent 4KB
4268 * pages.
4269 */
4270 if ((l3e & ATTR_SW_MANAGED) != 0 &&
4271 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4272 dirty) {
4273 m = PTE_TO_VM_PAGE(pmap_load(l3p));
4274 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4275 vm_page_dirty(mt);
4276 }
4277
4278 if (*vap == va_next)
4279 *vap = va;
4280 }
4281
4282 /*
4283 * Masks and sets bits in last level page table entries in the specified
4284 * pmap and range
4285 */
4286 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4287 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4288 pt_entry_t nbits, bool invalidate)
4289 {
4290 vm_offset_t va, va_next;
4291 pd_entry_t *l0, *l1, *l2;
4292 pt_entry_t *l3p, l3;
4293
4294 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4295 for (; sva < eva; sva = va_next) {
4296 l0 = pmap_l0(pmap, sva);
4297 if (pmap_load(l0) == 0) {
4298 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4299 if (va_next < sva)
4300 va_next = eva;
4301 continue;
4302 }
4303
4304 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4305 if (va_next < sva)
4306 va_next = eva;
4307 l1 = pmap_l0_to_l1(l0, sva);
4308 if (pmap_load(l1) == 0)
4309 continue;
4310 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4311 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4312 KASSERT(va_next <= eva,
4313 ("partial update of non-transparent 1G page "
4314 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4315 pmap_load(l1), sva, eva, va_next));
4316 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4317 if ((pmap_load(l1) & mask) != nbits) {
4318 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4319 if (invalidate)
4320 pmap_s1_invalidate_page(pmap, sva, true);
4321 }
4322 continue;
4323 }
4324
4325 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4326 if (va_next < sva)
4327 va_next = eva;
4328
4329 l2 = pmap_l1_to_l2(l1, sva);
4330 if (pmap_load(l2) == 0)
4331 continue;
4332
4333 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4334 if (sva + L2_SIZE == va_next && eva >= va_next) {
4335 pmap_protect_l2(pmap, l2, sva, mask, nbits);
4336 continue;
4337 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
4338 continue;
4339 }
4340 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4341 ("pmap_protect: Invalid L2 entry after demotion"));
4342
4343 if (va_next > eva)
4344 va_next = eva;
4345
4346 va = va_next;
4347 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4348 sva += L3_SIZE) {
4349 l3 = pmap_load(l3p);
4350
4351 /*
4352 * Go to the next L3 entry if the current one is
4353 * invalid or already has the desired access
4354 * restrictions in place. (The latter case occurs
4355 * frequently. For example, in a "buildworld"
4356 * workload, almost 1 out of 4 L3 entries already
4357 * have the desired restrictions.)
4358 */
4359 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4360 if (va != va_next) {
4361 if (invalidate)
4362 pmap_s1_invalidate_range(pmap,
4363 va, sva, true);
4364 va = va_next;
4365 }
4366 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4367 l3p += L3C_ENTRIES - 1;
4368 sva += L3C_SIZE - L3_SIZE;
4369 }
4370 continue;
4371 }
4372
4373 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4374 /*
4375 * Is this entire set of contiguous L3 entries
4376 * being protected? Handle the possibility
4377 * that "va_next" is zero because of address
4378 * wraparound.
4379 */
4380 if ((sva & L3C_OFFSET) == 0 &&
4381 sva + L3C_OFFSET <= va_next - 1) {
4382 pmap_mask_set_l3c(pmap, l3p, sva, &va,
4383 va_next, mask, nbits);
4384 l3p += L3C_ENTRIES - 1;
4385 sva += L3C_SIZE - L3_SIZE;
4386 continue;
4387 }
4388
4389 (void)pmap_demote_l3c(pmap, l3p, sva);
4390
4391 /*
4392 * The L3 entry's accessed bit may have changed.
4393 */
4394 l3 = pmap_load(l3p);
4395 }
4396 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4397 nbits))
4398 cpu_spinwait();
4399
4400 /*
4401 * When a dirty read/write mapping is write protected,
4402 * update the page's dirty field.
4403 */
4404 if ((l3 & ATTR_SW_MANAGED) != 0 &&
4405 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4406 pmap_pte_dirty(pmap, l3))
4407 vm_page_dirty(PTE_TO_VM_PAGE(l3));
4408
4409 if (va == va_next)
4410 va = sva;
4411 }
4412 if (va != va_next && invalidate)
4413 pmap_s1_invalidate_range(pmap, va, sva, true);
4414 }
4415 }
4416
4417 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4418 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4419 pt_entry_t nbits, bool invalidate)
4420 {
4421 PMAP_LOCK(pmap);
4422 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4423 PMAP_UNLOCK(pmap);
4424 }
4425
4426 /*
4427 * Set the physical protection on the
4428 * specified range of this map as requested.
4429 */
4430 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4431 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4432 {
4433 pt_entry_t mask, nbits;
4434
4435 PMAP_ASSERT_STAGE1(pmap);
4436 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4437 if (prot == VM_PROT_NONE) {
4438 pmap_remove(pmap, sva, eva);
4439 return;
4440 }
4441
4442 mask = nbits = 0;
4443 if ((prot & VM_PROT_WRITE) == 0) {
4444 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4445 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4446 }
4447 if ((prot & VM_PROT_EXECUTE) == 0) {
4448 mask |= ATTR_S1_XN;
4449 nbits |= ATTR_S1_XN;
4450 }
4451 if (pmap == kernel_pmap) {
4452 mask |= ATTR_KERN_GP;
4453 nbits |= ATTR_KERN_GP;
4454 }
4455 if (mask == 0)
4456 return;
4457
4458 pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4459 }
4460
4461 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4462 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4463 {
4464
4465 MPASS((sva & L3_OFFSET) == 0);
4466 MPASS(((sva + size) & L3_OFFSET) == 0);
4467
4468 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4469 ATTR_SW_NO_PROMOTE, false);
4470 }
4471
4472 /*
4473 * Inserts the specified page table page into the specified pmap's collection
4474 * of idle page table pages. Each of a pmap's page table pages is responsible
4475 * for mapping a distinct range of virtual addresses. The pmap's collection is
4476 * ordered by this virtual address range.
4477 *
4478 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4479 * "mpte"'s valid field will be set to 0.
4480 *
4481 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4482 * contain valid mappings with identical attributes except for ATTR_AF;
4483 * "mpte"'s valid field will be set to 1.
4484 *
4485 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4486 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4487 * field will be set to VM_PAGE_BITS_ALL.
4488 */
4489 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4490 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4491 bool all_l3e_AF_set)
4492 {
4493
4494 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4495 KASSERT(promoted || !all_l3e_AF_set,
4496 ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4497 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4498 return (vm_radix_insert(&pmap->pm_root, mpte));
4499 }
4500
4501 /*
4502 * Removes the page table page mapping the specified virtual address from the
4503 * specified pmap's collection of idle page table pages, and returns it.
4504 * Otherwise, returns NULL if there is no page table page corresponding to the
4505 * specified virtual address.
4506 */
4507 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4508 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4509 {
4510
4511 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4512 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4513 }
4514
4515 /*
4516 * Performs a break-before-make update of a pmap entry. This is needed when
4517 * either promoting or demoting pages to ensure the TLB doesn't get into an
4518 * inconsistent state.
4519 */
4520 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4521 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4522 vm_offset_t va, vm_size_t size)
4523 {
4524 pd_entry_t *lip, *ptep_end;
4525 register_t intr;
4526
4527 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4528
4529 if ((newpte & ATTR_SW_NO_PROMOTE) != 0)
4530 panic("%s: Updating non-promote pte", __func__);
4531
4532 if (size == L3C_SIZE)
4533 ptep_end = ptep + L3C_ENTRIES;
4534 else
4535 ptep_end = ptep + 1;
4536
4537 /*
4538 * Ensure we don't get switched out with the page table in an
4539 * inconsistent state. We also need to ensure no interrupts fire
4540 * as they may make use of an address we are about to invalidate.
4541 */
4542 intr = intr_disable();
4543
4544 /*
4545 * Clear the old mapping's valid bit, but leave the rest of the entry
4546 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4547 * lookup the physical address.
4548 */
4549 for (lip = ptep; lip < ptep_end; lip++)
4550 pmap_clear_bits(lip, ATTR_DESCR_VALID);
4551
4552 /*
4553 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4554 * be cached, so we invalidate intermediate entries as well as final
4555 * entries.
4556 */
4557 pmap_s1_invalidate_range(pmap, va, va + size, size == L3C_SIZE);
4558
4559 /* Create the new mapping */
4560 for (lip = ptep; lip < ptep_end; lip++) {
4561 pmap_store(lip, newpte);
4562 newpte += PAGE_SIZE;
4563 }
4564 dsb(ishst);
4565
4566 intr_restore(intr);
4567 }
4568
4569 #if VM_NRESERVLEVEL > 0
4570 /*
4571 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4572 * replace the many pv entries for the 4KB page mappings by a single pv entry
4573 * for the 2MB page mapping.
4574 */
4575 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4576 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4577 struct rwlock **lockp)
4578 {
4579 struct md_page *pvh;
4580 pv_entry_t pv;
4581 vm_offset_t va_last;
4582 vm_page_t m;
4583
4584 KASSERT((pa & L2_OFFSET) == 0,
4585 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4586 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4587
4588 /*
4589 * Transfer the first page's pv entry for this mapping to the 2mpage's
4590 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4591 * a transfer avoids the possibility that get_pv_entry() calls
4592 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4593 * mappings that is being promoted.
4594 */
4595 m = PHYS_TO_VM_PAGE(pa);
4596 va = va & ~L2_OFFSET;
4597 pv = pmap_pvh_remove(&m->md, pmap, va);
4598 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4599 pvh = page_to_pvh(m);
4600 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4601 pvh->pv_gen++;
4602 /* Free the remaining NPTEPG - 1 pv entries. */
4603 va_last = va + L2_SIZE - PAGE_SIZE;
4604 do {
4605 m++;
4606 va += PAGE_SIZE;
4607 pmap_pvh_free(&m->md, pmap, va);
4608 } while (va < va_last);
4609 }
4610
4611 /*
4612 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4613 * single level 2 table entry to a single 2MB page mapping. For promotion
4614 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4615 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4616 * identical characteristics.
4617 */
4618 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4619 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4620 struct rwlock **lockp)
4621 {
4622 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4623
4624 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4625
4626 /*
4627 * Currently, this function only supports promotion on stage 1 pmaps
4628 * because it tests stage 1 specific fields and performs a break-
4629 * before-make sequence that is incorrect for stage 2 pmaps.
4630 */
4631 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4632 return (false);
4633
4634 /*
4635 * Examine the first L3E in the specified PTP. Abort if this L3E is
4636 * ineligible for promotion...
4637 */
4638 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4639 newl2 = pmap_load(firstl3);
4640 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4641 return (false);
4642 /* ... is not the first physical page within an L2 block */
4643 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4644 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4645 atomic_add_long(&pmap_l2_p_failures, 1);
4646 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4647 " in pmap %p", va, pmap);
4648 return (false);
4649 }
4650
4651 /*
4652 * Both here and in the below "for" loop, to allow for repromotion
4653 * after MADV_FREE, conditionally write protect a clean L3E before
4654 * possibly aborting the promotion due to other L3E attributes. Why?
4655 * Suppose that MADV_FREE is applied to a part of a superpage, the
4656 * address range [S, E). pmap_advise() will demote the superpage
4657 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4658 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later,
4659 * imagine that the memory in [S, E) is recycled, but the last 4KB
4660 * page in [S, E) is not the last to be rewritten, or simply accessed.
4661 * In other words, there is still a 4KB page in [S, E), call it P,
4662 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4663 * Unless we write protect P before aborting the promotion, if and
4664 * when P is finally rewritten, there won't be a page fault to trigger
4665 * repromotion.
4666 */
4667 setl2:
4668 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4669 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4670 /*
4671 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4672 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4673 */
4674 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4675 goto setl2;
4676 newl2 &= ~ATTR_SW_DBM;
4677 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4678 " in pmap %p", va & ~L2_OFFSET, pmap);
4679 }
4680
4681 /*
4682 * Examine each of the other L3Es in the specified PTP. Abort if this
4683 * L3E maps an unexpected 4KB physical page or does not have identical
4684 * characteristics to the first L3E. If ATTR_AF is not set in every
4685 * PTE, then request that the PTP be refilled on demotion.
4686 */
4687 all_l3e_AF = newl2 & ATTR_AF;
4688 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4689 + L2_SIZE - PAGE_SIZE;
4690 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4691 oldl3 = pmap_load(l3);
4692 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4693 atomic_add_long(&pmap_l2_p_failures, 1);
4694 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4695 " in pmap %p", va, pmap);
4696 return (false);
4697 }
4698 setl3:
4699 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4700 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4701 /*
4702 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4703 * set, ATTR_SW_DBM can be cleared without a TLB
4704 * invalidation.
4705 */
4706 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4707 ~ATTR_SW_DBM))
4708 goto setl3;
4709 oldl3 &= ~ATTR_SW_DBM;
4710 }
4711 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
4712 atomic_add_long(&pmap_l2_p_failures, 1);
4713 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4714 " in pmap %p", va, pmap);
4715 return (false);
4716 }
4717 all_l3e_AF &= oldl3;
4718 pa -= PAGE_SIZE;
4719 }
4720
4721 /*
4722 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4723 * mapping, so that promotions triggered by speculative mappings,
4724 * such as pmap_enter_quick(), don't automatically mark the
4725 * underlying pages as referenced.
4726 */
4727 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
4728
4729 /*
4730 * Save the page table page in its current state until the L2
4731 * mapping the superpage is demoted by pmap_demote_l2() or
4732 * destroyed by pmap_remove_l3().
4733 */
4734 if (mpte == NULL)
4735 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
4736 KASSERT(mpte >= vm_page_array &&
4737 mpte < &vm_page_array[vm_page_array_size],
4738 ("pmap_promote_l2: page table page is out of range"));
4739 KASSERT(mpte->pindex == pmap_l2_pindex(va),
4740 ("pmap_promote_l2: page table page's pindex is wrong"));
4741 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4742 atomic_add_long(&pmap_l2_p_failures, 1);
4743 CTR2(KTR_PMAP,
4744 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4745 pmap);
4746 return (false);
4747 }
4748
4749 if ((newl2 & ATTR_SW_MANAGED) != 0)
4750 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
4751
4752 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
4753
4754 atomic_add_long(&pmap_l2_promotions, 1);
4755 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4756 pmap);
4757 return (true);
4758 }
4759
4760 /*
4761 * Tries to promote an aligned, contiguous set of base page mappings to a
4762 * single L3C page mapping. For promotion to occur, two conditions must be
4763 * met: (1) the base page mappings must map aligned, contiguous physical
4764 * memory and (2) the base page mappings must have identical characteristics
4765 * except for the accessed flag.
4766 */
4767 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)4768 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
4769 {
4770 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
4771
4772 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4773
4774 /*
4775 * Currently, this function only supports promotion on stage 1 pmaps
4776 * because it tests stage 1 specific fields and performs a break-
4777 * before-make sequence that is incorrect for stage 2 pmaps.
4778 */
4779 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4780 return (false);
4781
4782 /*
4783 * Compute the address of the first L3 entry in the superpage
4784 * candidate.
4785 */
4786 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
4787 sizeof(pt_entry_t)) - 1));
4788
4789 firstl3c = pmap_load(l3p);
4790
4791 /*
4792 * Examine the first L3 entry. Abort if this L3E is ineligible for
4793 * promotion...
4794 */
4795 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
4796 return (false);
4797 /* ...is not properly aligned... */
4798 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
4799 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
4800 counter_u64_add(pmap_l3c_p_failures, 1);
4801 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4802 " in pmap %p", va, pmap);
4803 return (false);
4804 }
4805
4806 /*
4807 * If the first L3 entry is a clean read-write mapping, convert it
4808 * to a read-only mapping. See pmap_promote_l2() for the rationale.
4809 */
4810 set_first:
4811 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4812 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4813 /*
4814 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4815 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4816 */
4817 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
4818 goto set_first;
4819 firstl3c &= ~ATTR_SW_DBM;
4820 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4821 " in pmap %p", va & ~L3C_OFFSET, pmap);
4822 }
4823
4824 /*
4825 * Check that the rest of the L3 entries are compatible with the first,
4826 * and convert clean read-write mappings to read-only mappings.
4827 */
4828 all_l3e_AF = firstl3c & ATTR_AF;
4829 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
4830 L3C_SIZE - PAGE_SIZE;
4831 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
4832 oldl3 = pmap_load(l3);
4833 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4834 counter_u64_add(pmap_l3c_p_failures, 1);
4835 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4836 " in pmap %p", va, pmap);
4837 return (false);
4838 }
4839 set_l3:
4840 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4841 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4842 /*
4843 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4844 * set, ATTR_SW_DBM can be cleared without a TLB
4845 * invalidation.
4846 */
4847 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4848 ~ATTR_SW_DBM))
4849 goto set_l3;
4850 oldl3 &= ~ATTR_SW_DBM;
4851 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4852 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
4853 (va & ~L3C_OFFSET), pmap);
4854 }
4855 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
4856 counter_u64_add(pmap_l3c_p_failures, 1);
4857 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4858 " in pmap %p", va, pmap);
4859 return (false);
4860 }
4861 all_l3e_AF &= oldl3;
4862 pa -= PAGE_SIZE;
4863 }
4864
4865 /*
4866 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4867 * mapping, so that promotions triggered by speculative mappings,
4868 * such as pmap_enter_quick(), don't automatically mark the
4869 * underlying pages as referenced.
4870 */
4871 firstl3c &= ~ATTR_AF | all_l3e_AF;
4872
4873 /*
4874 * Remake the mappings with the contiguous bit set.
4875 */
4876 pmap_update_entry(pmap, l3p, firstl3c | ATTR_CONTIGUOUS, va &
4877 ~L3C_OFFSET, L3C_SIZE);
4878
4879 counter_u64_add(pmap_l3c_promotions, 1);
4880 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
4881 pmap);
4882 return (true);
4883 }
4884 #endif /* VM_NRESERVLEVEL > 0 */
4885
4886 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t newpte,int flags,int psind)4887 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t newpte, int flags,
4888 int psind)
4889 {
4890 pd_entry_t *l0p, *l1p, *l2p, origpte;
4891 vm_page_t mp;
4892
4893 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4894 KASSERT(psind > 0 && psind < MAXPAGESIZES,
4895 ("psind %d unexpected", psind));
4896 KASSERT((PTE_TO_PHYS(newpte) & (pagesizes[psind] - 1)) == 0,
4897 ("unaligned phys address %#lx newpte %#lx psind %d",
4898 PTE_TO_PHYS(newpte), newpte, psind));
4899
4900 restart:
4901 if (!pmap_bti_same(pmap, va, va + pagesizes[psind]))
4902 return (KERN_PROTECTION_FAILURE);
4903 if (psind == 2) {
4904 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4905
4906 l0p = pmap_l0(pmap, va);
4907 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4908 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4909 if (mp == NULL) {
4910 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4911 return (KERN_RESOURCE_SHORTAGE);
4912 PMAP_UNLOCK(pmap);
4913 vm_wait(NULL);
4914 PMAP_LOCK(pmap);
4915 goto restart;
4916 }
4917 l1p = pmap_l0_to_l1(l0p, va);
4918 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4919 origpte = pmap_load(l1p);
4920 } else {
4921 l1p = pmap_l0_to_l1(l0p, va);
4922 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4923 origpte = pmap_load(l1p);
4924 if ((origpte & ATTR_DESCR_VALID) == 0) {
4925 mp = PTE_TO_VM_PAGE(pmap_load(l0p));
4926 mp->ref_count++;
4927 }
4928 }
4929 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
4930 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
4931 (origpte & ATTR_DESCR_VALID) == 0,
4932 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
4933 va, origpte, newpte));
4934 pmap_store(l1p, newpte);
4935 } else /* (psind == 1) */ {
4936 l2p = pmap_l2(pmap, va);
4937 if (l2p == NULL) {
4938 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
4939 if (mp == NULL) {
4940 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4941 return (KERN_RESOURCE_SHORTAGE);
4942 PMAP_UNLOCK(pmap);
4943 vm_wait(NULL);
4944 PMAP_LOCK(pmap);
4945 goto restart;
4946 }
4947 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
4948 l2p = &l2p[pmap_l2_index(va)];
4949 origpte = pmap_load(l2p);
4950 } else {
4951 l1p = pmap_l1(pmap, va);
4952 origpte = pmap_load(l2p);
4953 if ((origpte & ATTR_DESCR_VALID) == 0) {
4954 mp = PTE_TO_VM_PAGE(pmap_load(l1p));
4955 mp->ref_count++;
4956 }
4957 }
4958 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
4959 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
4960 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
4961 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
4962 va, origpte, newpte));
4963 pmap_store(l2p, newpte);
4964 }
4965 dsb(ishst);
4966
4967 if ((origpte & ATTR_DESCR_VALID) == 0)
4968 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
4969 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
4970 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
4971 else if ((newpte & ATTR_SW_WIRED) == 0 &&
4972 (origpte & ATTR_SW_WIRED) != 0)
4973 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
4974
4975 return (KERN_SUCCESS);
4976 }
4977
4978 /*
4979 * Insert the given physical page (p) at
4980 * the specified virtual address (v) in the
4981 * target physical map with the protection requested.
4982 *
4983 * If specified, the page will be wired down, meaning
4984 * that the related pte can not be reclaimed.
4985 *
4986 * NB: This is the only routine which MAY NOT lazy-evaluate
4987 * or lose information. That is, this routine must actually
4988 * insert this page into the given map NOW.
4989 */
4990 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)4991 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
4992 u_int flags, int8_t psind)
4993 {
4994 struct rwlock *lock;
4995 pd_entry_t *pde;
4996 pt_entry_t new_l3, orig_l3;
4997 pt_entry_t *l2, *l3;
4998 pv_entry_t pv;
4999 vm_paddr_t opa, pa;
5000 vm_page_t mpte, om;
5001 bool nosleep;
5002 int lvl, rv;
5003
5004 KASSERT(ADDR_IS_CANONICAL(va),
5005 ("%s: Address not in canonical form: %lx", __func__, va));
5006
5007 va = trunc_page(va);
5008 if ((m->oflags & VPO_UNMANAGED) == 0)
5009 VM_PAGE_OBJECT_BUSY_ASSERT(m);
5010 pa = VM_PAGE_TO_PHYS(m);
5011 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE);
5012 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5013 new_l3 |= pmap_pte_prot(pmap, prot);
5014 if ((flags & PMAP_ENTER_WIRED) != 0)
5015 new_l3 |= ATTR_SW_WIRED;
5016 if (pmap->pm_stage == PM_STAGE1) {
5017 if (!ADDR_IS_KERNEL(va))
5018 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5019 else
5020 new_l3 |= ATTR_S1_UXN;
5021 if (pmap != kernel_pmap)
5022 new_l3 |= ATTR_S1_nG;
5023 } else {
5024 /*
5025 * Clear the access flag on executable mappings, this will be
5026 * set later when the page is accessed. The fault handler is
5027 * required to invalidate the I-cache.
5028 *
5029 * TODO: Switch to the valid flag to allow hardware management
5030 * of the access flag. Much of the pmap code assumes the
5031 * valid flag is set and fails to destroy the old page tables
5032 * correctly if it is clear.
5033 */
5034 if (prot & VM_PROT_EXECUTE)
5035 new_l3 &= ~ATTR_AF;
5036 }
5037 if ((m->oflags & VPO_UNMANAGED) == 0) {
5038 new_l3 |= ATTR_SW_MANAGED;
5039 if ((prot & VM_PROT_WRITE) != 0) {
5040 new_l3 |= ATTR_SW_DBM;
5041 if ((flags & VM_PROT_WRITE) == 0) {
5042 if (pmap->pm_stage == PM_STAGE1)
5043 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5044 else
5045 new_l3 &=
5046 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5047 }
5048 }
5049 }
5050
5051 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5052
5053 lock = NULL;
5054 PMAP_LOCK(pmap);
5055 /* Wait until we lock the pmap to protect the bti rangeset */
5056 new_l3 |= pmap_pte_bti(pmap, va);
5057
5058 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5059 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5060 ("managed largepage va %#lx flags %#x", va, flags));
5061 new_l3 &= ~L3_PAGE;
5062 if (psind == 2) {
5063 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5064 new_l3 |= L1_BLOCK;
5065 } else /* (psind == 1) */
5066 new_l3 |= L2_BLOCK;
5067 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5068 goto out;
5069 }
5070 if (psind == 1) {
5071 /* Assert the required virtual and physical alignment. */
5072 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5073 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5074 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5075 flags, m, &lock);
5076 goto out;
5077 }
5078 mpte = NULL;
5079
5080 /*
5081 * In the case that a page table page is not
5082 * resident, we are creating it here.
5083 */
5084 retry:
5085 pde = pmap_pde(pmap, va, &lvl);
5086 if (pde != NULL && lvl == 2) {
5087 l3 = pmap_l2_to_l3(pde, va);
5088 if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
5089 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5090 mpte->ref_count++;
5091 }
5092 goto havel3;
5093 } else if (pde != NULL && lvl == 1) {
5094 l2 = pmap_l1_to_l2(pde, va);
5095 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5096 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5097 l3 = &l3[pmap_l3_index(va)];
5098 if (!ADDR_IS_KERNEL(va)) {
5099 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5100 mpte->ref_count++;
5101 }
5102 goto havel3;
5103 }
5104 /* We need to allocate an L3 table. */
5105 }
5106 if (!ADDR_IS_KERNEL(va)) {
5107 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5108
5109 /*
5110 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5111 * to handle the possibility that a superpage mapping for "va"
5112 * was created while we slept.
5113 */
5114 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5115 nosleep ? NULL : &lock);
5116 if (mpte == NULL && nosleep) {
5117 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5118 rv = KERN_RESOURCE_SHORTAGE;
5119 goto out;
5120 }
5121 goto retry;
5122 } else
5123 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5124
5125 havel3:
5126 orig_l3 = pmap_load(l3);
5127 opa = PTE_TO_PHYS(orig_l3);
5128 pv = NULL;
5129
5130 /*
5131 * Is the specified virtual address already mapped?
5132 */
5133 if (pmap_l3_valid(orig_l3)) {
5134 /*
5135 * Wiring change, just update stats. We don't worry about
5136 * wiring PT pages as they remain resident as long as there
5137 * are valid mappings in them. Hence, if a user page is wired,
5138 * the PT page will be also.
5139 */
5140 if ((flags & PMAP_ENTER_WIRED) != 0 &&
5141 (orig_l3 & ATTR_SW_WIRED) == 0)
5142 pmap->pm_stats.wired_count++;
5143 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5144 (orig_l3 & ATTR_SW_WIRED) != 0)
5145 pmap->pm_stats.wired_count--;
5146
5147 /*
5148 * Remove the extra PT page reference.
5149 */
5150 if (mpte != NULL) {
5151 mpte->ref_count--;
5152 KASSERT(mpte->ref_count > 0,
5153 ("pmap_enter: missing reference to page table page,"
5154 " va: 0x%lx", va));
5155 }
5156
5157 /*
5158 * Has the physical page changed?
5159 */
5160 if (opa == pa) {
5161 /*
5162 * No, might be a protection or wiring change.
5163 */
5164 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5165 (new_l3 & ATTR_SW_DBM) != 0)
5166 vm_page_aflag_set(m, PGA_WRITEABLE);
5167 goto validate;
5168 }
5169
5170 /*
5171 * The physical page has changed. Temporarily invalidate
5172 * the mapping.
5173 */
5174 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5175 (void)pmap_demote_l3c(pmap, l3, va);
5176 orig_l3 = pmap_load_clear(l3);
5177 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5178 ("pmap_enter: unexpected pa update for %#lx", va));
5179 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5180 om = PHYS_TO_VM_PAGE(opa);
5181
5182 /*
5183 * The pmap lock is sufficient to synchronize with
5184 * concurrent calls to pmap_page_test_mappings() and
5185 * pmap_ts_referenced().
5186 */
5187 if (pmap_pte_dirty(pmap, orig_l3))
5188 vm_page_dirty(om);
5189 if ((orig_l3 & ATTR_AF) != 0) {
5190 pmap_invalidate_page(pmap, va, true);
5191 vm_page_aflag_set(om, PGA_REFERENCED);
5192 }
5193 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5194 pv = pmap_pvh_remove(&om->md, pmap, va);
5195 if ((m->oflags & VPO_UNMANAGED) != 0)
5196 free_pv_entry(pmap, pv);
5197 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5198 TAILQ_EMPTY(&om->md.pv_list) &&
5199 ((om->flags & PG_FICTITIOUS) != 0 ||
5200 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5201 vm_page_aflag_clear(om, PGA_WRITEABLE);
5202 } else {
5203 KASSERT((orig_l3 & ATTR_AF) != 0,
5204 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5205 pmap_invalidate_page(pmap, va, true);
5206 }
5207 orig_l3 = 0;
5208 } else {
5209 /*
5210 * Increment the counters.
5211 */
5212 if ((new_l3 & ATTR_SW_WIRED) != 0)
5213 pmap->pm_stats.wired_count++;
5214 pmap_resident_count_inc(pmap, 1);
5215 }
5216 /*
5217 * Enter on the PV list if part of our managed memory.
5218 */
5219 if ((m->oflags & VPO_UNMANAGED) == 0) {
5220 if (pv == NULL) {
5221 pv = get_pv_entry(pmap, &lock);
5222 pv->pv_va = va;
5223 }
5224 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5225 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5226 m->md.pv_gen++;
5227 if ((new_l3 & ATTR_SW_DBM) != 0)
5228 vm_page_aflag_set(m, PGA_WRITEABLE);
5229 }
5230
5231 validate:
5232 if (pmap->pm_stage == PM_STAGE1) {
5233 /*
5234 * Sync icache if exec permission and attribute
5235 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5236 * is stored and made valid for hardware table walk. If done
5237 * later, then other can access this page before caches are
5238 * properly synced. Don't do it for kernel memory which is
5239 * mapped with exec permission even if the memory isn't going
5240 * to hold executable code. The only time when icache sync is
5241 * needed is after kernel module is loaded and the relocation
5242 * info is processed. And it's done in elf_cpu_load_file().
5243 */
5244 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5245 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5246 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5247 PMAP_ASSERT_STAGE1(pmap);
5248 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5249 PAGE_SIZE);
5250 }
5251 } else {
5252 cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5253 }
5254
5255 /*
5256 * Update the L3 entry
5257 */
5258 if (pmap_l3_valid(orig_l3)) {
5259 KASSERT(opa == pa, ("pmap_enter: invalid update"));
5260 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5261 /* same PA, different attributes */
5262 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5263 (void)pmap_demote_l3c(pmap, l3, va);
5264 orig_l3 = pmap_load_store(l3, new_l3);
5265 pmap_invalidate_page(pmap, va, true);
5266 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5267 pmap_pte_dirty(pmap, orig_l3))
5268 vm_page_dirty(m);
5269 } else {
5270 /*
5271 * orig_l3 == new_l3
5272 * This can happens if multiple threads simultaneously
5273 * access not yet mapped page. This bad for performance
5274 * since this can cause full demotion-NOP-promotion
5275 * cycle.
5276 * Another possible reasons are:
5277 * - VM and pmap memory layout are diverged
5278 * - tlb flush is missing somewhere and CPU doesn't see
5279 * actual mapping.
5280 */
5281 CTR4(KTR_PMAP, "%s: already mapped page - "
5282 "pmap %p va 0x%#lx pte 0x%lx",
5283 __func__, pmap, va, new_l3);
5284 }
5285 } else {
5286 /* New mapping */
5287 pmap_store(l3, new_l3);
5288 dsb(ishst);
5289 }
5290
5291 #if VM_NRESERVLEVEL > 0
5292 /*
5293 * First, attempt L3C promotion, if the virtual and physical addresses
5294 * are aligned with each other and an underlying reservation has the
5295 * neighboring L3 pages allocated. The first condition is simply an
5296 * optimization that recognizes some eventual promotion failures early
5297 * at a lower run-time cost. Then, if both the page table page and
5298 * the reservation are fully populated, attempt L2 promotion.
5299 */
5300 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5301 (m->flags & PG_FICTITIOUS) == 0 &&
5302 vm_reserv_is_populated(m, L3C_ENTRIES) &&
5303 pmap_promote_l3c(pmap, l3, va) &&
5304 (mpte == NULL || mpte->ref_count == NL3PG) &&
5305 vm_reserv_level_iffullpop(m) == 0)
5306 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5307 #endif
5308
5309 rv = KERN_SUCCESS;
5310 out:
5311 if (lock != NULL)
5312 rw_wunlock(lock);
5313 PMAP_UNLOCK(pmap);
5314 return (rv);
5315 }
5316
5317 /*
5318 * Tries to create a read- and/or execute-only L2 page mapping. Returns
5319 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5320 * value. See pmap_enter_l2() for the possible error values when "no sleep",
5321 * "no replace", and "no reclaim" are specified.
5322 */
5323 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5324 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5325 struct rwlock **lockp)
5326 {
5327 pd_entry_t new_l2;
5328
5329 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5330 PMAP_ASSERT_STAGE1(pmap);
5331 KASSERT(ADDR_IS_CANONICAL(va),
5332 ("%s: Address not in canonical form: %lx", __func__, va));
5333
5334 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5335 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5336 L2_BLOCK);
5337 new_l2 |= pmap_pte_bti(pmap, va);
5338 if ((m->oflags & VPO_UNMANAGED) == 0) {
5339 new_l2 |= ATTR_SW_MANAGED;
5340 new_l2 &= ~ATTR_AF;
5341 }
5342 if ((prot & VM_PROT_EXECUTE) == 0 ||
5343 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5344 new_l2 |= ATTR_S1_XN;
5345 if (!ADDR_IS_KERNEL(va))
5346 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5347 else
5348 new_l2 |= ATTR_S1_UXN;
5349 if (pmap != kernel_pmap)
5350 new_l2 |= ATTR_S1_nG;
5351 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5352 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5353 }
5354
5355 /*
5356 * Returns true if every page table entry in the specified page table is
5357 * zero.
5358 */
5359 static bool
pmap_every_pte_zero(vm_paddr_t pa)5360 pmap_every_pte_zero(vm_paddr_t pa)
5361 {
5362 pt_entry_t *pt_end, *pte;
5363
5364 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5365 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5366 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5367 if (*pte != 0)
5368 return (false);
5369 }
5370 return (true);
5371 }
5372
5373 /*
5374 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if
5375 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5376 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
5377 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5378 * within the L2 virtual address range starting at the specified virtual
5379 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5380 * L2 page mapping already exists at the specified virtual address. Returns
5381 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5382 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5383 * and a PV entry allocation failed.
5384 */
5385 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5386 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5387 vm_page_t m, struct rwlock **lockp)
5388 {
5389 struct spglist free;
5390 pd_entry_t *l2, old_l2;
5391 vm_page_t l2pg, mt;
5392 vm_page_t uwptpg;
5393
5394 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5395 KASSERT(ADDR_IS_CANONICAL(va),
5396 ("%s: Address not in canonical form: %lx", __func__, va));
5397
5398 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5399 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5400 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5401 va, pmap);
5402 return (KERN_RESOURCE_SHORTAGE);
5403 }
5404
5405 /*
5406 * If bti is not the same for the whole l2 range, return failure
5407 * and let vm_fault() cope. Check after l2 allocation, since
5408 * it could sleep.
5409 */
5410 if (!pmap_bti_same(pmap, va, va + L2_SIZE)) {
5411 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5412 pmap_abort_ptp(pmap, va, l2pg);
5413 return (KERN_PROTECTION_FAILURE);
5414 }
5415
5416 /*
5417 * If there are existing mappings, either abort or remove them.
5418 */
5419 if ((old_l2 = pmap_load(l2)) != 0) {
5420 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5421 ("pmap_enter_l2: l2pg's ref count is too low"));
5422 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5423 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5424 if (l2pg != NULL)
5425 l2pg->ref_count--;
5426 CTR2(KTR_PMAP,
5427 "pmap_enter_l2: no space for va %#lx"
5428 " in pmap %p", va, pmap);
5429 return (KERN_NO_SPACE);
5430 } else if (!ADDR_IS_KERNEL(va) ||
5431 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5432 if (l2pg != NULL)
5433 l2pg->ref_count--;
5434 CTR2(KTR_PMAP,
5435 "pmap_enter_l2: failure for va %#lx"
5436 " in pmap %p", va, pmap);
5437 return (KERN_FAILURE);
5438 }
5439 }
5440 SLIST_INIT(&free);
5441 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
5442 (void)pmap_remove_l2(pmap, l2, va,
5443 pmap_load(pmap_l1(pmap, va)), &free, lockp);
5444 else
5445 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5446 &free, lockp);
5447 if (!ADDR_IS_KERNEL(va)) {
5448 vm_page_free_pages_toq(&free, true);
5449 KASSERT(pmap_load(l2) == 0,
5450 ("pmap_enter_l2: non-zero L2 entry %p", l2));
5451 } else {
5452 KASSERT(SLIST_EMPTY(&free),
5453 ("pmap_enter_l2: freed kernel page table page"));
5454
5455 /*
5456 * Both pmap_remove_l2() and pmap_remove_l3_range()
5457 * will leave the kernel page table page zero filled.
5458 * Nonetheless, the TLB could have an intermediate
5459 * entry for the kernel page table page, so request
5460 * an invalidation at all levels after clearing
5461 * the L2_TABLE entry.
5462 */
5463 mt = PTE_TO_VM_PAGE(pmap_load(l2));
5464 if (pmap_insert_pt_page(pmap, mt, false, false))
5465 panic("pmap_enter_l2: trie insert failed");
5466 pmap_clear(l2);
5467 pmap_s1_invalidate_page(pmap, va, false);
5468 }
5469 }
5470
5471 /*
5472 * Allocate leaf ptpage for wired userspace pages.
5473 */
5474 uwptpg = NULL;
5475 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5476 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5477 if (uwptpg == NULL) {
5478 return (KERN_RESOURCE_SHORTAGE);
5479 }
5480 uwptpg->pindex = pmap_l2_pindex(va);
5481 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5482 vm_page_unwire_noq(uwptpg);
5483 vm_page_free(uwptpg);
5484 return (KERN_RESOURCE_SHORTAGE);
5485 }
5486 pmap_resident_count_inc(pmap, 1);
5487 uwptpg->ref_count = NL3PG;
5488 }
5489 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5490 /*
5491 * Abort this mapping if its PV entry could not be created.
5492 */
5493 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5494 if (l2pg != NULL)
5495 pmap_abort_ptp(pmap, va, l2pg);
5496 if (uwptpg != NULL) {
5497 mt = pmap_remove_pt_page(pmap, va);
5498 KASSERT(mt == uwptpg,
5499 ("removed pt page %p, expected %p", mt,
5500 uwptpg));
5501 pmap_resident_count_dec(pmap, 1);
5502 uwptpg->ref_count = 1;
5503 vm_page_unwire_noq(uwptpg);
5504 vm_page_free(uwptpg);
5505 }
5506 CTR2(KTR_PMAP,
5507 "pmap_enter_l2: failure for va %#lx in pmap %p",
5508 va, pmap);
5509 return (KERN_RESOURCE_SHORTAGE);
5510 }
5511 if ((new_l2 & ATTR_SW_DBM) != 0)
5512 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5513 vm_page_aflag_set(mt, PGA_WRITEABLE);
5514 }
5515
5516 /*
5517 * Increment counters.
5518 */
5519 if ((new_l2 & ATTR_SW_WIRED) != 0)
5520 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5521 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5522
5523 /*
5524 * Conditionally sync the icache. See pmap_enter() for details.
5525 */
5526 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5527 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5528 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5529 cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5530 L2_SIZE);
5531 }
5532
5533 /*
5534 * Map the superpage.
5535 */
5536 pmap_store(l2, new_l2);
5537 dsb(ishst);
5538
5539 atomic_add_long(&pmap_l2_mappings, 1);
5540 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5541 va, pmap);
5542
5543 return (KERN_SUCCESS);
5544 }
5545
5546 /*
5547 * Tries to create a read- and/or execute-only L3C page mapping. Returns
5548 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5549 * value.
5550 */
5551 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)5552 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5553 vm_prot_t prot, struct rwlock **lockp)
5554 {
5555 pt_entry_t l3e;
5556
5557 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5558 PMAP_ASSERT_STAGE1(pmap);
5559 KASSERT(ADDR_IS_CANONICAL(va),
5560 ("%s: Address not in canonical form: %lx", __func__, va));
5561
5562 l3e = VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5563 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5564 ATTR_CONTIGUOUS | L3_PAGE;
5565 l3e |= pmap_pte_bti(pmap, va);
5566 if ((m->oflags & VPO_UNMANAGED) == 0) {
5567 l3e |= ATTR_SW_MANAGED;
5568 l3e &= ~ATTR_AF;
5569 }
5570 if ((prot & VM_PROT_EXECUTE) == 0 ||
5571 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5572 l3e |= ATTR_S1_XN;
5573 if (!ADDR_IS_KERNEL(va))
5574 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5575 else
5576 l3e |= ATTR_S1_UXN;
5577 if (pmap != kernel_pmap)
5578 l3e |= ATTR_S1_nG;
5579 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5580 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
5581 }
5582
5583 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)5584 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
5585 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
5586 {
5587 pd_entry_t *l2p, *pde;
5588 pt_entry_t *l3p, *tl3p;
5589 vm_page_t mt;
5590 vm_paddr_t pa;
5591 vm_pindex_t l2pindex;
5592 int lvl;
5593
5594 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5595 KASSERT((va & L3C_OFFSET) == 0,
5596 ("pmap_enter_l3c: va is not aligned"));
5597 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
5598 ("pmap_enter_l3c: managed mapping within the clean submap"));
5599
5600 /*
5601 * If the L3 PTP is not resident, we attempt to create it here.
5602 */
5603 if (!ADDR_IS_KERNEL(va)) {
5604 /*
5605 * Were we given the correct L3 PTP? If so, we can simply
5606 * increment its ref count.
5607 */
5608 l2pindex = pmap_l2_pindex(va);
5609 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
5610 (*ml3p)->ref_count += L3C_ENTRIES;
5611 } else {
5612 retry:
5613 /*
5614 * Get the L2 entry.
5615 */
5616 pde = pmap_pde(pmap, va, &lvl);
5617
5618 /*
5619 * If the L2 entry is a superpage, we either abort or
5620 * demote depending on the given flags.
5621 */
5622 if (lvl == 1) {
5623 l2p = pmap_l1_to_l2(pde, va);
5624 if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
5625 L2_BLOCK) {
5626 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5627 return (KERN_FAILURE);
5628 l3p = pmap_demote_l2_locked(pmap, l2p,
5629 va, lockp);
5630 if (l3p != NULL) {
5631 *ml3p = PTE_TO_VM_PAGE(
5632 pmap_load(l2p));
5633 (*ml3p)->ref_count +=
5634 L3C_ENTRIES;
5635 goto have_l3p;
5636 }
5637 }
5638 /* We need to allocate an L3 PTP. */
5639 }
5640
5641 /*
5642 * If the L3 PTP is mapped, we just increment its ref
5643 * count. Otherwise, we attempt to allocate it.
5644 */
5645 if (lvl == 2 && pmap_load(pde) != 0) {
5646 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
5647 (*ml3p)->ref_count += L3C_ENTRIES;
5648 } else {
5649 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
5650 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
5651 if (*ml3p == NULL) {
5652 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5653 return (KERN_FAILURE);
5654
5655 /*
5656 * The page table may have changed
5657 * while we slept.
5658 */
5659 goto retry;
5660 }
5661 (*ml3p)->ref_count += L3C_ENTRIES - 1;
5662 }
5663 }
5664 l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
5665
5666 have_l3p:
5667 /*
5668 * If bti is not the same for the whole L3C range, return
5669 * failure and let vm_fault() cope. Check after L3 allocation,
5670 * since it could sleep.
5671 */
5672 if (!pmap_bti_same(pmap, va, va + L3C_SIZE)) {
5673 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
5674 pmap_abort_ptp(pmap, va, *ml3p);
5675 *ml3p = NULL;
5676 return (KERN_PROTECTION_FAILURE);
5677 }
5678 } else {
5679 *ml3p = NULL;
5680
5681 /*
5682 * If the L2 entry is a superpage, we either abort or demote
5683 * depending on the given flags.
5684 */
5685 pde = pmap_pde(kernel_pmap, va, &lvl);
5686 if (lvl == 1) {
5687 l2p = pmap_l1_to_l2(pde, va);
5688 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
5689 ("pmap_enter_l3c: missing L2 block"));
5690 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5691 return (KERN_FAILURE);
5692 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
5693 } else {
5694 KASSERT(lvl == 2,
5695 ("pmap_enter_l3c: Invalid level %d", lvl));
5696 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
5697 pmap_load(pde)));
5698 }
5699 }
5700 l3p = &l3p[pmap_l3_index(va)];
5701
5702 /*
5703 * If there are existing mappings, either abort or remove them.
5704 */
5705 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5706 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5707 if (pmap_load(tl3p) != 0) {
5708 if (*ml3p != NULL)
5709 (*ml3p)->ref_count -= L3C_ENTRIES;
5710 return (KERN_FAILURE);
5711 }
5712 }
5713 } else {
5714 /*
5715 * Because we increment the L3 page's reference count above,
5716 * it is guaranteed not to be freed here and we can pass NULL
5717 * instead of a valid free list.
5718 */
5719 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
5720 va + L3C_SIZE, NULL, lockp);
5721 }
5722
5723 /*
5724 * Enter on the PV list if part of our managed memory.
5725 */
5726 if ((l3e & ATTR_SW_MANAGED) != 0) {
5727 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
5728 if (*ml3p != NULL) {
5729 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
5730 pmap_abort_ptp(pmap, va, *ml3p);
5731 *ml3p = NULL;
5732 }
5733 return (KERN_RESOURCE_SHORTAGE);
5734 }
5735 if ((l3e & ATTR_SW_DBM) != 0)
5736 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
5737 vm_page_aflag_set(mt, PGA_WRITEABLE);
5738 }
5739
5740 /*
5741 * Increment counters.
5742 */
5743 if ((l3e & ATTR_SW_WIRED) != 0)
5744 pmap->pm_stats.wired_count += L3C_ENTRIES;
5745 pmap_resident_count_inc(pmap, L3C_ENTRIES);
5746
5747 pa = VM_PAGE_TO_PHYS(m);
5748 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
5749
5750 /*
5751 * Sync the icache before the mapping is stored.
5752 */
5753 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
5754 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5755 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
5756
5757 /*
5758 * Map the superpage.
5759 */
5760 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5761 pmap_store(tl3p, l3e);
5762 l3e += L3_SIZE;
5763 }
5764 dsb(ishst);
5765
5766 counter_u64_add(pmap_l3c_mappings, 1);
5767 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
5768 va, pmap);
5769 return (KERN_SUCCESS);
5770 }
5771
5772 /*
5773 * Maps a sequence of resident pages belonging to the same object.
5774 * The sequence begins with the given page m_start. This page is
5775 * mapped at the given virtual address start. Each subsequent page is
5776 * mapped at a virtual address that is offset from start by the same
5777 * amount as the page is offset from m_start within the object. The
5778 * last page in the sequence is the page with the largest offset from
5779 * m_start that can be mapped at a virtual address less than the given
5780 * virtual address end. Not every virtual page between start and end
5781 * is mapped; only those for which a resident page exists with the
5782 * corresponding offset from m_start are mapped.
5783 */
5784 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)5785 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5786 vm_page_t m_start, vm_prot_t prot)
5787 {
5788 struct rwlock *lock;
5789 vm_offset_t va;
5790 vm_page_t m, mpte;
5791 vm_pindex_t diff, psize;
5792 int rv;
5793
5794 VM_OBJECT_ASSERT_LOCKED(m_start->object);
5795
5796 psize = atop(end - start);
5797 mpte = NULL;
5798 m = m_start;
5799 lock = NULL;
5800 PMAP_LOCK(pmap);
5801 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5802 va = start + ptoa(diff);
5803 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
5804 m->psind == 1 && pmap_ps_enabled(pmap) &&
5805 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
5806 KERN_SUCCESS || rv == KERN_NO_SPACE))
5807 m = &m[L2_SIZE / PAGE_SIZE - 1];
5808 else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
5809 (VM_PAGE_TO_PHYS(m) & L3C_OFFSET) == 0 &&
5810 vm_reserv_is_populated(m, L3C_ENTRIES) &&
5811 pmap_ps_enabled(pmap) &&
5812 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
5813 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE))
5814 m = &m[L3C_ENTRIES - 1];
5815 else
5816 mpte = pmap_enter_quick_locked(pmap, va, m, prot, mpte,
5817 &lock);
5818 m = TAILQ_NEXT(m, listq);
5819 }
5820 if (lock != NULL)
5821 rw_wunlock(lock);
5822 PMAP_UNLOCK(pmap);
5823 }
5824
5825 /*
5826 * this code makes some *MAJOR* assumptions:
5827 * 1. Current pmap & pmap exists.
5828 * 2. Not wired.
5829 * 3. Read access.
5830 * 4. No page table pages.
5831 * but is *MUCH* faster than pmap_enter...
5832 */
5833
5834 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)5835 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5836 {
5837 struct rwlock *lock;
5838
5839 lock = NULL;
5840 PMAP_LOCK(pmap);
5841 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5842 if (lock != NULL)
5843 rw_wunlock(lock);
5844 PMAP_UNLOCK(pmap);
5845 }
5846
5847 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)5848 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5849 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5850 {
5851 pd_entry_t *pde;
5852 pt_entry_t *l1, *l2, *l3, l3_val;
5853 vm_paddr_t pa;
5854 int lvl;
5855
5856 KASSERT(!VA_IS_CLEANMAP(va) ||
5857 (m->oflags & VPO_UNMANAGED) != 0,
5858 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5859 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5860 PMAP_ASSERT_STAGE1(pmap);
5861 KASSERT(ADDR_IS_CANONICAL(va),
5862 ("%s: Address not in canonical form: %lx", __func__, va));
5863 l2 = NULL;
5864
5865 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
5866 /*
5867 * In the case that a page table page is not
5868 * resident, we are creating it here.
5869 */
5870 if (!ADDR_IS_KERNEL(va)) {
5871 vm_pindex_t l2pindex;
5872
5873 /*
5874 * Calculate pagetable page index
5875 */
5876 l2pindex = pmap_l2_pindex(va);
5877 if (mpte && (mpte->pindex == l2pindex)) {
5878 mpte->ref_count++;
5879 } else {
5880 /*
5881 * If the page table page is mapped, we just increment
5882 * the hold count, and activate it. Otherwise, we
5883 * attempt to allocate a page table page, passing NULL
5884 * instead of the PV list lock pointer because we don't
5885 * intend to sleep. If this attempt fails, we don't
5886 * retry. Instead, we give up.
5887 */
5888 l1 = pmap_l1(pmap, va);
5889 if (l1 != NULL && pmap_load(l1) != 0) {
5890 if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
5891 L1_BLOCK)
5892 return (NULL);
5893 l2 = pmap_l1_to_l2(l1, va);
5894 if (pmap_load(l2) != 0) {
5895 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
5896 L2_BLOCK)
5897 return (NULL);
5898 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5899 mpte->ref_count++;
5900 } else {
5901 mpte = _pmap_alloc_l3(pmap, l2pindex,
5902 NULL);
5903 if (mpte == NULL)
5904 return (mpte);
5905 }
5906 } else {
5907 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
5908 if (mpte == NULL)
5909 return (mpte);
5910 }
5911 }
5912 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5913 l3 = &l3[pmap_l3_index(va)];
5914 } else {
5915 mpte = NULL;
5916 pde = pmap_pde(kernel_pmap, va, &lvl);
5917 KASSERT(pde != NULL,
5918 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
5919 va));
5920 KASSERT(lvl == 2,
5921 ("pmap_enter_quick_locked: Invalid level %d", lvl));
5922 l3 = pmap_l2_to_l3(pde, va);
5923 }
5924
5925 /*
5926 * Abort if a mapping already exists.
5927 */
5928 if (pmap_load(l3) != 0) {
5929 if (mpte != NULL)
5930 mpte->ref_count--;
5931 return (NULL);
5932 }
5933
5934 /*
5935 * Enter on the PV list if part of our managed memory.
5936 */
5937 if ((m->oflags & VPO_UNMANAGED) == 0 &&
5938 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
5939 if (mpte != NULL)
5940 pmap_abort_ptp(pmap, va, mpte);
5941 return (NULL);
5942 }
5943
5944 /*
5945 * Increment counters
5946 */
5947 pmap_resident_count_inc(pmap, 1);
5948
5949 pa = VM_PAGE_TO_PHYS(m);
5950 l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
5951 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
5952 l3_val |= pmap_pte_bti(pmap, va);
5953 if ((prot & VM_PROT_EXECUTE) == 0 ||
5954 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5955 l3_val |= ATTR_S1_XN;
5956 if (!ADDR_IS_KERNEL(va))
5957 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5958 else
5959 l3_val |= ATTR_S1_UXN;
5960 if (pmap != kernel_pmap)
5961 l3_val |= ATTR_S1_nG;
5962
5963 /*
5964 * Now validate mapping with RO protection
5965 */
5966 if ((m->oflags & VPO_UNMANAGED) == 0) {
5967 l3_val |= ATTR_SW_MANAGED;
5968 l3_val &= ~ATTR_AF;
5969 }
5970
5971 /* Sync icache before the mapping is stored to PTE */
5972 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5973 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5974 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5975
5976 pmap_store(l3, l3_val);
5977 dsb(ishst);
5978
5979 #if VM_NRESERVLEVEL > 0
5980 /*
5981 * If both the PTP and the reservation are fully populated, then
5982 * attempt promotion.
5983 */
5984 if ((mpte == NULL || mpte->ref_count == NL3PG) &&
5985 (m->flags & PG_FICTITIOUS) == 0 &&
5986 vm_reserv_level_iffullpop(m) == 0) {
5987 if (l2 == NULL)
5988 l2 = pmap_pde(pmap, va, &lvl);
5989
5990 /*
5991 * If promotion succeeds, then the next call to this function
5992 * should not be given the unmapped PTP as a hint.
5993 */
5994 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
5995 mpte = NULL;
5996 }
5997 #endif
5998
5999 return (mpte);
6000 }
6001
6002 /*
6003 * This code maps large physical mmap regions into the
6004 * processor address space. Note that some shortcuts
6005 * are taken, but the code works.
6006 */
6007 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6008 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6009 vm_pindex_t pindex, vm_size_t size)
6010 {
6011
6012 VM_OBJECT_ASSERT_WLOCKED(object);
6013 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6014 ("pmap_object_init_pt: non-device object"));
6015 }
6016
6017 /*
6018 * Clear the wired attribute from the mappings for the specified range of
6019 * addresses in the given pmap. Every valid mapping within that range
6020 * must have the wired attribute set. In contrast, invalid mappings
6021 * cannot have the wired attribute set, so they are ignored.
6022 *
6023 * The wired attribute of the page table entry is not a hardware feature,
6024 * so there is no need to invalidate any TLB entries.
6025 */
6026 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6027 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6028 {
6029 vm_offset_t va_next;
6030 pd_entry_t *l0, *l1, *l2;
6031 pt_entry_t *l3;
6032 bool partial_l3c;
6033
6034 PMAP_LOCK(pmap);
6035 for (; sva < eva; sva = va_next) {
6036 l0 = pmap_l0(pmap, sva);
6037 if (pmap_load(l0) == 0) {
6038 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6039 if (va_next < sva)
6040 va_next = eva;
6041 continue;
6042 }
6043
6044 l1 = pmap_l0_to_l1(l0, sva);
6045 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6046 if (va_next < sva)
6047 va_next = eva;
6048 if (pmap_load(l1) == 0)
6049 continue;
6050
6051 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6052 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6053 KASSERT(va_next <= eva,
6054 ("partial update of non-transparent 1G page "
6055 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6056 pmap_load(l1), sva, eva, va_next));
6057 MPASS(pmap != kernel_pmap);
6058 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6059 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6060 pmap_clear_bits(l1, ATTR_SW_WIRED);
6061 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6062 continue;
6063 }
6064
6065 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6066 if (va_next < sva)
6067 va_next = eva;
6068
6069 l2 = pmap_l1_to_l2(l1, sva);
6070 if (pmap_load(l2) == 0)
6071 continue;
6072
6073 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6074 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6075 panic("pmap_unwire: l2 %#jx is missing "
6076 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6077
6078 /*
6079 * Are we unwiring the entire large page? If not,
6080 * demote the mapping and fall through.
6081 */
6082 if (sva + L2_SIZE == va_next && eva >= va_next) {
6083 pmap_clear_bits(l2, ATTR_SW_WIRED);
6084 pmap->pm_stats.wired_count -= L2_SIZE /
6085 PAGE_SIZE;
6086 continue;
6087 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6088 panic("pmap_unwire: demotion failed");
6089 }
6090 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6091 ("pmap_unwire: Invalid l2 entry after demotion"));
6092
6093 if (va_next > eva)
6094 va_next = eva;
6095 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6096 sva != va_next; l3++, sva += L3_SIZE) {
6097 if (pmap_load(l3) == 0)
6098 continue;
6099 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6100 /*
6101 * Avoid demotion for whole-page unwiring.
6102 */
6103 if ((sva & L3C_OFFSET) == 0) {
6104 /*
6105 * Handle the possibility that
6106 * "va_next" is zero because of
6107 * address wraparound.
6108 */
6109 partial_l3c = sva + L3C_OFFSET >
6110 va_next - 1;
6111 }
6112 if (partial_l3c)
6113 (void)pmap_demote_l3c(pmap, l3, sva);
6114 }
6115 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6116 panic("pmap_unwire: l3 %#jx is missing "
6117 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6118
6119 /*
6120 * ATTR_SW_WIRED must be cleared atomically. Although
6121 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6122 * the System MMU may write to the entry concurrently.
6123 */
6124 pmap_clear_bits(l3, ATTR_SW_WIRED);
6125 pmap->pm_stats.wired_count--;
6126 }
6127 }
6128 PMAP_UNLOCK(pmap);
6129 }
6130
6131 /*
6132 * This function requires that the caller has already added one to ml3's
6133 * ref_count in anticipation of creating a 4KB page mapping.
6134 */
6135 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6136 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6137 vm_page_t ml3, struct rwlock **lockp)
6138 {
6139 pt_entry_t *tl3p;
6140
6141 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6142 KASSERT((va & L3C_OFFSET) == 0,
6143 ("pmap_copy_l3c: va is not aligned"));
6144 KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6145 ("pmap_copy_l3c: l3e is not managed"));
6146
6147 /*
6148 * Abort if a mapping already exists.
6149 */
6150 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6151 if (pmap_load(tl3p) != 0) {
6152 if (ml3 != NULL)
6153 ml3->ref_count--;
6154 return (false);
6155 }
6156
6157 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6158 if (ml3 != NULL)
6159 pmap_abort_ptp(pmap, va, ml3);
6160 return (false);
6161 }
6162 ml3->ref_count += L3C_ENTRIES - 1;
6163
6164 /*
6165 * Clear the wired and accessed bits. However, leave the dirty bit
6166 * unchanged because read/write superpage mappings are required to be
6167 * dirty.
6168 */
6169 l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6170
6171 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6172 pmap_store(tl3p, l3e);
6173 l3e += L3_SIZE;
6174 }
6175 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6176 counter_u64_add(pmap_l3c_mappings, 1);
6177 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6178 va, pmap);
6179 return (true);
6180 }
6181
6182 /*
6183 * Copy the range specified by src_addr/len
6184 * from the source map to the range dst_addr/len
6185 * in the destination map.
6186 *
6187 * This routine is only advisory and need not do anything.
6188 *
6189 * Because the executable mappings created by this routine are copied,
6190 * it should not have to flush the instruction cache.
6191 */
6192 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6193 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6194 vm_offset_t src_addr)
6195 {
6196 struct rwlock *lock;
6197 pd_entry_t *l0, *l1, *l2, srcptepaddr;
6198 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6199 vm_offset_t addr, end_addr, va_next;
6200 vm_page_t dst_m, dstmpte, srcmpte;
6201
6202 PMAP_ASSERT_STAGE1(dst_pmap);
6203 PMAP_ASSERT_STAGE1(src_pmap);
6204
6205 if (dst_addr != src_addr)
6206 return;
6207 end_addr = src_addr + len;
6208 lock = NULL;
6209 if (dst_pmap < src_pmap) {
6210 PMAP_LOCK(dst_pmap);
6211 PMAP_LOCK(src_pmap);
6212 } else {
6213 PMAP_LOCK(src_pmap);
6214 PMAP_LOCK(dst_pmap);
6215 }
6216 for (addr = src_addr; addr < end_addr; addr = va_next) {
6217 l0 = pmap_l0(src_pmap, addr);
6218 if (pmap_load(l0) == 0) {
6219 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6220 if (va_next < addr)
6221 va_next = end_addr;
6222 continue;
6223 }
6224
6225 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6226 if (va_next < addr)
6227 va_next = end_addr;
6228 l1 = pmap_l0_to_l1(l0, addr);
6229 if (pmap_load(l1) == 0)
6230 continue;
6231 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6232 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6233 KASSERT(va_next <= end_addr,
6234 ("partial update of non-transparent 1G page "
6235 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6236 pmap_load(l1), addr, end_addr, va_next));
6237 srcptepaddr = pmap_load(l1);
6238 l1 = pmap_l1(dst_pmap, addr);
6239 if (l1 == NULL) {
6240 if (_pmap_alloc_l3(dst_pmap,
6241 pmap_l0_pindex(addr), NULL) == NULL)
6242 break;
6243 l1 = pmap_l1(dst_pmap, addr);
6244 } else {
6245 l0 = pmap_l0(dst_pmap, addr);
6246 dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6247 dst_m->ref_count++;
6248 }
6249 KASSERT(pmap_load(l1) == 0,
6250 ("1G mapping present in dst pmap "
6251 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6252 pmap_load(l1), addr, end_addr, va_next));
6253 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6254 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6255 continue;
6256 }
6257
6258 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6259 if (va_next < addr)
6260 va_next = end_addr;
6261 l2 = pmap_l1_to_l2(l1, addr);
6262 srcptepaddr = pmap_load(l2);
6263 if (srcptepaddr == 0)
6264 continue;
6265 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6266 /*
6267 * We can only virtual copy whole superpages.
6268 */
6269 if ((addr & L2_OFFSET) != 0 ||
6270 addr + L2_SIZE > end_addr)
6271 continue;
6272 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6273 if (l2 == NULL)
6274 break;
6275 if (pmap_load(l2) == 0 &&
6276 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6277 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6278 PMAP_ENTER_NORECLAIM, &lock))) {
6279 /*
6280 * We leave the dirty bit unchanged because
6281 * managed read/write superpage mappings are
6282 * required to be dirty. However, managed
6283 * superpage mappings are not required to
6284 * have their accessed bit set, so we clear
6285 * it because we don't know if this mapping
6286 * will be used.
6287 */
6288 srcptepaddr &= ~ATTR_SW_WIRED;
6289 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6290 srcptepaddr &= ~ATTR_AF;
6291 pmap_store(l2, srcptepaddr);
6292 pmap_resident_count_inc(dst_pmap, L2_SIZE /
6293 PAGE_SIZE);
6294 atomic_add_long(&pmap_l2_mappings, 1);
6295 } else
6296 pmap_abort_ptp(dst_pmap, addr, dst_m);
6297 continue;
6298 }
6299 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6300 ("pmap_copy: invalid L2 entry"));
6301 srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6302 KASSERT(srcmpte->ref_count > 0,
6303 ("pmap_copy: source page table page is unused"));
6304 if (va_next > end_addr)
6305 va_next = end_addr;
6306 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6307 src_pte = &src_pte[pmap_l3_index(addr)];
6308 dstmpte = NULL;
6309 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6310 ptetemp = pmap_load(src_pte);
6311
6312 /*
6313 * We only virtual copy managed pages.
6314 */
6315 if ((ptetemp & ATTR_SW_MANAGED) == 0)
6316 continue;
6317
6318 if (dstmpte != NULL) {
6319 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6320 ("dstmpte pindex/addr mismatch"));
6321 dstmpte->ref_count++;
6322 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6323 NULL)) == NULL)
6324 goto out;
6325 dst_pte = (pt_entry_t *)
6326 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6327 dst_pte = &dst_pte[pmap_l3_index(addr)];
6328 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6329 L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6330 va_next - 1) {
6331 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6332 ptetemp, dstmpte, &lock))
6333 goto out;
6334 addr += L3C_SIZE - PAGE_SIZE;
6335 src_pte += L3C_ENTRIES - 1;
6336 } else if (pmap_load(dst_pte) == 0 &&
6337 pmap_try_insert_pv_entry(dst_pmap, addr,
6338 PTE_TO_VM_PAGE(ptetemp), &lock)) {
6339 /*
6340 * Clear the wired, contiguous, modified, and
6341 * accessed bits from the destination PTE.
6342 * The contiguous bit is cleared because we
6343 * are not copying the entire L3C superpage.
6344 */
6345 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6346 ATTR_AF;
6347 nbits = 0;
6348 if ((ptetemp & ATTR_SW_DBM) != 0)
6349 nbits |= ATTR_S1_AP_RW_BIT;
6350 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6351 pmap_resident_count_inc(dst_pmap, 1);
6352 } else {
6353 pmap_abort_ptp(dst_pmap, addr, dstmpte);
6354 goto out;
6355 }
6356 /* Have we copied all of the valid mappings? */
6357 if (dstmpte->ref_count >= srcmpte->ref_count)
6358 break;
6359 }
6360 }
6361 out:
6362 /*
6363 * XXX This barrier may not be needed because the destination pmap is
6364 * not active.
6365 */
6366 dsb(ishst);
6367
6368 if (lock != NULL)
6369 rw_wunlock(lock);
6370 PMAP_UNLOCK(src_pmap);
6371 PMAP_UNLOCK(dst_pmap);
6372 }
6373
6374 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6375 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6376 {
6377 int error;
6378
6379 if (dst_pmap->pm_stage != src_pmap->pm_stage)
6380 return (EINVAL);
6381
6382 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6383 return (0);
6384
6385 for (;;) {
6386 if (dst_pmap < src_pmap) {
6387 PMAP_LOCK(dst_pmap);
6388 PMAP_LOCK(src_pmap);
6389 } else {
6390 PMAP_LOCK(src_pmap);
6391 PMAP_LOCK(dst_pmap);
6392 }
6393 error = pmap_bti_copy(dst_pmap, src_pmap);
6394 /* Clean up partial copy on failure due to no memory. */
6395 if (error == ENOMEM)
6396 pmap_bti_deassign_all(dst_pmap);
6397 PMAP_UNLOCK(src_pmap);
6398 PMAP_UNLOCK(dst_pmap);
6399 if (error != ENOMEM)
6400 break;
6401 vm_wait(NULL);
6402 }
6403 return (error);
6404 }
6405
6406 /*
6407 * pmap_zero_page zeros the specified hardware page by mapping
6408 * the page into KVM and using bzero to clear its contents.
6409 */
6410 void
pmap_zero_page(vm_page_t m)6411 pmap_zero_page(vm_page_t m)
6412 {
6413 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6414
6415 pagezero((void *)va);
6416 }
6417
6418 /*
6419 * pmap_zero_page_area zeros the specified hardware page by mapping
6420 * the page into KVM and using bzero to clear its contents.
6421 *
6422 * off and size may not cover an area beyond a single hardware page.
6423 */
6424 void
pmap_zero_page_area(vm_page_t m,int off,int size)6425 pmap_zero_page_area(vm_page_t m, int off, int size)
6426 {
6427 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6428
6429 if (off == 0 && size == PAGE_SIZE)
6430 pagezero((void *)va);
6431 else
6432 bzero((char *)va + off, size);
6433 }
6434
6435 /*
6436 * pmap_copy_page copies the specified (machine independent)
6437 * page by mapping the page into virtual memory and using
6438 * bcopy to copy the page, one machine dependent page at a
6439 * time.
6440 */
6441 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6442 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6443 {
6444 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6445 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6446
6447 pagecopy((void *)src, (void *)dst);
6448 }
6449
6450 int unmapped_buf_allowed = 1;
6451
6452 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6453 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6454 vm_offset_t b_offset, int xfersize)
6455 {
6456 void *a_cp, *b_cp;
6457 vm_page_t m_a, m_b;
6458 vm_paddr_t p_a, p_b;
6459 vm_offset_t a_pg_offset, b_pg_offset;
6460 int cnt;
6461
6462 while (xfersize > 0) {
6463 a_pg_offset = a_offset & PAGE_MASK;
6464 m_a = ma[a_offset >> PAGE_SHIFT];
6465 p_a = m_a->phys_addr;
6466 b_pg_offset = b_offset & PAGE_MASK;
6467 m_b = mb[b_offset >> PAGE_SHIFT];
6468 p_b = m_b->phys_addr;
6469 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6470 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6471 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6472 panic("!DMAP a %lx", p_a);
6473 } else {
6474 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6475 }
6476 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6477 panic("!DMAP b %lx", p_b);
6478 } else {
6479 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6480 }
6481 bcopy(a_cp, b_cp, cnt);
6482 a_offset += cnt;
6483 b_offset += cnt;
6484 xfersize -= cnt;
6485 }
6486 }
6487
6488 vm_offset_t
pmap_quick_enter_page(vm_page_t m)6489 pmap_quick_enter_page(vm_page_t m)
6490 {
6491
6492 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6493 }
6494
6495 void
pmap_quick_remove_page(vm_offset_t addr)6496 pmap_quick_remove_page(vm_offset_t addr)
6497 {
6498 }
6499
6500 /*
6501 * Returns true if the pmap's pv is one of the first
6502 * 16 pvs linked to from this page. This count may
6503 * be changed upwards or downwards in the future; it
6504 * is only necessary that true be returned for a small
6505 * subset of pmaps for proper page aging.
6506 */
6507 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)6508 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6509 {
6510 struct md_page *pvh;
6511 struct rwlock *lock;
6512 pv_entry_t pv;
6513 int loops = 0;
6514 bool rv;
6515
6516 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6517 ("pmap_page_exists_quick: page %p is not managed", m));
6518 rv = false;
6519 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6520 rw_rlock(lock);
6521 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6522 if (PV_PMAP(pv) == pmap) {
6523 rv = true;
6524 break;
6525 }
6526 loops++;
6527 if (loops >= 16)
6528 break;
6529 }
6530 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6531 pvh = page_to_pvh(m);
6532 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6533 if (PV_PMAP(pv) == pmap) {
6534 rv = true;
6535 break;
6536 }
6537 loops++;
6538 if (loops >= 16)
6539 break;
6540 }
6541 }
6542 rw_runlock(lock);
6543 return (rv);
6544 }
6545
6546 /*
6547 * pmap_page_wired_mappings:
6548 *
6549 * Return the number of managed mappings to the given physical page
6550 * that are wired.
6551 */
6552 int
pmap_page_wired_mappings(vm_page_t m)6553 pmap_page_wired_mappings(vm_page_t m)
6554 {
6555 struct rwlock *lock;
6556 struct md_page *pvh;
6557 pmap_t pmap;
6558 pt_entry_t *pte;
6559 pv_entry_t pv;
6560 int count, md_gen, pvh_gen;
6561
6562 if ((m->oflags & VPO_UNMANAGED) != 0)
6563 return (0);
6564 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6565 rw_rlock(lock);
6566 restart:
6567 count = 0;
6568 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6569 pmap = PV_PMAP(pv);
6570 if (!PMAP_TRYLOCK(pmap)) {
6571 md_gen = m->md.pv_gen;
6572 rw_runlock(lock);
6573 PMAP_LOCK(pmap);
6574 rw_rlock(lock);
6575 if (md_gen != m->md.pv_gen) {
6576 PMAP_UNLOCK(pmap);
6577 goto restart;
6578 }
6579 }
6580 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6581 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6582 count++;
6583 PMAP_UNLOCK(pmap);
6584 }
6585 if ((m->flags & PG_FICTITIOUS) == 0) {
6586 pvh = page_to_pvh(m);
6587 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6588 pmap = PV_PMAP(pv);
6589 if (!PMAP_TRYLOCK(pmap)) {
6590 md_gen = m->md.pv_gen;
6591 pvh_gen = pvh->pv_gen;
6592 rw_runlock(lock);
6593 PMAP_LOCK(pmap);
6594 rw_rlock(lock);
6595 if (md_gen != m->md.pv_gen ||
6596 pvh_gen != pvh->pv_gen) {
6597 PMAP_UNLOCK(pmap);
6598 goto restart;
6599 }
6600 }
6601 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6602 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6603 count++;
6604 PMAP_UNLOCK(pmap);
6605 }
6606 }
6607 rw_runlock(lock);
6608 return (count);
6609 }
6610
6611 /*
6612 * Returns true if the given page is mapped individually or as part of
6613 * a 2mpage. Otherwise, returns false.
6614 */
6615 bool
pmap_page_is_mapped(vm_page_t m)6616 pmap_page_is_mapped(vm_page_t m)
6617 {
6618 struct rwlock *lock;
6619 bool rv;
6620
6621 if ((m->oflags & VPO_UNMANAGED) != 0)
6622 return (false);
6623 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6624 rw_rlock(lock);
6625 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6626 ((m->flags & PG_FICTITIOUS) == 0 &&
6627 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
6628 rw_runlock(lock);
6629 return (rv);
6630 }
6631
6632 /*
6633 * Destroy all managed, non-wired mappings in the given user-space
6634 * pmap. This pmap cannot be active on any processor besides the
6635 * caller.
6636 *
6637 * This function cannot be applied to the kernel pmap. Moreover, it
6638 * is not intended for general use. It is only to be used during
6639 * process termination. Consequently, it can be implemented in ways
6640 * that make it faster than pmap_remove(). First, it can more quickly
6641 * destroy mappings by iterating over the pmap's collection of PV
6642 * entries, rather than searching the page table. Second, it doesn't
6643 * have to test and clear the page table entries atomically, because
6644 * no processor is currently accessing the user address space. In
6645 * particular, a page table entry's dirty bit won't change state once
6646 * this function starts.
6647 */
6648 void
pmap_remove_pages(pmap_t pmap)6649 pmap_remove_pages(pmap_t pmap)
6650 {
6651 pd_entry_t *pde;
6652 pt_entry_t *pte, tpte;
6653 struct spglist free;
6654 struct pv_chunklist free_chunks[PMAP_MEMDOM];
6655 vm_page_t m, ml3, mt;
6656 pv_entry_t pv;
6657 struct md_page *pvh;
6658 struct pv_chunk *pc, *npc;
6659 struct rwlock *lock;
6660 int64_t bit;
6661 uint64_t inuse, bitmask;
6662 int allfree, field, i, idx, lvl;
6663 int freed __pvused;
6664 vm_paddr_t pa;
6665
6666 lock = NULL;
6667
6668 for (i = 0; i < PMAP_MEMDOM; i++)
6669 TAILQ_INIT(&free_chunks[i]);
6670 SLIST_INIT(&free);
6671 PMAP_LOCK(pmap);
6672 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6673 allfree = 1;
6674 freed = 0;
6675 for (field = 0; field < _NPCM; field++) {
6676 inuse = ~pc->pc_map[field] & pc_freemask[field];
6677 while (inuse != 0) {
6678 bit = ffsl(inuse) - 1;
6679 bitmask = 1UL << bit;
6680 idx = field * 64 + bit;
6681 pv = &pc->pc_pventry[idx];
6682 inuse &= ~bitmask;
6683
6684 pde = pmap_pde(pmap, pv->pv_va, &lvl);
6685 KASSERT(pde != NULL,
6686 ("Attempting to remove an unmapped page"));
6687
6688 switch(lvl) {
6689 case 1:
6690 pte = pmap_l1_to_l2(pde, pv->pv_va);
6691 tpte = pmap_load(pte);
6692 KASSERT((tpte & ATTR_DESCR_MASK) ==
6693 L2_BLOCK,
6694 ("Attempting to remove an invalid "
6695 "block: %lx", tpte));
6696 break;
6697 case 2:
6698 pte = pmap_l2_to_l3(pde, pv->pv_va);
6699 tpte = pmap_load(pte);
6700 KASSERT((tpte & ATTR_DESCR_MASK) ==
6701 L3_PAGE,
6702 ("Attempting to remove an invalid "
6703 "page: %lx", tpte));
6704 break;
6705 default:
6706 panic(
6707 "Invalid page directory level: %d",
6708 lvl);
6709 }
6710
6711 /*
6712 * We cannot remove wired mappings at this time.
6713 *
6714 * For L3C superpages, all of the constituent PTEs
6715 * should have the wired bit set, so we don't
6716 * check for ATTR_CONTIGUOUS here.
6717 */
6718 if (tpte & ATTR_SW_WIRED) {
6719 allfree = 0;
6720 continue;
6721 }
6722
6723 /* Mark free */
6724 pc->pc_map[field] |= bitmask;
6725
6726 /*
6727 * Because this pmap is not active on other
6728 * processors, the dirty bit cannot have
6729 * changed state since we last loaded pte.
6730 */
6731 pmap_clear(pte);
6732
6733 pa = PTE_TO_PHYS(tpte);
6734
6735 m = PHYS_TO_VM_PAGE(pa);
6736 KASSERT(m->phys_addr == pa,
6737 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6738 m, (uintmax_t)m->phys_addr,
6739 (uintmax_t)tpte));
6740
6741 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6742 m < &vm_page_array[vm_page_array_size],
6743 ("pmap_remove_pages: bad pte %#jx",
6744 (uintmax_t)tpte));
6745
6746 /*
6747 * Update the vm_page_t clean/reference bits.
6748 *
6749 * We don't check for ATTR_CONTIGUOUS here
6750 * because writeable L3C superpages are expected
6751 * to be dirty, i.e., every constituent PTE
6752 * should be dirty.
6753 */
6754 if (pmap_pte_dirty(pmap, tpte)) {
6755 switch (lvl) {
6756 case 1:
6757 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6758 vm_page_dirty(mt);
6759 break;
6760 case 2:
6761 vm_page_dirty(m);
6762 break;
6763 }
6764 }
6765
6766 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6767
6768 switch (lvl) {
6769 case 1:
6770 pmap_resident_count_dec(pmap,
6771 L2_SIZE / PAGE_SIZE);
6772 pvh = page_to_pvh(m);
6773 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
6774 pvh->pv_gen++;
6775 if (TAILQ_EMPTY(&pvh->pv_list)) {
6776 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6777 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
6778 TAILQ_EMPTY(&mt->md.pv_list))
6779 vm_page_aflag_clear(mt, PGA_WRITEABLE);
6780 }
6781 ml3 = pmap_remove_pt_page(pmap,
6782 pv->pv_va);
6783 if (ml3 != NULL) {
6784 KASSERT(vm_page_any_valid(ml3),
6785 ("pmap_remove_pages: l3 page not promoted"));
6786 pmap_resident_count_dec(pmap,1);
6787 KASSERT(ml3->ref_count == NL3PG,
6788 ("pmap_remove_pages: l3 page ref count error"));
6789 ml3->ref_count = 0;
6790 pmap_add_delayed_free_list(ml3,
6791 &free, false);
6792 }
6793 break;
6794 case 2:
6795 pmap_resident_count_dec(pmap, 1);
6796 TAILQ_REMOVE(&m->md.pv_list, pv,
6797 pv_next);
6798 m->md.pv_gen++;
6799 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
6800 TAILQ_EMPTY(&m->md.pv_list) &&
6801 (m->flags & PG_FICTITIOUS) == 0) {
6802 pvh = page_to_pvh(m);
6803 if (TAILQ_EMPTY(&pvh->pv_list))
6804 vm_page_aflag_clear(m,
6805 PGA_WRITEABLE);
6806 }
6807 break;
6808 }
6809 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
6810 &free);
6811 freed++;
6812 }
6813 }
6814 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6815 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6816 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6817 if (allfree) {
6818 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6819 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
6820 pc_list);
6821 }
6822 }
6823 if (lock != NULL)
6824 rw_wunlock(lock);
6825 pmap_invalidate_all(pmap);
6826 pmap_bti_deassign_all(pmap);
6827 free_pv_chunk_batch(free_chunks);
6828 PMAP_UNLOCK(pmap);
6829 vm_page_free_pages_toq(&free, true);
6830 }
6831
6832 /*
6833 * This is used to check if a page has been accessed or modified.
6834 */
6835 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)6836 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
6837 {
6838 struct rwlock *lock;
6839 pv_entry_t pv;
6840 struct md_page *pvh;
6841 pt_entry_t l3e, mask, *pte, value;
6842 pmap_t pmap;
6843 int md_gen, pvh_gen;
6844 bool rv;
6845
6846 rv = false;
6847 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6848 rw_rlock(lock);
6849 restart:
6850 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6851 pmap = PV_PMAP(pv);
6852 PMAP_ASSERT_STAGE1(pmap);
6853 if (!PMAP_TRYLOCK(pmap)) {
6854 md_gen = m->md.pv_gen;
6855 rw_runlock(lock);
6856 PMAP_LOCK(pmap);
6857 rw_rlock(lock);
6858 if (md_gen != m->md.pv_gen) {
6859 PMAP_UNLOCK(pmap);
6860 goto restart;
6861 }
6862 }
6863 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6864 mask = 0;
6865 value = 0;
6866 if (modified) {
6867 mask |= ATTR_S1_AP_RW_BIT;
6868 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6869 }
6870 if (accessed) {
6871 mask |= ATTR_AF | ATTR_DESCR_MASK;
6872 value |= ATTR_AF | L3_PAGE;
6873 }
6874 l3e = pmap_load(pte);
6875 if ((l3e & ATTR_CONTIGUOUS) != 0)
6876 l3e = pmap_load_l3c(pte);
6877 PMAP_UNLOCK(pmap);
6878 rv = (l3e & mask) == value;
6879 if (rv)
6880 goto out;
6881 }
6882 if ((m->flags & PG_FICTITIOUS) == 0) {
6883 pvh = page_to_pvh(m);
6884 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6885 pmap = PV_PMAP(pv);
6886 PMAP_ASSERT_STAGE1(pmap);
6887 if (!PMAP_TRYLOCK(pmap)) {
6888 md_gen = m->md.pv_gen;
6889 pvh_gen = pvh->pv_gen;
6890 rw_runlock(lock);
6891 PMAP_LOCK(pmap);
6892 rw_rlock(lock);
6893 if (md_gen != m->md.pv_gen ||
6894 pvh_gen != pvh->pv_gen) {
6895 PMAP_UNLOCK(pmap);
6896 goto restart;
6897 }
6898 }
6899 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6900 mask = 0;
6901 value = 0;
6902 if (modified) {
6903 mask |= ATTR_S1_AP_RW_BIT;
6904 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6905 }
6906 if (accessed) {
6907 mask |= ATTR_AF | ATTR_DESCR_MASK;
6908 value |= ATTR_AF | L2_BLOCK;
6909 }
6910 rv = (pmap_load(pte) & mask) == value;
6911 PMAP_UNLOCK(pmap);
6912 if (rv)
6913 goto out;
6914 }
6915 }
6916 out:
6917 rw_runlock(lock);
6918 return (rv);
6919 }
6920
6921 /*
6922 * pmap_is_modified:
6923 *
6924 * Return whether or not the specified physical page was modified
6925 * in any physical maps.
6926 */
6927 bool
pmap_is_modified(vm_page_t m)6928 pmap_is_modified(vm_page_t m)
6929 {
6930
6931 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6932 ("pmap_is_modified: page %p is not managed", m));
6933
6934 /*
6935 * If the page is not busied then this check is racy.
6936 */
6937 if (!pmap_page_is_write_mapped(m))
6938 return (false);
6939 return (pmap_page_test_mappings(m, false, true));
6940 }
6941
6942 /*
6943 * pmap_is_prefaultable:
6944 *
6945 * Return whether or not the specified virtual address is eligible
6946 * for prefault.
6947 */
6948 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)6949 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
6950 {
6951 pd_entry_t *pde;
6952 pt_entry_t *pte;
6953 bool rv;
6954 int lvl;
6955
6956 /*
6957 * Return true if and only if the L3 entry for the specified virtual
6958 * address is allocated but invalid.
6959 */
6960 rv = false;
6961 PMAP_LOCK(pmap);
6962 pde = pmap_pde(pmap, addr, &lvl);
6963 if (pde != NULL && lvl == 2) {
6964 pte = pmap_l2_to_l3(pde, addr);
6965 rv = pmap_load(pte) == 0;
6966 }
6967 PMAP_UNLOCK(pmap);
6968 return (rv);
6969 }
6970
6971 /*
6972 * pmap_is_referenced:
6973 *
6974 * Return whether or not the specified physical page was referenced
6975 * in any physical maps.
6976 */
6977 bool
pmap_is_referenced(vm_page_t m)6978 pmap_is_referenced(vm_page_t m)
6979 {
6980
6981 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6982 ("pmap_is_referenced: page %p is not managed", m));
6983 return (pmap_page_test_mappings(m, true, false));
6984 }
6985
6986 /*
6987 * Clear the write and modified bits in each of the given page's mappings.
6988 */
6989 void
pmap_remove_write(vm_page_t m)6990 pmap_remove_write(vm_page_t m)
6991 {
6992 struct md_page *pvh;
6993 pmap_t pmap;
6994 struct rwlock *lock;
6995 pv_entry_t next_pv, pv;
6996 pt_entry_t oldpte, *pte, set, clear, mask, val;
6997 vm_offset_t va;
6998 int md_gen, pvh_gen;
6999
7000 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7001 ("pmap_remove_write: page %p is not managed", m));
7002 vm_page_assert_busied(m);
7003
7004 if (!pmap_page_is_write_mapped(m))
7005 return;
7006 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7007 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7008 rw_wlock(lock);
7009 retry:
7010 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7011 pmap = PV_PMAP(pv);
7012 PMAP_ASSERT_STAGE1(pmap);
7013 if (!PMAP_TRYLOCK(pmap)) {
7014 pvh_gen = pvh->pv_gen;
7015 rw_wunlock(lock);
7016 PMAP_LOCK(pmap);
7017 rw_wlock(lock);
7018 if (pvh_gen != pvh->pv_gen) {
7019 PMAP_UNLOCK(pmap);
7020 goto retry;
7021 }
7022 }
7023 va = pv->pv_va;
7024 pte = pmap_pte_exists(pmap, va, 2, __func__);
7025 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7026 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7027 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7028 ("inconsistent pv lock %p %p for page %p",
7029 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7030 PMAP_UNLOCK(pmap);
7031 }
7032 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7033 pmap = PV_PMAP(pv);
7034 if (!PMAP_TRYLOCK(pmap)) {
7035 pvh_gen = pvh->pv_gen;
7036 md_gen = m->md.pv_gen;
7037 rw_wunlock(lock);
7038 PMAP_LOCK(pmap);
7039 rw_wlock(lock);
7040 if (pvh_gen != pvh->pv_gen ||
7041 md_gen != m->md.pv_gen) {
7042 PMAP_UNLOCK(pmap);
7043 goto retry;
7044 }
7045 }
7046 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7047 oldpte = pmap_load(pte);
7048 if ((oldpte & ATTR_SW_DBM) != 0) {
7049 if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7050 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7051
7052 /*
7053 * The L3 entry's accessed bit may have
7054 * changed.
7055 */
7056 oldpte = pmap_load(pte);
7057 }
7058 if (pmap->pm_stage == PM_STAGE1) {
7059 set = ATTR_S1_AP_RW_BIT;
7060 clear = 0;
7061 mask = ATTR_S1_AP_RW_BIT;
7062 val = ATTR_S1_AP(ATTR_S1_AP_RW);
7063 } else {
7064 set = 0;
7065 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7066 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7067 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7068 }
7069 clear |= ATTR_SW_DBM;
7070 while (!atomic_fcmpset_64(pte, &oldpte,
7071 (oldpte | set) & ~clear))
7072 cpu_spinwait();
7073
7074 if ((oldpte & mask) == val)
7075 vm_page_dirty(m);
7076 pmap_invalidate_page(pmap, pv->pv_va, true);
7077 }
7078 PMAP_UNLOCK(pmap);
7079 }
7080 rw_wunlock(lock);
7081 vm_page_aflag_clear(m, PGA_WRITEABLE);
7082 }
7083
7084 /*
7085 * pmap_ts_referenced:
7086 *
7087 * Return a count of reference bits for a page, clearing those bits.
7088 * It is not necessary for every reference bit to be cleared, but it
7089 * is necessary that 0 only be returned when there are truly no
7090 * reference bits set.
7091 *
7092 * As an optimization, update the page's dirty field if a modified bit is
7093 * found while counting reference bits. This opportunistic update can be
7094 * performed at low cost and can eliminate the need for some future calls
7095 * to pmap_is_modified(). However, since this function stops after
7096 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7097 * dirty pages. Those dirty pages will only be detected by a future call
7098 * to pmap_is_modified().
7099 */
7100 int
pmap_ts_referenced(vm_page_t m)7101 pmap_ts_referenced(vm_page_t m)
7102 {
7103 struct md_page *pvh;
7104 pv_entry_t pv, pvf;
7105 pmap_t pmap;
7106 struct rwlock *lock;
7107 pt_entry_t *pte, tpte;
7108 vm_offset_t va;
7109 vm_paddr_t pa;
7110 int cleared, md_gen, not_cleared, pvh_gen;
7111 struct spglist free;
7112
7113 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7114 ("pmap_ts_referenced: page %p is not managed", m));
7115 SLIST_INIT(&free);
7116 cleared = 0;
7117 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7118 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7119 rw_wlock(lock);
7120 retry:
7121 not_cleared = 0;
7122 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7123 goto small_mappings;
7124 pv = pvf;
7125 do {
7126 if (pvf == NULL)
7127 pvf = pv;
7128 pmap = PV_PMAP(pv);
7129 if (!PMAP_TRYLOCK(pmap)) {
7130 pvh_gen = pvh->pv_gen;
7131 rw_wunlock(lock);
7132 PMAP_LOCK(pmap);
7133 rw_wlock(lock);
7134 if (pvh_gen != pvh->pv_gen) {
7135 PMAP_UNLOCK(pmap);
7136 goto retry;
7137 }
7138 }
7139 va = pv->pv_va;
7140 pte = pmap_pte_exists(pmap, va, 2, __func__);
7141 tpte = pmap_load(pte);
7142 if (pmap_pte_dirty(pmap, tpte)) {
7143 /*
7144 * Although "tpte" is mapping a 2MB page, because
7145 * this function is called at a 4KB page granularity,
7146 * we only update the 4KB page under test.
7147 */
7148 vm_page_dirty(m);
7149 }
7150 if ((tpte & ATTR_AF) != 0) {
7151 pa = VM_PAGE_TO_PHYS(m);
7152
7153 /*
7154 * Since this reference bit is shared by 512 4KB pages,
7155 * it should not be cleared every time it is tested.
7156 * Apply a simple "hash" function on the physical page
7157 * number, the virtual superpage number, and the pmap
7158 * address to select one 4KB page out of the 512 on
7159 * which testing the reference bit will result in
7160 * clearing that reference bit. This function is
7161 * designed to avoid the selection of the same 4KB page
7162 * for every 2MB page mapping.
7163 *
7164 * On demotion, a mapping that hasn't been referenced
7165 * is simply destroyed. To avoid the possibility of a
7166 * subsequent page fault on a demoted wired mapping,
7167 * always leave its reference bit set. Moreover,
7168 * since the superpage is wired, the current state of
7169 * its reference bit won't affect page replacement.
7170 */
7171 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7172 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7173 (tpte & ATTR_SW_WIRED) == 0) {
7174 pmap_clear_bits(pte, ATTR_AF);
7175 pmap_invalidate_page(pmap, va, true);
7176 cleared++;
7177 } else
7178 not_cleared++;
7179 }
7180 PMAP_UNLOCK(pmap);
7181 /* Rotate the PV list if it has more than one entry. */
7182 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7183 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7184 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7185 pvh->pv_gen++;
7186 }
7187 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7188 goto out;
7189 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7190 small_mappings:
7191 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7192 goto out;
7193 pv = pvf;
7194 do {
7195 if (pvf == NULL)
7196 pvf = pv;
7197 pmap = PV_PMAP(pv);
7198 if (!PMAP_TRYLOCK(pmap)) {
7199 pvh_gen = pvh->pv_gen;
7200 md_gen = m->md.pv_gen;
7201 rw_wunlock(lock);
7202 PMAP_LOCK(pmap);
7203 rw_wlock(lock);
7204 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7205 PMAP_UNLOCK(pmap);
7206 goto retry;
7207 }
7208 }
7209 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7210 tpte = pmap_load(pte);
7211 if (pmap_pte_dirty(pmap, tpte))
7212 vm_page_dirty(m);
7213 if ((tpte & ATTR_AF) != 0) {
7214 if ((tpte & ATTR_SW_WIRED) == 0) {
7215 /*
7216 * Clear the accessed bit in this L3 entry
7217 * regardless of the contiguous bit.
7218 */
7219 pmap_clear_bits(pte, ATTR_AF);
7220 pmap_invalidate_page(pmap, pv->pv_va, true);
7221 cleared++;
7222 } else
7223 not_cleared++;
7224 } else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7225 (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7226 /*
7227 * An L3C superpage mapping is regarded as accessed
7228 * until the accessed bit has been cleared in all
7229 * of its constituent entries.
7230 */
7231 not_cleared++;
7232 }
7233 PMAP_UNLOCK(pmap);
7234 /* Rotate the PV list if it has more than one entry. */
7235 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7236 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7237 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7238 m->md.pv_gen++;
7239 }
7240 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7241 not_cleared < PMAP_TS_REFERENCED_MAX);
7242 out:
7243 rw_wunlock(lock);
7244 vm_page_free_pages_toq(&free, true);
7245 return (cleared + not_cleared);
7246 }
7247
7248 /*
7249 * Apply the given advice to the specified range of addresses within the
7250 * given pmap. Depending on the advice, clear the referenced and/or
7251 * modified flags in each mapping and set the mapped page's dirty field.
7252 */
7253 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7254 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7255 {
7256 struct rwlock *lock;
7257 vm_offset_t va, va_next, dva;
7258 vm_page_t m;
7259 pd_entry_t *l0, *l1, *l2, oldl2;
7260 pt_entry_t *l3, *dl3, oldl3;
7261
7262 PMAP_ASSERT_STAGE1(pmap);
7263
7264 if (advice != MADV_DONTNEED && advice != MADV_FREE)
7265 return;
7266
7267 PMAP_LOCK(pmap);
7268 for (; sva < eva; sva = va_next) {
7269 l0 = pmap_l0(pmap, sva);
7270 if (pmap_load(l0) == 0) {
7271 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7272 if (va_next < sva)
7273 va_next = eva;
7274 continue;
7275 }
7276
7277 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7278 if (va_next < sva)
7279 va_next = eva;
7280 l1 = pmap_l0_to_l1(l0, sva);
7281 if (pmap_load(l1) == 0)
7282 continue;
7283 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7284 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7285 continue;
7286 }
7287
7288 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7289 if (va_next < sva)
7290 va_next = eva;
7291 l2 = pmap_l1_to_l2(l1, sva);
7292 oldl2 = pmap_load(l2);
7293 if (oldl2 == 0)
7294 continue;
7295 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7296 if ((oldl2 & ATTR_SW_MANAGED) == 0)
7297 continue;
7298 lock = NULL;
7299 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7300 if (lock != NULL)
7301 rw_wunlock(lock);
7302
7303 /*
7304 * The 2MB page mapping was destroyed.
7305 */
7306 continue;
7307 }
7308
7309 /*
7310 * Unless the page mappings are wired, remove the
7311 * mapping to a single page so that a subsequent
7312 * access may repromote. Choosing the last page
7313 * within the address range [sva, min(va_next, eva))
7314 * generally results in more repromotions. Since the
7315 * underlying page table page is fully populated, this
7316 * removal never frees a page table page.
7317 */
7318 if ((oldl2 & ATTR_SW_WIRED) == 0) {
7319 va = eva;
7320 if (va > va_next)
7321 va = va_next;
7322 va -= PAGE_SIZE;
7323 KASSERT(va >= sva,
7324 ("pmap_advise: no address gap"));
7325 l3 = pmap_l2_to_l3(l2, va);
7326 KASSERT(pmap_load(l3) != 0,
7327 ("pmap_advise: invalid PTE"));
7328 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7329 NULL, &lock);
7330 }
7331 if (lock != NULL)
7332 rw_wunlock(lock);
7333 }
7334 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7335 ("pmap_advise: invalid L2 entry after demotion"));
7336 if (va_next > eva)
7337 va_next = eva;
7338 va = va_next;
7339 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7340 sva += L3_SIZE) {
7341 oldl3 = pmap_load(l3);
7342 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7343 (ATTR_SW_MANAGED | L3_PAGE))
7344 goto maybe_invlrng;
7345 else if (pmap_pte_dirty(pmap, oldl3)) {
7346 if (advice == MADV_DONTNEED) {
7347 /*
7348 * Future calls to pmap_is_modified()
7349 * can be avoided by making the page
7350 * dirty now.
7351 */
7352 m = PTE_TO_VM_PAGE(oldl3);
7353 vm_page_dirty(m);
7354 }
7355 if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7356 /*
7357 * Unconditionally demote the L3C
7358 * superpage because we do not allow
7359 * writeable, clean superpages.
7360 */
7361 (void)pmap_demote_l3c(pmap, l3, sva);
7362
7363 /*
7364 * Destroy the final mapping before the
7365 * next L3C boundary or va_next,
7366 * whichever comes first, so that a
7367 * subsequent access may act as a
7368 * repromotion trigger.
7369 */
7370 if ((oldl3 & ATTR_SW_WIRED) == 0) {
7371 dva = MIN((sva & ~L3C_OFFSET) +
7372 L3C_SIZE - PAGE_SIZE,
7373 va_next - PAGE_SIZE);
7374 dl3 = pmap_l2_to_l3(l2, dva);
7375 KASSERT(pmap_load(dl3) != 0,
7376 ("pmap_advise: invalid PTE"));
7377 lock = NULL;
7378 pmap_remove_l3(pmap, dl3, dva,
7379 pmap_load(l2), NULL, &lock);
7380 if (lock != NULL)
7381 rw_wunlock(lock);
7382 }
7383
7384 /*
7385 * The L3 entry's accessed bit may have
7386 * changed.
7387 */
7388 oldl3 = pmap_load(l3);
7389 }
7390
7391 /*
7392 * Check that we did not just destroy this entry so
7393 * we avoid corrupting the page able.
7394 */
7395 if (oldl3 != 0) {
7396 while (!atomic_fcmpset_long(l3, &oldl3,
7397 (oldl3 & ~ATTR_AF) |
7398 ATTR_S1_AP(ATTR_S1_AP_RO)))
7399 cpu_spinwait();
7400 }
7401 } else if ((oldl3 & ATTR_AF) != 0) {
7402 /*
7403 * Clear the accessed bit in this L3 entry
7404 * regardless of the contiguous bit.
7405 */
7406 pmap_clear_bits(l3, ATTR_AF);
7407 } else
7408 goto maybe_invlrng;
7409 if (va == va_next)
7410 va = sva;
7411 continue;
7412 maybe_invlrng:
7413 if (va != va_next) {
7414 pmap_s1_invalidate_range(pmap, va, sva, true);
7415 va = va_next;
7416 }
7417 }
7418 if (va != va_next)
7419 pmap_s1_invalidate_range(pmap, va, sva, true);
7420 }
7421 PMAP_UNLOCK(pmap);
7422 }
7423
7424 /*
7425 * Clear the modify bits on the specified physical page.
7426 */
7427 void
pmap_clear_modify(vm_page_t m)7428 pmap_clear_modify(vm_page_t m)
7429 {
7430 struct md_page *pvh;
7431 struct rwlock *lock;
7432 pmap_t pmap;
7433 pv_entry_t next_pv, pv;
7434 pd_entry_t *l2, oldl2;
7435 pt_entry_t *l3, oldl3;
7436 vm_offset_t va;
7437 int md_gen, pvh_gen;
7438
7439 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7440 ("pmap_clear_modify: page %p is not managed", m));
7441 vm_page_assert_busied(m);
7442
7443 if (!pmap_page_is_write_mapped(m))
7444 return;
7445 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7446 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7447 rw_wlock(lock);
7448 restart:
7449 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7450 pmap = PV_PMAP(pv);
7451 PMAP_ASSERT_STAGE1(pmap);
7452 if (!PMAP_TRYLOCK(pmap)) {
7453 pvh_gen = pvh->pv_gen;
7454 rw_wunlock(lock);
7455 PMAP_LOCK(pmap);
7456 rw_wlock(lock);
7457 if (pvh_gen != pvh->pv_gen) {
7458 PMAP_UNLOCK(pmap);
7459 goto restart;
7460 }
7461 }
7462 va = pv->pv_va;
7463 l2 = pmap_l2(pmap, va);
7464 oldl2 = pmap_load(l2);
7465 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7466 if ((oldl2 & ATTR_SW_DBM) != 0 &&
7467 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7468 (oldl2 & ATTR_SW_WIRED) == 0) {
7469 /*
7470 * Write protect the mapping to a single page so that
7471 * a subsequent write access may repromote.
7472 */
7473 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7474 l3 = pmap_l2_to_l3(l2, va);
7475 oldl3 = pmap_load(l3);
7476 while (!atomic_fcmpset_long(l3, &oldl3,
7477 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7478 cpu_spinwait();
7479 vm_page_dirty(m);
7480 pmap_s1_invalidate_page(pmap, va, true);
7481 }
7482 PMAP_UNLOCK(pmap);
7483 }
7484 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7485 pmap = PV_PMAP(pv);
7486 PMAP_ASSERT_STAGE1(pmap);
7487 if (!PMAP_TRYLOCK(pmap)) {
7488 md_gen = m->md.pv_gen;
7489 pvh_gen = pvh->pv_gen;
7490 rw_wunlock(lock);
7491 PMAP_LOCK(pmap);
7492 rw_wlock(lock);
7493 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7494 PMAP_UNLOCK(pmap);
7495 goto restart;
7496 }
7497 }
7498 l2 = pmap_l2(pmap, pv->pv_va);
7499 l3 = pmap_l2_to_l3(l2, pv->pv_va);
7500 oldl3 = pmap_load(l3);
7501 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7502 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7503 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7504 ("writeable L3C superpage not dirty"));
7505 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7506 if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7507 (void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7508 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7509 pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7510 }
7511 PMAP_UNLOCK(pmap);
7512 }
7513 rw_wunlock(lock);
7514 }
7515
7516 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)7517 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7518 {
7519 struct pmap_preinit_mapping *ppim;
7520 vm_offset_t va, offset;
7521 pd_entry_t old_l2e, *pde;
7522 pt_entry_t *l2;
7523 int i, lvl, l2_blocks, free_l2_count, start_idx;
7524
7525 if (!vm_initialized) {
7526 /*
7527 * No L3 ptables so map entire L2 blocks where start VA is:
7528 * preinit_map_va + start_idx * L2_SIZE
7529 * There may be duplicate mappings (multiple VA -> same PA) but
7530 * ARM64 dcache is always PIPT so that's acceptable.
7531 */
7532 if (size == 0)
7533 return (NULL);
7534
7535 /* Calculate how many L2 blocks are needed for the mapping */
7536 l2_blocks = (roundup2(pa + size, L2_SIZE) -
7537 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7538
7539 offset = pa & L2_OFFSET;
7540
7541 if (preinit_map_va == 0)
7542 return (NULL);
7543
7544 /* Map 2MiB L2 blocks from reserved VA space */
7545
7546 free_l2_count = 0;
7547 start_idx = -1;
7548 /* Find enough free contiguous VA space */
7549 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7550 ppim = pmap_preinit_mapping + i;
7551 if (free_l2_count > 0 && ppim->pa != 0) {
7552 /* Not enough space here */
7553 free_l2_count = 0;
7554 start_idx = -1;
7555 continue;
7556 }
7557
7558 if (ppim->pa == 0) {
7559 /* Free L2 block */
7560 if (start_idx == -1)
7561 start_idx = i;
7562 free_l2_count++;
7563 if (free_l2_count == l2_blocks)
7564 break;
7565 }
7566 }
7567 if (free_l2_count != l2_blocks)
7568 panic("%s: too many preinit mappings", __func__);
7569
7570 va = preinit_map_va + (start_idx * L2_SIZE);
7571 for (i = start_idx; i < start_idx + l2_blocks; i++) {
7572 /* Mark entries as allocated */
7573 ppim = pmap_preinit_mapping + i;
7574 ppim->pa = pa;
7575 ppim->va = va + offset;
7576 ppim->size = size;
7577 }
7578
7579 /* Map L2 blocks */
7580 pa = rounddown2(pa, L2_SIZE);
7581 old_l2e = 0;
7582 for (i = 0; i < l2_blocks; i++) {
7583 pde = pmap_pde(kernel_pmap, va, &lvl);
7584 KASSERT(pde != NULL,
7585 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
7586 va));
7587 KASSERT(lvl == 1,
7588 ("pmap_mapbios: Invalid level %d", lvl));
7589
7590 /* Insert L2_BLOCK */
7591 l2 = pmap_l1_to_l2(pde, va);
7592 old_l2e |= pmap_load_store(l2,
7593 PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN |
7594 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
7595 L2_BLOCK);
7596
7597 va += L2_SIZE;
7598 pa += L2_SIZE;
7599 }
7600 if ((old_l2e & ATTR_DESCR_VALID) != 0)
7601 pmap_s1_invalidate_all(kernel_pmap);
7602 else {
7603 /*
7604 * Because the old entries were invalid and the new
7605 * mappings are not executable, an isb is not required.
7606 */
7607 dsb(ishst);
7608 }
7609
7610 va = preinit_map_va + (start_idx * L2_SIZE);
7611
7612 } else {
7613 /* kva_alloc may be used to map the pages */
7614 offset = pa & PAGE_MASK;
7615 size = round_page(offset + size);
7616
7617 va = kva_alloc(size);
7618 if (va == 0)
7619 panic("%s: Couldn't allocate KVA", __func__);
7620
7621 pde = pmap_pde(kernel_pmap, va, &lvl);
7622 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
7623
7624 /* L3 table is linked */
7625 va = trunc_page(va);
7626 pa = trunc_page(pa);
7627 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
7628 }
7629
7630 return ((void *)(va + offset));
7631 }
7632
7633 void
pmap_unmapbios(void * p,vm_size_t size)7634 pmap_unmapbios(void *p, vm_size_t size)
7635 {
7636 struct pmap_preinit_mapping *ppim;
7637 vm_offset_t offset, va, va_trunc;
7638 pd_entry_t *pde;
7639 pt_entry_t *l2;
7640 int i, lvl, l2_blocks, block;
7641 bool preinit_map;
7642
7643 va = (vm_offset_t)p;
7644 l2_blocks =
7645 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
7646 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
7647
7648 /* Remove preinit mapping */
7649 preinit_map = false;
7650 block = 0;
7651 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7652 ppim = pmap_preinit_mapping + i;
7653 if (ppim->va == va) {
7654 KASSERT(ppim->size == size,
7655 ("pmap_unmapbios: size mismatch"));
7656 ppim->va = 0;
7657 ppim->pa = 0;
7658 ppim->size = 0;
7659 preinit_map = true;
7660 offset = block * L2_SIZE;
7661 va_trunc = rounddown2(va, L2_SIZE) + offset;
7662
7663 /* Remove L2_BLOCK */
7664 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
7665 KASSERT(pde != NULL,
7666 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
7667 va_trunc));
7668 l2 = pmap_l1_to_l2(pde, va_trunc);
7669 pmap_clear(l2);
7670
7671 if (block == (l2_blocks - 1))
7672 break;
7673 block++;
7674 }
7675 }
7676 if (preinit_map) {
7677 pmap_s1_invalidate_all(kernel_pmap);
7678 return;
7679 }
7680
7681 /* Unmap the pages reserved with kva_alloc. */
7682 if (vm_initialized) {
7683 offset = va & PAGE_MASK;
7684 size = round_page(offset + size);
7685 va = trunc_page(va);
7686
7687 /* Unmap and invalidate the pages */
7688 pmap_kremove_device(va, size);
7689
7690 kva_free(va, size);
7691 }
7692 }
7693
7694 /*
7695 * Sets the memory attribute for the specified page.
7696 */
7697 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)7698 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7699 {
7700
7701 m->md.pv_memattr = ma;
7702
7703 /*
7704 * If "m" is a normal page, update its direct mapping. This update
7705 * can be relied upon to perform any cache operations that are
7706 * required for data coherence.
7707 */
7708 if ((m->flags & PG_FICTITIOUS) == 0 &&
7709 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7710 m->md.pv_memattr) != 0)
7711 panic("memory attribute change on the direct map failed");
7712 }
7713
7714 /*
7715 * Changes the specified virtual address range's memory type to that given by
7716 * the parameter "mode". The specified virtual address range must be
7717 * completely contained within either the direct map or the kernel map. If
7718 * the virtual address range is contained within the kernel map, then the
7719 * memory type for each of the corresponding ranges of the direct map is also
7720 * changed. (The corresponding ranges of the direct map are those ranges that
7721 * map the same physical pages as the specified virtual address range.) These
7722 * changes to the direct map are necessary because Intel describes the
7723 * behavior of their processors as "undefined" if two or more mappings to the
7724 * same physical page have different memory types.
7725 *
7726 * Returns zero if the change completed successfully, and either EINVAL or
7727 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
7728 * of the virtual address range was not mapped, and ENOMEM is returned if
7729 * there was insufficient memory available to complete the change. In the
7730 * latter case, the memory type may have been changed on some part of the
7731 * virtual address range or the direct map.
7732 */
7733 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)7734 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7735 {
7736 int error;
7737
7738 PMAP_LOCK(kernel_pmap);
7739 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
7740 PMAP_UNLOCK(kernel_pmap);
7741 return (error);
7742 }
7743
7744 /*
7745 * Changes the specified virtual address range's protections to those
7746 * specified by "prot". Like pmap_change_attr(), protections for aliases
7747 * in the direct map are updated as well. Protections on aliasing mappings may
7748 * be a subset of the requested protections; for example, mappings in the direct
7749 * map are never executable.
7750 */
7751 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)7752 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
7753 {
7754 int error;
7755
7756 /* Only supported within the kernel map. */
7757 if (va < VM_MIN_KERNEL_ADDRESS)
7758 return (EINVAL);
7759
7760 PMAP_LOCK(kernel_pmap);
7761 error = pmap_change_props_locked(va, size, prot, -1, false);
7762 PMAP_UNLOCK(kernel_pmap);
7763 return (error);
7764 }
7765
7766 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)7767 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
7768 int mode, bool skip_unmapped)
7769 {
7770 vm_offset_t base, offset, tmpva;
7771 vm_size_t pte_size;
7772 vm_paddr_t pa;
7773 pt_entry_t pte, *ptep, *newpte;
7774 pt_entry_t bits, mask;
7775 int lvl, rv;
7776
7777 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7778 base = trunc_page(va);
7779 offset = va & PAGE_MASK;
7780 size = round_page(offset + size);
7781
7782 if (!VIRT_IN_DMAP(base) &&
7783 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
7784 return (EINVAL);
7785
7786 bits = 0;
7787 mask = 0;
7788 if (mode != -1) {
7789 bits = ATTR_S1_IDX(mode);
7790 mask = ATTR_S1_IDX_MASK;
7791 if (mode == VM_MEMATTR_DEVICE) {
7792 mask |= ATTR_S1_XN;
7793 bits |= ATTR_S1_XN;
7794 }
7795 }
7796 if (prot != VM_PROT_NONE) {
7797 /* Don't mark the DMAP as executable. It never is on arm64. */
7798 if (VIRT_IN_DMAP(base)) {
7799 prot &= ~VM_PROT_EXECUTE;
7800 /*
7801 * XXX Mark the DMAP as writable for now. We rely
7802 * on this in ddb & dtrace to insert breakpoint
7803 * instructions.
7804 */
7805 prot |= VM_PROT_WRITE;
7806 }
7807
7808 if ((prot & VM_PROT_WRITE) == 0) {
7809 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
7810 }
7811 if ((prot & VM_PROT_EXECUTE) == 0) {
7812 bits |= ATTR_S1_PXN;
7813 }
7814 bits |= ATTR_S1_UXN;
7815 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
7816 }
7817
7818 for (tmpva = base; tmpva < base + size; ) {
7819 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
7820 if (ptep == NULL && !skip_unmapped) {
7821 return (EINVAL);
7822 } else if ((ptep == NULL && skip_unmapped) ||
7823 (pmap_load(ptep) & mask) == bits) {
7824 /*
7825 * We already have the correct attribute or there
7826 * is no memory mapped at this address and we are
7827 * skipping unmapped memory.
7828 */
7829 switch (lvl) {
7830 default:
7831 panic("Invalid DMAP table level: %d\n", lvl);
7832 case 1:
7833 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
7834 break;
7835 case 2:
7836 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
7837 break;
7838 case 3:
7839 tmpva += PAGE_SIZE;
7840 break;
7841 }
7842 } else {
7843 /* We can't demote/promote this entry */
7844 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
7845
7846 /*
7847 * Split the entry to an level 3 table, then
7848 * set the new attribute.
7849 */
7850 switch (lvl) {
7851 default:
7852 panic("Invalid DMAP table level: %d\n", lvl);
7853 case 1:
7854 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7855 if ((tmpva & L1_OFFSET) == 0 &&
7856 (base + size - tmpva) >= L1_SIZE) {
7857 pte_size = L1_SIZE;
7858 break;
7859 }
7860 newpte = pmap_demote_l1(kernel_pmap, ptep,
7861 tmpva & ~L1_OFFSET);
7862 if (newpte == NULL)
7863 return (EINVAL);
7864 ptep = pmap_l1_to_l2(ptep, tmpva);
7865 /* FALLTHROUGH */
7866 case 2:
7867 if ((tmpva & L2_OFFSET) == 0 &&
7868 (base + size - tmpva) >= L2_SIZE) {
7869 pte_size = L2_SIZE;
7870 break;
7871 }
7872 newpte = pmap_demote_l2(kernel_pmap, ptep,
7873 tmpva);
7874 if (newpte == NULL)
7875 return (EINVAL);
7876 ptep = pmap_l2_to_l3(ptep, tmpva);
7877 /* FALLTHROUGH */
7878 case 3:
7879 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
7880 if ((tmpva & L3C_OFFSET) == 0 &&
7881 (base + size - tmpva) >= L3C_SIZE) {
7882 pte_size = L3C_SIZE;
7883 break;
7884 }
7885 if (!pmap_demote_l3c(kernel_pmap, ptep,
7886 tmpva))
7887 return (EINVAL);
7888 }
7889 pte_size = PAGE_SIZE;
7890 break;
7891 }
7892
7893 /* Update the entry */
7894 pte = pmap_load(ptep);
7895 pte &= ~mask;
7896 pte |= bits;
7897
7898 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
7899 pte_size);
7900
7901 pa = PTE_TO_PHYS(pte);
7902 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
7903 /*
7904 * Keep the DMAP memory in sync.
7905 */
7906 rv = pmap_change_props_locked(
7907 PHYS_TO_DMAP(pa), pte_size,
7908 prot, mode, true);
7909 if (rv != 0)
7910 return (rv);
7911 }
7912
7913 /*
7914 * If moving to a non-cacheable entry flush
7915 * the cache.
7916 */
7917 if (mode == VM_MEMATTR_UNCACHEABLE)
7918 cpu_dcache_wbinv_range((void *)tmpva, pte_size);
7919 tmpva += pte_size;
7920 }
7921 }
7922
7923 return (0);
7924 }
7925
7926 /*
7927 * Create an L2 table to map all addresses within an L1 mapping.
7928 */
7929 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)7930 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
7931 {
7932 pt_entry_t *l2, newl2, oldl1;
7933 vm_offset_t tmpl1;
7934 vm_paddr_t l2phys, phys;
7935 vm_page_t ml2;
7936 int i;
7937
7938 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
7939 oldl1 = pmap_load(l1);
7940 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7941 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
7942 ("pmap_demote_l1: Demoting a non-block entry"));
7943 KASSERT((va & L1_OFFSET) == 0,
7944 ("pmap_demote_l1: Invalid virtual address %#lx", va));
7945 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
7946 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
7947 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
7948 ("pmap_demote_l1: Demoting entry with no-demote flag set"));
7949
7950 tmpl1 = 0;
7951 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
7952 tmpl1 = kva_alloc(PAGE_SIZE);
7953 if (tmpl1 == 0)
7954 return (NULL);
7955 }
7956
7957 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
7958 NULL) {
7959 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
7960 " in pmap %p", va, pmap);
7961 l2 = NULL;
7962 goto fail;
7963 }
7964
7965 l2phys = VM_PAGE_TO_PHYS(ml2);
7966 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
7967
7968 /* Address the range points at */
7969 phys = PTE_TO_PHYS(oldl1);
7970 /* The attributed from the old l1 table to be copied */
7971 newl2 = oldl1 & ATTR_MASK;
7972
7973 /* Create the new entries */
7974 for (i = 0; i < Ln_ENTRIES; i++) {
7975 l2[i] = newl2 | phys;
7976 phys += L2_SIZE;
7977 }
7978 KASSERT(l2[0] == ((oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK),
7979 ("Invalid l2 page (%lx != %lx)", l2[0],
7980 (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
7981
7982 if (tmpl1 != 0) {
7983 pmap_kenter(tmpl1, PAGE_SIZE,
7984 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
7985 VM_MEMATTR_WRITE_BACK);
7986 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
7987 }
7988
7989 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
7990
7991 fail:
7992 if (tmpl1 != 0) {
7993 pmap_kremove(tmpl1);
7994 kva_free(tmpl1, PAGE_SIZE);
7995 }
7996
7997 return (l2);
7998 }
7999
8000 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8001 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8002 {
8003 pt_entry_t *l3;
8004
8005 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8006 *l3 = newl3;
8007 newl3 += L3_SIZE;
8008 }
8009 }
8010
8011 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8012 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8013 {
8014 #ifdef INVARIANTS
8015 #ifdef DIAGNOSTIC
8016 pt_entry_t *xl3p, *yl3p;
8017
8018 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8019 xl3p++, newl3e += PAGE_SIZE) {
8020 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8021 printf("pmap_demote_l2: xl3e %zd and newl3e map "
8022 "different pages: found %#lx, expected %#lx\n",
8023 xl3p - firstl3p, pmap_load(xl3p), newl3e);
8024 printf("page table dump\n");
8025 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8026 yl3p++) {
8027 printf("%zd %#lx\n", yl3p - firstl3p,
8028 pmap_load(yl3p));
8029 }
8030 panic("firstpte");
8031 }
8032 }
8033 #else
8034 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8035 ("pmap_demote_l2: firstl3 and newl3e map different physical"
8036 " addresses"));
8037 #endif
8038 #endif
8039 }
8040
8041 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8042 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8043 struct rwlock **lockp)
8044 {
8045 struct spglist free;
8046
8047 SLIST_INIT(&free);
8048 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
8049 lockp);
8050 vm_page_free_pages_toq(&free, true);
8051 }
8052
8053 /*
8054 * Create an L3 table to map all addresses within an L2 mapping.
8055 */
8056 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8057 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8058 struct rwlock **lockp)
8059 {
8060 pt_entry_t *l3, newl3, oldl2;
8061 vm_offset_t tmpl2;
8062 vm_paddr_t l3phys;
8063 vm_page_t ml3;
8064
8065 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8066 PMAP_ASSERT_STAGE1(pmap);
8067 KASSERT(ADDR_IS_CANONICAL(va),
8068 ("%s: Address not in canonical form: %lx", __func__, va));
8069
8070 l3 = NULL;
8071 oldl2 = pmap_load(l2);
8072 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8073 ("pmap_demote_l2: Demoting a non-block entry"));
8074 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8075 ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8076 va &= ~L2_OFFSET;
8077
8078 tmpl2 = 0;
8079 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8080 tmpl2 = kva_alloc(PAGE_SIZE);
8081 if (tmpl2 == 0)
8082 return (NULL);
8083 }
8084
8085 /*
8086 * Invalidate the 2MB page mapping and return "failure" if the
8087 * mapping was never accessed.
8088 */
8089 if ((oldl2 & ATTR_AF) == 0) {
8090 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8091 ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
8092 pmap_demote_l2_abort(pmap, va, l2, lockp);
8093 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
8094 va, pmap);
8095 goto fail;
8096 }
8097
8098 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8099 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8100 ("pmap_demote_l2: page table page for a wired mapping"
8101 " is missing"));
8102
8103 /*
8104 * If the page table page is missing and the mapping
8105 * is for a kernel address, the mapping must belong to
8106 * either the direct map or the early kernel memory.
8107 * Page table pages are preallocated for every other
8108 * part of the kernel address space, so the direct map
8109 * region and early kernel memory are the only parts of the
8110 * kernel address space that must be handled here.
8111 */
8112 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
8113 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8114 ("pmap_demote_l2: No saved mpte for va %#lx", va));
8115
8116 /*
8117 * If the 2MB page mapping belongs to the direct map
8118 * region of the kernel's address space, then the page
8119 * allocation request specifies the highest possible
8120 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
8121 * priority is normal.
8122 */
8123 ml3 = vm_page_alloc_noobj(
8124 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8125 VM_ALLOC_WIRED);
8126
8127 /*
8128 * If the allocation of the new page table page fails,
8129 * invalidate the 2MB page mapping and return "failure".
8130 */
8131 if (ml3 == NULL) {
8132 pmap_demote_l2_abort(pmap, va, l2, lockp);
8133 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8134 " in pmap %p", va, pmap);
8135 goto fail;
8136 }
8137 ml3->pindex = pmap_l2_pindex(va);
8138
8139 if (!ADDR_IS_KERNEL(va)) {
8140 ml3->ref_count = NL3PG;
8141 pmap_resident_count_inc(pmap, 1);
8142 }
8143 }
8144 l3phys = VM_PAGE_TO_PHYS(ml3);
8145 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8146 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8147 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8148 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8149 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8150
8151 /*
8152 * If the PTP is not leftover from an earlier promotion or it does not
8153 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all
8154 * have ATTR_AF set.
8155 *
8156 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8157 * performs a dsb(). That dsb() ensures that the stores for filling
8158 * "l3" are visible before "l3" is added to the page table.
8159 */
8160 if (!vm_page_all_valid(ml3))
8161 pmap_fill_l3(l3, newl3);
8162
8163 pmap_demote_l2_check(l3, newl3);
8164
8165 /*
8166 * If the mapping has changed attributes, update the L3Es.
8167 */
8168 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8169 pmap_fill_l3(l3, newl3);
8170
8171 /*
8172 * Map the temporary page so we don't lose access to the l2 table.
8173 */
8174 if (tmpl2 != 0) {
8175 pmap_kenter(tmpl2, PAGE_SIZE,
8176 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8177 VM_MEMATTR_WRITE_BACK);
8178 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8179 }
8180
8181 /*
8182 * The spare PV entries must be reserved prior to demoting the
8183 * mapping, that is, prior to changing the PDE. Otherwise, the state
8184 * of the L2 and the PV lists will be inconsistent, which can result
8185 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8186 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8187 * PV entry for the 2MB page mapping that is being demoted.
8188 */
8189 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8190 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8191
8192 /*
8193 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8194 * the 2MB page mapping.
8195 */
8196 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8197
8198 /*
8199 * Demote the PV entry.
8200 */
8201 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8202 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8203
8204 atomic_add_long(&pmap_l2_demotions, 1);
8205 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8206 " in pmap %p %lx", va, pmap, l3[0]);
8207
8208 fail:
8209 if (tmpl2 != 0) {
8210 pmap_kremove(tmpl2);
8211 kva_free(tmpl2, PAGE_SIZE);
8212 }
8213
8214 return (l3);
8215
8216 }
8217
8218 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8219 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8220 {
8221 struct rwlock *lock;
8222 pt_entry_t *l3;
8223
8224 lock = NULL;
8225 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8226 if (lock != NULL)
8227 rw_wunlock(lock);
8228 return (l3);
8229 }
8230
8231 /*
8232 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8233 */
8234 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8235 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8236 {
8237 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8238 vm_offset_t tmpl3;
8239 register_t intr;
8240
8241 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8242 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8243 sizeof(pt_entry_t)) - 1));
8244 l3c_end = l3c_start + L3C_ENTRIES;
8245 tmpl3 = 0;
8246 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8247 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8248 tmpl3 = kva_alloc(PAGE_SIZE);
8249 if (tmpl3 == 0)
8250 return (false);
8251 pmap_kenter(tmpl3, PAGE_SIZE,
8252 DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8253 VM_MEMATTR_WRITE_BACK);
8254 l3c_start = (pt_entry_t *)(tmpl3 +
8255 ((vm_offset_t)l3c_start & PAGE_MASK));
8256 l3c_end = (pt_entry_t *)(tmpl3 +
8257 ((vm_offset_t)l3c_end & PAGE_MASK));
8258 }
8259 mask = 0;
8260 nbits = ATTR_DESCR_VALID;
8261 intr = intr_disable();
8262
8263 /*
8264 * Break the mappings.
8265 */
8266 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8267 /*
8268 * Clear the mapping's contiguous and valid bits, but leave
8269 * the rest of the entry unchanged, so that a lockless,
8270 * concurrent pmap_kextract() can still lookup the physical
8271 * address.
8272 */
8273 l3e = pmap_load(tl3p);
8274 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8275 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8276 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8277 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8278 ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8279 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8280 ATTR_DESCR_VALID)))
8281 cpu_spinwait();
8282
8283 /*
8284 * Hardware accessed and dirty bit maintenance might only
8285 * update a single L3 entry, so we must combine the accessed
8286 * and dirty bits from this entire set of contiguous L3
8287 * entries.
8288 */
8289 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8290 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8291 mask = ATTR_S1_AP_RW_BIT;
8292 nbits |= l3e & ATTR_AF;
8293 }
8294 if ((nbits & ATTR_AF) != 0) {
8295 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8296 ~L3C_OFFSET, true);
8297 }
8298
8299 /*
8300 * Remake the mappings, updating the accessed and dirty bits.
8301 */
8302 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8303 l3e = pmap_load(tl3p);
8304 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
8305 cpu_spinwait();
8306 }
8307 dsb(ishst);
8308
8309 intr_restore(intr);
8310 if (tmpl3 != 0) {
8311 pmap_kremove(tmpl3);
8312 kva_free(tmpl3, PAGE_SIZE);
8313 }
8314 counter_u64_add(pmap_l3c_demotions, 1);
8315 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8316 va, pmap);
8317 return (true);
8318 }
8319
8320 /*
8321 * Accumulate the accessed and dirty bits within a L3C superpage and
8322 * return the specified PTE with them applied correctly.
8323 */
8324 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)8325 pmap_load_l3c(pt_entry_t *l3p)
8326 {
8327 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8328
8329 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8330 sizeof(pt_entry_t)) - 1));
8331 l3c_end = l3c_start + L3C_ENTRIES;
8332 mask = 0;
8333 nbits = 0;
8334 /* Iterate over each mapping in the superpage. */
8335 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8336 l3e = pmap_load(tl3p);
8337 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8338 ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8339 /* Update mask if the current page has its dirty bit set. */
8340 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8341 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8342 mask = ATTR_S1_AP_RW_BIT;
8343 /* Update nbits if the accessed bit is set. */
8344 nbits |= l3e & ATTR_AF;
8345 }
8346 return ((pmap_load(l3p) & ~mask) | nbits);
8347 }
8348
8349 /*
8350 * Perform the pmap work for mincore(2). If the page is not both referenced and
8351 * modified by this pmap, returns its physical address so that the caller can
8352 * find other mappings.
8353 */
8354 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)8355 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8356 {
8357 pt_entry_t *pte, tpte;
8358 vm_paddr_t mask, pa;
8359 int lvl, val;
8360 bool managed;
8361
8362 PMAP_ASSERT_STAGE1(pmap);
8363 PMAP_LOCK(pmap);
8364 pte = pmap_pte(pmap, addr, &lvl);
8365 if (pte != NULL) {
8366 tpte = pmap_load(pte);
8367
8368 switch (lvl) {
8369 case 3:
8370 mask = L3_OFFSET;
8371 break;
8372 case 2:
8373 mask = L2_OFFSET;
8374 break;
8375 case 1:
8376 mask = L1_OFFSET;
8377 break;
8378 default:
8379 panic("pmap_mincore: invalid level %d", lvl);
8380 }
8381
8382 managed = (tpte & ATTR_SW_MANAGED) != 0;
8383 val = MINCORE_INCORE;
8384 if (lvl != 3)
8385 val |= MINCORE_PSIND(3 - lvl);
8386 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8387 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8388 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8389 if ((tpte & ATTR_AF) == ATTR_AF)
8390 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8391
8392 pa = PTE_TO_PHYS(tpte) | (addr & mask);
8393 } else {
8394 managed = false;
8395 val = 0;
8396 }
8397
8398 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8399 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8400 *pap = pa;
8401 }
8402 PMAP_UNLOCK(pmap);
8403 return (val);
8404 }
8405
8406 /*
8407 * Garbage collect every ASID that is neither active on a processor nor
8408 * reserved.
8409 */
8410 static void
pmap_reset_asid_set(pmap_t pmap)8411 pmap_reset_asid_set(pmap_t pmap)
8412 {
8413 pmap_t curpmap;
8414 int asid, cpuid, epoch;
8415 struct asid_set *set;
8416 enum pmap_stage stage;
8417
8418 set = pmap->pm_asid_set;
8419 stage = pmap->pm_stage;
8420
8421 set = pmap->pm_asid_set;
8422 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8423 mtx_assert(&set->asid_set_mutex, MA_OWNED);
8424
8425 /*
8426 * Ensure that the store to asid_epoch is globally visible before the
8427 * loads from pc_curpmap are performed.
8428 */
8429 epoch = set->asid_epoch + 1;
8430 if (epoch == INT_MAX)
8431 epoch = 0;
8432 set->asid_epoch = epoch;
8433 dsb(ishst);
8434 if (stage == PM_STAGE1) {
8435 __asm __volatile("tlbi vmalle1is");
8436 } else {
8437 KASSERT(pmap_clean_stage2_tlbi != NULL,
8438 ("%s: Unset stage 2 tlb invalidation callback\n",
8439 __func__));
8440 pmap_clean_stage2_tlbi();
8441 }
8442 dsb(ish);
8443 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
8444 set->asid_set_size - 1);
8445 CPU_FOREACH(cpuid) {
8446 if (cpuid == curcpu)
8447 continue;
8448 if (stage == PM_STAGE1) {
8449 curpmap = pcpu_find(cpuid)->pc_curpmap;
8450 PMAP_ASSERT_STAGE1(pmap);
8451 } else {
8452 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
8453 if (curpmap == NULL)
8454 continue;
8455 PMAP_ASSERT_STAGE2(pmap);
8456 }
8457 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
8458 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
8459 if (asid == -1)
8460 continue;
8461 bit_set(set->asid_set, asid);
8462 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
8463 }
8464 }
8465
8466 /*
8467 * Allocate a new ASID for the specified pmap.
8468 */
8469 static void
pmap_alloc_asid(pmap_t pmap)8470 pmap_alloc_asid(pmap_t pmap)
8471 {
8472 struct asid_set *set;
8473 int new_asid;
8474
8475 set = pmap->pm_asid_set;
8476 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8477
8478 mtx_lock_spin(&set->asid_set_mutex);
8479
8480 /*
8481 * While this processor was waiting to acquire the asid set mutex,
8482 * pmap_reset_asid_set() running on another processor might have
8483 * updated this pmap's cookie to the current epoch. In which case, we
8484 * don't need to allocate a new ASID.
8485 */
8486 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
8487 goto out;
8488
8489 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
8490 &new_asid);
8491 if (new_asid == -1) {
8492 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8493 set->asid_next, &new_asid);
8494 if (new_asid == -1) {
8495 pmap_reset_asid_set(pmap);
8496 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8497 set->asid_set_size, &new_asid);
8498 KASSERT(new_asid != -1, ("ASID allocation failure"));
8499 }
8500 }
8501 bit_set(set->asid_set, new_asid);
8502 set->asid_next = new_asid + 1;
8503 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
8504 out:
8505 mtx_unlock_spin(&set->asid_set_mutex);
8506 }
8507
8508 static uint64_t __read_mostly ttbr_flags;
8509
8510 /*
8511 * Compute the value that should be stored in ttbr0 to activate the specified
8512 * pmap. This value may change from time to time.
8513 */
8514 uint64_t
pmap_to_ttbr0(pmap_t pmap)8515 pmap_to_ttbr0(pmap_t pmap)
8516 {
8517 uint64_t ttbr;
8518
8519 ttbr = pmap->pm_ttbr;
8520 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
8521 ttbr |= ttbr_flags;
8522
8523 return (ttbr);
8524 }
8525
8526 static void
pmap_set_cnp(void * arg)8527 pmap_set_cnp(void *arg)
8528 {
8529 uint64_t ttbr0, ttbr1;
8530 u_int cpuid;
8531
8532 cpuid = *(u_int *)arg;
8533 if (cpuid == curcpu) {
8534 /*
8535 * Set the flags while all CPUs are handling the
8536 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
8537 * to pmap_to_ttbr0 after this will have the CnP flag set.
8538 * The dsb after invalidating the TLB will act as a barrier
8539 * to ensure all CPUs can observe this change.
8540 */
8541 ttbr_flags |= TTBR_CnP;
8542 }
8543
8544 ttbr0 = READ_SPECIALREG(ttbr0_el1);
8545 ttbr0 |= TTBR_CnP;
8546
8547 ttbr1 = READ_SPECIALREG(ttbr1_el1);
8548 ttbr1 |= TTBR_CnP;
8549
8550 /* Update ttbr{0,1}_el1 with the CnP flag */
8551 WRITE_SPECIALREG(ttbr0_el1, ttbr0);
8552 WRITE_SPECIALREG(ttbr1_el1, ttbr1);
8553 isb();
8554 __asm __volatile("tlbi vmalle1is");
8555 dsb(ish);
8556 isb();
8557 }
8558
8559 /*
8560 * Defer enabling some features until we have read the ID registers to know
8561 * if they are supported on all CPUs.
8562 */
8563 static void
pmap_init_mp(void * dummy __unused)8564 pmap_init_mp(void *dummy __unused)
8565 {
8566 uint64_t reg;
8567
8568 if (get_kernel_reg(ID_AA64PFR1_EL1, ®)) {
8569 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
8570 if (bootverbose)
8571 printf("Enabling BTI\n");
8572 pmap_bti_support = true;
8573
8574 pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
8575 sizeof(struct rs_el), NULL, NULL, NULL, NULL,
8576 UMA_ALIGN_PTR, 0);
8577 }
8578 }
8579 }
8580 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
8581
8582 /*
8583 * Defer enabling CnP until we have read the ID registers to know if it's
8584 * supported on all CPUs.
8585 */
8586 static void
pmap_init_cnp(void * dummy __unused)8587 pmap_init_cnp(void *dummy __unused)
8588 {
8589 uint64_t reg;
8590 u_int cpuid;
8591
8592 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®))
8593 return;
8594
8595 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
8596 if (bootverbose)
8597 printf("Enabling CnP\n");
8598 cpuid = curcpu;
8599 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
8600 }
8601
8602 }
8603 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
8604
8605 static bool
pmap_activate_int(pmap_t pmap)8606 pmap_activate_int(pmap_t pmap)
8607 {
8608 struct asid_set *set;
8609 int epoch;
8610
8611 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
8612 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
8613
8614 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
8615 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
8616 /*
8617 * Handle the possibility that the old thread was preempted
8618 * after an "ic" or "tlbi" instruction but before it performed
8619 * a "dsb" instruction. If the old thread migrates to a new
8620 * processor, its completion of a "dsb" instruction on that
8621 * new processor does not guarantee that the "ic" or "tlbi"
8622 * instructions performed on the old processor have completed.
8623 */
8624 dsb(ish);
8625 return (false);
8626 }
8627
8628 set = pmap->pm_asid_set;
8629 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8630
8631 /*
8632 * Ensure that the store to curpmap is globally visible before the
8633 * load from asid_epoch is performed.
8634 */
8635 if (pmap->pm_stage == PM_STAGE1)
8636 PCPU_SET(curpmap, pmap);
8637 else
8638 PCPU_SET(curvmpmap, pmap);
8639 dsb(ish);
8640 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
8641 if (epoch >= 0 && epoch != set->asid_epoch)
8642 pmap_alloc_asid(pmap);
8643
8644 if (pmap->pm_stage == PM_STAGE1) {
8645 set_ttbr0(pmap_to_ttbr0(pmap));
8646 if (PCPU_GET(bcast_tlbi_workaround) != 0)
8647 invalidate_local_icache();
8648 }
8649 return (true);
8650 }
8651
8652 void
pmap_activate_vm(pmap_t pmap)8653 pmap_activate_vm(pmap_t pmap)
8654 {
8655
8656 PMAP_ASSERT_STAGE2(pmap);
8657
8658 (void)pmap_activate_int(pmap);
8659 }
8660
8661 void
pmap_activate(struct thread * td)8662 pmap_activate(struct thread *td)
8663 {
8664 pmap_t pmap;
8665
8666 pmap = vmspace_pmap(td->td_proc->p_vmspace);
8667 PMAP_ASSERT_STAGE1(pmap);
8668 critical_enter();
8669 (void)pmap_activate_int(pmap);
8670 critical_exit();
8671 }
8672
8673 /*
8674 * Activate the thread we are switching to.
8675 * To simplify the assembly in cpu_throw return the new threads pcb.
8676 */
8677 struct pcb *
pmap_switch(struct thread * new)8678 pmap_switch(struct thread *new)
8679 {
8680 pcpu_bp_harden bp_harden;
8681 struct pcb *pcb;
8682
8683 /* Store the new curthread */
8684 PCPU_SET(curthread, new);
8685
8686 /* And the new pcb */
8687 pcb = new->td_pcb;
8688 PCPU_SET(curpcb, pcb);
8689
8690 /*
8691 * TODO: We may need to flush the cache here if switching
8692 * to a user process.
8693 */
8694
8695 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
8696 /*
8697 * Stop userspace from training the branch predictor against
8698 * other processes. This will call into a CPU specific
8699 * function that clears the branch predictor state.
8700 */
8701 bp_harden = PCPU_GET(bp_harden);
8702 if (bp_harden != NULL)
8703 bp_harden();
8704 }
8705
8706 return (pcb);
8707 }
8708
8709 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)8710 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
8711 {
8712
8713 PMAP_ASSERT_STAGE1(pmap);
8714 KASSERT(ADDR_IS_CANONICAL(va),
8715 ("%s: Address not in canonical form: %lx", __func__, va));
8716
8717 if (ADDR_IS_KERNEL(va)) {
8718 cpu_icache_sync_range((void *)va, sz);
8719 } else {
8720 u_int len, offset;
8721 vm_paddr_t pa;
8722
8723 /* Find the length of data in this page to flush */
8724 offset = va & PAGE_MASK;
8725 len = imin(PAGE_SIZE - offset, sz);
8726
8727 while (sz != 0) {
8728 /* Extract the physical address & find it in the DMAP */
8729 pa = pmap_extract(pmap, va);
8730 if (pa != 0)
8731 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
8732 len);
8733
8734 /* Move to the next page */
8735 sz -= len;
8736 va += len;
8737 /* Set the length for the next iteration */
8738 len = imin(PAGE_SIZE, sz);
8739 }
8740 }
8741 }
8742
8743 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)8744 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
8745 {
8746 pd_entry_t *pdep;
8747 pt_entry_t *ptep, pte;
8748 int rv, lvl, dfsc;
8749
8750 PMAP_ASSERT_STAGE2(pmap);
8751 rv = KERN_FAILURE;
8752
8753 /* Data and insn aborts use same encoding for FSC field. */
8754 dfsc = esr & ISS_DATA_DFSC_MASK;
8755 switch (dfsc) {
8756 case ISS_DATA_DFSC_TF_L0:
8757 case ISS_DATA_DFSC_TF_L1:
8758 case ISS_DATA_DFSC_TF_L2:
8759 case ISS_DATA_DFSC_TF_L3:
8760 PMAP_LOCK(pmap);
8761 pdep = pmap_pde(pmap, far, &lvl);
8762 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
8763 PMAP_UNLOCK(pmap);
8764 break;
8765 }
8766
8767 switch (lvl) {
8768 case 0:
8769 ptep = pmap_l0_to_l1(pdep, far);
8770 break;
8771 case 1:
8772 ptep = pmap_l1_to_l2(pdep, far);
8773 break;
8774 case 2:
8775 ptep = pmap_l2_to_l3(pdep, far);
8776 break;
8777 default:
8778 panic("%s: Invalid pde level %d", __func__,lvl);
8779 }
8780 goto fault_exec;
8781
8782 case ISS_DATA_DFSC_AFF_L1:
8783 case ISS_DATA_DFSC_AFF_L2:
8784 case ISS_DATA_DFSC_AFF_L3:
8785 PMAP_LOCK(pmap);
8786 ptep = pmap_pte(pmap, far, &lvl);
8787 fault_exec:
8788 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
8789 if (icache_vmid) {
8790 pmap_invalidate_vpipt_icache();
8791 } else {
8792 /*
8793 * If accessing an executable page invalidate
8794 * the I-cache so it will be valid when we
8795 * continue execution in the guest. The D-cache
8796 * is assumed to already be clean to the Point
8797 * of Coherency.
8798 */
8799 if ((pte & ATTR_S2_XN_MASK) !=
8800 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
8801 invalidate_icache();
8802 }
8803 }
8804 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
8805 rv = KERN_SUCCESS;
8806 }
8807 PMAP_UNLOCK(pmap);
8808 break;
8809 }
8810
8811 return (rv);
8812 }
8813
8814 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)8815 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
8816 {
8817 pt_entry_t pte, *ptep;
8818 register_t intr;
8819 uint64_t ec, par;
8820 int lvl, rv;
8821
8822 rv = KERN_FAILURE;
8823
8824 ec = ESR_ELx_EXCEPTION(esr);
8825 switch (ec) {
8826 case EXCP_INSN_ABORT_L:
8827 case EXCP_INSN_ABORT:
8828 case EXCP_DATA_ABORT_L:
8829 case EXCP_DATA_ABORT:
8830 break;
8831 default:
8832 return (rv);
8833 }
8834
8835 if (pmap->pm_stage == PM_STAGE2)
8836 return (pmap_stage2_fault(pmap, esr, far));
8837
8838 /* Data and insn aborts use same encoding for FSC field. */
8839 switch (esr & ISS_DATA_DFSC_MASK) {
8840 case ISS_DATA_DFSC_AFF_L1:
8841 case ISS_DATA_DFSC_AFF_L2:
8842 case ISS_DATA_DFSC_AFF_L3:
8843 PMAP_LOCK(pmap);
8844 ptep = pmap_pte(pmap, far, &lvl);
8845 if (ptep != NULL) {
8846 pmap_set_bits(ptep, ATTR_AF);
8847 rv = KERN_SUCCESS;
8848 /*
8849 * XXXMJ as an optimization we could mark the entry
8850 * dirty if this is a write fault.
8851 */
8852 }
8853 PMAP_UNLOCK(pmap);
8854 break;
8855 case ISS_DATA_DFSC_PF_L1:
8856 case ISS_DATA_DFSC_PF_L2:
8857 case ISS_DATA_DFSC_PF_L3:
8858 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
8859 (esr & ISS_DATA_WnR) == 0)
8860 return (rv);
8861 PMAP_LOCK(pmap);
8862 ptep = pmap_pte(pmap, far, &lvl);
8863 if (ptep != NULL &&
8864 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
8865 if ((pte & ATTR_S1_AP_RW_BIT) ==
8866 ATTR_S1_AP(ATTR_S1_AP_RO)) {
8867 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
8868 pmap_s1_invalidate_page(pmap, far, true);
8869 }
8870 rv = KERN_SUCCESS;
8871 }
8872 PMAP_UNLOCK(pmap);
8873 break;
8874 case ISS_DATA_DFSC_TF_L0:
8875 case ISS_DATA_DFSC_TF_L1:
8876 case ISS_DATA_DFSC_TF_L2:
8877 case ISS_DATA_DFSC_TF_L3:
8878 /*
8879 * Retry the translation. A break-before-make sequence can
8880 * produce a transient fault.
8881 */
8882 if (pmap == kernel_pmap) {
8883 /*
8884 * The translation fault may have occurred within a
8885 * critical section. Therefore, we must check the
8886 * address without acquiring the kernel pmap's lock.
8887 */
8888 if (pmap_klookup(far, NULL))
8889 rv = KERN_SUCCESS;
8890 } else {
8891 PMAP_LOCK(pmap);
8892 /* Ask the MMU to check the address. */
8893 intr = intr_disable();
8894 par = arm64_address_translate_s1e0r(far);
8895 intr_restore(intr);
8896 PMAP_UNLOCK(pmap);
8897
8898 /*
8899 * If the translation was successful, then we can
8900 * return success to the trap handler.
8901 */
8902 if (PAR_SUCCESS(par))
8903 rv = KERN_SUCCESS;
8904 }
8905 break;
8906 }
8907
8908 return (rv);
8909 }
8910
8911 /*
8912 * Increase the starting virtual address of the given mapping if a
8913 * different alignment might result in more superpage mappings.
8914 */
8915 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)8916 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
8917 vm_offset_t *addr, vm_size_t size)
8918 {
8919 vm_offset_t superpage_offset;
8920
8921 if (size < L2_SIZE)
8922 return;
8923 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
8924 offset += ptoa(object->pg_color);
8925 superpage_offset = offset & L2_OFFSET;
8926 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
8927 (*addr & L2_OFFSET) == superpage_offset)
8928 return;
8929 if ((*addr & L2_OFFSET) < superpage_offset)
8930 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
8931 else
8932 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
8933 }
8934
8935 /**
8936 * Get the kernel virtual address of a set of physical pages. If there are
8937 * physical addresses not covered by the DMAP perform a transient mapping
8938 * that will be removed when calling pmap_unmap_io_transient.
8939 *
8940 * \param page The pages the caller wishes to obtain the virtual
8941 * address on the kernel memory map.
8942 * \param vaddr On return contains the kernel virtual memory address
8943 * of the pages passed in the page parameter.
8944 * \param count Number of pages passed in.
8945 * \param can_fault true if the thread using the mapped pages can take
8946 * page faults, false otherwise.
8947 *
8948 * \returns true if the caller must call pmap_unmap_io_transient when
8949 * finished or false otherwise.
8950 *
8951 */
8952 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)8953 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8954 bool can_fault)
8955 {
8956 vm_paddr_t paddr;
8957 bool needs_mapping;
8958 int error __diagused, i;
8959
8960 /*
8961 * Allocate any KVA space that we need, this is done in a separate
8962 * loop to prevent calling vmem_alloc while pinned.
8963 */
8964 needs_mapping = false;
8965 for (i = 0; i < count; i++) {
8966 paddr = VM_PAGE_TO_PHYS(page[i]);
8967 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
8968 error = vmem_alloc(kernel_arena, PAGE_SIZE,
8969 M_BESTFIT | M_WAITOK, &vaddr[i]);
8970 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
8971 needs_mapping = true;
8972 } else {
8973 vaddr[i] = PHYS_TO_DMAP(paddr);
8974 }
8975 }
8976
8977 /* Exit early if everything is covered by the DMAP */
8978 if (!needs_mapping)
8979 return (false);
8980
8981 if (!can_fault)
8982 sched_pin();
8983 for (i = 0; i < count; i++) {
8984 paddr = VM_PAGE_TO_PHYS(page[i]);
8985 if (!PHYS_IN_DMAP(paddr)) {
8986 panic(
8987 "pmap_map_io_transient: TODO: Map out of DMAP data");
8988 }
8989 }
8990
8991 return (needs_mapping);
8992 }
8993
8994 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)8995 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
8996 bool can_fault)
8997 {
8998 vm_paddr_t paddr;
8999 int i;
9000
9001 if (!can_fault)
9002 sched_unpin();
9003 for (i = 0; i < count; i++) {
9004 paddr = VM_PAGE_TO_PHYS(page[i]);
9005 if (!PHYS_IN_DMAP(paddr)) {
9006 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9007 }
9008 }
9009 }
9010
9011 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9012 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9013 {
9014
9015 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
9016 }
9017
9018 static void *
bti_dup_range(void * ctx __unused,void * data)9019 bti_dup_range(void *ctx __unused, void *data)
9020 {
9021 struct rs_el *node, *new_node;
9022
9023 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9024 if (new_node == NULL)
9025 return (NULL);
9026 node = data;
9027 memcpy(new_node, node, sizeof(*node));
9028 return (new_node);
9029 }
9030
9031 static void
bti_free_range(void * ctx __unused,void * node)9032 bti_free_range(void *ctx __unused, void *node)
9033 {
9034
9035 uma_zfree(pmap_bti_ranges_zone, node);
9036 }
9037
9038 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9039 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9040 {
9041 struct rs_el *rs;
9042 int error;
9043
9044 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9045 PMAP_ASSERT_STAGE1(pmap);
9046 MPASS(pmap->pm_bti != NULL);
9047 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9048 if (rs == NULL)
9049 return (ENOMEM);
9050 error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9051 if (error != 0)
9052 uma_zfree(pmap_bti_ranges_zone, rs);
9053 return (error);
9054 }
9055
9056 static void
pmap_bti_deassign_all(pmap_t pmap)9057 pmap_bti_deassign_all(pmap_t pmap)
9058 {
9059
9060 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9061 if (pmap->pm_bti != NULL)
9062 rangeset_remove_all(pmap->pm_bti);
9063 }
9064
9065 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9066 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9067 {
9068 struct rs_el *prev_rs, *rs;
9069 vm_offset_t va;
9070
9071 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9072 KASSERT(ADDR_IS_CANONICAL(sva),
9073 ("%s: Start address not in canonical form: %lx", __func__, sva));
9074 KASSERT(ADDR_IS_CANONICAL(eva),
9075 ("%s: End address not in canonical form: %lx", __func__, eva));
9076
9077 if (pmap->pm_bti == NULL || ADDR_IS_KERNEL(sva))
9078 return (true);
9079 MPASS(!ADDR_IS_KERNEL(eva));
9080 for (va = sva; va < eva; prev_rs = rs) {
9081 rs = rangeset_lookup(pmap->pm_bti, va);
9082 if (va == sva)
9083 prev_rs = rs;
9084 else if ((rs == NULL) ^ (prev_rs == NULL))
9085 return (false);
9086 if (rs == NULL) {
9087 va += PAGE_SIZE;
9088 continue;
9089 }
9090 va = rs->re_end;
9091 }
9092 return (true);
9093 }
9094
9095 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9096 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9097 {
9098 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9099 MPASS(ADDR_IS_CANONICAL(va));
9100
9101 if (pmap->pm_stage != PM_STAGE1)
9102 return (0);
9103 if (pmap == kernel_pmap)
9104 return (ATTR_KERN_GP);
9105 if (pmap->pm_bti != NULL && rangeset_lookup(pmap->pm_bti, va) != NULL)
9106 return (ATTR_S1_GP);
9107 return (0);
9108 }
9109
9110 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9111 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9112 {
9113
9114 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9115 if (pmap->pm_bti != NULL)
9116 rangeset_remove(pmap->pm_bti, sva, eva);
9117 }
9118
9119 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9120 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9121 {
9122
9123 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9124 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9125 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9126 MPASS(src_pmap->pm_bti != NULL);
9127 MPASS(dst_pmap->pm_bti != NULL);
9128 if (src_pmap->pm_bti->rs_data_ctx == NULL)
9129 return (0);
9130 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9131 }
9132
9133 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9134 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9135 {
9136 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9137 PMAP_ASSERT_STAGE1(pmap);
9138
9139 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9140 true);
9141 }
9142
9143 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9144 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9145 {
9146 int error;
9147
9148 if (pmap->pm_bti == NULL)
9149 return (0);
9150 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9151 return (EINVAL);
9152 if (pmap->pm_stage != PM_STAGE1)
9153 return (EINVAL);
9154 if (eva <= sva || ADDR_IS_KERNEL(eva))
9155 return (EFAULT);
9156
9157 sva = trunc_page(sva);
9158 eva = round_page(eva);
9159 for (;;) {
9160 PMAP_LOCK(pmap);
9161 error = pmap_bti_assign(pmap, sva, eva);
9162 if (error == 0)
9163 pmap_bti_update_range(pmap, sva, eva, true);
9164 PMAP_UNLOCK(pmap);
9165 if (error != ENOMEM)
9166 break;
9167 vm_wait(NULL);
9168 }
9169 return (error);
9170 }
9171
9172 #if defined(KASAN) || defined(KMSAN)
9173 static pd_entry_t *pmap_san_early_l2;
9174
9175 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE)
9176 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE)
9177 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9178 pmap_san_enter_bootstrap_alloc_l2(void)
9179 {
9180 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9181 static size_t offset = 0;
9182 vm_offset_t addr;
9183
9184 if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9185 panic("%s: out of memory for the bootstrap shadow map L2 entries",
9186 __func__);
9187 }
9188
9189 addr = (uintptr_t)&bootstrap_data[offset];
9190 offset += L2_SIZE;
9191 return (addr);
9192 }
9193
9194 /*
9195 * SAN L1 + L2 pages, maybe L3 entries later?
9196 */
9197 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9198 pmap_san_enter_bootstrap_alloc_pages(int npages)
9199 {
9200 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9201 static size_t offset = 0;
9202 vm_offset_t addr;
9203
9204 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9205 panic("%s: out of memory for the bootstrap shadow map",
9206 __func__);
9207 }
9208
9209 addr = (uintptr_t)&bootstrap_data[offset];
9210 offset += (npages * PAGE_SIZE);
9211 return (addr);
9212 }
9213
9214 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)9215 pmap_san_enter_bootstrap(void)
9216 {
9217 vm_offset_t freemempos;
9218
9219 /* L1, L2 */
9220 freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9221 bs_state.freemempos = freemempos;
9222 bs_state.va = KASAN_MIN_ADDRESS;
9223 pmap_bootstrap_l1_table(&bs_state);
9224 pmap_san_early_l2 = bs_state.l2;
9225 }
9226
9227 static vm_page_t
pmap_san_enter_alloc_l3(void)9228 pmap_san_enter_alloc_l3(void)
9229 {
9230 vm_page_t m;
9231
9232 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9233 VM_ALLOC_ZERO);
9234 if (m == NULL)
9235 panic("%s: no memory to grow shadow map", __func__);
9236 return (m);
9237 }
9238
9239 static vm_page_t
pmap_san_enter_alloc_l2(void)9240 pmap_san_enter_alloc_l2(void)
9241 {
9242 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9243 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9244 }
9245
9246 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)9247 pmap_san_enter(vm_offset_t va)
9248 {
9249 pd_entry_t *l1, *l2;
9250 pt_entry_t *l3;
9251 vm_page_t m;
9252
9253 if (virtual_avail == 0) {
9254 vm_offset_t block;
9255 int slot;
9256 bool first;
9257
9258 /* Temporary shadow map prior to pmap_bootstrap(). */
9259 first = pmap_san_early_l2 == NULL;
9260 if (first)
9261 pmap_san_enter_bootstrap();
9262
9263 l2 = pmap_san_early_l2;
9264 slot = pmap_l2_index(va);
9265
9266 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9267 MPASS(first);
9268 block = pmap_san_enter_bootstrap_alloc_l2();
9269 pmap_store(&l2[slot],
9270 PHYS_TO_PTE(pmap_early_vtophys(block)) |
9271 PMAP_SAN_PTE_BITS | L2_BLOCK);
9272 dmb(ishst);
9273 }
9274
9275 return;
9276 }
9277
9278 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9279 l1 = pmap_l1(kernel_pmap, va);
9280 MPASS(l1 != NULL);
9281 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9282 m = pmap_san_enter_alloc_l3();
9283 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9284 }
9285 l2 = pmap_l1_to_l2(l1, va);
9286 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9287 m = pmap_san_enter_alloc_l2();
9288 if (m != NULL) {
9289 pmap_store(l2, VM_PAGE_TO_PTE(m) |
9290 PMAP_SAN_PTE_BITS | L2_BLOCK);
9291 } else {
9292 m = pmap_san_enter_alloc_l3();
9293 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9294 }
9295 dmb(ishst);
9296 }
9297 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9298 return;
9299 l3 = pmap_l2_to_l3(l2, va);
9300 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9301 return;
9302 m = pmap_san_enter_alloc_l3();
9303 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9304 dmb(ishst);
9305 }
9306 #endif /* KASAN || KMSAN */
9307
9308 /*
9309 * Track a range of the kernel's virtual address space that is contiguous
9310 * in various mapping attributes.
9311 */
9312 struct pmap_kernel_map_range {
9313 vm_offset_t sva;
9314 pt_entry_t attrs;
9315 int l3pages;
9316 int l3contig;
9317 int l2blocks;
9318 int l1blocks;
9319 };
9320
9321 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)9322 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9323 vm_offset_t eva)
9324 {
9325 const char *mode;
9326 int index;
9327
9328 if (eva <= range->sva)
9329 return;
9330
9331 index = range->attrs & ATTR_S1_IDX_MASK;
9332 switch (index) {
9333 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9334 mode = "DEV-NP";
9335 break;
9336 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9337 mode = "DEV";
9338 break;
9339 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9340 mode = "UC";
9341 break;
9342 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9343 mode = "WB";
9344 break;
9345 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9346 mode = "WT";
9347 break;
9348 default:
9349 printf(
9350 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9351 __func__, index, range->sva, eva);
9352 mode = "??";
9353 break;
9354 }
9355
9356 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d\n",
9357 range->sva, eva,
9358 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9359 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9360 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9361 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9362 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9363 mode, range->l1blocks, range->l2blocks, range->l3contig,
9364 range->l3pages);
9365
9366 /* Reset to sentinel value. */
9367 range->sva = 0xfffffffffffffffful;
9368 }
9369
9370 /*
9371 * Determine whether the attributes specified by a page table entry match those
9372 * being tracked by the current range.
9373 */
9374 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)9375 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
9376 {
9377
9378 return (range->attrs == attrs);
9379 }
9380
9381 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)9382 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
9383 pt_entry_t attrs)
9384 {
9385
9386 memset(range, 0, sizeof(*range));
9387 range->sva = va;
9388 range->attrs = attrs;
9389 }
9390
9391 /* Get the block/page attributes that correspond to the table attributes */
9392 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)9393 sysctl_kmaps_table_attrs(pd_entry_t table)
9394 {
9395 pt_entry_t attrs;
9396
9397 attrs = 0;
9398 if ((table & TATTR_UXN_TABLE) != 0)
9399 attrs |= ATTR_S1_UXN;
9400 if ((table & TATTR_PXN_TABLE) != 0)
9401 attrs |= ATTR_S1_PXN;
9402 if ((table & TATTR_AP_TABLE_RO) != 0)
9403 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
9404
9405 return (attrs);
9406 }
9407
9408 /* Read the block/page attributes we care about */
9409 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)9410 sysctl_kmaps_block_attrs(pt_entry_t block)
9411 {
9412 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
9413 ATTR_S1_GP));
9414 }
9415
9416 /*
9417 * Given a leaf PTE, derive the mapping's attributes. If they do not match
9418 * those of the current run, dump the address range and its attributes, and
9419 * begin a new run.
9420 */
9421 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)9422 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
9423 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
9424 pt_entry_t l3e)
9425 {
9426 pt_entry_t attrs;
9427
9428 attrs = sysctl_kmaps_table_attrs(l0e);
9429
9430 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9431 attrs |= sysctl_kmaps_block_attrs(l1e);
9432 goto done;
9433 }
9434 attrs |= sysctl_kmaps_table_attrs(l1e);
9435
9436 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9437 attrs |= sysctl_kmaps_block_attrs(l2e);
9438 goto done;
9439 }
9440 attrs |= sysctl_kmaps_table_attrs(l2e);
9441 attrs |= sysctl_kmaps_block_attrs(l3e);
9442
9443 done:
9444 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
9445 sysctl_kmaps_dump(sb, range, va);
9446 sysctl_kmaps_reinit(range, va, attrs);
9447 }
9448 }
9449
9450 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)9451 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
9452 {
9453 struct pmap_kernel_map_range range;
9454 struct sbuf sbuf, *sb;
9455 pd_entry_t l0e, *l1, l1e, *l2, l2e;
9456 pt_entry_t *l3, l3e;
9457 vm_offset_t sva;
9458 vm_paddr_t pa;
9459 int error, i, j, k, l;
9460
9461 error = sysctl_wire_old_buffer(req, 0);
9462 if (error != 0)
9463 return (error);
9464 sb = &sbuf;
9465 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
9466
9467 /* Sentinel value. */
9468 range.sva = 0xfffffffffffffffful;
9469
9470 /*
9471 * Iterate over the kernel page tables without holding the kernel pmap
9472 * lock. Kernel page table pages are never freed, so at worst we will
9473 * observe inconsistencies in the output.
9474 */
9475 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
9476 i++) {
9477 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
9478 sbuf_printf(sb, "\nDirect map:\n");
9479 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
9480 sbuf_printf(sb, "\nKernel map:\n");
9481 #ifdef KASAN
9482 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
9483 sbuf_printf(sb, "\nKASAN shadow map:\n");
9484 #endif
9485 #ifdef KMSAN
9486 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
9487 sbuf_printf(sb, "\nKMSAN shadow map:\n");
9488 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
9489 sbuf_printf(sb, "\nKMSAN origin map:\n");
9490 #endif
9491
9492 l0e = kernel_pmap->pm_l0[i];
9493 if ((l0e & ATTR_DESCR_VALID) == 0) {
9494 sysctl_kmaps_dump(sb, &range, sva);
9495 sva += L0_SIZE;
9496 continue;
9497 }
9498 pa = PTE_TO_PHYS(l0e);
9499 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9500
9501 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
9502 l1e = l1[j];
9503 if ((l1e & ATTR_DESCR_VALID) == 0) {
9504 sysctl_kmaps_dump(sb, &range, sva);
9505 sva += L1_SIZE;
9506 continue;
9507 }
9508 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
9509 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
9510 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
9511 0, 0);
9512 range.l1blocks++;
9513 sva += L1_SIZE;
9514 continue;
9515 }
9516 pa = PTE_TO_PHYS(l1e);
9517 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9518
9519 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
9520 l2e = l2[k];
9521 if ((l2e & ATTR_DESCR_VALID) == 0) {
9522 sysctl_kmaps_dump(sb, &range, sva);
9523 sva += L2_SIZE;
9524 continue;
9525 }
9526 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
9527 sysctl_kmaps_check(sb, &range, sva,
9528 l0e, l1e, l2e, 0);
9529 range.l2blocks++;
9530 sva += L2_SIZE;
9531 continue;
9532 }
9533 pa = PTE_TO_PHYS(l2e);
9534 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
9535
9536 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
9537 l++, sva += L3_SIZE) {
9538 l3e = l3[l];
9539 if ((l3e & ATTR_DESCR_VALID) == 0) {
9540 sysctl_kmaps_dump(sb, &range,
9541 sva);
9542 continue;
9543 }
9544 sysctl_kmaps_check(sb, &range, sva,
9545 l0e, l1e, l2e, l3e);
9546 if ((l3e & ATTR_CONTIGUOUS) != 0)
9547 range.l3contig +=
9548 l % L3C_ENTRIES == 0 ?
9549 1 : 0;
9550 else
9551 range.l3pages++;
9552 }
9553 }
9554 }
9555 }
9556
9557 error = sbuf_finish(sb);
9558 sbuf_delete(sb);
9559 return (error);
9560 }
9561 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
9562 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
9563 NULL, 0, sysctl_kmaps, "A",
9564 "Dump kernel address layout");
9565