1 /*-
2 * Copyright (c) 1991 Regents of the University of California.
3 * All rights reserved.
4 * Copyright (c) 1994 John S. Dyson
5 * All rights reserved.
6 * Copyright (c) 1994 David Greenman
7 * All rights reserved.
8 * Copyright (c) 2003 Peter Wemm
9 * All rights reserved.
10 * Copyright (c) 2005-2010 Alan L. Cox <alc@cs.rice.edu>
11 * All rights reserved.
12 * Copyright (c) 2014 Andrew Turner
13 * All rights reserved.
14 * Copyright (c) 2014-2016 The FreeBSD Foundation
15 * All rights reserved.
16 *
17 * This code is derived from software contributed to Berkeley by
18 * the Systems Programming Group of the University of Utah Computer
19 * Science Department and William Jolitz of UUNET Technologies Inc.
20 *
21 * This software was developed by Andrew Turner under sponsorship from
22 * the FreeBSD Foundation.
23 *
24 * Redistribution and use in source and binary forms, with or without
25 * modification, are permitted provided that the following conditions
26 * are met:
27 * 1. Redistributions of source code must retain the above copyright
28 * notice, this list of conditions and the following disclaimer.
29 * 2. Redistributions in binary form must reproduce the above copyright
30 * notice, this list of conditions and the following disclaimer in the
31 * documentation and/or other materials provided with the distribution.
32 * 3. All advertising materials mentioning features or use of this software
33 * must display the following acknowledgement:
34 * This product includes software developed by the University of
35 * California, Berkeley and its contributors.
36 * 4. Neither the name of the University nor the names of its contributors
37 * may be used to endorse or promote products derived from this software
38 * without specific prior written permission.
39 *
40 * THIS SOFTWARE IS PROVIDED BY THE REGENTS AND CONTRIBUTORS ``AS IS'' AND
41 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
42 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
43 * ARE DISCLAIMED. IN NO EVENT SHALL THE REGENTS OR CONTRIBUTORS BE LIABLE
44 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
45 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
46 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
47 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
48 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
49 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
50 * SUCH DAMAGE.
51 */
52 /*-
53 * Copyright (c) 2003 Networks Associates Technology, Inc.
54 * All rights reserved.
55 *
56 * This software was developed for the FreeBSD Project by Jake Burkholder,
57 * Safeport Network Services, and Network Associates Laboratories, the
58 * Security Research Division of Network Associates, Inc. under
59 * DARPA/SPAWAR contract N66001-01-C-8035 ("CBOSS"), as part of the DARPA
60 * CHATS research program.
61 *
62 * Redistribution and use in source and binary forms, with or without
63 * modification, are permitted provided that the following conditions
64 * are met:
65 * 1. Redistributions of source code must retain the above copyright
66 * notice, this list of conditions and the following disclaimer.
67 * 2. Redistributions in binary form must reproduce the above copyright
68 * notice, this list of conditions and the following disclaimer in the
69 * documentation and/or other materials provided with the distribution.
70 *
71 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
72 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
73 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
74 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
75 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
76 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
77 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
78 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
79 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
80 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
81 * SUCH DAMAGE.
82 */
83
84 #include <sys/cdefs.h>
85 /*
86 * Manages physical address maps.
87 *
88 * Since the information managed by this module is
89 * also stored by the logical address mapping module,
90 * this module may throw away valid virtual-to-physical
91 * mappings at almost any time. However, invalidations
92 * of virtual-to-physical mappings must be done as
93 * requested.
94 *
95 * In order to cope with hardware architectures which
96 * make virtual-to-physical map invalidates expensive,
97 * this module may delay invalidate or reduced protection
98 * operations until such time as they are actually
99 * necessary. This module is given full information as
100 * to which processors are currently using which maps,
101 * and to when physical maps must be made correct.
102 */
103
104 #include "opt_vm.h"
105
106 #include <sys/param.h>
107 #include <sys/asan.h>
108 #include <sys/bitstring.h>
109 #include <sys/bus.h>
110 #include <sys/systm.h>
111 #include <sys/kernel.h>
112 #include <sys/ktr.h>
113 #include <sys/limits.h>
114 #include <sys/lock.h>
115 #include <sys/malloc.h>
116 #include <sys/mman.h>
117 #include <sys/msan.h>
118 #include <sys/msgbuf.h>
119 #include <sys/mutex.h>
120 #include <sys/physmem.h>
121 #include <sys/proc.h>
122 #include <sys/rangeset.h>
123 #include <sys/rwlock.h>
124 #include <sys/sbuf.h>
125 #include <sys/sx.h>
126 #include <sys/vmem.h>
127 #include <sys/vmmeter.h>
128 #include <sys/sched.h>
129 #include <sys/sysctl.h>
130 #include <sys/_unrhdr.h>
131 #include <sys/smp.h>
132
133 #include <vm/vm.h>
134 #include <vm/vm_param.h>
135 #include <vm/vm_kern.h>
136 #include <vm/vm_page.h>
137 #include <vm/vm_map.h>
138 #include <vm/vm_object.h>
139 #include <vm/vm_extern.h>
140 #include <vm/vm_pageout.h>
141 #include <vm/vm_pager.h>
142 #include <vm/vm_phys.h>
143 #include <vm/vm_radix.h>
144 #include <vm/vm_reserv.h>
145 #include <vm/vm_dumpset.h>
146 #include <vm/uma.h>
147
148 #include <machine/asan.h>
149 #include <machine/machdep.h>
150 #include <machine/md_var.h>
151 #include <machine/pcb.h>
152
153 #ifdef NUMA
154 #define PMAP_MEMDOM MAXMEMDOM
155 #else
156 #define PMAP_MEMDOM 1
157 #endif
158
159 #define PMAP_ASSERT_STAGE1(pmap) MPASS((pmap)->pm_stage == PM_STAGE1)
160 #define PMAP_ASSERT_STAGE2(pmap) MPASS((pmap)->pm_stage == PM_STAGE2)
161
162 #define NL0PG (PAGE_SIZE/(sizeof (pd_entry_t)))
163 #define NL1PG (PAGE_SIZE/(sizeof (pd_entry_t)))
164 #define NL2PG (PAGE_SIZE/(sizeof (pd_entry_t)))
165 #define NL3PG (PAGE_SIZE/(sizeof (pt_entry_t)))
166
167 #define NUL0E L0_ENTRIES
168 #define NUL1E (NUL0E * NL1PG)
169 #define NUL2E (NUL1E * NL2PG)
170
171 #ifdef PV_STATS
172 #define PV_STAT(x) do { x ; } while (0)
173 #define __pvused
174 #else
175 #define PV_STAT(x) do { } while (0)
176 #define __pvused __unused
177 #endif
178
179 #define pmap_l0_pindex(v) (NUL2E + NUL1E + ((v) >> L0_SHIFT))
180 #define pmap_l1_pindex(v) (NUL2E + ((v) >> L1_SHIFT))
181 #define pmap_l2_pindex(v) ((v) >> L2_SHIFT)
182
183 #ifdef __ARM_FEATURE_BTI_DEFAULT
184 #define ATTR_KERN_GP ATTR_S1_GP
185 #else
186 #define ATTR_KERN_GP 0
187 #endif
188 #define PMAP_SAN_PTE_BITS (ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP | \
189 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | ATTR_S1_AP(ATTR_S1_AP_RW))
190
191 struct pmap_large_md_page {
192 struct rwlock pv_lock;
193 struct md_page pv_page;
194 /* Pad to a power of 2, see pmap_init_pv_table(). */
195 int pv_pad[2];
196 };
197
198 __exclusive_cache_line static struct pmap_large_md_page pv_dummy_large;
199 #define pv_dummy pv_dummy_large.pv_page
200 __read_mostly static struct pmap_large_md_page *pv_table;
201
202 static struct pmap_large_md_page *
_pa_to_pmdp(vm_paddr_t pa)203 _pa_to_pmdp(vm_paddr_t pa)
204 {
205 struct vm_phys_seg *seg;
206
207 if ((seg = vm_phys_paddr_to_seg(pa)) != NULL)
208 return ((struct pmap_large_md_page *)seg->md_first +
209 pmap_l2_pindex(pa) - pmap_l2_pindex(seg->start));
210 return (NULL);
211 }
212
213 static struct pmap_large_md_page *
pa_to_pmdp(vm_paddr_t pa)214 pa_to_pmdp(vm_paddr_t pa)
215 {
216 struct pmap_large_md_page *pvd;
217
218 pvd = _pa_to_pmdp(pa);
219 if (pvd == NULL)
220 panic("pa 0x%jx not within vm_phys_segs", (uintmax_t)pa);
221 return (pvd);
222 }
223
224 static struct pmap_large_md_page *
page_to_pmdp(vm_page_t m)225 page_to_pmdp(vm_page_t m)
226 {
227 struct vm_phys_seg *seg;
228
229 seg = &vm_phys_segs[m->segind];
230 return ((struct pmap_large_md_page *)seg->md_first +
231 pmap_l2_pindex(VM_PAGE_TO_PHYS(m)) - pmap_l2_pindex(seg->start));
232 }
233
234 #define pa_to_pvh(pa) (&(pa_to_pmdp(pa)->pv_page))
235 #define page_to_pvh(m) (&(page_to_pmdp(m)->pv_page))
236
237 #define PHYS_TO_PV_LIST_LOCK(pa) ({ \
238 struct pmap_large_md_page *_pvd; \
239 struct rwlock *_lock; \
240 _pvd = _pa_to_pmdp(pa); \
241 if (__predict_false(_pvd == NULL)) \
242 _lock = &pv_dummy_large.pv_lock; \
243 else \
244 _lock = &(_pvd->pv_lock); \
245 _lock; \
246 })
247
248 static struct rwlock *
VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)249 VM_PAGE_TO_PV_LIST_LOCK(vm_page_t m)
250 {
251 if ((m->flags & PG_FICTITIOUS) == 0)
252 return (&page_to_pmdp(m)->pv_lock);
253 else
254 return (&pv_dummy_large.pv_lock);
255 }
256
257 #define CHANGE_PV_LIST_LOCK(lockp, new_lock) do { \
258 struct rwlock **_lockp = (lockp); \
259 struct rwlock *_new_lock = (new_lock); \
260 \
261 if (_new_lock != *_lockp) { \
262 if (*_lockp != NULL) \
263 rw_wunlock(*_lockp); \
264 *_lockp = _new_lock; \
265 rw_wlock(*_lockp); \
266 } \
267 } while (0)
268
269 #define CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa) \
270 CHANGE_PV_LIST_LOCK(lockp, PHYS_TO_PV_LIST_LOCK(pa))
271
272 #define CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m) \
273 CHANGE_PV_LIST_LOCK(lockp, VM_PAGE_TO_PV_LIST_LOCK(m))
274
275 #define RELEASE_PV_LIST_LOCK(lockp) do { \
276 struct rwlock **_lockp = (lockp); \
277 \
278 if (*_lockp != NULL) { \
279 rw_wunlock(*_lockp); \
280 *_lockp = NULL; \
281 } \
282 } while (0)
283
284 #define PTE_TO_VM_PAGE(pte) PHYS_TO_VM_PAGE(PTE_TO_PHYS(pte))
285 #define VM_PAGE_TO_PTE(m) PHYS_TO_PTE(VM_PAGE_TO_PHYS(m))
286
287 /*
288 * The presence of this flag indicates that the mapping is writeable.
289 * If the ATTR_S1_AP_RO bit is also set, then the mapping is clean, otherwise
290 * it is dirty. This flag may only be set on managed mappings.
291 *
292 * The DBM bit is reserved on ARMv8.0 but it seems we can safely treat it
293 * as a software managed bit.
294 */
295 #define ATTR_SW_DBM ATTR_DBM
296
297 struct pmap kernel_pmap_store;
298
299 /* Used for mapping ACPI memory before VM is initialized */
300 #define PMAP_PREINIT_MAPPING_COUNT 32
301 #define PMAP_PREINIT_MAPPING_SIZE (PMAP_PREINIT_MAPPING_COUNT * L2_SIZE)
302 static vm_offset_t preinit_map_va; /* Start VA of pre-init mapping space */
303 static int vm_initialized = 0; /* No need to use pre-init maps when set */
304
305 /*
306 * Reserve a few L2 blocks starting from 'preinit_map_va' pointer.
307 * Always map entire L2 block for simplicity.
308 * VA of L2 block = preinit_map_va + i * L2_SIZE
309 */
310 static struct pmap_preinit_mapping {
311 vm_paddr_t pa;
312 vm_offset_t va;
313 vm_size_t size;
314 } pmap_preinit_mapping[PMAP_PREINIT_MAPPING_COUNT];
315
316 vm_offset_t virtual_avail; /* VA of first avail page (after kernel bss) */
317 vm_offset_t virtual_end; /* VA of last avail page (end of kernel AS) */
318 vm_offset_t kernel_vm_end = 0;
319
320 /*
321 * Data for the pv entry allocation mechanism.
322 */
323 #ifdef NUMA
324 static __inline int
pc_to_domain(struct pv_chunk * pc)325 pc_to_domain(struct pv_chunk *pc)
326 {
327 return (vm_phys_domain(DMAP_TO_PHYS((vm_offset_t)pc)));
328 }
329 #else
330 static __inline int
pc_to_domain(struct pv_chunk * pc __unused)331 pc_to_domain(struct pv_chunk *pc __unused)
332 {
333 return (0);
334 }
335 #endif
336
337 struct pv_chunks_list {
338 struct mtx pvc_lock;
339 TAILQ_HEAD(pch, pv_chunk) pvc_list;
340 int active_reclaims;
341 } __aligned(CACHE_LINE_SIZE);
342
343 struct pv_chunks_list __exclusive_cache_line pv_chunks[PMAP_MEMDOM];
344
345 vm_paddr_t dmap_phys_base; /* The start of the dmap region */
346 vm_paddr_t dmap_phys_max; /* The limit of the dmap region */
347 vm_offset_t dmap_max_addr; /* The virtual address limit of the dmap */
348
349 extern pt_entry_t pagetable_l0_ttbr1[];
350
351 #define PHYSMAP_SIZE (2 * (VM_PHYSSEG_MAX - 1))
352 static vm_paddr_t physmap[PHYSMAP_SIZE];
353 static u_int physmap_idx;
354
355 static SYSCTL_NODE(_vm, OID_AUTO, pmap, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
356 "VM/pmap parameters");
357
358 #if PAGE_SIZE == PAGE_SIZE_4K
359 #define L1_BLOCKS_SUPPORTED 1
360 #else
361 /* TODO: Make this dynamic when we support FEAT_LPA2 (TCR_EL1.DS == 1) */
362 #define L1_BLOCKS_SUPPORTED 0
363 #endif
364
365 #define PMAP_ASSERT_L1_BLOCKS_SUPPORTED MPASS(L1_BLOCKS_SUPPORTED)
366
367 /*
368 * This ASID allocator uses a bit vector ("asid_set") to remember which ASIDs
369 * that it has currently allocated to a pmap, a cursor ("asid_next") to
370 * optimize its search for a free ASID in the bit vector, and an epoch number
371 * ("asid_epoch") to indicate when it has reclaimed all previously allocated
372 * ASIDs that are not currently active on a processor.
373 *
374 * The current epoch number is always in the range [0, INT_MAX). Negative
375 * numbers and INT_MAX are reserved for special cases that are described
376 * below.
377 */
378 struct asid_set {
379 int asid_bits;
380 bitstr_t *asid_set;
381 int asid_set_size;
382 int asid_next;
383 int asid_epoch;
384 struct mtx asid_set_mutex;
385 };
386
387 static struct asid_set asids;
388 static struct asid_set vmids;
389
390 static SYSCTL_NODE(_vm_pmap, OID_AUTO, asid, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
391 "ASID allocator");
392 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, bits, CTLFLAG_RD, &asids.asid_bits, 0,
393 "The number of bits in an ASID");
394 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, next, CTLFLAG_RD, &asids.asid_next, 0,
395 "The last allocated ASID plus one");
396 SYSCTL_INT(_vm_pmap_asid, OID_AUTO, epoch, CTLFLAG_RD, &asids.asid_epoch, 0,
397 "The current epoch number");
398
399 static SYSCTL_NODE(_vm_pmap, OID_AUTO, vmid, CTLFLAG_RD, 0, "VMID allocator");
400 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, bits, CTLFLAG_RD, &vmids.asid_bits, 0,
401 "The number of bits in an VMID");
402 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, next, CTLFLAG_RD, &vmids.asid_next, 0,
403 "The last allocated VMID plus one");
404 SYSCTL_INT(_vm_pmap_vmid, OID_AUTO, epoch, CTLFLAG_RD, &vmids.asid_epoch, 0,
405 "The current epoch number");
406
407 void (*pmap_clean_stage2_tlbi)(void);
408 void (*pmap_invalidate_vpipt_icache)(void);
409 void (*pmap_stage2_invalidate_range)(uint64_t, vm_offset_t, vm_offset_t, bool);
410 void (*pmap_stage2_invalidate_all)(uint64_t);
411
412 /*
413 * A pmap's cookie encodes an ASID and epoch number. Cookies for reserved
414 * ASIDs have a negative epoch number, specifically, INT_MIN. Cookies for
415 * dynamically allocated ASIDs have a non-negative epoch number.
416 *
417 * An invalid ASID is represented by -1.
418 *
419 * There are two special-case cookie values: (1) COOKIE_FROM(-1, INT_MIN),
420 * which indicates that an ASID should never be allocated to the pmap, and
421 * (2) COOKIE_FROM(-1, INT_MAX), which indicates that an ASID should be
422 * allocated when the pmap is next activated.
423 */
424 #define COOKIE_FROM(asid, epoch) ((long)((u_int)(asid) | \
425 ((u_long)(epoch) << 32)))
426 #define COOKIE_TO_ASID(cookie) ((int)(cookie))
427 #define COOKIE_TO_EPOCH(cookie) ((int)((u_long)(cookie) >> 32))
428
429 #define TLBI_VA_SHIFT 12
430 #define TLBI_VA_MASK ((1ul << 44) - 1)
431 #define TLBI_VA(addr) (((addr) >> TLBI_VA_SHIFT) & TLBI_VA_MASK)
432
433 static int __read_frequently superpages_enabled = 1;
434 SYSCTL_INT(_vm_pmap, OID_AUTO, superpages_enabled,
435 CTLFLAG_RDTUN | CTLFLAG_NOFETCH, &superpages_enabled, 0,
436 "Are large page mappings enabled?");
437
438 /*
439 * True when Branch Target Identification should be used by userspace. This
440 * allows pmap to mark pages as guarded with ATTR_S1_GP.
441 */
442 __read_mostly static bool pmap_bti_support = false;
443
444 /*
445 * Internal flags for pmap_enter()'s helper functions.
446 */
447 #define PMAP_ENTER_NORECLAIM 0x1000000 /* Don't reclaim PV entries. */
448 #define PMAP_ENTER_NOREPLACE 0x2000000 /* Don't replace mappings. */
449
450 TAILQ_HEAD(pv_chunklist, pv_chunk);
451
452 static void free_pv_chunk(struct pv_chunk *pc);
453 static void free_pv_chunk_batch(struct pv_chunklist *batch);
454 static void free_pv_entry(pmap_t pmap, pv_entry_t pv);
455 static pv_entry_t get_pv_entry(pmap_t pmap, struct rwlock **lockp);
456 static vm_page_t reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp);
457 static void pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va);
458 static pv_entry_t pmap_pvh_remove(struct md_page *pvh, pmap_t pmap,
459 vm_offset_t va);
460
461 static void pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte);
462 static bool pmap_activate_int(pmap_t pmap);
463 static void pmap_alloc_asid(pmap_t pmap);
464 static int pmap_change_props_locked(vm_offset_t va, vm_size_t size,
465 vm_prot_t prot, int mode, bool skip_unmapped);
466 static bool pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
467 pt_entry_t l3e, vm_page_t ml3, struct rwlock **lockp);
468 static pt_entry_t *pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va);
469 static pt_entry_t *pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2,
470 vm_offset_t va, struct rwlock **lockp);
471 static pt_entry_t *pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
472 static bool pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va);
473 static bool pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va);
474 static vm_page_t pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va,
475 vm_page_t m, vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp);
476 static int pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2,
477 u_int flags, vm_page_t m, struct rwlock **lockp);
478 static int pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
479 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp);
480 static bool pmap_every_pte_zero(vm_paddr_t pa);
481 static int pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
482 bool all_l3e_AF_set);
483 static pt_entry_t pmap_load_l3c(pt_entry_t *l3p);
484 static void pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
485 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits);
486 static bool pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
487 struct rwlock **lockp);
488 static void pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va);
489 static int pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
490 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp);
491 static int pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t sva,
492 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp);
493 static bool pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
494 vm_offset_t *vap, vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
495 struct rwlock **lockp);
496 static void pmap_reset_asid_set(pmap_t pmap);
497 static bool pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va,
498 vm_page_t m, struct rwlock **lockp);
499
500 static vm_page_t _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex,
501 struct rwlock **lockp);
502
503 static void _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m,
504 struct spglist *free);
505 static int pmap_unuse_pt(pmap_t, vm_offset_t, pd_entry_t, struct spglist *);
506 static void pmap_update_entry(pmap_t pmap, pd_entry_t *pte, pd_entry_t newpte,
507 vm_offset_t va, vm_size_t size);
508 static __inline vm_page_t pmap_remove_pt_page(pmap_t pmap, vm_offset_t va);
509
510 static uma_zone_t pmap_bti_ranges_zone;
511 static bool pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
512 pt_entry_t *pte);
513 static pt_entry_t pmap_pte_bti(pmap_t pmap, vm_offset_t va);
514 static void pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva);
515 static void *bti_dup_range(void *ctx, void *data);
516 static void bti_free_range(void *ctx, void *node);
517 static int pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap);
518 static void pmap_bti_deassign_all(pmap_t pmap);
519
520 /*
521 * These load the old table data and store the new value.
522 * They need to be atomic as the System MMU may write to the table at
523 * the same time as the CPU.
524 */
525 #define pmap_clear(table) atomic_store_64(table, 0)
526 #define pmap_clear_bits(table, bits) atomic_clear_64(table, bits)
527 #define pmap_load(table) (*table)
528 #define pmap_load_clear(table) atomic_swap_64(table, 0)
529 #define pmap_load_store(table, entry) atomic_swap_64(table, entry)
530 #define pmap_set_bits(table, bits) atomic_set_64(table, bits)
531 #define pmap_store(table, entry) atomic_store_64(table, entry)
532
533 /********************/
534 /* Inline functions */
535 /********************/
536
537 static __inline void
pagecopy(void * s,void * d)538 pagecopy(void *s, void *d)
539 {
540
541 memcpy(d, s, PAGE_SIZE);
542 }
543
544 static __inline pd_entry_t *
pmap_l0(pmap_t pmap,vm_offset_t va)545 pmap_l0(pmap_t pmap, vm_offset_t va)
546 {
547
548 return (&pmap->pm_l0[pmap_l0_index(va)]);
549 }
550
551 static __inline pd_entry_t *
pmap_l0_to_l1(pd_entry_t * l0,vm_offset_t va)552 pmap_l0_to_l1(pd_entry_t *l0, vm_offset_t va)
553 {
554 pd_entry_t *l1;
555
556 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
557 return (&l1[pmap_l1_index(va)]);
558 }
559
560 static __inline pd_entry_t *
pmap_l1(pmap_t pmap,vm_offset_t va)561 pmap_l1(pmap_t pmap, vm_offset_t va)
562 {
563 pd_entry_t *l0;
564
565 l0 = pmap_l0(pmap, va);
566 if ((pmap_load(l0) & ATTR_DESCR_MASK) != L0_TABLE)
567 return (NULL);
568
569 return (pmap_l0_to_l1(l0, va));
570 }
571
572 static __inline pd_entry_t *
pmap_l1_to_l2(pd_entry_t * l1p,vm_offset_t va)573 pmap_l1_to_l2(pd_entry_t *l1p, vm_offset_t va)
574 {
575 pd_entry_t l1, *l2p;
576
577 l1 = pmap_load(l1p);
578
579 KASSERT(ADDR_IS_CANONICAL(va),
580 ("%s: Address not in canonical form: %lx", __func__, va));
581 /*
582 * The valid bit may be clear if pmap_update_entry() is concurrently
583 * modifying the entry, so for KVA only the entry type may be checked.
584 */
585 KASSERT(ADDR_IS_KERNEL(va) || (l1 & ATTR_DESCR_VALID) != 0,
586 ("%s: L1 entry %#lx for %#lx is invalid", __func__, l1, va));
587 KASSERT((l1 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
588 ("%s: L1 entry %#lx for %#lx is a leaf", __func__, l1, va));
589 l2p = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l1));
590 return (&l2p[pmap_l2_index(va)]);
591 }
592
593 static __inline pd_entry_t *
pmap_l2(pmap_t pmap,vm_offset_t va)594 pmap_l2(pmap_t pmap, vm_offset_t va)
595 {
596 pd_entry_t *l1;
597
598 l1 = pmap_l1(pmap, va);
599 if ((pmap_load(l1) & ATTR_DESCR_MASK) != L1_TABLE)
600 return (NULL);
601
602 return (pmap_l1_to_l2(l1, va));
603 }
604
605 static __inline pt_entry_t *
pmap_l2_to_l3(pd_entry_t * l2p,vm_offset_t va)606 pmap_l2_to_l3(pd_entry_t *l2p, vm_offset_t va)
607 {
608 pd_entry_t l2;
609 pt_entry_t *l3p;
610
611 l2 = pmap_load(l2p);
612
613 KASSERT(ADDR_IS_CANONICAL(va),
614 ("%s: Address not in canonical form: %lx", __func__, va));
615 /*
616 * The valid bit may be clear if pmap_update_entry() is concurrently
617 * modifying the entry, so for KVA only the entry type may be checked.
618 */
619 KASSERT(ADDR_IS_KERNEL(va) || (l2 & ATTR_DESCR_VALID) != 0,
620 ("%s: L2 entry %#lx for %#lx is invalid", __func__, l2, va));
621 KASSERT((l2 & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_TABLE,
622 ("%s: L2 entry %#lx for %#lx is a leaf", __func__, l2, va));
623 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(l2));
624 return (&l3p[pmap_l3_index(va)]);
625 }
626
627 /*
628 * Returns the lowest valid pde for a given virtual address.
629 * The next level may or may not point to a valid page or block.
630 */
631 static __inline pd_entry_t *
pmap_pde(pmap_t pmap,vm_offset_t va,int * level)632 pmap_pde(pmap_t pmap, vm_offset_t va, int *level)
633 {
634 pd_entry_t *l0, *l1, *l2, desc;
635
636 l0 = pmap_l0(pmap, va);
637 desc = pmap_load(l0) & ATTR_DESCR_MASK;
638 if (desc != L0_TABLE) {
639 *level = -1;
640 return (NULL);
641 }
642
643 l1 = pmap_l0_to_l1(l0, va);
644 desc = pmap_load(l1) & ATTR_DESCR_MASK;
645 if (desc != L1_TABLE) {
646 *level = 0;
647 return (l0);
648 }
649
650 l2 = pmap_l1_to_l2(l1, va);
651 desc = pmap_load(l2) & ATTR_DESCR_MASK;
652 if (desc != L2_TABLE) {
653 *level = 1;
654 return (l1);
655 }
656
657 *level = 2;
658 return (l2);
659 }
660
661 /*
662 * Returns the lowest valid pte block or table entry for a given virtual
663 * address. If there are no valid entries return NULL and set the level to
664 * the first invalid level.
665 */
666 static __inline pt_entry_t *
pmap_pte(pmap_t pmap,vm_offset_t va,int * level)667 pmap_pte(pmap_t pmap, vm_offset_t va, int *level)
668 {
669 pd_entry_t *l1, *l2, desc;
670 pt_entry_t *l3;
671
672 l1 = pmap_l1(pmap, va);
673 if (l1 == NULL) {
674 *level = 0;
675 return (NULL);
676 }
677 desc = pmap_load(l1) & ATTR_DESCR_MASK;
678 if (desc == L1_BLOCK) {
679 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
680 *level = 1;
681 return (l1);
682 }
683
684 if (desc != L1_TABLE) {
685 *level = 1;
686 return (NULL);
687 }
688
689 l2 = pmap_l1_to_l2(l1, va);
690 desc = pmap_load(l2) & ATTR_DESCR_MASK;
691 if (desc == L2_BLOCK) {
692 *level = 2;
693 return (l2);
694 }
695
696 if (desc != L2_TABLE) {
697 *level = 2;
698 return (NULL);
699 }
700
701 *level = 3;
702 l3 = pmap_l2_to_l3(l2, va);
703 if ((pmap_load(l3) & ATTR_DESCR_MASK) != L3_PAGE)
704 return (NULL);
705
706 return (l3);
707 }
708
709 /*
710 * If the given pmap has an L{1,2}_BLOCK or L3_PAGE entry at the specified
711 * level that maps the specified virtual address, then a pointer to that entry
712 * is returned. Otherwise, NULL is returned, unless INVARIANTS are enabled
713 * and a diagnostic message is provided, in which case this function panics.
714 */
715 static __always_inline pt_entry_t *
pmap_pte_exists(pmap_t pmap,vm_offset_t va,int level,const char * diag)716 pmap_pte_exists(pmap_t pmap, vm_offset_t va, int level, const char *diag)
717 {
718 pd_entry_t *l0p, *l1p, *l2p;
719 pt_entry_t desc, *l3p;
720 int walk_level __diagused;
721
722 KASSERT(level >= 0 && level < 4,
723 ("%s: %s passed an out-of-range level (%d)", __func__, diag,
724 level));
725 l0p = pmap_l0(pmap, va);
726 desc = pmap_load(l0p) & ATTR_DESCR_MASK;
727 if (desc == L0_TABLE && level > 0) {
728 l1p = pmap_l0_to_l1(l0p, va);
729 desc = pmap_load(l1p) & ATTR_DESCR_MASK;
730 if (desc == L1_BLOCK && level == 1) {
731 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
732 return (l1p);
733 }
734 if (desc == L1_TABLE && level > 1) {
735 l2p = pmap_l1_to_l2(l1p, va);
736 desc = pmap_load(l2p) & ATTR_DESCR_MASK;
737 if (desc == L2_BLOCK && level == 2)
738 return (l2p);
739 else if (desc == L2_TABLE && level > 2) {
740 l3p = pmap_l2_to_l3(l2p, va);
741 desc = pmap_load(l3p) & ATTR_DESCR_MASK;
742 if (desc == L3_PAGE && level == 3)
743 return (l3p);
744 else
745 walk_level = 3;
746 } else
747 walk_level = 2;
748 } else
749 walk_level = 1;
750 } else
751 walk_level = 0;
752 KASSERT(diag == NULL,
753 ("%s: va %#lx not mapped at level %d, desc %ld at level %d",
754 diag, va, level, desc, walk_level));
755 return (NULL);
756 }
757
758 bool
pmap_ps_enabled(pmap_t pmap)759 pmap_ps_enabled(pmap_t pmap)
760 {
761 /*
762 * Promotion requires a hypervisor call when the kernel is running
763 * in EL1. To stop this disable superpage support on non-stage 1
764 * pmaps for now.
765 */
766 if (pmap->pm_stage != PM_STAGE1)
767 return (false);
768
769 #ifdef KMSAN
770 /*
771 * The break-before-make in pmap_update_entry() results in a situation
772 * where a CPU may call into the KMSAN runtime while the entry is
773 * invalid. If the entry is used to map the current thread structure,
774 * then the runtime will attempt to access unmapped memory. Avoid this
775 * by simply disabling superpage promotion for the kernel map.
776 */
777 if (pmap == kernel_pmap)
778 return (false);
779 #endif
780
781 return (superpages_enabled != 0);
782 }
783
784 bool
pmap_get_tables(pmap_t pmap,vm_offset_t va,pd_entry_t ** l0,pd_entry_t ** l1,pd_entry_t ** l2,pt_entry_t ** l3)785 pmap_get_tables(pmap_t pmap, vm_offset_t va, pd_entry_t **l0, pd_entry_t **l1,
786 pd_entry_t **l2, pt_entry_t **l3)
787 {
788 pd_entry_t *l0p, *l1p, *l2p;
789
790 if (pmap->pm_l0 == NULL)
791 return (false);
792
793 l0p = pmap_l0(pmap, va);
794 *l0 = l0p;
795
796 if ((pmap_load(l0p) & ATTR_DESCR_MASK) != L0_TABLE)
797 return (false);
798
799 l1p = pmap_l0_to_l1(l0p, va);
800 *l1 = l1p;
801
802 if ((pmap_load(l1p) & ATTR_DESCR_MASK) == L1_BLOCK) {
803 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
804 *l2 = NULL;
805 *l3 = NULL;
806 return (true);
807 }
808
809 if ((pmap_load(l1p) & ATTR_DESCR_MASK) != L1_TABLE)
810 return (false);
811
812 l2p = pmap_l1_to_l2(l1p, va);
813 *l2 = l2p;
814
815 if ((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK) {
816 *l3 = NULL;
817 return (true);
818 }
819
820 if ((pmap_load(l2p) & ATTR_DESCR_MASK) != L2_TABLE)
821 return (false);
822
823 *l3 = pmap_l2_to_l3(l2p, va);
824
825 return (true);
826 }
827
828 static __inline int
pmap_l3_valid(pt_entry_t l3)829 pmap_l3_valid(pt_entry_t l3)
830 {
831
832 return ((l3 & ATTR_DESCR_MASK) == L3_PAGE);
833 }
834
835 CTASSERT(L1_BLOCK == L2_BLOCK);
836
837 static pt_entry_t
pmap_pte_memattr(pmap_t pmap,vm_memattr_t memattr)838 pmap_pte_memattr(pmap_t pmap, vm_memattr_t memattr)
839 {
840 pt_entry_t val;
841
842 if (pmap->pm_stage == PM_STAGE1) {
843 val = ATTR_S1_IDX(memattr);
844 if (memattr == VM_MEMATTR_DEVICE)
845 val |= ATTR_S1_XN;
846 return (val);
847 }
848
849 val = 0;
850
851 switch (memattr) {
852 case VM_MEMATTR_DEVICE:
853 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_DEVICE_nGnRnE) |
854 ATTR_S2_XN(ATTR_S2_XN_ALL));
855 case VM_MEMATTR_UNCACHEABLE:
856 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_NC));
857 case VM_MEMATTR_WRITE_BACK:
858 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WB));
859 case VM_MEMATTR_WRITE_THROUGH:
860 return (ATTR_S2_MEMATTR(ATTR_S2_MEMATTR_WT));
861 default:
862 panic("%s: invalid memory attribute %x", __func__, memattr);
863 }
864 }
865
866 static pt_entry_t
pmap_pte_prot(pmap_t pmap,vm_prot_t prot)867 pmap_pte_prot(pmap_t pmap, vm_prot_t prot)
868 {
869 pt_entry_t val;
870
871 val = 0;
872 if (pmap->pm_stage == PM_STAGE1) {
873 if ((prot & VM_PROT_EXECUTE) == 0)
874 val |= ATTR_S1_XN;
875 if ((prot & VM_PROT_WRITE) == 0)
876 val |= ATTR_S1_AP(ATTR_S1_AP_RO);
877 } else {
878 if ((prot & VM_PROT_WRITE) != 0)
879 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
880 if ((prot & VM_PROT_READ) != 0)
881 val |= ATTR_S2_S2AP(ATTR_S2_S2AP_READ);
882 if ((prot & VM_PROT_EXECUTE) == 0)
883 val |= ATTR_S2_XN(ATTR_S2_XN_ALL);
884 }
885
886 return (val);
887 }
888
889 /*
890 * Checks if the PTE is dirty.
891 */
892 static inline int
pmap_pte_dirty(pmap_t pmap,pt_entry_t pte)893 pmap_pte_dirty(pmap_t pmap, pt_entry_t pte)
894 {
895
896 KASSERT((pte & ATTR_SW_MANAGED) != 0, ("pte %#lx is unmanaged", pte));
897
898 if (pmap->pm_stage == PM_STAGE1) {
899 KASSERT((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) != 0,
900 ("pte %#lx is writeable and missing ATTR_SW_DBM", pte));
901
902 return ((pte & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
903 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM));
904 }
905
906 return ((pte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
907 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE));
908 }
909
910 static __inline void
pmap_resident_count_inc(pmap_t pmap,int count)911 pmap_resident_count_inc(pmap_t pmap, int count)
912 {
913
914 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
915 pmap->pm_stats.resident_count += count;
916 }
917
918 static __inline void
pmap_resident_count_dec(pmap_t pmap,int count)919 pmap_resident_count_dec(pmap_t pmap, int count)
920 {
921
922 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
923 KASSERT(pmap->pm_stats.resident_count >= count,
924 ("pmap %p resident count underflow %ld %d", pmap,
925 pmap->pm_stats.resident_count, count));
926 pmap->pm_stats.resident_count -= count;
927 }
928
929 static vm_paddr_t
pmap_early_vtophys(vm_offset_t va)930 pmap_early_vtophys(vm_offset_t va)
931 {
932 vm_paddr_t pa_page;
933
934 pa_page = arm64_address_translate_s1e1r(va) & PAR_PA_MASK;
935 return (pa_page | (va & PAR_LOW_MASK));
936 }
937
938 /* State of the bootstrapped DMAP page tables */
939 struct pmap_bootstrap_state {
940 pt_entry_t *l1;
941 pt_entry_t *l2;
942 pt_entry_t *l3;
943 vm_offset_t freemempos;
944 vm_offset_t va;
945 vm_paddr_t pa;
946 pt_entry_t table_attrs;
947 u_int l0_slot;
948 u_int l1_slot;
949 u_int l2_slot;
950 bool dmap_valid;
951 };
952
953 /* The bootstrap state */
954 static struct pmap_bootstrap_state bs_state = {
955 .l1 = NULL,
956 .l2 = NULL,
957 .l3 = NULL,
958 .table_attrs = TATTR_PXN_TABLE,
959 .l0_slot = L0_ENTRIES,
960 .l1_slot = Ln_ENTRIES,
961 .l2_slot = Ln_ENTRIES,
962 .dmap_valid = false,
963 };
964
965 static void
pmap_bootstrap_l0_table(struct pmap_bootstrap_state * state)966 pmap_bootstrap_l0_table(struct pmap_bootstrap_state *state)
967 {
968 vm_paddr_t l1_pa;
969 pd_entry_t l0e;
970 u_int l0_slot;
971
972 /* Link the level 0 table to a level 1 table */
973 l0_slot = pmap_l0_index(state->va);
974 if (l0_slot != state->l0_slot) {
975 /*
976 * Make sure we move from a low address to high address
977 * before the DMAP region is ready. This ensures we never
978 * modify an existing mapping until we can map from a
979 * physical address to a virtual address.
980 */
981 MPASS(state->l0_slot < l0_slot ||
982 state->l0_slot == L0_ENTRIES ||
983 state->dmap_valid);
984
985 /* Reset lower levels */
986 state->l2 = NULL;
987 state->l3 = NULL;
988 state->l1_slot = Ln_ENTRIES;
989 state->l2_slot = Ln_ENTRIES;
990
991 /* Check the existing L0 entry */
992 state->l0_slot = l0_slot;
993 if (state->dmap_valid) {
994 l0e = pagetable_l0_ttbr1[l0_slot];
995 if ((l0e & ATTR_DESCR_VALID) != 0) {
996 MPASS((l0e & ATTR_DESCR_MASK) == L0_TABLE);
997 l1_pa = PTE_TO_PHYS(l0e);
998 state->l1 = (pt_entry_t *)PHYS_TO_DMAP(l1_pa);
999 return;
1000 }
1001 }
1002
1003 /* Create a new L0 table entry */
1004 state->l1 = (pt_entry_t *)state->freemempos;
1005 memset(state->l1, 0, PAGE_SIZE);
1006 state->freemempos += PAGE_SIZE;
1007
1008 l1_pa = pmap_early_vtophys((vm_offset_t)state->l1);
1009 MPASS((l1_pa & Ln_TABLE_MASK) == 0);
1010 MPASS(pagetable_l0_ttbr1[l0_slot] == 0);
1011 pmap_store(&pagetable_l0_ttbr1[l0_slot], PHYS_TO_PTE(l1_pa) |
1012 TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0 | L0_TABLE);
1013 }
1014 KASSERT(state->l1 != NULL, ("%s: NULL l1", __func__));
1015 }
1016
1017 static void
pmap_bootstrap_l1_table(struct pmap_bootstrap_state * state)1018 pmap_bootstrap_l1_table(struct pmap_bootstrap_state *state)
1019 {
1020 vm_paddr_t l2_pa;
1021 pd_entry_t l1e;
1022 u_int l1_slot;
1023
1024 /* Make sure there is a valid L0 -> L1 table */
1025 pmap_bootstrap_l0_table(state);
1026
1027 /* Link the level 1 table to a level 2 table */
1028 l1_slot = pmap_l1_index(state->va);
1029 if (l1_slot != state->l1_slot) {
1030 /* See pmap_bootstrap_l0_table for a description */
1031 MPASS(state->l1_slot < l1_slot ||
1032 state->l1_slot == Ln_ENTRIES ||
1033 state->dmap_valid);
1034
1035 /* Reset lower levels */
1036 state->l3 = NULL;
1037 state->l2_slot = Ln_ENTRIES;
1038
1039 /* Check the existing L1 entry */
1040 state->l1_slot = l1_slot;
1041 if (state->dmap_valid) {
1042 l1e = state->l1[l1_slot];
1043 if ((l1e & ATTR_DESCR_VALID) != 0) {
1044 MPASS((l1e & ATTR_DESCR_MASK) == L1_TABLE);
1045 l2_pa = PTE_TO_PHYS(l1e);
1046 state->l2 = (pt_entry_t *)PHYS_TO_DMAP(l2_pa);
1047 return;
1048 }
1049 }
1050
1051 /* Create a new L1 table entry */
1052 state->l2 = (pt_entry_t *)state->freemempos;
1053 memset(state->l2, 0, PAGE_SIZE);
1054 state->freemempos += PAGE_SIZE;
1055
1056 l2_pa = pmap_early_vtophys((vm_offset_t)state->l2);
1057 MPASS((l2_pa & Ln_TABLE_MASK) == 0);
1058 MPASS(state->l1[l1_slot] == 0);
1059 pmap_store(&state->l1[l1_slot], PHYS_TO_PTE(l2_pa) |
1060 state->table_attrs | L1_TABLE);
1061 }
1062 KASSERT(state->l2 != NULL, ("%s: NULL l2", __func__));
1063 }
1064
1065 static void
pmap_bootstrap_l2_table(struct pmap_bootstrap_state * state)1066 pmap_bootstrap_l2_table(struct pmap_bootstrap_state *state)
1067 {
1068 vm_paddr_t l3_pa;
1069 pd_entry_t l2e;
1070 u_int l2_slot;
1071
1072 /* Make sure there is a valid L1 -> L2 table */
1073 pmap_bootstrap_l1_table(state);
1074
1075 /* Link the level 2 table to a level 3 table */
1076 l2_slot = pmap_l2_index(state->va);
1077 if (l2_slot != state->l2_slot) {
1078 /* See pmap_bootstrap_l0_table for a description */
1079 MPASS(state->l2_slot < l2_slot ||
1080 state->l2_slot == Ln_ENTRIES ||
1081 state->dmap_valid);
1082
1083 /* Check the existing L2 entry */
1084 state->l2_slot = l2_slot;
1085 if (state->dmap_valid) {
1086 l2e = state->l2[l2_slot];
1087 if ((l2e & ATTR_DESCR_VALID) != 0) {
1088 MPASS((l2e & ATTR_DESCR_MASK) == L2_TABLE);
1089 l3_pa = PTE_TO_PHYS(l2e);
1090 state->l3 = (pt_entry_t *)PHYS_TO_DMAP(l3_pa);
1091 return;
1092 }
1093 }
1094
1095 /* Create a new L2 table entry */
1096 state->l3 = (pt_entry_t *)state->freemempos;
1097 memset(state->l3, 0, PAGE_SIZE);
1098 state->freemempos += PAGE_SIZE;
1099
1100 l3_pa = pmap_early_vtophys((vm_offset_t)state->l3);
1101 MPASS((l3_pa & Ln_TABLE_MASK) == 0);
1102 MPASS(state->l2[l2_slot] == 0);
1103 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(l3_pa) |
1104 state->table_attrs | L2_TABLE);
1105 }
1106 KASSERT(state->l3 != NULL, ("%s: NULL l3", __func__));
1107 }
1108
1109 static void
pmap_bootstrap_l2_block(struct pmap_bootstrap_state * state,int i)1110 pmap_bootstrap_l2_block(struct pmap_bootstrap_state *state, int i)
1111 {
1112 pt_entry_t contig;
1113 u_int l2_slot;
1114 bool first;
1115
1116 if ((physmap[i + 1] - state->pa) < L2_SIZE)
1117 return;
1118
1119 /* Make sure there is a valid L1 table */
1120 pmap_bootstrap_l1_table(state);
1121
1122 MPASS((state->va & L2_OFFSET) == 0);
1123 for (first = true, contig = 0;
1124 state->va < DMAP_MAX_ADDRESS &&
1125 (physmap[i + 1] - state->pa) >= L2_SIZE;
1126 state->va += L2_SIZE, state->pa += L2_SIZE) {
1127 /*
1128 * Stop if we are about to walk off the end of what the
1129 * current L1 slot can address.
1130 */
1131 if (!first && (state->pa & L1_OFFSET) == 0)
1132 break;
1133
1134 /*
1135 * If we have an aligned, contiguous chunk of L2C_ENTRIES
1136 * L2 blocks, set the contiguous bit within each PTE so that
1137 * the chunk can be cached using only one TLB entry.
1138 */
1139 if ((state->pa & L2C_OFFSET) == 0) {
1140 if (state->va + L2C_SIZE < DMAP_MAX_ADDRESS &&
1141 physmap[i + 1] - state->pa >= L2C_SIZE) {
1142 contig = ATTR_CONTIGUOUS;
1143 } else {
1144 contig = 0;
1145 }
1146 }
1147
1148 first = false;
1149 l2_slot = pmap_l2_index(state->va);
1150 MPASS((state->pa & L2_OFFSET) == 0);
1151 MPASS(state->l2[l2_slot] == 0);
1152 pmap_store(&state->l2[l2_slot], PHYS_TO_PTE(state->pa) |
1153 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1154 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L2_BLOCK);
1155 }
1156 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1157 }
1158
1159 static void
pmap_bootstrap_l3_page(struct pmap_bootstrap_state * state,int i)1160 pmap_bootstrap_l3_page(struct pmap_bootstrap_state *state, int i)
1161 {
1162 pt_entry_t contig;
1163 u_int l3_slot;
1164 bool first;
1165
1166 if (physmap[i + 1] - state->pa < L3_SIZE)
1167 return;
1168
1169 /* Make sure there is a valid L2 table */
1170 pmap_bootstrap_l2_table(state);
1171
1172 MPASS((state->va & L3_OFFSET) == 0);
1173 for (first = true, contig = 0;
1174 state->va < DMAP_MAX_ADDRESS &&
1175 physmap[i + 1] - state->pa >= L3_SIZE;
1176 state->va += L3_SIZE, state->pa += L3_SIZE) {
1177 /*
1178 * Stop if we are about to walk off the end of what the
1179 * current L2 slot can address.
1180 */
1181 if (!first && (state->pa & L2_OFFSET) == 0)
1182 break;
1183
1184 /*
1185 * If we have an aligned, contiguous chunk of L3C_ENTRIES
1186 * L3 pages, set the contiguous bit within each PTE so that
1187 * the chunk can be cached using only one TLB entry.
1188 */
1189 if ((state->pa & L3C_OFFSET) == 0) {
1190 if (state->va + L3C_SIZE < DMAP_MAX_ADDRESS &&
1191 physmap[i + 1] - state->pa >= L3C_SIZE) {
1192 contig = ATTR_CONTIGUOUS;
1193 } else {
1194 contig = 0;
1195 }
1196 }
1197
1198 first = false;
1199 l3_slot = pmap_l3_index(state->va);
1200 MPASS((state->pa & L3_OFFSET) == 0);
1201 MPASS(state->l3[l3_slot] == 0);
1202 pmap_store(&state->l3[l3_slot], PHYS_TO_PTE(state->pa) |
1203 ATTR_DEFAULT | ATTR_S1_XN | ATTR_KERN_GP |
1204 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) | contig | L3_PAGE);
1205 }
1206 MPASS(state->va == (state->pa - dmap_phys_base + DMAP_MIN_ADDRESS));
1207 }
1208
1209 static void
pmap_bootstrap_dmap(vm_paddr_t min_pa)1210 pmap_bootstrap_dmap(vm_paddr_t min_pa)
1211 {
1212 int i;
1213
1214 dmap_phys_base = min_pa & ~L1_OFFSET;
1215 dmap_phys_max = 0;
1216 dmap_max_addr = 0;
1217
1218 for (i = 0; i < (physmap_idx * 2); i += 2) {
1219 bs_state.pa = physmap[i] & ~L3_OFFSET;
1220 bs_state.va = bs_state.pa - dmap_phys_base + DMAP_MIN_ADDRESS;
1221
1222 /* Create L3 mappings at the start of the region */
1223 if ((bs_state.pa & L2_OFFSET) != 0)
1224 pmap_bootstrap_l3_page(&bs_state, i);
1225 MPASS(bs_state.pa <= physmap[i + 1]);
1226
1227 if (L1_BLOCKS_SUPPORTED) {
1228 /* Create L2 mappings at the start of the region */
1229 if ((bs_state.pa & L1_OFFSET) != 0)
1230 pmap_bootstrap_l2_block(&bs_state, i);
1231 MPASS(bs_state.pa <= physmap[i + 1]);
1232
1233 /* Create the main L1 block mappings */
1234 for (; bs_state.va < DMAP_MAX_ADDRESS &&
1235 (physmap[i + 1] - bs_state.pa) >= L1_SIZE;
1236 bs_state.va += L1_SIZE, bs_state.pa += L1_SIZE) {
1237 /* Make sure there is a valid L1 table */
1238 pmap_bootstrap_l0_table(&bs_state);
1239 MPASS((bs_state.pa & L1_OFFSET) == 0);
1240 pmap_store(
1241 &bs_state.l1[pmap_l1_index(bs_state.va)],
1242 PHYS_TO_PTE(bs_state.pa) | ATTR_DEFAULT |
1243 ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
1244 ATTR_S1_XN | ATTR_KERN_GP | L1_BLOCK);
1245 }
1246 MPASS(bs_state.pa <= physmap[i + 1]);
1247
1248 /* Create L2 mappings at the end of the region */
1249 pmap_bootstrap_l2_block(&bs_state, i);
1250 } else {
1251 while (bs_state.va < DMAP_MAX_ADDRESS &&
1252 (physmap[i + 1] - bs_state.pa) >= L2_SIZE) {
1253 pmap_bootstrap_l2_block(&bs_state, i);
1254 }
1255 }
1256 MPASS(bs_state.pa <= physmap[i + 1]);
1257
1258 /* Create L3 mappings at the end of the region */
1259 pmap_bootstrap_l3_page(&bs_state, i);
1260 MPASS(bs_state.pa == physmap[i + 1]);
1261
1262 if (bs_state.pa > dmap_phys_max) {
1263 dmap_phys_max = bs_state.pa;
1264 dmap_max_addr = bs_state.va;
1265 }
1266 }
1267
1268 cpu_tlb_flushID();
1269 }
1270
1271 static void
pmap_bootstrap_l2(vm_offset_t va)1272 pmap_bootstrap_l2(vm_offset_t va)
1273 {
1274 KASSERT((va & L1_OFFSET) == 0, ("Invalid virtual address"));
1275
1276 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1277 bs_state.va = va;
1278
1279 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L1_SIZE)
1280 pmap_bootstrap_l1_table(&bs_state);
1281 }
1282
1283 static void
pmap_bootstrap_l3(vm_offset_t va)1284 pmap_bootstrap_l3(vm_offset_t va)
1285 {
1286 KASSERT((va & L2_OFFSET) == 0, ("Invalid virtual address"));
1287
1288 /* Leave bs_state.pa as it's only needed to bootstrap blocks and pages*/
1289 bs_state.va = va;
1290
1291 for (; bs_state.va < VM_MAX_KERNEL_ADDRESS; bs_state.va += L2_SIZE)
1292 pmap_bootstrap_l2_table(&bs_state);
1293 }
1294
1295 /*
1296 * Bootstrap the system enough to run with virtual memory.
1297 */
1298 void
pmap_bootstrap(vm_size_t kernlen)1299 pmap_bootstrap(vm_size_t kernlen)
1300 {
1301 vm_offset_t dpcpu, msgbufpv;
1302 vm_paddr_t start_pa, pa, min_pa;
1303 int i;
1304
1305 /* Verify that the ASID is set through TTBR0. */
1306 KASSERT((READ_SPECIALREG(tcr_el1) & TCR_A1) == 0,
1307 ("pmap_bootstrap: TCR_EL1.A1 != 0"));
1308
1309 /* Set this early so we can use the pagetable walking functions */
1310 kernel_pmap_store.pm_l0 = pagetable_l0_ttbr1;
1311 PMAP_LOCK_INIT(kernel_pmap);
1312 kernel_pmap->pm_l0_paddr =
1313 pmap_early_vtophys((vm_offset_t)kernel_pmap_store.pm_l0);
1314 TAILQ_INIT(&kernel_pmap->pm_pvchunk);
1315 vm_radix_init(&kernel_pmap->pm_root);
1316 kernel_pmap->pm_cookie = COOKIE_FROM(-1, INT_MIN);
1317 kernel_pmap->pm_stage = PM_STAGE1;
1318 kernel_pmap->pm_levels = 4;
1319 kernel_pmap->pm_ttbr = kernel_pmap->pm_l0_paddr;
1320 kernel_pmap->pm_asid_set = &asids;
1321
1322 /* Assume the address we were loaded to is a valid physical address */
1323 min_pa = pmap_early_vtophys(KERNBASE);
1324
1325 physmap_idx = physmem_avail(physmap, nitems(physmap));
1326 physmap_idx /= 2;
1327
1328 /*
1329 * Find the minimum physical address. physmap is sorted,
1330 * but may contain empty ranges.
1331 */
1332 for (i = 0; i < physmap_idx * 2; i += 2) {
1333 if (physmap[i] == physmap[i + 1])
1334 continue;
1335 if (physmap[i] <= min_pa)
1336 min_pa = physmap[i];
1337 }
1338
1339 bs_state.freemempos = KERNBASE + kernlen;
1340 bs_state.freemempos = roundup2(bs_state.freemempos, PAGE_SIZE);
1341
1342 /* Create a direct map region early so we can use it for pa -> va */
1343 pmap_bootstrap_dmap(min_pa);
1344 bs_state.dmap_valid = true;
1345 /*
1346 * We only use PXN when we know nothing will be executed from it, e.g.
1347 * the DMAP region.
1348 */
1349 bs_state.table_attrs &= ~TATTR_PXN_TABLE;
1350
1351 start_pa = pa = pmap_early_vtophys(KERNBASE);
1352
1353 /*
1354 * Create the l2 tables up to VM_MAX_KERNEL_ADDRESS. We assume that the
1355 * loader allocated the first and only l2 page table page used to map
1356 * the kernel, preloaded files and module metadata.
1357 */
1358 pmap_bootstrap_l2(KERNBASE + L1_SIZE);
1359 /* And the l3 tables for the early devmap */
1360 pmap_bootstrap_l3(VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE));
1361
1362 cpu_tlb_flushID();
1363
1364 #define alloc_pages(var, np) \
1365 (var) = bs_state.freemempos; \
1366 bs_state.freemempos += (np * PAGE_SIZE); \
1367 memset((char *)(var), 0, ((np) * PAGE_SIZE));
1368
1369 /* Allocate dynamic per-cpu area. */
1370 alloc_pages(dpcpu, DPCPU_SIZE / PAGE_SIZE);
1371 dpcpu_init((void *)dpcpu, 0);
1372
1373 /* Allocate memory for the msgbuf, e.g. for /sbin/dmesg */
1374 alloc_pages(msgbufpv, round_page(msgbufsize) / PAGE_SIZE);
1375 msgbufp = (void *)msgbufpv;
1376
1377 /* Reserve some VA space for early BIOS/ACPI mapping */
1378 preinit_map_va = roundup2(bs_state.freemempos, L2_SIZE);
1379
1380 virtual_avail = preinit_map_va + PMAP_PREINIT_MAPPING_SIZE;
1381 virtual_avail = roundup2(virtual_avail, L1_SIZE);
1382 virtual_end = VM_MAX_KERNEL_ADDRESS - (PMAP_MAPDEV_EARLY_SIZE);
1383 kernel_vm_end = virtual_avail;
1384
1385 pa = pmap_early_vtophys(bs_state.freemempos);
1386
1387 physmem_exclude_region(start_pa, pa - start_pa, EXFLAG_NOALLOC);
1388
1389 cpu_tlb_flushID();
1390 }
1391
1392 #if defined(KASAN) || defined(KMSAN)
1393 static void
pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa,vm_paddr_t end_pa,vm_offset_t * vap,vm_offset_t eva)1394 pmap_bootstrap_allocate_san_l2(vm_paddr_t start_pa, vm_paddr_t end_pa,
1395 vm_offset_t *vap, vm_offset_t eva)
1396 {
1397 vm_paddr_t pa;
1398 vm_offset_t va;
1399 pd_entry_t *l2;
1400
1401 va = *vap;
1402 pa = rounddown2(end_pa - L2_SIZE, L2_SIZE);
1403 for (; pa >= start_pa && va < eva; va += L2_SIZE, pa -= L2_SIZE) {
1404 l2 = pmap_l2(kernel_pmap, va);
1405
1406 /*
1407 * KASAN stack checking results in us having already allocated
1408 * part of our shadow map, so we can just skip those segments.
1409 */
1410 if ((pmap_load(l2) & ATTR_DESCR_VALID) != 0) {
1411 pa += L2_SIZE;
1412 continue;
1413 }
1414
1415 bzero((void *)PHYS_TO_DMAP(pa), L2_SIZE);
1416 physmem_exclude_region(pa, L2_SIZE, EXFLAG_NOALLOC);
1417 pmap_store(l2, PHYS_TO_PTE(pa) | PMAP_SAN_PTE_BITS | L2_BLOCK);
1418 }
1419 *vap = va;
1420 }
1421
1422 /*
1423 * Finish constructing the initial shadow map:
1424 * - Count how many pages from KERNBASE to virtual_avail (scaled for
1425 * shadow map)
1426 * - Map that entire range using L2 superpages.
1427 */
1428 static void
pmap_bootstrap_san1(vm_offset_t va,int scale)1429 pmap_bootstrap_san1(vm_offset_t va, int scale)
1430 {
1431 vm_offset_t eva;
1432 vm_paddr_t kernstart;
1433 int i;
1434
1435 kernstart = pmap_early_vtophys(KERNBASE);
1436
1437 /*
1438 * Rebuild physmap one more time, we may have excluded more regions from
1439 * allocation since pmap_bootstrap().
1440 */
1441 physmap_idx = physmem_avail(physmap, nitems(physmap));
1442 physmap_idx /= 2;
1443
1444 eva = va + (virtual_avail - VM_MIN_KERNEL_ADDRESS) / scale;
1445
1446 /*
1447 * Find a slot in the physmap large enough for what we needed. We try to put
1448 * the shadow map as high up as we can to avoid depleting the lower 4GB in case
1449 * it's needed for, e.g., an xhci controller that can only do 32-bit DMA.
1450 */
1451 for (i = (physmap_idx * 2) - 2; i >= 0; i -= 2) {
1452 vm_paddr_t plow, phigh;
1453
1454 /* L2 mappings must be backed by memory that is L2-aligned */
1455 plow = roundup2(physmap[i], L2_SIZE);
1456 phigh = physmap[i + 1];
1457 if (plow >= phigh)
1458 continue;
1459 if (kernstart >= plow && kernstart < phigh)
1460 phigh = kernstart;
1461 if (phigh - plow >= L2_SIZE) {
1462 pmap_bootstrap_allocate_san_l2(plow, phigh, &va, eva);
1463 if (va >= eva)
1464 break;
1465 }
1466 }
1467 if (i < 0)
1468 panic("Could not find phys region for shadow map");
1469
1470 /*
1471 * Done. We should now have a valid shadow address mapped for all KVA
1472 * that has been mapped so far, i.e., KERNBASE to virtual_avail. Thus,
1473 * shadow accesses by the sanitizer runtime will succeed for this range.
1474 * When the kernel virtual address range is later expanded, as will
1475 * happen in vm_mem_init(), the shadow map will be grown as well. This
1476 * is handled by pmap_san_enter().
1477 */
1478 }
1479
1480 void
pmap_bootstrap_san(void)1481 pmap_bootstrap_san(void)
1482 {
1483 #ifdef KASAN
1484 pmap_bootstrap_san1(KASAN_MIN_ADDRESS, KASAN_SHADOW_SCALE);
1485 #else
1486 static uint8_t kmsan_shad_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1487 static uint8_t kmsan_orig_ptp[PAGE_SIZE * 2] __aligned(PAGE_SIZE);
1488 pd_entry_t *l0, *l1;
1489
1490 if (virtual_avail - VM_MIN_KERNEL_ADDRESS > L1_SIZE)
1491 panic("initial kernel map is too large");
1492
1493 l0 = pmap_l0(kernel_pmap, KMSAN_SHAD_MIN_ADDRESS);
1494 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1495 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp)));
1496 l1 = pmap_l0_to_l1(l0, KMSAN_SHAD_MIN_ADDRESS);
1497 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1498 pmap_early_vtophys((vm_offset_t)kmsan_shad_ptp + PAGE_SIZE)));
1499 pmap_bootstrap_san1(KMSAN_SHAD_MIN_ADDRESS, 1);
1500
1501 l0 = pmap_l0(kernel_pmap, KMSAN_ORIG_MIN_ADDRESS);
1502 pmap_store(l0, L0_TABLE | PHYS_TO_PTE(
1503 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp)));
1504 l1 = pmap_l0_to_l1(l0, KMSAN_ORIG_MIN_ADDRESS);
1505 pmap_store(l1, L1_TABLE | PHYS_TO_PTE(
1506 pmap_early_vtophys((vm_offset_t)kmsan_orig_ptp + PAGE_SIZE)));
1507 pmap_bootstrap_san1(KMSAN_ORIG_MIN_ADDRESS, 1);
1508 #endif
1509 }
1510 #endif
1511
1512 /*
1513 * Initialize a vm_page's machine-dependent fields.
1514 */
1515 void
pmap_page_init(vm_page_t m)1516 pmap_page_init(vm_page_t m)
1517 {
1518
1519 TAILQ_INIT(&m->md.pv_list);
1520 m->md.pv_memattr = VM_MEMATTR_WRITE_BACK;
1521 }
1522
1523 static void
pmap_init_asids(struct asid_set * set,int bits)1524 pmap_init_asids(struct asid_set *set, int bits)
1525 {
1526 int i;
1527
1528 set->asid_bits = bits;
1529
1530 /*
1531 * We may be too early in the overall initialization process to use
1532 * bit_alloc().
1533 */
1534 set->asid_set_size = 1 << set->asid_bits;
1535 set->asid_set = kmem_malloc(bitstr_size(set->asid_set_size),
1536 M_WAITOK | M_ZERO);
1537 for (i = 0; i < ASID_FIRST_AVAILABLE; i++)
1538 bit_set(set->asid_set, i);
1539 set->asid_next = ASID_FIRST_AVAILABLE;
1540 mtx_init(&set->asid_set_mutex, "asid set", NULL, MTX_SPIN);
1541 }
1542
1543 static void
pmap_init_pv_table(void)1544 pmap_init_pv_table(void)
1545 {
1546 struct vm_phys_seg *seg, *next_seg;
1547 struct pmap_large_md_page *pvd;
1548 vm_size_t s;
1549 int domain, i, j, pages;
1550
1551 /*
1552 * We strongly depend on the size being a power of two, so the assert
1553 * is overzealous. However, should the struct be resized to a
1554 * different power of two, the code below needs to be revisited.
1555 */
1556 CTASSERT((sizeof(*pvd) == 64));
1557
1558 /*
1559 * Calculate the size of the array.
1560 */
1561 s = 0;
1562 for (i = 0; i < vm_phys_nsegs; i++) {
1563 seg = &vm_phys_segs[i];
1564 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1565 pmap_l2_pindex(seg->start);
1566 s += round_page(pages * sizeof(*pvd));
1567 }
1568 pv_table = (struct pmap_large_md_page *)kva_alloc(s);
1569 if (pv_table == NULL)
1570 panic("%s: kva_alloc failed\n", __func__);
1571
1572 /*
1573 * Iterate physical segments to allocate domain-local memory for PV
1574 * list headers.
1575 */
1576 pvd = pv_table;
1577 for (i = 0; i < vm_phys_nsegs; i++) {
1578 seg = &vm_phys_segs[i];
1579 pages = pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1580 pmap_l2_pindex(seg->start);
1581 domain = seg->domain;
1582
1583 s = round_page(pages * sizeof(*pvd));
1584
1585 for (j = 0; j < s; j += PAGE_SIZE) {
1586 vm_page_t m = vm_page_alloc_noobj_domain(domain,
1587 VM_ALLOC_ZERO);
1588 if (m == NULL)
1589 panic("failed to allocate PV table page");
1590 pmap_qenter((vm_offset_t)pvd + j, &m, 1);
1591 }
1592
1593 for (j = 0; j < s / sizeof(*pvd); j++) {
1594 rw_init_flags(&pvd->pv_lock, "pmap pv list", RW_NEW);
1595 TAILQ_INIT(&pvd->pv_page.pv_list);
1596 pvd++;
1597 }
1598 }
1599 pvd = &pv_dummy_large;
1600 memset(pvd, 0, sizeof(*pvd));
1601 rw_init_flags(&pvd->pv_lock, "pmap pv list dummy", RW_NEW);
1602 TAILQ_INIT(&pvd->pv_page.pv_list);
1603
1604 /*
1605 * Set pointers from vm_phys_segs to pv_table.
1606 */
1607 for (i = 0, pvd = pv_table; i < vm_phys_nsegs; i++) {
1608 seg = &vm_phys_segs[i];
1609 seg->md_first = pvd;
1610 pvd += pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) -
1611 pmap_l2_pindex(seg->start);
1612
1613 /*
1614 * If there is a following segment, and the final
1615 * superpage of this segment and the initial superpage
1616 * of the next segment are the same then adjust the
1617 * pv_table entry for that next segment down by one so
1618 * that the pv_table entries will be shared.
1619 */
1620 if (i + 1 < vm_phys_nsegs) {
1621 next_seg = &vm_phys_segs[i + 1];
1622 if (pmap_l2_pindex(roundup2(seg->end, L2_SIZE)) - 1 ==
1623 pmap_l2_pindex(next_seg->start)) {
1624 pvd--;
1625 }
1626 }
1627 }
1628 }
1629
1630 /*
1631 * Initialize the pmap module.
1632 *
1633 * Called by vm_mem_init(), to initialize any structures that the pmap
1634 * system needs to map virtual memory.
1635 */
1636 void
pmap_init(void)1637 pmap_init(void)
1638 {
1639 uint64_t mmfr1;
1640 int i, vmid_bits;
1641
1642 /*
1643 * Are large page mappings enabled?
1644 */
1645 TUNABLE_INT_FETCH("vm.pmap.superpages_enabled", &superpages_enabled);
1646 if (superpages_enabled) {
1647 KASSERT(MAXPAGESIZES > 1 && pagesizes[1] == 0,
1648 ("pmap_init: can't assign to pagesizes[1]"));
1649 pagesizes[1] = L2_SIZE;
1650 if (L1_BLOCKS_SUPPORTED) {
1651 KASSERT(MAXPAGESIZES > 2 && pagesizes[2] == 0,
1652 ("pmap_init: can't assign to pagesizes[2]"));
1653 pagesizes[2] = L1_SIZE;
1654 }
1655 }
1656
1657 /*
1658 * Initialize the ASID allocator.
1659 */
1660 pmap_init_asids(&asids,
1661 (READ_SPECIALREG(tcr_el1) & TCR_ASID_16) != 0 ? 16 : 8);
1662
1663 if (has_hyp()) {
1664 mmfr1 = READ_SPECIALREG(id_aa64mmfr1_el1);
1665 vmid_bits = 8;
1666
1667 if (ID_AA64MMFR1_VMIDBits_VAL(mmfr1) ==
1668 ID_AA64MMFR1_VMIDBits_16)
1669 vmid_bits = 16;
1670 pmap_init_asids(&vmids, vmid_bits);
1671 }
1672
1673 /*
1674 * Initialize pv chunk lists.
1675 */
1676 for (i = 0; i < PMAP_MEMDOM; i++) {
1677 mtx_init(&pv_chunks[i].pvc_lock, "pmap pv chunk list", NULL,
1678 MTX_DEF);
1679 TAILQ_INIT(&pv_chunks[i].pvc_list);
1680 }
1681 pmap_init_pv_table();
1682
1683 vm_initialized = 1;
1684 }
1685
1686 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l1, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1687 "L1 (1GB/64GB) page mapping counters");
1688
1689 static COUNTER_U64_DEFINE_EARLY(pmap_l1_demotions);
1690 SYSCTL_COUNTER_U64(_vm_pmap_l1, OID_AUTO, demotions, CTLFLAG_RD,
1691 &pmap_l1_demotions, "L1 (1GB/64GB) page demotions");
1692
1693 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1694 "L2C (32MB/1GB) page mapping counters");
1695
1696 static COUNTER_U64_DEFINE_EARLY(pmap_l2c_demotions);
1697 SYSCTL_COUNTER_U64(_vm_pmap_l2c, OID_AUTO, demotions, CTLFLAG_RD,
1698 &pmap_l2c_demotions, "L2C (32MB/1GB) page demotions");
1699
1700 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l2, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1701 "2MB page mapping counters");
1702
1703 static u_long pmap_l2_demotions;
1704 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, demotions, CTLFLAG_RD,
1705 &pmap_l2_demotions, 0, "2MB page demotions");
1706
1707 static u_long pmap_l2_mappings;
1708 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, mappings, CTLFLAG_RD,
1709 &pmap_l2_mappings, 0, "2MB page mappings");
1710
1711 static u_long pmap_l2_p_failures;
1712 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, p_failures, CTLFLAG_RD,
1713 &pmap_l2_p_failures, 0, "2MB page promotion failures");
1714
1715 static u_long pmap_l2_promotions;
1716 SYSCTL_ULONG(_vm_pmap_l2, OID_AUTO, promotions, CTLFLAG_RD,
1717 &pmap_l2_promotions, 0, "2MB page promotions");
1718
1719 static SYSCTL_NODE(_vm_pmap, OID_AUTO, l3c, CTLFLAG_RD | CTLFLAG_MPSAFE, 0,
1720 "L3C (64KB/2MB) page mapping counters");
1721
1722 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_demotions);
1723 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, demotions, CTLFLAG_RD,
1724 &pmap_l3c_demotions, "L3C (64KB/2MB) page demotions");
1725
1726 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_mappings);
1727 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, mappings, CTLFLAG_RD,
1728 &pmap_l3c_mappings, "L3C (64KB/2MB) page mappings");
1729
1730 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_p_failures);
1731 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, p_failures, CTLFLAG_RD,
1732 &pmap_l3c_p_failures, "L3C (64KB/2MB) page promotion failures");
1733
1734 static COUNTER_U64_DEFINE_EARLY(pmap_l3c_promotions);
1735 SYSCTL_COUNTER_U64(_vm_pmap_l3c, OID_AUTO, promotions, CTLFLAG_RD,
1736 &pmap_l3c_promotions, "L3C (64KB/2MB) page promotions");
1737
1738 /*
1739 * If the given value for "final_only" is false, then any cached intermediate-
1740 * level entries, i.e., L{0,1,2}_TABLE entries, are invalidated in addition to
1741 * any cached final-level entry, i.e., either an L{1,2}_BLOCK or L3_PAGE entry.
1742 * Otherwise, just the cached final-level entry is invalidated.
1743 */
1744 static __inline void
pmap_s1_invalidate_kernel(uint64_t r,bool final_only)1745 pmap_s1_invalidate_kernel(uint64_t r, bool final_only)
1746 {
1747 if (final_only)
1748 __asm __volatile("tlbi vaale1is, %0" : : "r" (r));
1749 else
1750 __asm __volatile("tlbi vaae1is, %0" : : "r" (r));
1751 }
1752
1753 static __inline void
pmap_s1_invalidate_user(uint64_t r,bool final_only)1754 pmap_s1_invalidate_user(uint64_t r, bool final_only)
1755 {
1756 if (final_only)
1757 __asm __volatile("tlbi vale1is, %0" : : "r" (r));
1758 else
1759 __asm __volatile("tlbi vae1is, %0" : : "r" (r));
1760 }
1761
1762 /*
1763 * Invalidates any cached final- and optionally intermediate-level TLB entries
1764 * for the specified virtual address in the given virtual address space.
1765 */
1766 static __inline void
pmap_s1_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1767 pmap_s1_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1768 {
1769 uint64_t r;
1770
1771 PMAP_ASSERT_STAGE1(pmap);
1772
1773 dsb(ishst);
1774 r = TLBI_VA(va);
1775 if (pmap == kernel_pmap) {
1776 pmap_s1_invalidate_kernel(r, final_only);
1777 } else {
1778 r |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1779 pmap_s1_invalidate_user(r, final_only);
1780 }
1781 dsb(ish);
1782 isb();
1783 }
1784
1785 static __inline void
pmap_s2_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1786 pmap_s2_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1787 {
1788 PMAP_ASSERT_STAGE2(pmap);
1789 MPASS(pmap_stage2_invalidate_range != NULL);
1790 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), va, va + PAGE_SIZE,
1791 final_only);
1792 }
1793
1794 static __inline void
pmap_invalidate_page(pmap_t pmap,vm_offset_t va,bool final_only)1795 pmap_invalidate_page(pmap_t pmap, vm_offset_t va, bool final_only)
1796 {
1797 if (pmap->pm_stage == PM_STAGE1)
1798 pmap_s1_invalidate_page(pmap, va, final_only);
1799 else
1800 pmap_s2_invalidate_page(pmap, va, final_only);
1801 }
1802
1803 /*
1804 * Use stride L{1,2}_SIZE when invalidating the TLB entries for L{1,2}_BLOCK
1805 * mappings. Otherwise, use stride L3_SIZE.
1806 */
1807 static __inline void
pmap_s1_invalidate_strided(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_offset_t stride,bool final_only)1808 pmap_s1_invalidate_strided(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1809 vm_offset_t stride, bool final_only)
1810 {
1811 uint64_t end, r, start;
1812
1813 PMAP_ASSERT_STAGE1(pmap);
1814
1815 dsb(ishst);
1816 if (pmap == kernel_pmap) {
1817 start = TLBI_VA(sva);
1818 end = TLBI_VA(eva);
1819 for (r = start; r < end; r += TLBI_VA(stride))
1820 pmap_s1_invalidate_kernel(r, final_only);
1821 } else {
1822 start = end = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1823 start |= TLBI_VA(sva);
1824 end |= TLBI_VA(eva);
1825 for (r = start; r < end; r += TLBI_VA(stride))
1826 pmap_s1_invalidate_user(r, final_only);
1827 }
1828 dsb(ish);
1829 isb();
1830 }
1831
1832 /*
1833 * Invalidates any cached final- and optionally intermediate-level TLB entries
1834 * for the specified virtual address range in the given virtual address space.
1835 */
1836 static __inline void
pmap_s1_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1837 pmap_s1_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1838 bool final_only)
1839 {
1840 pmap_s1_invalidate_strided(pmap, sva, eva, L3_SIZE, final_only);
1841 }
1842
1843 static __inline void
pmap_s2_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1844 pmap_s2_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1845 bool final_only)
1846 {
1847 PMAP_ASSERT_STAGE2(pmap);
1848 MPASS(pmap_stage2_invalidate_range != NULL);
1849 pmap_stage2_invalidate_range(pmap_to_ttbr0(pmap), sva, eva, final_only);
1850 }
1851
1852 static __inline void
pmap_invalidate_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool final_only)1853 pmap_invalidate_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva,
1854 bool final_only)
1855 {
1856 if (pmap->pm_stage == PM_STAGE1)
1857 pmap_s1_invalidate_range(pmap, sva, eva, final_only);
1858 else
1859 pmap_s2_invalidate_range(pmap, sva, eva, final_only);
1860 }
1861
1862 /*
1863 * Invalidates all cached intermediate- and final-level TLB entries for the
1864 * given virtual address space.
1865 */
1866 static __inline void
pmap_s1_invalidate_all(pmap_t pmap)1867 pmap_s1_invalidate_all(pmap_t pmap)
1868 {
1869 uint64_t r;
1870
1871 PMAP_ASSERT_STAGE1(pmap);
1872
1873 dsb(ishst);
1874 if (pmap == kernel_pmap) {
1875 __asm __volatile("tlbi vmalle1is");
1876 } else {
1877 r = ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
1878 __asm __volatile("tlbi aside1is, %0" : : "r" (r));
1879 }
1880 dsb(ish);
1881 isb();
1882 }
1883
1884 static __inline void
pmap_s2_invalidate_all(pmap_t pmap)1885 pmap_s2_invalidate_all(pmap_t pmap)
1886 {
1887 PMAP_ASSERT_STAGE2(pmap);
1888 MPASS(pmap_stage2_invalidate_all != NULL);
1889 pmap_stage2_invalidate_all(pmap_to_ttbr0(pmap));
1890 }
1891
1892 static __inline void
pmap_invalidate_all(pmap_t pmap)1893 pmap_invalidate_all(pmap_t pmap)
1894 {
1895 if (pmap->pm_stage == PM_STAGE1)
1896 pmap_s1_invalidate_all(pmap);
1897 else
1898 pmap_s2_invalidate_all(pmap);
1899 }
1900
1901 /*
1902 * Routine: pmap_extract
1903 * Function:
1904 * Extract the physical page address associated
1905 * with the given map/virtual_address pair.
1906 */
1907 vm_paddr_t
pmap_extract(pmap_t pmap,vm_offset_t va)1908 pmap_extract(pmap_t pmap, vm_offset_t va)
1909 {
1910 pt_entry_t *pte, tpte;
1911 vm_paddr_t pa;
1912 int lvl;
1913
1914 pa = 0;
1915 PMAP_LOCK(pmap);
1916 /*
1917 * Find the block or page map for this virtual address. pmap_pte
1918 * will return either a valid block/page entry, or NULL.
1919 */
1920 pte = pmap_pte(pmap, va, &lvl);
1921 if (pte != NULL) {
1922 tpte = pmap_load(pte);
1923 pa = PTE_TO_PHYS(tpte);
1924 switch(lvl) {
1925 case 1:
1926 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
1927 KASSERT((tpte & ATTR_DESCR_MASK) == L1_BLOCK,
1928 ("pmap_extract: Invalid L1 pte found: %lx",
1929 tpte & ATTR_DESCR_MASK));
1930 pa |= (va & L1_OFFSET);
1931 break;
1932 case 2:
1933 KASSERT((tpte & ATTR_DESCR_MASK) == L2_BLOCK,
1934 ("pmap_extract: Invalid L2 pte found: %lx",
1935 tpte & ATTR_DESCR_MASK));
1936 pa |= (va & L2_OFFSET);
1937 break;
1938 case 3:
1939 KASSERT((tpte & ATTR_DESCR_MASK) == L3_PAGE,
1940 ("pmap_extract: Invalid L3 pte found: %lx",
1941 tpte & ATTR_DESCR_MASK));
1942 pa |= (va & L3_OFFSET);
1943 break;
1944 }
1945 }
1946 PMAP_UNLOCK(pmap);
1947 return (pa);
1948 }
1949
1950 /*
1951 * Routine: pmap_extract_and_hold
1952 * Function:
1953 * Atomically extract and hold the physical page
1954 * with the given pmap and virtual address pair
1955 * if that mapping permits the given protection.
1956 */
1957 vm_page_t
pmap_extract_and_hold(pmap_t pmap,vm_offset_t va,vm_prot_t prot)1958 pmap_extract_and_hold(pmap_t pmap, vm_offset_t va, vm_prot_t prot)
1959 {
1960 pt_entry_t *pte, tpte;
1961 vm_offset_t off;
1962 vm_page_t m;
1963 int lvl;
1964 bool use;
1965
1966 m = NULL;
1967 PMAP_LOCK(pmap);
1968 pte = pmap_pte(pmap, va, &lvl);
1969 if (pte != NULL) {
1970 tpte = pmap_load(pte);
1971
1972 KASSERT(lvl > 0 && lvl <= 3,
1973 ("pmap_extract_and_hold: Invalid level %d", lvl));
1974 /*
1975 * Check that the pte is either a L3 page, or a L1 or L2 block
1976 * entry. We can assume L1_BLOCK == L2_BLOCK.
1977 */
1978 KASSERT((lvl == 3 && (tpte & ATTR_DESCR_MASK) == L3_PAGE) ||
1979 (lvl < 3 && (tpte & ATTR_DESCR_MASK) == L1_BLOCK),
1980 ("pmap_extract_and_hold: Invalid pte at L%d: %lx", lvl,
1981 tpte & ATTR_DESCR_MASK));
1982
1983 use = false;
1984 if ((prot & VM_PROT_WRITE) == 0)
1985 use = true;
1986 else if (pmap->pm_stage == PM_STAGE1 &&
1987 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW))
1988 use = true;
1989 else if (pmap->pm_stage == PM_STAGE2 &&
1990 ((tpte & ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)) ==
1991 ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE)))
1992 use = true;
1993
1994 if (use) {
1995 switch (lvl) {
1996 case 1:
1997 off = va & L1_OFFSET;
1998 break;
1999 case 2:
2000 off = va & L2_OFFSET;
2001 break;
2002 case 3:
2003 default:
2004 off = 0;
2005 }
2006 m = PHYS_TO_VM_PAGE(PTE_TO_PHYS(tpte) | off);
2007 if (m != NULL && !vm_page_wire_mapped(m))
2008 m = NULL;
2009 }
2010 }
2011 PMAP_UNLOCK(pmap);
2012 return (m);
2013 }
2014
2015 /*
2016 * Walks the page tables to translate a kernel virtual address to a
2017 * physical address. Returns true if the kva is valid and stores the
2018 * physical address in pa if it is not NULL.
2019 *
2020 * See the comment above data_abort() for the rationale for specifying
2021 * NO_PERTHREAD_SSP here.
2022 */
2023 bool NO_PERTHREAD_SSP
pmap_klookup(vm_offset_t va,vm_paddr_t * pa)2024 pmap_klookup(vm_offset_t va, vm_paddr_t *pa)
2025 {
2026 pt_entry_t *pte, tpte;
2027 register_t intr;
2028 uint64_t par;
2029
2030 /*
2031 * Disable interrupts so we don't get interrupted between asking
2032 * for address translation, and getting the result back.
2033 */
2034 intr = intr_disable();
2035 par = arm64_address_translate_s1e1r(va);
2036 intr_restore(intr);
2037
2038 if (PAR_SUCCESS(par)) {
2039 if (pa != NULL)
2040 *pa = (par & PAR_PA_MASK) | (va & PAR_LOW_MASK);
2041 return (true);
2042 }
2043
2044 /*
2045 * Fall back to walking the page table. The address translation
2046 * instruction may fail when the page is in a break-before-make
2047 * sequence. As we only clear the valid bit in said sequence we
2048 * can walk the page table to find the physical address.
2049 */
2050
2051 pte = pmap_l1(kernel_pmap, va);
2052 if (pte == NULL)
2053 return (false);
2054
2055 /*
2056 * A concurrent pmap_update_entry() will clear the entry's valid bit
2057 * but leave the rest of the entry unchanged. Therefore, we treat a
2058 * non-zero entry as being valid, and we ignore the valid bit when
2059 * determining whether the entry maps a block, page, or table.
2060 */
2061 tpte = pmap_load(pte);
2062 if (tpte == 0)
2063 return (false);
2064 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2065 if (pa != NULL)
2066 *pa = PTE_TO_PHYS(tpte) | (va & L1_OFFSET);
2067 return (true);
2068 }
2069 pte = pmap_l1_to_l2(&tpte, va);
2070 tpte = pmap_load(pte);
2071 if (tpte == 0)
2072 return (false);
2073 if ((tpte & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
2074 if (pa != NULL)
2075 *pa = PTE_TO_PHYS(tpte) | (va & L2_OFFSET);
2076 return (true);
2077 }
2078 pte = pmap_l2_to_l3(&tpte, va);
2079 tpte = pmap_load(pte);
2080 if (tpte == 0)
2081 return (false);
2082 if (pa != NULL)
2083 *pa = PTE_TO_PHYS(tpte) | (va & L3_OFFSET);
2084 return (true);
2085 }
2086
2087 /*
2088 * Routine: pmap_kextract
2089 * Function:
2090 * Extract the physical page address associated with the given kernel
2091 * virtual address.
2092 */
2093 vm_paddr_t
pmap_kextract(vm_offset_t va)2094 pmap_kextract(vm_offset_t va)
2095 {
2096 vm_paddr_t pa;
2097
2098 if (va >= DMAP_MIN_ADDRESS && va < DMAP_MAX_ADDRESS)
2099 return (DMAP_TO_PHYS(va));
2100
2101 if (pmap_klookup(va, &pa) == false)
2102 return (0);
2103 return (pa);
2104 }
2105
2106 /***************************************************
2107 * Low level mapping routines.....
2108 ***************************************************/
2109
2110 void
pmap_kenter(vm_offset_t sva,vm_size_t size,vm_paddr_t pa,int mode)2111 pmap_kenter(vm_offset_t sva, vm_size_t size, vm_paddr_t pa, int mode)
2112 {
2113 pd_entry_t *pde;
2114 pt_entry_t attr, old_l3e, *pte;
2115 vm_offset_t va;
2116 vm_page_t mpte;
2117 int error, lvl;
2118
2119 KASSERT((pa & L3_OFFSET) == 0,
2120 ("pmap_kenter: Invalid physical address"));
2121 KASSERT((sva & L3_OFFSET) == 0,
2122 ("pmap_kenter: Invalid virtual address"));
2123 KASSERT((size & PAGE_MASK) == 0,
2124 ("pmap_kenter: Mapping is not page-sized"));
2125
2126 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2127 ATTR_KERN_GP | ATTR_S1_IDX(mode);
2128 old_l3e = 0;
2129 va = sva;
2130 while (size != 0) {
2131 pde = pmap_pde(kernel_pmap, va, &lvl);
2132 KASSERT(pde != NULL,
2133 ("pmap_kenter: Invalid page entry, va: 0x%lx", va));
2134 KASSERT(lvl == 2, ("pmap_kenter: Invalid level %d", lvl));
2135
2136 /*
2137 * If we have an aligned, contiguous chunk of L2_SIZE, try
2138 * to create an L2_BLOCK mapping.
2139 */
2140 if ((va & L2_OFFSET) == 0 && size >= L2_SIZE &&
2141 (pa & L2_OFFSET) == 0 && vm_initialized) {
2142 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
2143 KASSERT(pmap_every_pte_zero(VM_PAGE_TO_PHYS(mpte)),
2144 ("pmap_kenter: Unexpected mapping"));
2145 PMAP_LOCK(kernel_pmap);
2146 error = pmap_insert_pt_page(kernel_pmap, mpte, false,
2147 false);
2148 if (error == 0) {
2149 attr &= ~ATTR_CONTIGUOUS;
2150
2151 /*
2152 * Although the page table page "mpte" should
2153 * be devoid of mappings, the TLB might hold
2154 * intermediate entries that reference it, so
2155 * we perform a single-page invalidation.
2156 */
2157 pmap_update_entry(kernel_pmap, pde,
2158 PHYS_TO_PTE(pa) | attr | L2_BLOCK, va,
2159 PAGE_SIZE);
2160 }
2161 PMAP_UNLOCK(kernel_pmap);
2162 if (error == 0) {
2163 va += L2_SIZE;
2164 pa += L2_SIZE;
2165 size -= L2_SIZE;
2166 continue;
2167 }
2168 }
2169
2170 /*
2171 * If we have an aligned, contiguous chunk of L3C_ENTRIES
2172 * L3 pages, set the contiguous bit within each PTE so that
2173 * the chunk can be cached using only one TLB entry.
2174 */
2175 if ((va & L3C_OFFSET) == 0 && (pa & L3C_OFFSET) == 0) {
2176 if (size >= L3C_SIZE)
2177 attr |= ATTR_CONTIGUOUS;
2178 else
2179 attr &= ~ATTR_CONTIGUOUS;
2180 }
2181
2182 pte = pmap_l2_to_l3(pde, va);
2183 old_l3e |= pmap_load_store(pte, PHYS_TO_PTE(pa) | attr |
2184 L3_PAGE);
2185
2186 va += PAGE_SIZE;
2187 pa += PAGE_SIZE;
2188 size -= PAGE_SIZE;
2189 }
2190 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2191 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2192 else {
2193 /*
2194 * Because the old entries were invalid and the new mappings
2195 * are not executable, an isb is not required.
2196 */
2197 dsb(ishst);
2198 }
2199 }
2200
2201 void
pmap_kenter_device(vm_offset_t sva,vm_size_t size,vm_paddr_t pa)2202 pmap_kenter_device(vm_offset_t sva, vm_size_t size, vm_paddr_t pa)
2203 {
2204
2205 pmap_kenter(sva, size, pa, VM_MEMATTR_DEVICE);
2206 }
2207
2208 /*
2209 * Remove a page from the kernel pagetables.
2210 */
2211 void
pmap_kremove(vm_offset_t va)2212 pmap_kremove(vm_offset_t va)
2213 {
2214 pt_entry_t *pte;
2215
2216 pte = pmap_pte_exists(kernel_pmap, va, 3, __func__);
2217 KASSERT((pmap_load(pte) & ATTR_CONTIGUOUS) == 0,
2218 ("pmap_kremove: unexpected ATTR_CONTIGUOUS"));
2219 pmap_clear(pte);
2220 pmap_s1_invalidate_page(kernel_pmap, va, true);
2221 }
2222
2223 /*
2224 * Remove the specified range of mappings from the kernel address space.
2225 *
2226 * Should only be applied to mappings that were created by pmap_kenter() or
2227 * pmap_kenter_device(). Nothing about this function is actually specific
2228 * to device mappings.
2229 */
2230 void
pmap_kremove_device(vm_offset_t sva,vm_size_t size)2231 pmap_kremove_device(vm_offset_t sva, vm_size_t size)
2232 {
2233 pt_entry_t *ptep, *ptep_end;
2234 vm_offset_t va;
2235 int lvl;
2236
2237 KASSERT((sva & L3_OFFSET) == 0,
2238 ("pmap_kremove_device: Invalid virtual address"));
2239 KASSERT((size & PAGE_MASK) == 0,
2240 ("pmap_kremove_device: Mapping is not page-sized"));
2241
2242 va = sva;
2243 while (size != 0) {
2244 ptep = pmap_pte(kernel_pmap, va, &lvl);
2245 KASSERT(ptep != NULL, ("Invalid page table, va: 0x%lx", va));
2246 switch (lvl) {
2247 case 2:
2248 KASSERT((va & L2_OFFSET) == 0,
2249 ("Unaligned virtual address"));
2250 KASSERT(size >= L2_SIZE, ("Insufficient size"));
2251
2252 if (va != sva) {
2253 pmap_s1_invalidate_range(kernel_pmap, sva, va,
2254 true);
2255 }
2256 pmap_clear(ptep);
2257 pmap_s1_invalidate_page(kernel_pmap, va, true);
2258 PMAP_LOCK(kernel_pmap);
2259 pmap_remove_kernel_l2(kernel_pmap, ptep, va);
2260 PMAP_UNLOCK(kernel_pmap);
2261
2262 va += L2_SIZE;
2263 sva = va;
2264 size -= L2_SIZE;
2265 break;
2266 case 3:
2267 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
2268 KASSERT((va & L3C_OFFSET) == 0,
2269 ("Unaligned L3C virtual address"));
2270 KASSERT(size >= L3C_SIZE,
2271 ("Insufficient L3C size"));
2272
2273 ptep_end = ptep + L3C_ENTRIES;
2274 for (; ptep < ptep_end; ptep++)
2275 pmap_clear(ptep);
2276
2277 va += L3C_SIZE;
2278 size -= L3C_SIZE;
2279 break;
2280 }
2281 pmap_clear(ptep);
2282
2283 va += PAGE_SIZE;
2284 size -= PAGE_SIZE;
2285 break;
2286 default:
2287 __assert_unreachable();
2288 break;
2289 }
2290 }
2291 if (va != sva)
2292 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2293 }
2294
2295 /*
2296 * Used to map a range of physical addresses into kernel
2297 * virtual address space.
2298 *
2299 * The value passed in '*virt' is a suggested virtual address for
2300 * the mapping. Architectures which can support a direct-mapped
2301 * physical to virtual region can return the appropriate address
2302 * within that region, leaving '*virt' unchanged. Other
2303 * architectures should map the pages starting at '*virt' and
2304 * update '*virt' with the first usable address after the mapped
2305 * region.
2306 */
2307 vm_offset_t
pmap_map(vm_offset_t * virt,vm_paddr_t start,vm_paddr_t end,int prot)2308 pmap_map(vm_offset_t *virt, vm_paddr_t start, vm_paddr_t end, int prot)
2309 {
2310 return PHYS_TO_DMAP(start);
2311 }
2312
2313 /*
2314 * Add a list of wired pages to the kva
2315 * this routine is only used for temporary
2316 * kernel mappings that do not need to have
2317 * page modification or references recorded.
2318 * Note that old mappings are simply written
2319 * over. The page *must* be wired.
2320 * Note: SMP coherent. Uses a ranged shootdown IPI.
2321 */
2322 void
pmap_qenter(vm_offset_t sva,vm_page_t * ma,int count)2323 pmap_qenter(vm_offset_t sva, vm_page_t *ma, int count)
2324 {
2325 pd_entry_t *pde;
2326 pt_entry_t attr, old_l3e, *pte;
2327 vm_offset_t va;
2328 vm_page_t m;
2329 int i, lvl;
2330
2331 old_l3e = 0;
2332 va = sva;
2333 for (i = 0; i < count; i++) {
2334 pde = pmap_pde(kernel_pmap, va, &lvl);
2335 KASSERT(pde != NULL,
2336 ("pmap_qenter: Invalid page entry, va: 0x%lx", va));
2337 KASSERT(lvl == 2,
2338 ("pmap_qenter: Invalid level %d", lvl));
2339
2340 m = ma[i];
2341 attr = ATTR_DEFAULT | ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_S1_XN |
2342 ATTR_KERN_GP | ATTR_S1_IDX(m->md.pv_memattr) | L3_PAGE;
2343 pte = pmap_l2_to_l3(pde, va);
2344 old_l3e |= pmap_load_store(pte, VM_PAGE_TO_PTE(m) | attr);
2345
2346 va += L3_SIZE;
2347 }
2348 if ((old_l3e & ATTR_DESCR_VALID) != 0)
2349 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2350 else {
2351 /*
2352 * Because the old entries were invalid and the new mappings
2353 * are not executable, an isb is not required.
2354 */
2355 dsb(ishst);
2356 }
2357 }
2358
2359 /*
2360 * This routine tears out page mappings from the
2361 * kernel -- it is meant only for temporary mappings.
2362 */
2363 void
pmap_qremove(vm_offset_t sva,int count)2364 pmap_qremove(vm_offset_t sva, int count)
2365 {
2366 pt_entry_t *pte;
2367 vm_offset_t va;
2368
2369 KASSERT(ADDR_IS_CANONICAL(sva),
2370 ("%s: Address not in canonical form: %lx", __func__, sva));
2371 KASSERT(ADDR_IS_KERNEL(sva), ("usermode va %lx", sva));
2372
2373 va = sva;
2374 while (count-- > 0) {
2375 pte = pmap_pte_exists(kernel_pmap, va, 3, NULL);
2376 if (pte != NULL) {
2377 pmap_clear(pte);
2378 }
2379
2380 va += PAGE_SIZE;
2381 }
2382 pmap_s1_invalidate_range(kernel_pmap, sva, va, true);
2383 }
2384
2385 /***************************************************
2386 * Page table page management routines.....
2387 ***************************************************/
2388 /*
2389 * Schedule the specified unused page table page to be freed. Specifically,
2390 * add the page to the specified list of pages that will be released to the
2391 * physical memory manager after the TLB has been updated.
2392 */
2393 static __inline void
pmap_add_delayed_free_list(vm_page_t m,struct spglist * free,bool set_PG_ZERO)2394 pmap_add_delayed_free_list(vm_page_t m, struct spglist *free, bool set_PG_ZERO)
2395 {
2396
2397 if (set_PG_ZERO)
2398 m->flags |= PG_ZERO;
2399 else
2400 m->flags &= ~PG_ZERO;
2401 SLIST_INSERT_HEAD(free, m, plinks.s.ss);
2402 }
2403
2404 /*
2405 * Decrements a page table page's reference count, which is used to record the
2406 * number of valid page table entries within the page. If the reference count
2407 * drops to zero, then the page table page is unmapped. Returns true if the
2408 * page table page was unmapped and false otherwise.
2409 */
2410 static inline bool
pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2411 pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2412 {
2413
2414 --m->ref_count;
2415 if (m->ref_count == 0) {
2416 _pmap_unwire_l3(pmap, va, m, free);
2417 return (true);
2418 } else
2419 return (false);
2420 }
2421
2422 static void
_pmap_unwire_l3(pmap_t pmap,vm_offset_t va,vm_page_t m,struct spglist * free)2423 _pmap_unwire_l3(pmap_t pmap, vm_offset_t va, vm_page_t m, struct spglist *free)
2424 {
2425
2426 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2427 /*
2428 * unmap the page table page
2429 */
2430 if (m->pindex >= (NUL2E + NUL1E)) {
2431 /* l1 page */
2432 pd_entry_t *l0;
2433
2434 l0 = pmap_l0(pmap, va);
2435 pmap_clear(l0);
2436 } else if (m->pindex >= NUL2E) {
2437 /* l2 page */
2438 pd_entry_t *l1;
2439
2440 l1 = pmap_l1(pmap, va);
2441 pmap_clear(l1);
2442 } else {
2443 /* l3 page */
2444 pd_entry_t *l2;
2445
2446 l2 = pmap_l2(pmap, va);
2447 pmap_clear(l2);
2448 }
2449 pmap_resident_count_dec(pmap, 1);
2450 if (m->pindex < NUL2E) {
2451 /* We just released an l3, unhold the matching l2 */
2452 pd_entry_t *l1, tl1;
2453 vm_page_t l2pg;
2454
2455 l1 = pmap_l1(pmap, va);
2456 tl1 = pmap_load(l1);
2457 l2pg = PTE_TO_VM_PAGE(tl1);
2458 pmap_unwire_l3(pmap, va, l2pg, free);
2459 } else if (m->pindex < (NUL2E + NUL1E)) {
2460 /* We just released an l2, unhold the matching l1 */
2461 pd_entry_t *l0, tl0;
2462 vm_page_t l1pg;
2463
2464 l0 = pmap_l0(pmap, va);
2465 tl0 = pmap_load(l0);
2466 l1pg = PTE_TO_VM_PAGE(tl0);
2467 pmap_unwire_l3(pmap, va, l1pg, free);
2468 }
2469 pmap_invalidate_page(pmap, va, false);
2470
2471 /*
2472 * Put page on a list so that it is released after
2473 * *ALL* TLB shootdown is done
2474 */
2475 pmap_add_delayed_free_list(m, free, true);
2476 }
2477
2478 /*
2479 * After removing a page table entry, this routine is used to
2480 * conditionally free the page, and manage the reference count.
2481 */
2482 static int
pmap_unuse_pt(pmap_t pmap,vm_offset_t va,pd_entry_t ptepde,struct spglist * free)2483 pmap_unuse_pt(pmap_t pmap, vm_offset_t va, pd_entry_t ptepde,
2484 struct spglist *free)
2485 {
2486 vm_page_t mpte;
2487
2488 KASSERT(ADDR_IS_CANONICAL(va),
2489 ("%s: Address not in canonical form: %lx", __func__, va));
2490 if (ADDR_IS_KERNEL(va))
2491 return (0);
2492 KASSERT(ptepde != 0, ("pmap_unuse_pt: ptepde != 0"));
2493 mpte = PTE_TO_VM_PAGE(ptepde);
2494 return (pmap_unwire_l3(pmap, va, mpte, free));
2495 }
2496
2497 /*
2498 * Release a page table page reference after a failed attempt to create a
2499 * mapping.
2500 */
2501 static void
pmap_abort_ptp(pmap_t pmap,vm_offset_t va,vm_page_t mpte)2502 pmap_abort_ptp(pmap_t pmap, vm_offset_t va, vm_page_t mpte)
2503 {
2504 struct spglist free;
2505
2506 SLIST_INIT(&free);
2507 if (pmap_unwire_l3(pmap, va, mpte, &free))
2508 vm_page_free_pages_toq(&free, true);
2509 }
2510
2511 void
pmap_pinit0(pmap_t pmap)2512 pmap_pinit0(pmap_t pmap)
2513 {
2514
2515 PMAP_LOCK_INIT(pmap);
2516 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2517 pmap->pm_l0_paddr = READ_SPECIALREG(ttbr0_el1);
2518 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2519 TAILQ_INIT(&pmap->pm_pvchunk);
2520 vm_radix_init(&pmap->pm_root);
2521 pmap->pm_cookie = COOKIE_FROM(ASID_RESERVED_FOR_PID_0, INT_MIN);
2522 pmap->pm_stage = PM_STAGE1;
2523 pmap->pm_levels = 4;
2524 pmap->pm_ttbr = pmap->pm_l0_paddr;
2525 pmap->pm_asid_set = &asids;
2526 pmap->pm_bti = NULL;
2527
2528 PCPU_SET(curpmap, pmap);
2529 }
2530
2531 int
pmap_pinit_stage(pmap_t pmap,enum pmap_stage stage,int levels)2532 pmap_pinit_stage(pmap_t pmap, enum pmap_stage stage, int levels)
2533 {
2534 vm_page_t m;
2535
2536 /*
2537 * allocate the l0 page
2538 */
2539 m = vm_page_alloc_noobj(VM_ALLOC_WAITOK | VM_ALLOC_WIRED |
2540 VM_ALLOC_ZERO);
2541 pmap->pm_l0_paddr = VM_PAGE_TO_PHYS(m);
2542 pmap->pm_l0 = (pd_entry_t *)PHYS_TO_DMAP(pmap->pm_l0_paddr);
2543
2544 TAILQ_INIT(&pmap->pm_pvchunk);
2545 vm_radix_init(&pmap->pm_root);
2546 bzero(&pmap->pm_stats, sizeof(pmap->pm_stats));
2547 pmap->pm_cookie = COOKIE_FROM(-1, INT_MAX);
2548
2549 MPASS(levels == 3 || levels == 4);
2550 pmap->pm_levels = levels;
2551 pmap->pm_stage = stage;
2552 pmap->pm_bti = NULL;
2553 switch (stage) {
2554 case PM_STAGE1:
2555 pmap->pm_asid_set = &asids;
2556 if (pmap_bti_support) {
2557 pmap->pm_bti = malloc(sizeof(struct rangeset), M_DEVBUF,
2558 M_ZERO | M_WAITOK);
2559 rangeset_init(pmap->pm_bti, bti_dup_range,
2560 bti_free_range, pmap, M_NOWAIT);
2561 }
2562 break;
2563 case PM_STAGE2:
2564 pmap->pm_asid_set = &vmids;
2565 break;
2566 default:
2567 panic("%s: Invalid pmap type %d", __func__, stage);
2568 break;
2569 }
2570
2571 /* XXX Temporarily disable deferred ASID allocation. */
2572 pmap_alloc_asid(pmap);
2573
2574 /*
2575 * Allocate the level 1 entry to use as the root. This will increase
2576 * the refcount on the level 1 page so it won't be removed until
2577 * pmap_release() is called.
2578 */
2579 if (pmap->pm_levels == 3) {
2580 PMAP_LOCK(pmap);
2581 m = _pmap_alloc_l3(pmap, NUL2E + NUL1E, NULL);
2582 PMAP_UNLOCK(pmap);
2583 }
2584 pmap->pm_ttbr = VM_PAGE_TO_PHYS(m);
2585
2586 return (1);
2587 }
2588
2589 int
pmap_pinit(pmap_t pmap)2590 pmap_pinit(pmap_t pmap)
2591 {
2592
2593 return (pmap_pinit_stage(pmap, PM_STAGE1, 4));
2594 }
2595
2596 /*
2597 * This routine is called if the desired page table page does not exist.
2598 *
2599 * If page table page allocation fails, this routine may sleep before
2600 * returning NULL. It sleeps only if a lock pointer was given.
2601 *
2602 * Note: If a page allocation fails at page table level two or three,
2603 * one or two pages may be held during the wait, only to be released
2604 * afterwards. This conservative approach is easily argued to avoid
2605 * race conditions.
2606 */
2607 static vm_page_t
_pmap_alloc_l3(pmap_t pmap,vm_pindex_t ptepindex,struct rwlock ** lockp)2608 _pmap_alloc_l3(pmap_t pmap, vm_pindex_t ptepindex, struct rwlock **lockp)
2609 {
2610 vm_page_t m, l1pg, l2pg;
2611
2612 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
2613
2614 /*
2615 * Allocate a page table page.
2616 */
2617 if ((m = vm_page_alloc_noobj(VM_ALLOC_WIRED | VM_ALLOC_ZERO)) == NULL) {
2618 if (lockp != NULL) {
2619 RELEASE_PV_LIST_LOCK(lockp);
2620 PMAP_UNLOCK(pmap);
2621 vm_wait(NULL);
2622 PMAP_LOCK(pmap);
2623 }
2624
2625 /*
2626 * Indicate the need to retry. While waiting, the page table
2627 * page may have been allocated.
2628 */
2629 return (NULL);
2630 }
2631 m->pindex = ptepindex;
2632
2633 /*
2634 * Because of AArch64's weak memory consistency model, we must have a
2635 * barrier here to ensure that the stores for zeroing "m", whether by
2636 * pmap_zero_page() or an earlier function, are visible before adding
2637 * "m" to the page table. Otherwise, a page table walk by another
2638 * processor's MMU could see the mapping to "m" and a stale, non-zero
2639 * PTE within "m".
2640 */
2641 dmb(ishst);
2642
2643 /*
2644 * Map the pagetable page into the process address space, if
2645 * it isn't already there.
2646 */
2647
2648 if (ptepindex >= (NUL2E + NUL1E)) {
2649 pd_entry_t *l0p, l0e;
2650 vm_pindex_t l0index;
2651
2652 l0index = ptepindex - (NUL2E + NUL1E);
2653 l0p = &pmap->pm_l0[l0index];
2654 KASSERT((pmap_load(l0p) & ATTR_DESCR_VALID) == 0,
2655 ("%s: L0 entry %#lx is valid", __func__, pmap_load(l0p)));
2656 l0e = VM_PAGE_TO_PTE(m) | L0_TABLE;
2657
2658 /*
2659 * Mark all kernel memory as not accessible from userspace
2660 * and userspace memory as not executable from the kernel.
2661 * This has been done for the bootstrap L0 entries in
2662 * locore.S.
2663 */
2664 if (pmap == kernel_pmap)
2665 l0e |= TATTR_UXN_TABLE | TATTR_AP_TABLE_NO_EL0;
2666 else
2667 l0e |= TATTR_PXN_TABLE;
2668 pmap_store(l0p, l0e);
2669 } else if (ptepindex >= NUL2E) {
2670 vm_pindex_t l0index, l1index;
2671 pd_entry_t *l0, *l1;
2672 pd_entry_t tl0;
2673
2674 l1index = ptepindex - NUL2E;
2675 l0index = l1index >> Ln_ENTRIES_SHIFT;
2676
2677 l0 = &pmap->pm_l0[l0index];
2678 tl0 = pmap_load(l0);
2679 if (tl0 == 0) {
2680 /* recurse for allocating page dir */
2681 if (_pmap_alloc_l3(pmap, NUL2E + NUL1E + l0index,
2682 lockp) == NULL) {
2683 vm_page_unwire_noq(m);
2684 vm_page_free_zero(m);
2685 return (NULL);
2686 }
2687 } else {
2688 l1pg = PTE_TO_VM_PAGE(tl0);
2689 l1pg->ref_count++;
2690 }
2691
2692 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l0)));
2693 l1 = &l1[ptepindex & Ln_ADDR_MASK];
2694 KASSERT((pmap_load(l1) & ATTR_DESCR_VALID) == 0,
2695 ("%s: L1 entry %#lx is valid", __func__, pmap_load(l1)));
2696 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
2697 } else {
2698 vm_pindex_t l0index, l1index;
2699 pd_entry_t *l0, *l1, *l2;
2700 pd_entry_t tl0, tl1;
2701
2702 l1index = ptepindex >> Ln_ENTRIES_SHIFT;
2703 l0index = l1index >> Ln_ENTRIES_SHIFT;
2704
2705 l0 = &pmap->pm_l0[l0index];
2706 tl0 = pmap_load(l0);
2707 if (tl0 == 0) {
2708 /* recurse for allocating page dir */
2709 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2710 lockp) == NULL) {
2711 vm_page_unwire_noq(m);
2712 vm_page_free_zero(m);
2713 return (NULL);
2714 }
2715 tl0 = pmap_load(l0);
2716 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2717 l1 = &l1[l1index & Ln_ADDR_MASK];
2718 } else {
2719 l1 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(tl0));
2720 l1 = &l1[l1index & Ln_ADDR_MASK];
2721 tl1 = pmap_load(l1);
2722 if (tl1 == 0) {
2723 /* recurse for allocating page dir */
2724 if (_pmap_alloc_l3(pmap, NUL2E + l1index,
2725 lockp) == NULL) {
2726 vm_page_unwire_noq(m);
2727 vm_page_free_zero(m);
2728 return (NULL);
2729 }
2730 } else {
2731 l2pg = PTE_TO_VM_PAGE(tl1);
2732 l2pg->ref_count++;
2733 }
2734 }
2735
2736 l2 = (pd_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l1)));
2737 l2 = &l2[ptepindex & Ln_ADDR_MASK];
2738 KASSERT((pmap_load(l2) & ATTR_DESCR_VALID) == 0,
2739 ("%s: L2 entry %#lx is valid", __func__, pmap_load(l2)));
2740 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
2741 }
2742
2743 pmap_resident_count_inc(pmap, 1);
2744
2745 return (m);
2746 }
2747
2748 static pd_entry_t *
pmap_alloc_l2(pmap_t pmap,vm_offset_t va,vm_page_t * l2pgp,struct rwlock ** lockp)2749 pmap_alloc_l2(pmap_t pmap, vm_offset_t va, vm_page_t *l2pgp,
2750 struct rwlock **lockp)
2751 {
2752 pd_entry_t *l1, *l2;
2753 vm_page_t l2pg;
2754 vm_pindex_t l2pindex;
2755
2756 KASSERT(ADDR_IS_CANONICAL(va),
2757 ("%s: Address not in canonical form: %lx", __func__, va));
2758
2759 retry:
2760 l1 = pmap_l1(pmap, va);
2761 if (l1 != NULL && (pmap_load(l1) & ATTR_DESCR_MASK) == L1_TABLE) {
2762 l2 = pmap_l1_to_l2(l1, va);
2763 if (!ADDR_IS_KERNEL(va)) {
2764 /* Add a reference to the L2 page. */
2765 l2pg = PTE_TO_VM_PAGE(pmap_load(l1));
2766 l2pg->ref_count++;
2767 } else
2768 l2pg = NULL;
2769 } else if (!ADDR_IS_KERNEL(va)) {
2770 /* Allocate a L2 page. */
2771 l2pindex = pmap_l2_pindex(va) >> Ln_ENTRIES_SHIFT;
2772 l2pg = _pmap_alloc_l3(pmap, NUL2E + l2pindex, lockp);
2773 if (l2pg == NULL) {
2774 if (lockp != NULL)
2775 goto retry;
2776 else
2777 return (NULL);
2778 }
2779 l2 = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(l2pg));
2780 l2 = &l2[pmap_l2_index(va)];
2781 } else
2782 panic("pmap_alloc_l2: missing page table page for va %#lx",
2783 va);
2784 *l2pgp = l2pg;
2785 return (l2);
2786 }
2787
2788 static vm_page_t
pmap_alloc_l3(pmap_t pmap,vm_offset_t va,struct rwlock ** lockp)2789 pmap_alloc_l3(pmap_t pmap, vm_offset_t va, struct rwlock **lockp)
2790 {
2791 vm_pindex_t ptepindex;
2792 pd_entry_t *pde, tpde;
2793 #ifdef INVARIANTS
2794 pt_entry_t *pte;
2795 #endif
2796 vm_page_t m;
2797 int lvl;
2798
2799 /*
2800 * Calculate pagetable page index
2801 */
2802 ptepindex = pmap_l2_pindex(va);
2803 retry:
2804 /*
2805 * Get the page directory entry
2806 */
2807 pde = pmap_pde(pmap, va, &lvl);
2808
2809 /*
2810 * If the page table page is mapped, we just increment the hold count,
2811 * and activate it. If we get a level 2 pde it will point to a level 3
2812 * table.
2813 */
2814 switch (lvl) {
2815 case -1:
2816 break;
2817 case 0:
2818 #ifdef INVARIANTS
2819 pte = pmap_l0_to_l1(pde, va);
2820 KASSERT(pmap_load(pte) == 0,
2821 ("pmap_alloc_l3: TODO: l0 superpages"));
2822 #endif
2823 break;
2824 case 1:
2825 #ifdef INVARIANTS
2826 pte = pmap_l1_to_l2(pde, va);
2827 KASSERT(pmap_load(pte) == 0,
2828 ("pmap_alloc_l3: TODO: l1 superpages"));
2829 #endif
2830 break;
2831 case 2:
2832 tpde = pmap_load(pde);
2833 if (tpde != 0) {
2834 m = PTE_TO_VM_PAGE(tpde);
2835 m->ref_count++;
2836 return (m);
2837 }
2838 break;
2839 default:
2840 panic("pmap_alloc_l3: Invalid level %d", lvl);
2841 }
2842
2843 /*
2844 * Here if the pte page isn't mapped, or if it has been deallocated.
2845 */
2846 m = _pmap_alloc_l3(pmap, ptepindex, lockp);
2847 if (m == NULL && lockp != NULL)
2848 goto retry;
2849
2850 return (m);
2851 }
2852
2853 /***************************************************
2854 * Pmap allocation/deallocation routines.
2855 ***************************************************/
2856
2857 /*
2858 * Release any resources held by the given physical map.
2859 * Called when a pmap initialized by pmap_pinit is being released.
2860 * Should only be called if the map contains no valid mappings.
2861 */
2862 void
pmap_release(pmap_t pmap)2863 pmap_release(pmap_t pmap)
2864 {
2865 bool rv __diagused;
2866 struct spglist freelist;
2867 struct asid_set *set;
2868 vm_page_t m;
2869 int asid;
2870
2871 if (pmap->pm_levels != 4) {
2872 PMAP_ASSERT_STAGE2(pmap);
2873 KASSERT(pmap->pm_stats.resident_count == 1,
2874 ("pmap_release: pmap resident count %ld != 0",
2875 pmap->pm_stats.resident_count));
2876 KASSERT((pmap->pm_l0[0] & ATTR_DESCR_VALID) == ATTR_DESCR_VALID,
2877 ("pmap_release: Invalid l0 entry: %lx", pmap->pm_l0[0]));
2878
2879 SLIST_INIT(&freelist);
2880 m = PHYS_TO_VM_PAGE(pmap->pm_ttbr);
2881 PMAP_LOCK(pmap);
2882 rv = pmap_unwire_l3(pmap, 0, m, &freelist);
2883 PMAP_UNLOCK(pmap);
2884 MPASS(rv == true);
2885 vm_page_free_pages_toq(&freelist, true);
2886 }
2887
2888 KASSERT(pmap->pm_stats.resident_count == 0,
2889 ("pmap_release: pmap resident count %ld != 0",
2890 pmap->pm_stats.resident_count));
2891 KASSERT(vm_radix_is_empty(&pmap->pm_root),
2892 ("pmap_release: pmap has reserved page table page(s)"));
2893
2894 set = pmap->pm_asid_set;
2895 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
2896
2897 /*
2898 * Allow the ASID to be reused. In stage 2 VMIDs we don't invalidate
2899 * the entries when removing them so rely on a later tlb invalidation.
2900 * this will happen when updating the VMID generation. Because of this
2901 * we don't reuse VMIDs within a generation.
2902 */
2903 if (pmap->pm_stage == PM_STAGE1) {
2904 mtx_lock_spin(&set->asid_set_mutex);
2905 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch) {
2906 asid = COOKIE_TO_ASID(pmap->pm_cookie);
2907 KASSERT(asid >= ASID_FIRST_AVAILABLE &&
2908 asid < set->asid_set_size,
2909 ("pmap_release: pmap cookie has out-of-range asid"));
2910 bit_clear(set->asid_set, asid);
2911 }
2912 mtx_unlock_spin(&set->asid_set_mutex);
2913
2914 if (pmap->pm_bti != NULL) {
2915 rangeset_fini(pmap->pm_bti);
2916 free(pmap->pm_bti, M_DEVBUF);
2917 }
2918 }
2919
2920 m = PHYS_TO_VM_PAGE(pmap->pm_l0_paddr);
2921 vm_page_unwire_noq(m);
2922 vm_page_free_zero(m);
2923 }
2924
2925 static int
kvm_size(SYSCTL_HANDLER_ARGS)2926 kvm_size(SYSCTL_HANDLER_ARGS)
2927 {
2928 unsigned long ksize = VM_MAX_KERNEL_ADDRESS - VM_MIN_KERNEL_ADDRESS;
2929
2930 return sysctl_handle_long(oidp, &ksize, 0, req);
2931 }
2932 SYSCTL_PROC(_vm, OID_AUTO, kvm_size, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2933 0, 0, kvm_size, "LU",
2934 "Size of KVM");
2935
2936 static int
kvm_free(SYSCTL_HANDLER_ARGS)2937 kvm_free(SYSCTL_HANDLER_ARGS)
2938 {
2939 unsigned long kfree = VM_MAX_KERNEL_ADDRESS - kernel_vm_end;
2940
2941 return sysctl_handle_long(oidp, &kfree, 0, req);
2942 }
2943 SYSCTL_PROC(_vm, OID_AUTO, kvm_free, CTLTYPE_LONG | CTLFLAG_RD | CTLFLAG_MPSAFE,
2944 0, 0, kvm_free, "LU",
2945 "Amount of KVM free");
2946
2947 /*
2948 * grow the number of kernel page table entries, if needed
2949 */
2950 void
pmap_growkernel(vm_offset_t addr)2951 pmap_growkernel(vm_offset_t addr)
2952 {
2953 vm_page_t nkpg;
2954 pd_entry_t *l0, *l1, *l2;
2955
2956 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
2957
2958 addr = roundup2(addr, L2_SIZE);
2959 if (addr - 1 >= vm_map_max(kernel_map))
2960 addr = vm_map_max(kernel_map);
2961 if (kernel_vm_end < addr) {
2962 kasan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2963 kmsan_shadow_map(kernel_vm_end, addr - kernel_vm_end);
2964 }
2965 while (kernel_vm_end < addr) {
2966 l0 = pmap_l0(kernel_pmap, kernel_vm_end);
2967 KASSERT(pmap_load(l0) != 0,
2968 ("pmap_growkernel: No level 0 kernel entry"));
2969
2970 l1 = pmap_l0_to_l1(l0, kernel_vm_end);
2971 if (pmap_load(l1) == 0) {
2972 /* We need a new PDP entry */
2973 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT |
2974 VM_ALLOC_WIRED | VM_ALLOC_ZERO);
2975 if (nkpg == NULL)
2976 panic("pmap_growkernel: no memory to grow kernel");
2977 nkpg->pindex = kernel_vm_end >> L1_SHIFT;
2978 /* See the dmb() in _pmap_alloc_l3(). */
2979 dmb(ishst);
2980 pmap_store(l1, VM_PAGE_TO_PTE(nkpg) | L1_TABLE);
2981 continue; /* try again */
2982 }
2983 l2 = pmap_l1_to_l2(l1, kernel_vm_end);
2984 if (pmap_load(l2) != 0) {
2985 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
2986 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
2987 kernel_vm_end = vm_map_max(kernel_map);
2988 break;
2989 }
2990 continue;
2991 }
2992
2993 nkpg = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
2994 VM_ALLOC_ZERO);
2995 if (nkpg == NULL)
2996 panic("pmap_growkernel: no memory to grow kernel");
2997 nkpg->pindex = kernel_vm_end >> L2_SHIFT;
2998 /* See the dmb() in _pmap_alloc_l3(). */
2999 dmb(ishst);
3000 pmap_store(l2, VM_PAGE_TO_PTE(nkpg) | L2_TABLE);
3001
3002 kernel_vm_end = (kernel_vm_end + L2_SIZE) & ~L2_OFFSET;
3003 if (kernel_vm_end - 1 >= vm_map_max(kernel_map)) {
3004 kernel_vm_end = vm_map_max(kernel_map);
3005 break;
3006 }
3007 }
3008 }
3009
3010 /***************************************************
3011 * page management routines.
3012 ***************************************************/
3013
3014 static const uint64_t pc_freemask[_NPCM] = {
3015 [0 ... _NPCM - 2] = PC_FREEN,
3016 [_NPCM - 1] = PC_FREEL
3017 };
3018
3019 #ifdef PV_STATS
3020 static int pc_chunk_count, pc_chunk_allocs, pc_chunk_frees, pc_chunk_tryfail;
3021
3022 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_count, CTLFLAG_RD, &pc_chunk_count, 0,
3023 "Current number of pv entry chunks");
3024 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_allocs, CTLFLAG_RD, &pc_chunk_allocs, 0,
3025 "Current number of pv entry chunks allocated");
3026 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_frees, CTLFLAG_RD, &pc_chunk_frees, 0,
3027 "Current number of pv entry chunks frees");
3028 SYSCTL_INT(_vm_pmap, OID_AUTO, pc_chunk_tryfail, CTLFLAG_RD, &pc_chunk_tryfail, 0,
3029 "Number of times tried to get a chunk page but failed.");
3030
3031 static long pv_entry_frees, pv_entry_allocs, pv_entry_count;
3032 static int pv_entry_spare;
3033
3034 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_frees, CTLFLAG_RD, &pv_entry_frees, 0,
3035 "Current number of pv entry frees");
3036 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_allocs, CTLFLAG_RD, &pv_entry_allocs, 0,
3037 "Current number of pv entry allocs");
3038 SYSCTL_LONG(_vm_pmap, OID_AUTO, pv_entry_count, CTLFLAG_RD, &pv_entry_count, 0,
3039 "Current number of pv entries");
3040 SYSCTL_INT(_vm_pmap, OID_AUTO, pv_entry_spare, CTLFLAG_RD, &pv_entry_spare, 0,
3041 "Current number of spare pv entries");
3042 #endif
3043
3044 /*
3045 * We are in a serious low memory condition. Resort to
3046 * drastic measures to free some pages so we can allocate
3047 * another pv entry chunk.
3048 *
3049 * Returns NULL if PV entries were reclaimed from the specified pmap.
3050 *
3051 * We do not, however, unmap 2mpages because subsequent accesses will
3052 * allocate per-page pv entries until repromotion occurs, thereby
3053 * exacerbating the shortage of free pv entries.
3054 */
3055 static vm_page_t
reclaim_pv_chunk_domain(pmap_t locked_pmap,struct rwlock ** lockp,int domain)3056 reclaim_pv_chunk_domain(pmap_t locked_pmap, struct rwlock **lockp, int domain)
3057 {
3058 struct pv_chunks_list *pvc;
3059 struct pv_chunk *pc, *pc_marker, *pc_marker_end;
3060 struct pv_chunk_header pc_marker_b, pc_marker_end_b;
3061 struct md_page *pvh;
3062 pd_entry_t *pde;
3063 pmap_t next_pmap, pmap;
3064 pt_entry_t *pte, tpte;
3065 pv_entry_t pv;
3066 vm_offset_t va;
3067 vm_page_t m, m_pc;
3068 struct spglist free;
3069 uint64_t inuse;
3070 int bit, field, freed, lvl;
3071
3072 PMAP_LOCK_ASSERT(locked_pmap, MA_OWNED);
3073 KASSERT(lockp != NULL, ("reclaim_pv_chunk: lockp is NULL"));
3074
3075 pmap = NULL;
3076 m_pc = NULL;
3077 SLIST_INIT(&free);
3078 bzero(&pc_marker_b, sizeof(pc_marker_b));
3079 bzero(&pc_marker_end_b, sizeof(pc_marker_end_b));
3080 pc_marker = (struct pv_chunk *)&pc_marker_b;
3081 pc_marker_end = (struct pv_chunk *)&pc_marker_end_b;
3082
3083 pvc = &pv_chunks[domain];
3084 mtx_lock(&pvc->pvc_lock);
3085 pvc->active_reclaims++;
3086 TAILQ_INSERT_HEAD(&pvc->pvc_list, pc_marker, pc_lru);
3087 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc_marker_end, pc_lru);
3088 while ((pc = TAILQ_NEXT(pc_marker, pc_lru)) != pc_marker_end &&
3089 SLIST_EMPTY(&free)) {
3090 next_pmap = pc->pc_pmap;
3091 if (next_pmap == NULL) {
3092 /*
3093 * The next chunk is a marker. However, it is
3094 * not our marker, so active_reclaims must be
3095 * > 1. Consequently, the next_chunk code
3096 * will not rotate the pv_chunks list.
3097 */
3098 goto next_chunk;
3099 }
3100 mtx_unlock(&pvc->pvc_lock);
3101
3102 /*
3103 * A pv_chunk can only be removed from the pc_lru list
3104 * when both pvc->pvc_lock is owned and the
3105 * corresponding pmap is locked.
3106 */
3107 if (pmap != next_pmap) {
3108 if (pmap != NULL && pmap != locked_pmap)
3109 PMAP_UNLOCK(pmap);
3110 pmap = next_pmap;
3111 /* Avoid deadlock and lock recursion. */
3112 if (pmap > locked_pmap) {
3113 RELEASE_PV_LIST_LOCK(lockp);
3114 PMAP_LOCK(pmap);
3115 mtx_lock(&pvc->pvc_lock);
3116 continue;
3117 } else if (pmap != locked_pmap) {
3118 if (PMAP_TRYLOCK(pmap)) {
3119 mtx_lock(&pvc->pvc_lock);
3120 continue;
3121 } else {
3122 pmap = NULL; /* pmap is not locked */
3123 mtx_lock(&pvc->pvc_lock);
3124 pc = TAILQ_NEXT(pc_marker, pc_lru);
3125 if (pc == NULL ||
3126 pc->pc_pmap != next_pmap)
3127 continue;
3128 goto next_chunk;
3129 }
3130 }
3131 }
3132
3133 /*
3134 * Destroy every non-wired, 4 KB page mapping in the chunk.
3135 */
3136 freed = 0;
3137 for (field = 0; field < _NPCM; field++) {
3138 for (inuse = ~pc->pc_map[field] & pc_freemask[field];
3139 inuse != 0; inuse &= ~(1UL << bit)) {
3140 bit = ffsl(inuse) - 1;
3141 pv = &pc->pc_pventry[field * 64 + bit];
3142 va = pv->pv_va;
3143 pde = pmap_pde(pmap, va, &lvl);
3144 if (lvl != 2)
3145 continue;
3146 pte = pmap_l2_to_l3(pde, va);
3147 tpte = pmap_load(pte);
3148 if ((tpte & ATTR_SW_WIRED) != 0)
3149 continue;
3150 if ((tpte & ATTR_CONTIGUOUS) != 0)
3151 (void)pmap_demote_l3c(pmap, pte, va);
3152 tpte = pmap_load_clear(pte);
3153 m = PTE_TO_VM_PAGE(tpte);
3154 if (pmap_pte_dirty(pmap, tpte))
3155 vm_page_dirty(m);
3156 if ((tpte & ATTR_AF) != 0) {
3157 pmap_s1_invalidate_page(pmap, va, true);
3158 vm_page_aflag_set(m, PGA_REFERENCED);
3159 }
3160 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3161 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
3162 m->md.pv_gen++;
3163 if (TAILQ_EMPTY(&m->md.pv_list) &&
3164 (m->flags & PG_FICTITIOUS) == 0) {
3165 pvh = page_to_pvh(m);
3166 if (TAILQ_EMPTY(&pvh->pv_list)) {
3167 vm_page_aflag_clear(m,
3168 PGA_WRITEABLE);
3169 }
3170 }
3171 pc->pc_map[field] |= 1UL << bit;
3172 pmap_unuse_pt(pmap, va, pmap_load(pde), &free);
3173 freed++;
3174 }
3175 }
3176 if (freed == 0) {
3177 mtx_lock(&pvc->pvc_lock);
3178 goto next_chunk;
3179 }
3180 /* Every freed mapping is for a 4 KB page. */
3181 pmap_resident_count_dec(pmap, freed);
3182 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
3183 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
3184 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
3185 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3186 if (pc_is_free(pc)) {
3187 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3188 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3189 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3190 /* Entire chunk is free; return it. */
3191 m_pc = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3192 dump_drop_page(m_pc->phys_addr);
3193 mtx_lock(&pvc->pvc_lock);
3194 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3195 break;
3196 }
3197 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3198 mtx_lock(&pvc->pvc_lock);
3199 /* One freed pv entry in locked_pmap is sufficient. */
3200 if (pmap == locked_pmap)
3201 break;
3202
3203 next_chunk:
3204 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3205 TAILQ_INSERT_AFTER(&pvc->pvc_list, pc, pc_marker, pc_lru);
3206 if (pvc->active_reclaims == 1 && pmap != NULL) {
3207 /*
3208 * Rotate the pv chunks list so that we do not
3209 * scan the same pv chunks that could not be
3210 * freed (because they contained a wired
3211 * and/or superpage mapping) on every
3212 * invocation of reclaim_pv_chunk().
3213 */
3214 while ((pc = TAILQ_FIRST(&pvc->pvc_list)) != pc_marker){
3215 MPASS(pc->pc_pmap != NULL);
3216 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3217 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3218 }
3219 }
3220 }
3221 TAILQ_REMOVE(&pvc->pvc_list, pc_marker, pc_lru);
3222 TAILQ_REMOVE(&pvc->pvc_list, pc_marker_end, pc_lru);
3223 pvc->active_reclaims--;
3224 mtx_unlock(&pvc->pvc_lock);
3225 if (pmap != NULL && pmap != locked_pmap)
3226 PMAP_UNLOCK(pmap);
3227 if (m_pc == NULL && !SLIST_EMPTY(&free)) {
3228 m_pc = SLIST_FIRST(&free);
3229 SLIST_REMOVE_HEAD(&free, plinks.s.ss);
3230 /* Recycle a freed page table page. */
3231 m_pc->ref_count = 1;
3232 }
3233 vm_page_free_pages_toq(&free, true);
3234 return (m_pc);
3235 }
3236
3237 static vm_page_t
reclaim_pv_chunk(pmap_t locked_pmap,struct rwlock ** lockp)3238 reclaim_pv_chunk(pmap_t locked_pmap, struct rwlock **lockp)
3239 {
3240 vm_page_t m;
3241 int i, domain;
3242
3243 domain = PCPU_GET(domain);
3244 for (i = 0; i < vm_ndomains; i++) {
3245 m = reclaim_pv_chunk_domain(locked_pmap, lockp, domain);
3246 if (m != NULL)
3247 break;
3248 domain = (domain + 1) % vm_ndomains;
3249 }
3250
3251 return (m);
3252 }
3253
3254 /*
3255 * free the pv_entry back to the free list
3256 */
3257 static void
free_pv_entry(pmap_t pmap,pv_entry_t pv)3258 free_pv_entry(pmap_t pmap, pv_entry_t pv)
3259 {
3260 struct pv_chunk *pc;
3261 int idx, field, bit;
3262
3263 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3264 PV_STAT(atomic_add_long(&pv_entry_frees, 1));
3265 PV_STAT(atomic_add_int(&pv_entry_spare, 1));
3266 PV_STAT(atomic_subtract_long(&pv_entry_count, 1));
3267 pc = pv_to_chunk(pv);
3268 idx = pv - &pc->pc_pventry[0];
3269 field = idx / 64;
3270 bit = idx % 64;
3271 pc->pc_map[field] |= 1ul << bit;
3272 if (!pc_is_free(pc)) {
3273 /* 98% of the time, pc is already at the head of the list. */
3274 if (__predict_false(pc != TAILQ_FIRST(&pmap->pm_pvchunk))) {
3275 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3276 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3277 }
3278 return;
3279 }
3280 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3281 free_pv_chunk(pc);
3282 }
3283
3284 static void
free_pv_chunk_dequeued(struct pv_chunk * pc)3285 free_pv_chunk_dequeued(struct pv_chunk *pc)
3286 {
3287 vm_page_t m;
3288
3289 PV_STAT(atomic_subtract_int(&pv_entry_spare, _NPCPV));
3290 PV_STAT(atomic_subtract_int(&pc_chunk_count, 1));
3291 PV_STAT(atomic_add_int(&pc_chunk_frees, 1));
3292 /* entire chunk is free, return it */
3293 m = PHYS_TO_VM_PAGE(DMAP_TO_PHYS((vm_offset_t)pc));
3294 dump_drop_page(m->phys_addr);
3295 vm_page_unwire_noq(m);
3296 vm_page_free(m);
3297 }
3298
3299 static void
free_pv_chunk(struct pv_chunk * pc)3300 free_pv_chunk(struct pv_chunk *pc)
3301 {
3302 struct pv_chunks_list *pvc;
3303
3304 pvc = &pv_chunks[pc_to_domain(pc)];
3305 mtx_lock(&pvc->pvc_lock);
3306 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3307 mtx_unlock(&pvc->pvc_lock);
3308 free_pv_chunk_dequeued(pc);
3309 }
3310
3311 static void
free_pv_chunk_batch(struct pv_chunklist * batch)3312 free_pv_chunk_batch(struct pv_chunklist *batch)
3313 {
3314 struct pv_chunks_list *pvc;
3315 struct pv_chunk *pc, *npc;
3316 int i;
3317
3318 for (i = 0; i < vm_ndomains; i++) {
3319 if (TAILQ_EMPTY(&batch[i]))
3320 continue;
3321 pvc = &pv_chunks[i];
3322 mtx_lock(&pvc->pvc_lock);
3323 TAILQ_FOREACH(pc, &batch[i], pc_list) {
3324 TAILQ_REMOVE(&pvc->pvc_list, pc, pc_lru);
3325 }
3326 mtx_unlock(&pvc->pvc_lock);
3327 }
3328
3329 for (i = 0; i < vm_ndomains; i++) {
3330 TAILQ_FOREACH_SAFE(pc, &batch[i], pc_list, npc) {
3331 free_pv_chunk_dequeued(pc);
3332 }
3333 }
3334 }
3335
3336 /*
3337 * Returns a new PV entry, allocating a new PV chunk from the system when
3338 * needed. If this PV chunk allocation fails and a PV list lock pointer was
3339 * given, a PV chunk is reclaimed from an arbitrary pmap. Otherwise, NULL is
3340 * returned.
3341 *
3342 * The given PV list lock may be released.
3343 */
3344 static pv_entry_t
get_pv_entry(pmap_t pmap,struct rwlock ** lockp)3345 get_pv_entry(pmap_t pmap, struct rwlock **lockp)
3346 {
3347 struct pv_chunks_list *pvc;
3348 int bit, field;
3349 pv_entry_t pv;
3350 struct pv_chunk *pc;
3351 vm_page_t m;
3352
3353 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3354 PV_STAT(atomic_add_long(&pv_entry_allocs, 1));
3355 retry:
3356 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3357 if (pc != NULL) {
3358 for (field = 0; field < _NPCM; field++) {
3359 if (pc->pc_map[field]) {
3360 bit = ffsl(pc->pc_map[field]) - 1;
3361 break;
3362 }
3363 }
3364 if (field < _NPCM) {
3365 pv = &pc->pc_pventry[field * 64 + bit];
3366 pc->pc_map[field] &= ~(1ul << bit);
3367 /* If this was the last item, move it to tail */
3368 if (pc_is_full(pc)) {
3369 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3370 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc,
3371 pc_list);
3372 }
3373 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3374 PV_STAT(atomic_subtract_int(&pv_entry_spare, 1));
3375 return (pv);
3376 }
3377 }
3378 /* No free items, allocate another chunk */
3379 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3380 if (m == NULL) {
3381 if (lockp == NULL) {
3382 PV_STAT(pc_chunk_tryfail++);
3383 return (NULL);
3384 }
3385 m = reclaim_pv_chunk(pmap, lockp);
3386 if (m == NULL)
3387 goto retry;
3388 }
3389 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3390 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3391 dump_add_page(m->phys_addr);
3392 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3393 pc->pc_pmap = pmap;
3394 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3395 pc->pc_map[0] &= ~1ul; /* preallocated bit 0 */
3396 pvc = &pv_chunks[vm_page_domain(m)];
3397 mtx_lock(&pvc->pvc_lock);
3398 TAILQ_INSERT_TAIL(&pvc->pvc_list, pc, pc_lru);
3399 mtx_unlock(&pvc->pvc_lock);
3400 pv = &pc->pc_pventry[0];
3401 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3402 PV_STAT(atomic_add_long(&pv_entry_count, 1));
3403 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV - 1));
3404 return (pv);
3405 }
3406
3407 /*
3408 * Ensure that the number of spare PV entries in the specified pmap meets or
3409 * exceeds the given count, "needed".
3410 *
3411 * The given PV list lock may be released.
3412 */
3413 static void
reserve_pv_entries(pmap_t pmap,int needed,struct rwlock ** lockp)3414 reserve_pv_entries(pmap_t pmap, int needed, struct rwlock **lockp)
3415 {
3416 struct pv_chunks_list *pvc;
3417 struct pch new_tail[PMAP_MEMDOM];
3418 struct pv_chunk *pc;
3419 vm_page_t m;
3420 int avail, free, i;
3421 bool reclaimed;
3422
3423 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3424 KASSERT(lockp != NULL, ("reserve_pv_entries: lockp is NULL"));
3425
3426 /*
3427 * Newly allocated PV chunks must be stored in a private list until
3428 * the required number of PV chunks have been allocated. Otherwise,
3429 * reclaim_pv_chunk() could recycle one of these chunks. In
3430 * contrast, these chunks must be added to the pmap upon allocation.
3431 */
3432 for (i = 0; i < PMAP_MEMDOM; i++)
3433 TAILQ_INIT(&new_tail[i]);
3434 retry:
3435 avail = 0;
3436 TAILQ_FOREACH(pc, &pmap->pm_pvchunk, pc_list) {
3437 bit_count((bitstr_t *)pc->pc_map, 0,
3438 sizeof(pc->pc_map) * NBBY, &free);
3439 if (free == 0)
3440 break;
3441 avail += free;
3442 if (avail >= needed)
3443 break;
3444 }
3445 for (reclaimed = false; avail < needed; avail += _NPCPV) {
3446 m = vm_page_alloc_noobj(VM_ALLOC_WIRED);
3447 if (m == NULL) {
3448 m = reclaim_pv_chunk(pmap, lockp);
3449 if (m == NULL)
3450 goto retry;
3451 reclaimed = true;
3452 }
3453 PV_STAT(atomic_add_int(&pc_chunk_count, 1));
3454 PV_STAT(atomic_add_int(&pc_chunk_allocs, 1));
3455 dump_add_page(m->phys_addr);
3456 pc = (void *)PHYS_TO_DMAP(m->phys_addr);
3457 pc->pc_pmap = pmap;
3458 memcpy(pc->pc_map, pc_freemask, sizeof(pc_freemask));
3459 TAILQ_INSERT_HEAD(&pmap->pm_pvchunk, pc, pc_list);
3460 TAILQ_INSERT_TAIL(&new_tail[vm_page_domain(m)], pc, pc_lru);
3461 PV_STAT(atomic_add_int(&pv_entry_spare, _NPCPV));
3462
3463 /*
3464 * The reclaim might have freed a chunk from the current pmap.
3465 * If that chunk contained available entries, we need to
3466 * re-count the number of available entries.
3467 */
3468 if (reclaimed)
3469 goto retry;
3470 }
3471 for (i = 0; i < vm_ndomains; i++) {
3472 if (TAILQ_EMPTY(&new_tail[i]))
3473 continue;
3474 pvc = &pv_chunks[i];
3475 mtx_lock(&pvc->pvc_lock);
3476 TAILQ_CONCAT(&pvc->pvc_list, &new_tail[i], pc_lru);
3477 mtx_unlock(&pvc->pvc_lock);
3478 }
3479 }
3480
3481 /*
3482 * First find and then remove the pv entry for the specified pmap and virtual
3483 * address from the specified pv list. Returns the pv entry if found and NULL
3484 * otherwise. This operation can be performed on pv lists for either 4KB or
3485 * 2MB page mappings.
3486 */
3487 static __inline pv_entry_t
pmap_pvh_remove(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3488 pmap_pvh_remove(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3489 {
3490 pv_entry_t pv;
3491
3492 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
3493 if (pmap == PV_PMAP(pv) && va == pv->pv_va) {
3494 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
3495 pvh->pv_gen++;
3496 break;
3497 }
3498 }
3499 return (pv);
3500 }
3501
3502 /*
3503 * After demotion from a 2MB page mapping to 512 4KB page mappings,
3504 * destroy the pv entry for the 2MB page mapping and reinstantiate the pv
3505 * entries for each of the 4KB page mappings.
3506 */
3507 static void
pmap_pv_demote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)3508 pmap_pv_demote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
3509 struct rwlock **lockp)
3510 {
3511 struct md_page *pvh;
3512 struct pv_chunk *pc;
3513 pv_entry_t pv;
3514 vm_offset_t va_last;
3515 vm_page_t m;
3516 int bit, field;
3517
3518 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3519 KASSERT((va & L2_OFFSET) == 0,
3520 ("pmap_pv_demote_l2: va is not 2mpage aligned"));
3521 KASSERT((pa & L2_OFFSET) == 0,
3522 ("pmap_pv_demote_l2: pa is not 2mpage aligned"));
3523 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3524
3525 /*
3526 * Transfer the 2mpage's pv entry for this mapping to the first
3527 * page's pv list. Once this transfer begins, the pv list lock
3528 * must not be released until the last pv entry is reinstantiated.
3529 */
3530 pvh = pa_to_pvh(pa);
3531 pv = pmap_pvh_remove(pvh, pmap, va);
3532 KASSERT(pv != NULL, ("pmap_pv_demote_l2: pv not found"));
3533 m = PHYS_TO_VM_PAGE(pa);
3534 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3535 m->md.pv_gen++;
3536 /* Instantiate the remaining Ln_ENTRIES - 1 pv entries. */
3537 PV_STAT(atomic_add_long(&pv_entry_allocs, Ln_ENTRIES - 1));
3538 va_last = va + L2_SIZE - PAGE_SIZE;
3539 for (;;) {
3540 pc = TAILQ_FIRST(&pmap->pm_pvchunk);
3541 KASSERT(!pc_is_full(pc), ("pmap_pv_demote_l2: missing spare"));
3542 for (field = 0; field < _NPCM; field++) {
3543 while (pc->pc_map[field]) {
3544 bit = ffsl(pc->pc_map[field]) - 1;
3545 pc->pc_map[field] &= ~(1ul << bit);
3546 pv = &pc->pc_pventry[field * 64 + bit];
3547 va += PAGE_SIZE;
3548 pv->pv_va = va;
3549 m++;
3550 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
3551 ("pmap_pv_demote_l2: page %p is not managed", m));
3552 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3553 m->md.pv_gen++;
3554 if (va == va_last)
3555 goto out;
3556 }
3557 }
3558 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3559 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3560 }
3561 out:
3562 if (pc_is_full(pc)) {
3563 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
3564 TAILQ_INSERT_TAIL(&pmap->pm_pvchunk, pc, pc_list);
3565 }
3566 PV_STAT(atomic_add_long(&pv_entry_count, Ln_ENTRIES - 1));
3567 PV_STAT(atomic_subtract_int(&pv_entry_spare, Ln_ENTRIES - 1));
3568 }
3569
3570 /*
3571 * First find and then destroy the pv entry for the specified pmap and virtual
3572 * address. This operation can be performed on pv lists for either 4KB or 2MB
3573 * page mappings.
3574 */
3575 static void
pmap_pvh_free(struct md_page * pvh,pmap_t pmap,vm_offset_t va)3576 pmap_pvh_free(struct md_page *pvh, pmap_t pmap, vm_offset_t va)
3577 {
3578 pv_entry_t pv;
3579
3580 pv = pmap_pvh_remove(pvh, pmap, va);
3581 KASSERT(pv != NULL, ("pmap_pvh_free: pv not found"));
3582 free_pv_entry(pmap, pv);
3583 }
3584
3585 /*
3586 * Conditionally create the PV entry for a 4KB page mapping if the required
3587 * memory can be allocated without resorting to reclamation.
3588 */
3589 static bool
pmap_try_insert_pv_entry(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3590 pmap_try_insert_pv_entry(pmap_t pmap, vm_offset_t va, vm_page_t m,
3591 struct rwlock **lockp)
3592 {
3593 pv_entry_t pv;
3594
3595 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3596 /* Pass NULL instead of the lock pointer to disable reclamation. */
3597 if ((pv = get_pv_entry(pmap, NULL)) != NULL) {
3598 pv->pv_va = va;
3599 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3600 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
3601 m->md.pv_gen++;
3602 return (true);
3603 } else
3604 return (false);
3605 }
3606
3607 /*
3608 * Create the PV entry for a 2MB page mapping. Always returns true unless the
3609 * flag PMAP_ENTER_NORECLAIM is specified. If that flag is specified, returns
3610 * false if the PV entry cannot be allocated without resorting to reclamation.
3611 */
3612 static bool
pmap_pv_insert_l2(pmap_t pmap,vm_offset_t va,pd_entry_t l2e,u_int flags,struct rwlock ** lockp)3613 pmap_pv_insert_l2(pmap_t pmap, vm_offset_t va, pd_entry_t l2e, u_int flags,
3614 struct rwlock **lockp)
3615 {
3616 struct md_page *pvh;
3617 pv_entry_t pv;
3618 vm_paddr_t pa;
3619
3620 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3621 /* Pass NULL instead of the lock pointer to disable reclamation. */
3622 if ((pv = get_pv_entry(pmap, (flags & PMAP_ENTER_NORECLAIM) != 0 ?
3623 NULL : lockp)) == NULL)
3624 return (false);
3625 pv->pv_va = va;
3626 pa = PTE_TO_PHYS(l2e);
3627 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
3628 pvh = pa_to_pvh(pa);
3629 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
3630 pvh->pv_gen++;
3631 return (true);
3632 }
3633
3634 /*
3635 * Conditionally creates the PV entries for a L3C superpage mapping if
3636 * the required memory can be allocated without resorting to reclamation.
3637 */
3638 static bool
pmap_pv_insert_l3c(pmap_t pmap,vm_offset_t va,vm_page_t m,struct rwlock ** lockp)3639 pmap_pv_insert_l3c(pmap_t pmap, vm_offset_t va, vm_page_t m,
3640 struct rwlock **lockp)
3641 {
3642 pv_entry_t pv;
3643 vm_offset_t tva;
3644 vm_paddr_t pa __diagused;
3645 vm_page_t mt;
3646
3647 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3648 KASSERT((va & L3C_OFFSET) == 0,
3649 ("pmap_pv_insert_l3c: va is not aligned"));
3650 pa = VM_PAGE_TO_PHYS(m);
3651 KASSERT((pa & L3C_OFFSET) == 0,
3652 ("pmap_pv_insert_l3c: pa is not aligned"));
3653 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3654 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva += L3_SIZE) {
3655 /* Pass NULL instead of lockp to disable reclamation. */
3656 pv = get_pv_entry(pmap, NULL);
3657 if (__predict_false(pv == NULL)) {
3658 while (tva > va) {
3659 mt--;
3660 tva -= L3_SIZE;
3661 pmap_pvh_free(&mt->md, pmap, tva);
3662 }
3663 return (false);
3664 }
3665 pv->pv_va = tva;
3666 TAILQ_INSERT_TAIL(&mt->md.pv_list, pv, pv_next);
3667 mt->md.pv_gen++;
3668 }
3669 return (true);
3670 }
3671
3672 static void
pmap_remove_kernel_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)3673 pmap_remove_kernel_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
3674 {
3675 pt_entry_t newl2, oldl2 __diagused;
3676 vm_page_t ml3;
3677 vm_paddr_t ml3pa;
3678
3679 KASSERT(!VIRT_IN_DMAP(va), ("removing direct mapping of %#lx", va));
3680 KASSERT(pmap == kernel_pmap, ("pmap %p is not kernel_pmap", pmap));
3681 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3682
3683 ml3 = pmap_remove_pt_page(pmap, va);
3684 if (ml3 == NULL)
3685 panic("pmap_remove_kernel_l2: Missing pt page");
3686
3687 ml3pa = VM_PAGE_TO_PHYS(ml3);
3688 newl2 = PHYS_TO_PTE(ml3pa) | L2_TABLE;
3689
3690 /*
3691 * If this page table page was unmapped by a promotion, then it
3692 * contains valid mappings. Zero it to invalidate those mappings.
3693 */
3694 if (vm_page_any_valid(ml3))
3695 pagezero((void *)PHYS_TO_DMAP(ml3pa));
3696
3697 /*
3698 * Demote the mapping. The caller must have already invalidated the
3699 * mapping (i.e., the "break" in break-before-make).
3700 */
3701 oldl2 = pmap_load_store(l2, newl2);
3702 KASSERT(oldl2 == 0, ("%s: found existing mapping at %p: %#lx",
3703 __func__, l2, oldl2));
3704 }
3705
3706 /*
3707 * pmap_remove_l2: Do the things to unmap a level 2 superpage.
3708 */
3709 static int
pmap_remove_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pd_entry_t l1e,struct spglist * free,struct rwlock ** lockp)3710 pmap_remove_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva,
3711 pd_entry_t l1e, struct spglist *free, struct rwlock **lockp)
3712 {
3713 struct md_page *pvh;
3714 pt_entry_t old_l2;
3715 vm_page_t m, ml3, mt;
3716
3717 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3718 KASSERT((sva & L2_OFFSET) == 0, ("pmap_remove_l2: sva is not aligned"));
3719 old_l2 = pmap_load_clear(l2);
3720 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
3721 ("pmap_remove_l2: L2e %lx is not a block mapping", old_l2));
3722
3723 /*
3724 * Since a promotion must break the 4KB page mappings before making
3725 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
3726 */
3727 pmap_s1_invalidate_page(pmap, sva, true);
3728
3729 if (old_l2 & ATTR_SW_WIRED)
3730 pmap->pm_stats.wired_count -= L2_SIZE / PAGE_SIZE;
3731 pmap_resident_count_dec(pmap, L2_SIZE / PAGE_SIZE);
3732 if (old_l2 & ATTR_SW_MANAGED) {
3733 m = PTE_TO_VM_PAGE(old_l2);
3734 pvh = page_to_pvh(m);
3735 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3736 pmap_pvh_free(pvh, pmap, sva);
3737 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++) {
3738 if (pmap_pte_dirty(pmap, old_l2))
3739 vm_page_dirty(mt);
3740 if (old_l2 & ATTR_AF)
3741 vm_page_aflag_set(mt, PGA_REFERENCED);
3742 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3743 TAILQ_EMPTY(&pvh->pv_list))
3744 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3745 }
3746 }
3747 if (pmap == kernel_pmap) {
3748 pmap_remove_kernel_l2(pmap, l2, sva);
3749 } else {
3750 ml3 = pmap_remove_pt_page(pmap, sva);
3751 if (ml3 != NULL) {
3752 KASSERT(vm_page_any_valid(ml3),
3753 ("pmap_remove_l2: l3 page not promoted"));
3754 pmap_resident_count_dec(pmap, 1);
3755 KASSERT(ml3->ref_count == NL3PG,
3756 ("pmap_remove_l2: l3 page ref count error"));
3757 ml3->ref_count = 0;
3758 pmap_add_delayed_free_list(ml3, free, false);
3759 }
3760 }
3761 return (pmap_unuse_pt(pmap, sva, l1e, free));
3762 }
3763
3764 /*
3765 * pmap_remove_l3: do the things to unmap a page in a process
3766 */
3767 static int
pmap_remove_l3(pmap_t pmap,pt_entry_t * l3,vm_offset_t va,pd_entry_t l2e,struct spglist * free,struct rwlock ** lockp)3768 pmap_remove_l3(pmap_t pmap, pt_entry_t *l3, vm_offset_t va,
3769 pd_entry_t l2e, struct spglist *free, struct rwlock **lockp)
3770 {
3771 struct md_page *pvh;
3772 pt_entry_t old_l3;
3773 vm_page_t m;
3774
3775 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3776 old_l3 = pmap_load(l3);
3777 if ((old_l3 & ATTR_CONTIGUOUS) != 0)
3778 (void)pmap_demote_l3c(pmap, l3, va);
3779 old_l3 = pmap_load_clear(l3);
3780 pmap_s1_invalidate_page(pmap, va, true);
3781 if (old_l3 & ATTR_SW_WIRED)
3782 pmap->pm_stats.wired_count -= 1;
3783 pmap_resident_count_dec(pmap, 1);
3784 if (old_l3 & ATTR_SW_MANAGED) {
3785 m = PTE_TO_VM_PAGE(old_l3);
3786 if (pmap_pte_dirty(pmap, old_l3))
3787 vm_page_dirty(m);
3788 if (old_l3 & ATTR_AF)
3789 vm_page_aflag_set(m, PGA_REFERENCED);
3790 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(lockp, m);
3791 pmap_pvh_free(&m->md, pmap, va);
3792 if (TAILQ_EMPTY(&m->md.pv_list) &&
3793 (m->flags & PG_FICTITIOUS) == 0) {
3794 pvh = page_to_pvh(m);
3795 if (TAILQ_EMPTY(&pvh->pv_list))
3796 vm_page_aflag_clear(m, PGA_WRITEABLE);
3797 }
3798 }
3799 return (pmap_unuse_pt(pmap, va, l2e, free));
3800 }
3801
3802 /*
3803 * Removes the specified L3C superpage mapping. Requests TLB invalidations
3804 * to be performed by the caller through the returned "*vap". Returns true
3805 * if the level 3 table "ml3" was unmapped and added to the spglist "free".
3806 * Otherwise, returns false.
3807 */
3808 static bool
pmap_remove_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,vm_page_t ml3,struct spglist * free,struct rwlock ** lockp)3809 pmap_remove_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, vm_offset_t *vap,
3810 vm_offset_t va_next, vm_page_t ml3, struct spglist *free,
3811 struct rwlock **lockp)
3812 {
3813 struct md_page *pvh;
3814 struct rwlock *new_lock;
3815 pt_entry_t first_l3e, l3e, *tl3p;
3816 vm_offset_t tva;
3817 vm_page_t m, mt;
3818
3819 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3820 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
3821 0, ("pmap_remove_l3c: l3p is not aligned"));
3822 KASSERT((va & L3C_OFFSET) == 0,
3823 ("pmap_remove_l3c: va is not aligned"));
3824
3825 /*
3826 * Hardware accessed and dirty bit maintenance might only update a
3827 * single L3 entry, so we must combine the accessed and dirty bits
3828 * from this entire set of contiguous L3 entries.
3829 */
3830 first_l3e = pmap_load_clear(l3p);
3831 for (tl3p = l3p + 1; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
3832 l3e = pmap_load_clear(tl3p);
3833 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
3834 ("pmap_remove_l3c: l3e is missing ATTR_CONTIGUOUS"));
3835 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
3836 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
3837 first_l3e &= ~ATTR_S1_AP_RW_BIT;
3838 first_l3e |= l3e & ATTR_AF;
3839 }
3840 if ((first_l3e & ATTR_SW_WIRED) != 0)
3841 pmap->pm_stats.wired_count -= L3C_ENTRIES;
3842 pmap_resident_count_dec(pmap, L3C_ENTRIES);
3843 if ((first_l3e & ATTR_SW_MANAGED) != 0) {
3844 m = PTE_TO_VM_PAGE(first_l3e);
3845 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3846 if (new_lock != *lockp) {
3847 if (*lockp != NULL) {
3848 /*
3849 * Pending TLB invalidations must be
3850 * performed before the PV list lock is
3851 * released. Otherwise, a concurrent
3852 * pmap_remove_all() on a physical page
3853 * could return while a stale TLB entry
3854 * still provides access to that page.
3855 */
3856 if (*vap != va_next) {
3857 pmap_invalidate_range(pmap, *vap, va,
3858 true);
3859 *vap = va_next;
3860 }
3861 rw_wunlock(*lockp);
3862 }
3863 *lockp = new_lock;
3864 rw_wlock(*lockp);
3865 }
3866 pvh = page_to_pvh(m);
3867 for (mt = m, tva = va; mt < &m[L3C_ENTRIES]; mt++, tva +=
3868 L3_SIZE) {
3869 if (pmap_pte_dirty(pmap, first_l3e))
3870 vm_page_dirty(mt);
3871 if ((first_l3e & ATTR_AF) != 0)
3872 vm_page_aflag_set(mt, PGA_REFERENCED);
3873 pmap_pvh_free(&mt->md, pmap, tva);
3874 if (TAILQ_EMPTY(&mt->md.pv_list) &&
3875 TAILQ_EMPTY(&pvh->pv_list))
3876 vm_page_aflag_clear(mt, PGA_WRITEABLE);
3877 }
3878 }
3879 if (*vap == va_next)
3880 *vap = va;
3881 if (ml3 != NULL) {
3882 ml3->ref_count -= L3C_ENTRIES;
3883 if (ml3->ref_count == 0) {
3884 _pmap_unwire_l3(pmap, va, ml3, free);
3885 return (true);
3886 }
3887 }
3888 return (false);
3889 }
3890
3891 /*
3892 * Remove the specified range of addresses from the L3 page table that is
3893 * identified by the given L2 entry.
3894 */
3895 static void
pmap_remove_l3_range(pmap_t pmap,pd_entry_t l2e,vm_offset_t sva,vm_offset_t eva,struct spglist * free,struct rwlock ** lockp)3896 pmap_remove_l3_range(pmap_t pmap, pd_entry_t l2e, vm_offset_t sva,
3897 vm_offset_t eva, struct spglist *free, struct rwlock **lockp)
3898 {
3899 struct md_page *pvh;
3900 struct rwlock *new_lock;
3901 pt_entry_t *l3, old_l3;
3902 vm_offset_t va;
3903 vm_page_t l3pg, m;
3904
3905 KASSERT(ADDR_IS_CANONICAL(sva),
3906 ("%s: Start address not in canonical form: %lx", __func__, sva));
3907 KASSERT(ADDR_IS_CANONICAL(eva) || eva == VM_MAX_USER_ADDRESS,
3908 ("%s: End address not in canonical form: %lx", __func__, eva));
3909
3910 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
3911 KASSERT(rounddown2(sva, L2_SIZE) + L2_SIZE == roundup2(eva, L2_SIZE),
3912 ("pmap_remove_l3_range: range crosses an L3 page table boundary"));
3913 l3pg = !ADDR_IS_KERNEL(sva) ? PTE_TO_VM_PAGE(l2e) : NULL;
3914 va = eva;
3915 for (l3 = pmap_l2_to_l3(&l2e, sva); sva != eva; l3++, sva += L3_SIZE) {
3916 old_l3 = pmap_load(l3);
3917 if (!pmap_l3_valid(old_l3)) {
3918 if (va != eva) {
3919 pmap_invalidate_range(pmap, va, sva, true);
3920 va = eva;
3921 }
3922 continue;
3923 }
3924 if ((old_l3 & ATTR_CONTIGUOUS) != 0) {
3925 /*
3926 * Is this entire set of contiguous L3 entries being
3927 * removed? Handle the possibility that "eva" is zero
3928 * because of address wraparound.
3929 */
3930 if ((sva & L3C_OFFSET) == 0 &&
3931 sva + L3C_OFFSET <= eva - 1) {
3932 if (pmap_remove_l3c(pmap, l3, sva, &va, eva,
3933 l3pg, free, lockp)) {
3934 /* The L3 table was unmapped. */
3935 sva += L3C_SIZE;
3936 break;
3937 }
3938 l3 += L3C_ENTRIES - 1;
3939 sva += L3C_SIZE - L3_SIZE;
3940 continue;
3941 }
3942
3943 (void)pmap_demote_l3c(pmap, l3, sva);
3944 }
3945 old_l3 = pmap_load_clear(l3);
3946 if ((old_l3 & ATTR_SW_WIRED) != 0)
3947 pmap->pm_stats.wired_count--;
3948 pmap_resident_count_dec(pmap, 1);
3949 if ((old_l3 & ATTR_SW_MANAGED) != 0) {
3950 m = PTE_TO_VM_PAGE(old_l3);
3951 if (pmap_pte_dirty(pmap, old_l3))
3952 vm_page_dirty(m);
3953 if ((old_l3 & ATTR_AF) != 0)
3954 vm_page_aflag_set(m, PGA_REFERENCED);
3955 new_lock = VM_PAGE_TO_PV_LIST_LOCK(m);
3956 if (new_lock != *lockp) {
3957 if (*lockp != NULL) {
3958 /*
3959 * Pending TLB invalidations must be
3960 * performed before the PV list lock is
3961 * released. Otherwise, a concurrent
3962 * pmap_remove_all() on a physical page
3963 * could return while a stale TLB entry
3964 * still provides access to that page.
3965 */
3966 if (va != eva) {
3967 pmap_invalidate_range(pmap, va,
3968 sva, true);
3969 va = eva;
3970 }
3971 rw_wunlock(*lockp);
3972 }
3973 *lockp = new_lock;
3974 rw_wlock(*lockp);
3975 }
3976 pmap_pvh_free(&m->md, pmap, sva);
3977 if (TAILQ_EMPTY(&m->md.pv_list) &&
3978 (m->flags & PG_FICTITIOUS) == 0) {
3979 pvh = page_to_pvh(m);
3980 if (TAILQ_EMPTY(&pvh->pv_list))
3981 vm_page_aflag_clear(m, PGA_WRITEABLE);
3982 }
3983 }
3984 if (l3pg != NULL && pmap_unwire_l3(pmap, sva, l3pg, free)) {
3985 /*
3986 * _pmap_unwire_l3() has already invalidated the TLB
3987 * entries at all levels for "sva". So, we need not
3988 * perform "sva += L3_SIZE;" here. Moreover, we need
3989 * not perform "va = sva;" if "sva" is at the start
3990 * of a new valid range consisting of a single page.
3991 */
3992 break;
3993 }
3994 if (va == eva)
3995 va = sva;
3996 }
3997 if (va != eva)
3998 pmap_invalidate_range(pmap, va, sva, true);
3999 }
4000
4001 static void
pmap_remove1(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool map_delete)4002 pmap_remove1(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool map_delete)
4003 {
4004 struct rwlock *lock;
4005 vm_offset_t va_next;
4006 pd_entry_t *l0, *l1, *l2;
4007 pt_entry_t l3_paddr;
4008 struct spglist free;
4009
4010 /*
4011 * Perform an unsynchronized read. This is, however, safe.
4012 */
4013 if (pmap->pm_stats.resident_count == 0)
4014 return;
4015
4016 SLIST_INIT(&free);
4017
4018 PMAP_LOCK(pmap);
4019 if (map_delete)
4020 pmap_bti_on_remove(pmap, sva, eva);
4021
4022 lock = NULL;
4023 for (; sva < eva; sva = va_next) {
4024 if (pmap->pm_stats.resident_count == 0)
4025 break;
4026
4027 l0 = pmap_l0(pmap, sva);
4028 if (pmap_load(l0) == 0) {
4029 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4030 if (va_next < sva)
4031 va_next = eva;
4032 continue;
4033 }
4034
4035 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4036 if (va_next < sva)
4037 va_next = eva;
4038 l1 = pmap_l0_to_l1(l0, sva);
4039 if (pmap_load(l1) == 0)
4040 continue;
4041 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4042 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4043 KASSERT(va_next <= eva,
4044 ("partial update of non-transparent 1G page "
4045 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4046 pmap_load(l1), sva, eva, va_next));
4047 MPASS(pmap != kernel_pmap);
4048 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4049 pmap_clear(l1);
4050 pmap_s1_invalidate_page(pmap, sva, true);
4051 pmap_resident_count_dec(pmap, L1_SIZE / PAGE_SIZE);
4052 pmap_unuse_pt(pmap, sva, pmap_load(l0), &free);
4053 continue;
4054 }
4055
4056 /*
4057 * Calculate index for next page table.
4058 */
4059 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4060 if (va_next < sva)
4061 va_next = eva;
4062
4063 l2 = pmap_l1_to_l2(l1, sva);
4064 if (l2 == NULL)
4065 continue;
4066
4067 l3_paddr = pmap_load(l2);
4068
4069 if ((l3_paddr & ATTR_DESCR_MASK) == L2_BLOCK) {
4070 if (sva + L2_SIZE == va_next && eva >= va_next) {
4071 pmap_remove_l2(pmap, l2, sva, pmap_load(l1),
4072 &free, &lock);
4073 continue;
4074 } else if (pmap_demote_l2_locked(pmap, l2, sva,
4075 &lock) == NULL)
4076 continue;
4077 l3_paddr = pmap_load(l2);
4078 }
4079
4080 /*
4081 * Weed out invalid mappings.
4082 */
4083 if ((l3_paddr & ATTR_DESCR_MASK) != L2_TABLE)
4084 continue;
4085
4086 /*
4087 * Limit our scan to either the end of the va represented
4088 * by the current page table page, or to the end of the
4089 * range being removed.
4090 */
4091 if (va_next > eva)
4092 va_next = eva;
4093
4094 pmap_remove_l3_range(pmap, l3_paddr, sva, va_next, &free,
4095 &lock);
4096 }
4097 if (lock != NULL)
4098 rw_wunlock(lock);
4099 PMAP_UNLOCK(pmap);
4100 vm_page_free_pages_toq(&free, true);
4101 }
4102
4103 /*
4104 * Remove the given range of addresses from the specified map.
4105 *
4106 * It is assumed that the start and end are properly
4107 * rounded to the page size.
4108 */
4109 void
pmap_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4110 pmap_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4111 {
4112 pmap_remove1(pmap, sva, eva, false);
4113 }
4114
4115 /*
4116 * Remove the given range of addresses as part of a logical unmap
4117 * operation. This has the effect of calling pmap_remove(), but
4118 * also clears any metadata that should persist for the lifetime
4119 * of a logical mapping.
4120 */
4121 void
pmap_map_delete(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)4122 pmap_map_delete(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
4123 {
4124 pmap_remove1(pmap, sva, eva, true);
4125 }
4126
4127 /*
4128 * Routine: pmap_remove_all
4129 * Function:
4130 * Removes this physical page from
4131 * all physical maps in which it resides.
4132 * Reflects back modify bits to the pager.
4133 *
4134 * Notes:
4135 * Original versions of this routine were very
4136 * inefficient because they iteratively called
4137 * pmap_remove (slow...)
4138 */
4139
4140 void
pmap_remove_all(vm_page_t m)4141 pmap_remove_all(vm_page_t m)
4142 {
4143 struct md_page *pvh;
4144 pv_entry_t pv;
4145 pmap_t pmap;
4146 struct rwlock *lock;
4147 pd_entry_t *pde, tpde;
4148 pt_entry_t *pte, tpte;
4149 vm_offset_t va;
4150 struct spglist free;
4151 int lvl, pvh_gen, md_gen;
4152
4153 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
4154 ("pmap_remove_all: page %p is not managed", m));
4155 SLIST_INIT(&free);
4156 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
4157 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
4158 rw_wlock(lock);
4159 retry:
4160 while ((pv = TAILQ_FIRST(&pvh->pv_list)) != NULL) {
4161 pmap = PV_PMAP(pv);
4162 if (!PMAP_TRYLOCK(pmap)) {
4163 pvh_gen = pvh->pv_gen;
4164 rw_wunlock(lock);
4165 PMAP_LOCK(pmap);
4166 rw_wlock(lock);
4167 if (pvh_gen != pvh->pv_gen) {
4168 PMAP_UNLOCK(pmap);
4169 goto retry;
4170 }
4171 }
4172 va = pv->pv_va;
4173 pte = pmap_pte_exists(pmap, va, 2, __func__);
4174 pmap_demote_l2_locked(pmap, pte, va, &lock);
4175 PMAP_UNLOCK(pmap);
4176 }
4177 while ((pv = TAILQ_FIRST(&m->md.pv_list)) != NULL) {
4178 pmap = PV_PMAP(pv);
4179 if (!PMAP_TRYLOCK(pmap)) {
4180 pvh_gen = pvh->pv_gen;
4181 md_gen = m->md.pv_gen;
4182 rw_wunlock(lock);
4183 PMAP_LOCK(pmap);
4184 rw_wlock(lock);
4185 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
4186 PMAP_UNLOCK(pmap);
4187 goto retry;
4188 }
4189 }
4190 pmap_resident_count_dec(pmap, 1);
4191
4192 pde = pmap_pde(pmap, pv->pv_va, &lvl);
4193 KASSERT(pde != NULL,
4194 ("pmap_remove_all: no page directory entry found"));
4195 KASSERT(lvl == 2,
4196 ("pmap_remove_all: invalid pde level %d", lvl));
4197 tpde = pmap_load(pde);
4198
4199 pte = pmap_l2_to_l3(pde, pv->pv_va);
4200 tpte = pmap_load(pte);
4201 if ((tpte & ATTR_CONTIGUOUS) != 0)
4202 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
4203 tpte = pmap_load_clear(pte);
4204 if (tpte & ATTR_SW_WIRED)
4205 pmap->pm_stats.wired_count--;
4206 if ((tpte & ATTR_AF) != 0) {
4207 pmap_invalidate_page(pmap, pv->pv_va, true);
4208 vm_page_aflag_set(m, PGA_REFERENCED);
4209 }
4210
4211 /*
4212 * Update the vm_page_t clean and reference bits.
4213 */
4214 if (pmap_pte_dirty(pmap, tpte))
4215 vm_page_dirty(m);
4216 pmap_unuse_pt(pmap, pv->pv_va, tpde, &free);
4217 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
4218 m->md.pv_gen++;
4219 free_pv_entry(pmap, pv);
4220 PMAP_UNLOCK(pmap);
4221 }
4222 vm_page_aflag_clear(m, PGA_WRITEABLE);
4223 rw_wunlock(lock);
4224 vm_page_free_pages_toq(&free, true);
4225 }
4226
4227 /*
4228 * Masks and sets bits in a level 2 page table entries in the specified pmap
4229 */
4230 static void
pmap_protect_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t sva,pt_entry_t mask,pt_entry_t nbits)4231 pmap_protect_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t sva, pt_entry_t mask,
4232 pt_entry_t nbits)
4233 {
4234 pd_entry_t old_l2;
4235 vm_page_t m, mt;
4236
4237 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4238 PMAP_ASSERT_STAGE1(pmap);
4239 KASSERT((sva & L2_OFFSET) == 0,
4240 ("pmap_protect_l2: sva is not 2mpage aligned"));
4241 old_l2 = pmap_load(l2);
4242 KASSERT((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK,
4243 ("pmap_protect_l2: L2e %lx is not a block mapping", old_l2));
4244
4245 /*
4246 * Return if the L2 entry already has the desired access restrictions
4247 * in place.
4248 */
4249 if ((old_l2 & mask) == nbits)
4250 return;
4251
4252 while (!atomic_fcmpset_64(l2, &old_l2, (old_l2 & ~mask) | nbits))
4253 cpu_spinwait();
4254
4255 /*
4256 * When a dirty read/write superpage mapping is write protected,
4257 * update the dirty field of each of the superpage's constituent 4KB
4258 * pages.
4259 */
4260 if ((old_l2 & ATTR_SW_MANAGED) != 0 &&
4261 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4262 pmap_pte_dirty(pmap, old_l2)) {
4263 m = PTE_TO_VM_PAGE(old_l2);
4264 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
4265 vm_page_dirty(mt);
4266 }
4267
4268 /*
4269 * Since a promotion must break the 4KB page mappings before making
4270 * the 2MB page mapping, a pmap_s1_invalidate_page() suffices.
4271 */
4272 pmap_s1_invalidate_page(pmap, sva, true);
4273 }
4274
4275 /*
4276 * Masks and sets bits in the specified L3C superpage mapping.
4277 *
4278 * Requests TLB invalidations to be performed by the caller through the
4279 * returned "*vap".
4280 */
4281 static void
pmap_mask_set_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,vm_offset_t * vap,vm_offset_t va_next,pt_entry_t mask,pt_entry_t nbits)4282 pmap_mask_set_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va,
4283 vm_offset_t *vap, vm_offset_t va_next, pt_entry_t mask, pt_entry_t nbits)
4284 {
4285 pt_entry_t l3e, *tl3p;
4286 vm_page_t m, mt;
4287 bool dirty;
4288
4289 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4290 KASSERT(((uintptr_t)l3p & ((L3C_ENTRIES * sizeof(pt_entry_t)) - 1)) ==
4291 0, ("pmap_mask_set_l3c: l3p is not aligned"));
4292 KASSERT((va & L3C_OFFSET) == 0,
4293 ("pmap_mask_set_l3c: va is not aligned"));
4294 dirty = false;
4295 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
4296 l3e = pmap_load(tl3p);
4297 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
4298 ("pmap_mask_set_l3c: l3e is missing ATTR_CONTIGUOUS"));
4299 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
4300 cpu_spinwait();
4301 if ((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) ==
4302 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RW)))
4303 dirty = true;
4304 }
4305
4306 /*
4307 * When a dirty read/write superpage mapping is write protected,
4308 * update the dirty field of each of the superpage's constituent 4KB
4309 * pages.
4310 */
4311 if ((l3e & ATTR_SW_MANAGED) != 0 &&
4312 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4313 dirty) {
4314 m = PTE_TO_VM_PAGE(pmap_load(l3p));
4315 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
4316 vm_page_dirty(mt);
4317 }
4318
4319 if (*vap == va_next)
4320 *vap = va;
4321 }
4322
4323 /*
4324 * Masks and sets bits in last level page table entries in the specified
4325 * pmap and range
4326 */
4327 static void
pmap_mask_set_locked(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4328 pmap_mask_set_locked(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4329 pt_entry_t nbits, bool invalidate)
4330 {
4331 vm_offset_t va, va_next;
4332 pd_entry_t *l0, *l1, *l2;
4333 pt_entry_t *l3p, l3;
4334
4335 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4336 for (; sva < eva; sva = va_next) {
4337 l0 = pmap_l0(pmap, sva);
4338 if (pmap_load(l0) == 0) {
4339 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
4340 if (va_next < sva)
4341 va_next = eva;
4342 continue;
4343 }
4344
4345 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
4346 if (va_next < sva)
4347 va_next = eva;
4348 l1 = pmap_l0_to_l1(l0, sva);
4349 if (pmap_load(l1) == 0)
4350 continue;
4351 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
4352 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4353 KASSERT(va_next <= eva,
4354 ("partial update of non-transparent 1G page "
4355 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
4356 pmap_load(l1), sva, eva, va_next));
4357 MPASS((pmap_load(l1) & ATTR_SW_MANAGED) == 0);
4358 if ((pmap_load(l1) & mask) != nbits) {
4359 pmap_store(l1, (pmap_load(l1) & ~mask) | nbits);
4360 if (invalidate)
4361 pmap_s1_invalidate_page(pmap, sva, true);
4362 }
4363 continue;
4364 }
4365
4366 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
4367 if (va_next < sva)
4368 va_next = eva;
4369
4370 l2 = pmap_l1_to_l2(l1, sva);
4371 if (pmap_load(l2) == 0)
4372 continue;
4373
4374 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
4375 if (sva + L2_SIZE == va_next && eva >= va_next) {
4376 pmap_protect_l2(pmap, l2, sva, mask, nbits);
4377 continue;
4378 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
4379 continue;
4380 }
4381 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
4382 ("pmap_protect: Invalid L2 entry after demotion"));
4383
4384 if (va_next > eva)
4385 va_next = eva;
4386
4387 va = va_next;
4388 for (l3p = pmap_l2_to_l3(l2, sva); sva != va_next; l3p++,
4389 sva += L3_SIZE) {
4390 l3 = pmap_load(l3p);
4391
4392 /*
4393 * Go to the next L3 entry if the current one is
4394 * invalid or already has the desired access
4395 * restrictions in place. (The latter case occurs
4396 * frequently. For example, in a "buildworld"
4397 * workload, almost 1 out of 4 L3 entries already
4398 * have the desired restrictions.)
4399 */
4400 if (!pmap_l3_valid(l3) || (l3 & mask) == nbits) {
4401 if (va != va_next) {
4402 if (invalidate)
4403 pmap_s1_invalidate_range(pmap,
4404 va, sva, true);
4405 va = va_next;
4406 }
4407 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4408 l3p += L3C_ENTRIES - 1;
4409 sva += L3C_SIZE - L3_SIZE;
4410 }
4411 continue;
4412 }
4413
4414 if ((l3 & ATTR_CONTIGUOUS) != 0) {
4415 /*
4416 * Is this entire set of contiguous L3 entries
4417 * being protected? Handle the possibility
4418 * that "va_next" is zero because of address
4419 * wraparound.
4420 */
4421 if ((sva & L3C_OFFSET) == 0 &&
4422 sva + L3C_OFFSET <= va_next - 1) {
4423 pmap_mask_set_l3c(pmap, l3p, sva, &va,
4424 va_next, mask, nbits);
4425 l3p += L3C_ENTRIES - 1;
4426 sva += L3C_SIZE - L3_SIZE;
4427 continue;
4428 }
4429
4430 (void)pmap_demote_l3c(pmap, l3p, sva);
4431
4432 /*
4433 * The L3 entry's accessed bit may have changed.
4434 */
4435 l3 = pmap_load(l3p);
4436 }
4437 while (!atomic_fcmpset_64(l3p, &l3, (l3 & ~mask) |
4438 nbits))
4439 cpu_spinwait();
4440
4441 /*
4442 * When a dirty read/write mapping is write protected,
4443 * update the page's dirty field.
4444 */
4445 if ((l3 & ATTR_SW_MANAGED) != 0 &&
4446 (nbits & ATTR_S1_AP(ATTR_S1_AP_RO)) != 0 &&
4447 pmap_pte_dirty(pmap, l3))
4448 vm_page_dirty(PTE_TO_VM_PAGE(l3));
4449
4450 if (va == va_next)
4451 va = sva;
4452 }
4453 if (va != va_next && invalidate)
4454 pmap_s1_invalidate_range(pmap, va, sva, true);
4455 }
4456 }
4457
4458 static void
pmap_mask_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t mask,pt_entry_t nbits,bool invalidate)4459 pmap_mask_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t mask,
4460 pt_entry_t nbits, bool invalidate)
4461 {
4462 PMAP_LOCK(pmap);
4463 pmap_mask_set_locked(pmap, sva, eva, mask, nbits, invalidate);
4464 PMAP_UNLOCK(pmap);
4465 }
4466
4467 /*
4468 * Set the physical protection on the
4469 * specified range of this map as requested.
4470 */
4471 void
pmap_protect(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,vm_prot_t prot)4472 pmap_protect(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, vm_prot_t prot)
4473 {
4474 pt_entry_t mask, nbits;
4475
4476 PMAP_ASSERT_STAGE1(pmap);
4477 KASSERT((prot & ~VM_PROT_ALL) == 0, ("invalid prot %x", prot));
4478 if (prot == VM_PROT_NONE) {
4479 pmap_remove(pmap, sva, eva);
4480 return;
4481 }
4482
4483 mask = nbits = 0;
4484 if ((prot & VM_PROT_WRITE) == 0) {
4485 mask |= ATTR_S1_AP_RW_BIT | ATTR_SW_DBM;
4486 nbits |= ATTR_S1_AP(ATTR_S1_AP_RO);
4487 }
4488 if ((prot & VM_PROT_EXECUTE) == 0) {
4489 mask |= ATTR_S1_XN;
4490 nbits |= ATTR_S1_XN;
4491 }
4492 if (pmap == kernel_pmap) {
4493 mask |= ATTR_KERN_GP;
4494 nbits |= ATTR_KERN_GP;
4495 }
4496 if (mask == 0)
4497 return;
4498
4499 pmap_mask_set(pmap, sva, eva, mask, nbits, true);
4500 }
4501
4502 void
pmap_disable_promotion(vm_offset_t sva,vm_size_t size)4503 pmap_disable_promotion(vm_offset_t sva, vm_size_t size)
4504 {
4505
4506 MPASS((sva & L3_OFFSET) == 0);
4507 MPASS(((sva + size) & L3_OFFSET) == 0);
4508
4509 pmap_mask_set(kernel_pmap, sva, sva + size, ATTR_SW_NO_PROMOTE,
4510 ATTR_SW_NO_PROMOTE, false);
4511 }
4512
4513 /*
4514 * Inserts the specified page table page into the specified pmap's collection
4515 * of idle page table pages. Each of a pmap's page table pages is responsible
4516 * for mapping a distinct range of virtual addresses. The pmap's collection is
4517 * ordered by this virtual address range.
4518 *
4519 * If "promoted" is false, then the page table page "mpte" must be zero filled;
4520 * "mpte"'s valid field will be set to 0.
4521 *
4522 * If "promoted" is true and "all_l3e_AF_set" is false, then "mpte" must
4523 * contain valid mappings with identical attributes except for ATTR_AF;
4524 * "mpte"'s valid field will be set to 1.
4525 *
4526 * If "promoted" and "all_l3e_AF_set" are both true, then "mpte" must contain
4527 * valid mappings with identical attributes including ATTR_AF; "mpte"'s valid
4528 * field will be set to VM_PAGE_BITS_ALL.
4529 */
4530 static __inline int
pmap_insert_pt_page(pmap_t pmap,vm_page_t mpte,bool promoted,bool all_l3e_AF_set)4531 pmap_insert_pt_page(pmap_t pmap, vm_page_t mpte, bool promoted,
4532 bool all_l3e_AF_set)
4533 {
4534
4535 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4536 KASSERT(promoted || !all_l3e_AF_set,
4537 ("a zero-filled PTP can't have ATTR_AF set in every PTE"));
4538 mpte->valid = promoted ? (all_l3e_AF_set ? VM_PAGE_BITS_ALL : 1) : 0;
4539 return (vm_radix_insert(&pmap->pm_root, mpte));
4540 }
4541
4542 /*
4543 * Removes the page table page mapping the specified virtual address from the
4544 * specified pmap's collection of idle page table pages, and returns it.
4545 * Otherwise, returns NULL if there is no page table page corresponding to the
4546 * specified virtual address.
4547 */
4548 static __inline vm_page_t
pmap_remove_pt_page(pmap_t pmap,vm_offset_t va)4549 pmap_remove_pt_page(pmap_t pmap, vm_offset_t va)
4550 {
4551
4552 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4553 return (vm_radix_remove(&pmap->pm_root, pmap_l2_pindex(va)));
4554 }
4555
4556 /*
4557 * Performs a break-before-make update of a pmap entry. This is needed when
4558 * either promoting or demoting pages to ensure the TLB doesn't get into an
4559 * inconsistent state.
4560 */
4561 static void
pmap_update_entry(pmap_t pmap,pd_entry_t * ptep,pd_entry_t newpte,vm_offset_t va,vm_size_t size)4562 pmap_update_entry(pmap_t pmap, pd_entry_t *ptep, pd_entry_t newpte,
4563 vm_offset_t va, vm_size_t size)
4564 {
4565 register_t intr;
4566
4567 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4568 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4569 ("%s: Updating non-promote pte", __func__));
4570
4571 /*
4572 * Ensure we don't get switched out with the page table in an
4573 * inconsistent state. We also need to ensure no interrupts fire
4574 * as they may make use of an address we are about to invalidate.
4575 */
4576 intr = intr_disable();
4577
4578 /*
4579 * Clear the old mapping's valid bit, but leave the rest of the entry
4580 * unchanged, so that a lockless, concurrent pmap_kextract() can still
4581 * lookup the physical address.
4582 */
4583 pmap_clear_bits(ptep, ATTR_DESCR_VALID);
4584
4585 /*
4586 * When promoting, the L{1,2}_TABLE entry that is being replaced might
4587 * be cached, so we invalidate intermediate entries as well as final
4588 * entries.
4589 */
4590 pmap_s1_invalidate_range(pmap, va, va + size, false);
4591
4592 /* Create the new mapping */
4593 pmap_store(ptep, newpte);
4594 dsb(ishst);
4595
4596 intr_restore(intr);
4597 }
4598
4599 /*
4600 * Performs a break-before-make update of an ATTR_CONTIGUOUS mapping.
4601 */
4602 static void
pmap_update_strided(pmap_t pmap,pd_entry_t * ptep,pd_entry_t * ptep_end,pd_entry_t newpte,vm_offset_t va,vm_offset_t stride,vm_size_t size)4603 pmap_update_strided(pmap_t pmap, pd_entry_t *ptep, pd_entry_t *ptep_end,
4604 pd_entry_t newpte, vm_offset_t va, vm_offset_t stride, vm_size_t size)
4605 {
4606 pd_entry_t *lip;
4607 register_t intr;
4608
4609 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4610 KASSERT((newpte & ATTR_SW_NO_PROMOTE) == 0,
4611 ("%s: Updating non-promote pte", __func__));
4612
4613 /*
4614 * Ensure we don't get switched out with the page table in an
4615 * inconsistent state. We also need to ensure no interrupts fire
4616 * as they may make use of an address we are about to invalidate.
4617 */
4618 intr = intr_disable();
4619
4620 /*
4621 * Clear the old mapping's valid bits, but leave the rest of each
4622 * entry unchanged, so that a lockless, concurrent pmap_kextract() can
4623 * still lookup the physical address.
4624 */
4625 for (lip = ptep; lip < ptep_end; lip++)
4626 pmap_clear_bits(lip, ATTR_DESCR_VALID);
4627
4628 /* Only final entries are changing. */
4629 pmap_s1_invalidate_strided(pmap, va, va + size, stride, true);
4630
4631 /* Create the new mapping. */
4632 for (lip = ptep; lip < ptep_end; lip++) {
4633 pmap_store(lip, newpte);
4634 newpte += stride;
4635 }
4636 dsb(ishst);
4637
4638 intr_restore(intr);
4639 }
4640
4641 #if VM_NRESERVLEVEL > 0
4642 /*
4643 * After promotion from 512 4KB page mappings to a single 2MB page mapping,
4644 * replace the many pv entries for the 4KB page mappings by a single pv entry
4645 * for the 2MB page mapping.
4646 */
4647 static void
pmap_pv_promote_l2(pmap_t pmap,vm_offset_t va,vm_paddr_t pa,struct rwlock ** lockp)4648 pmap_pv_promote_l2(pmap_t pmap, vm_offset_t va, vm_paddr_t pa,
4649 struct rwlock **lockp)
4650 {
4651 struct md_page *pvh;
4652 pv_entry_t pv;
4653 vm_offset_t va_last;
4654 vm_page_t m;
4655
4656 KASSERT((pa & L2_OFFSET) == 0,
4657 ("pmap_pv_promote_l2: pa is not 2mpage aligned"));
4658 CHANGE_PV_LIST_LOCK_TO_PHYS(lockp, pa);
4659
4660 /*
4661 * Transfer the first page's pv entry for this mapping to the 2mpage's
4662 * pv list. Aside from avoiding the cost of a call to get_pv_entry(),
4663 * a transfer avoids the possibility that get_pv_entry() calls
4664 * reclaim_pv_chunk() and that reclaim_pv_chunk() removes one of the
4665 * mappings that is being promoted.
4666 */
4667 m = PHYS_TO_VM_PAGE(pa);
4668 va = va & ~L2_OFFSET;
4669 pv = pmap_pvh_remove(&m->md, pmap, va);
4670 KASSERT(pv != NULL, ("pmap_pv_promote_l2: pv not found"));
4671 pvh = page_to_pvh(m);
4672 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
4673 pvh->pv_gen++;
4674 /* Free the remaining NPTEPG - 1 pv entries. */
4675 va_last = va + L2_SIZE - PAGE_SIZE;
4676 do {
4677 m++;
4678 va += PAGE_SIZE;
4679 pmap_pvh_free(&m->md, pmap, va);
4680 } while (va < va_last);
4681 }
4682
4683 /*
4684 * Tries to promote the 512, contiguous 4KB page mappings that are within a
4685 * single level 2 table entry to a single 2MB page mapping. For promotion
4686 * to occur, two conditions must be met: (1) the 4KB page mappings must map
4687 * aligned, contiguous physical memory and (2) the 4KB page mappings must have
4688 * identical characteristics.
4689 */
4690 static bool
pmap_promote_l2(pmap_t pmap,pd_entry_t * l2,vm_offset_t va,vm_page_t mpte,struct rwlock ** lockp)4691 pmap_promote_l2(pmap_t pmap, pd_entry_t *l2, vm_offset_t va, vm_page_t mpte,
4692 struct rwlock **lockp)
4693 {
4694 pt_entry_t all_l3e_AF, *firstl3, *l3, newl2, oldl3, pa;
4695
4696 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4697
4698 /*
4699 * Currently, this function only supports promotion on stage 1 pmaps
4700 * because it tests stage 1 specific fields and performs a break-
4701 * before-make sequence that is incorrect for stage 2 pmaps.
4702 */
4703 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4704 return (false);
4705
4706 /*
4707 * Examine the first L3E in the specified PTP. Abort if this L3E is
4708 * ineligible for promotion...
4709 */
4710 firstl3 = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(pmap_load(l2)));
4711 newl2 = pmap_load(firstl3);
4712 if ((newl2 & ATTR_SW_NO_PROMOTE) != 0)
4713 return (false);
4714 /* ... is not the first physical page within an L2 block */
4715 if ((PTE_TO_PHYS(newl2) & L2_OFFSET) != 0 ||
4716 ((newl2 & ATTR_DESCR_MASK) != L3_PAGE)) { /* ... or is invalid */
4717 atomic_add_long(&pmap_l2_p_failures, 1);
4718 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4719 " in pmap %p", va, pmap);
4720 return (false);
4721 }
4722
4723 /*
4724 * Both here and in the below "for" loop, to allow for repromotion
4725 * after MADV_FREE, conditionally write protect a clean L3E before
4726 * possibly aborting the promotion due to other L3E attributes. Why?
4727 * Suppose that MADV_FREE is applied to a part of a superpage, the
4728 * address range [S, E). pmap_advise() will demote the superpage
4729 * mapping, destroy the 4KB page mapping at the end of [S, E), and
4730 * set AP_RO and clear AF in the L3Es for the rest of [S, E). Later,
4731 * imagine that the memory in [S, E) is recycled, but the last 4KB
4732 * page in [S, E) is not the last to be rewritten, or simply accessed.
4733 * In other words, there is still a 4KB page in [S, E), call it P,
4734 * that is writeable but AP_RO is set and AF is clear in P's L3E.
4735 * Unless we write protect P before aborting the promotion, if and
4736 * when P is finally rewritten, there won't be a page fault to trigger
4737 * repromotion.
4738 */
4739 setl2:
4740 if ((newl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4741 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4742 /*
4743 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4744 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4745 */
4746 if (!atomic_fcmpset_64(firstl3, &newl2, newl2 & ~ATTR_SW_DBM))
4747 goto setl2;
4748 newl2 &= ~ATTR_SW_DBM;
4749 CTR2(KTR_PMAP, "pmap_promote_l2: protect for va %#lx"
4750 " in pmap %p", va & ~L2_OFFSET, pmap);
4751 }
4752
4753 /*
4754 * Examine each of the other L3Es in the specified PTP. Abort if this
4755 * L3E maps an unexpected 4KB physical page or does not have identical
4756 * characteristics to the first L3E. If ATTR_AF is not set in every
4757 * PTE, then request that the PTP be refilled on demotion.
4758 */
4759 all_l3e_AF = newl2 & ATTR_AF;
4760 pa = (PTE_TO_PHYS(newl2) | (newl2 & ATTR_DESCR_MASK))
4761 + L2_SIZE - PAGE_SIZE;
4762 for (l3 = firstl3 + NL3PG - 1; l3 > firstl3; l3--) {
4763 oldl3 = pmap_load(l3);
4764 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4765 atomic_add_long(&pmap_l2_p_failures, 1);
4766 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4767 " in pmap %p", va, pmap);
4768 return (false);
4769 }
4770 setl3:
4771 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4772 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4773 /*
4774 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4775 * set, ATTR_SW_DBM can be cleared without a TLB
4776 * invalidation.
4777 */
4778 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4779 ~ATTR_SW_DBM))
4780 goto setl3;
4781 oldl3 &= ~ATTR_SW_DBM;
4782 }
4783 if ((oldl3 & ATTR_PROMOTE) != (newl2 & ATTR_PROMOTE)) {
4784 atomic_add_long(&pmap_l2_p_failures, 1);
4785 CTR2(KTR_PMAP, "pmap_promote_l2: failure for va %#lx"
4786 " in pmap %p", va, pmap);
4787 return (false);
4788 }
4789 all_l3e_AF &= oldl3;
4790 pa -= PAGE_SIZE;
4791 }
4792
4793 /*
4794 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4795 * mapping, so that promotions triggered by speculative mappings,
4796 * such as pmap_enter_quick(), don't automatically mark the
4797 * underlying pages as referenced.
4798 */
4799 newl2 &= ~(ATTR_CONTIGUOUS | ATTR_AF | ATTR_DESCR_MASK) | all_l3e_AF;
4800
4801 /*
4802 * Save the page table page in its current state until the L2
4803 * mapping the superpage is demoted by pmap_demote_l2() or
4804 * destroyed by pmap_remove_l3().
4805 */
4806 if (mpte == NULL)
4807 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
4808 KASSERT(mpte >= vm_page_array &&
4809 mpte < &vm_page_array[vm_page_array_size],
4810 ("pmap_promote_l2: page table page is out of range"));
4811 KASSERT(mpte->pindex == pmap_l2_pindex(va),
4812 ("pmap_promote_l2: page table page's pindex is wrong"));
4813 if (pmap_insert_pt_page(pmap, mpte, true, all_l3e_AF != 0)) {
4814 atomic_add_long(&pmap_l2_p_failures, 1);
4815 CTR2(KTR_PMAP,
4816 "pmap_promote_l2: failure for va %#lx in pmap %p", va,
4817 pmap);
4818 return (false);
4819 }
4820
4821 if ((newl2 & ATTR_SW_MANAGED) != 0)
4822 pmap_pv_promote_l2(pmap, va, PTE_TO_PHYS(newl2), lockp);
4823
4824 pmap_update_entry(pmap, l2, newl2 | L2_BLOCK, va & ~L2_OFFSET, L2_SIZE);
4825
4826 atomic_add_long(&pmap_l2_promotions, 1);
4827 CTR2(KTR_PMAP, "pmap_promote_l2: success for va %#lx in pmap %p", va,
4828 pmap);
4829 return (true);
4830 }
4831
4832 /*
4833 * Tries to promote an aligned, contiguous set of base page mappings to a
4834 * single L3C page mapping. For promotion to occur, two conditions must be
4835 * met: (1) the base page mappings must map aligned, contiguous physical
4836 * memory and (2) the base page mappings must have identical characteristics
4837 * except for the accessed flag.
4838 */
4839 static bool
pmap_promote_l3c(pmap_t pmap,pd_entry_t * l3p,vm_offset_t va)4840 pmap_promote_l3c(pmap_t pmap, pd_entry_t *l3p, vm_offset_t va)
4841 {
4842 pd_entry_t all_l3e_AF, firstl3c, *l3, oldl3, pa;
4843
4844 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4845
4846 /*
4847 * Currently, this function only supports promotion on stage 1 pmaps
4848 * because it tests stage 1 specific fields and performs a break-
4849 * before-make sequence that is incorrect for stage 2 pmaps.
4850 */
4851 if (pmap->pm_stage != PM_STAGE1 || !pmap_ps_enabled(pmap))
4852 return (false);
4853
4854 /*
4855 * Compute the address of the first L3 entry in the superpage
4856 * candidate.
4857 */
4858 l3p = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
4859 sizeof(pt_entry_t)) - 1));
4860
4861 firstl3c = pmap_load(l3p);
4862
4863 /*
4864 * Examine the first L3 entry. Abort if this L3E is ineligible for
4865 * promotion...
4866 */
4867 if ((firstl3c & ATTR_SW_NO_PROMOTE) != 0)
4868 return (false);
4869 /* ...is not properly aligned... */
4870 if ((PTE_TO_PHYS(firstl3c) & L3C_OFFSET) != 0 ||
4871 (firstl3c & ATTR_DESCR_MASK) != L3_PAGE) { /* ...or is invalid. */
4872 counter_u64_add(pmap_l3c_p_failures, 1);
4873 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4874 " in pmap %p", va, pmap);
4875 return (false);
4876 }
4877
4878 /*
4879 * If the first L3 entry is a clean read-write mapping, convert it
4880 * to a read-only mapping. See pmap_promote_l2() for the rationale.
4881 */
4882 set_first:
4883 if ((firstl3c & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4884 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4885 /*
4886 * When the mapping is clean, i.e., ATTR_S1_AP_RO is set,
4887 * ATTR_SW_DBM can be cleared without a TLB invalidation.
4888 */
4889 if (!atomic_fcmpset_64(l3p, &firstl3c, firstl3c & ~ATTR_SW_DBM))
4890 goto set_first;
4891 firstl3c &= ~ATTR_SW_DBM;
4892 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4893 " in pmap %p", va & ~L3C_OFFSET, pmap);
4894 }
4895
4896 /*
4897 * Check that the rest of the L3 entries are compatible with the first,
4898 * and convert clean read-write mappings to read-only mappings.
4899 */
4900 all_l3e_AF = firstl3c & ATTR_AF;
4901 pa = (PTE_TO_PHYS(firstl3c) | (firstl3c & ATTR_DESCR_MASK)) +
4902 L3C_SIZE - PAGE_SIZE;
4903 for (l3 = l3p + L3C_ENTRIES - 1; l3 > l3p; l3--) {
4904 oldl3 = pmap_load(l3);
4905 if ((PTE_TO_PHYS(oldl3) | (oldl3 & ATTR_DESCR_MASK)) != pa) {
4906 counter_u64_add(pmap_l3c_p_failures, 1);
4907 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4908 " in pmap %p", va, pmap);
4909 return (false);
4910 }
4911 set_l3:
4912 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
4913 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM)) {
4914 /*
4915 * When the mapping is clean, i.e., ATTR_S1_AP_RO is
4916 * set, ATTR_SW_DBM can be cleared without a TLB
4917 * invalidation.
4918 */
4919 if (!atomic_fcmpset_64(l3, &oldl3, oldl3 &
4920 ~ATTR_SW_DBM))
4921 goto set_l3;
4922 oldl3 &= ~ATTR_SW_DBM;
4923 CTR2(KTR_PMAP, "pmap_promote_l3c: protect for va %#lx"
4924 " in pmap %p", (oldl3 & ~ATTR_MASK & L3C_OFFSET) |
4925 (va & ~L3C_OFFSET), pmap);
4926 }
4927 if ((oldl3 & ATTR_PROMOTE) != (firstl3c & ATTR_PROMOTE)) {
4928 counter_u64_add(pmap_l3c_p_failures, 1);
4929 CTR2(KTR_PMAP, "pmap_promote_l3c: failure for va %#lx"
4930 " in pmap %p", va, pmap);
4931 return (false);
4932 }
4933 all_l3e_AF &= oldl3;
4934 pa -= PAGE_SIZE;
4935 }
4936
4937 /*
4938 * Unless all PTEs have ATTR_AF set, clear it from the superpage
4939 * mapping, so that promotions triggered by speculative mappings,
4940 * such as pmap_enter_quick(), don't automatically mark the
4941 * underlying pages as referenced.
4942 */
4943 firstl3c &= ~ATTR_AF | all_l3e_AF;
4944
4945 /*
4946 * Remake the mappings with the contiguous bit set.
4947 */
4948 pmap_update_strided(pmap, l3p, l3p + L3C_ENTRIES, firstl3c |
4949 ATTR_CONTIGUOUS, va & ~L3C_OFFSET, L3_SIZE, L3C_SIZE);
4950
4951 counter_u64_add(pmap_l3c_promotions, 1);
4952 CTR2(KTR_PMAP, "pmap_promote_l3c: success for va %#lx in pmap %p", va,
4953 pmap);
4954 return (true);
4955 }
4956 #endif /* VM_NRESERVLEVEL > 0 */
4957
4958 static int
pmap_enter_largepage(pmap_t pmap,vm_offset_t va,pt_entry_t pte,int flags,int psind)4959 pmap_enter_largepage(pmap_t pmap, vm_offset_t va, pt_entry_t pte, int flags,
4960 int psind)
4961 {
4962 pd_entry_t *l0p, *l1p, *l2p, newpte, origpte;
4963 vm_page_t mp;
4964
4965 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
4966 KASSERT(psind > 0 && psind < MAXPAGESIZES,
4967 ("psind %d unexpected", psind));
4968 KASSERT((PTE_TO_PHYS(pte) & (pagesizes[psind] - 1)) == 0,
4969 ("unaligned phys address %#lx pte %#lx psind %d",
4970 PTE_TO_PHYS(pte), pte, psind));
4971
4972 restart:
4973 newpte = pte;
4974 if (!pmap_bti_same(pmap, va, va + pagesizes[psind], &newpte))
4975 return (KERN_PROTECTION_FAILURE);
4976 if (psind == 2) {
4977 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
4978
4979 l0p = pmap_l0(pmap, va);
4980 if ((pmap_load(l0p) & ATTR_DESCR_VALID) == 0) {
4981 mp = _pmap_alloc_l3(pmap, pmap_l0_pindex(va), NULL);
4982 if (mp == NULL) {
4983 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
4984 return (KERN_RESOURCE_SHORTAGE);
4985 PMAP_UNLOCK(pmap);
4986 vm_wait(NULL);
4987 PMAP_LOCK(pmap);
4988 goto restart;
4989 }
4990 l1p = pmap_l0_to_l1(l0p, va);
4991 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4992 origpte = pmap_load(l1p);
4993 } else {
4994 l1p = pmap_l0_to_l1(l0p, va);
4995 KASSERT(l1p != NULL, ("va %#lx lost l1 entry", va));
4996 origpte = pmap_load(l1p);
4997 if ((origpte & ATTR_DESCR_VALID) == 0) {
4998 mp = PTE_TO_VM_PAGE(pmap_load(l0p));
4999 mp->ref_count++;
5000 }
5001 }
5002 KASSERT((PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte) &&
5003 (origpte & ATTR_DESCR_MASK) == L1_BLOCK) ||
5004 (origpte & ATTR_DESCR_VALID) == 0,
5005 ("va %#lx changing 1G phys page l1 %#lx newpte %#lx",
5006 va, origpte, newpte));
5007 pmap_store(l1p, newpte);
5008 } else /* (psind == 1) */ {
5009 l2p = pmap_l2(pmap, va);
5010 if (l2p == NULL) {
5011 mp = _pmap_alloc_l3(pmap, pmap_l1_pindex(va), NULL);
5012 if (mp == NULL) {
5013 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5014 return (KERN_RESOURCE_SHORTAGE);
5015 PMAP_UNLOCK(pmap);
5016 vm_wait(NULL);
5017 PMAP_LOCK(pmap);
5018 goto restart;
5019 }
5020 l2p = (pd_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mp));
5021 l2p = &l2p[pmap_l2_index(va)];
5022 origpte = pmap_load(l2p);
5023 } else {
5024 l1p = pmap_l1(pmap, va);
5025 origpte = pmap_load(l2p);
5026 if ((origpte & ATTR_DESCR_VALID) == 0) {
5027 mp = PTE_TO_VM_PAGE(pmap_load(l1p));
5028 mp->ref_count++;
5029 }
5030 }
5031 KASSERT((origpte & ATTR_DESCR_VALID) == 0 ||
5032 ((origpte & ATTR_DESCR_MASK) == L2_BLOCK &&
5033 PTE_TO_PHYS(origpte) == PTE_TO_PHYS(newpte)),
5034 ("va %#lx changing 2M phys page l2 %#lx newpte %#lx",
5035 va, origpte, newpte));
5036 pmap_store(l2p, newpte);
5037 }
5038 dsb(ishst);
5039
5040 if ((origpte & ATTR_DESCR_VALID) == 0)
5041 pmap_resident_count_inc(pmap, pagesizes[psind] / PAGE_SIZE);
5042 if ((newpte & ATTR_SW_WIRED) != 0 && (origpte & ATTR_SW_WIRED) == 0)
5043 pmap->pm_stats.wired_count += pagesizes[psind] / PAGE_SIZE;
5044 else if ((newpte & ATTR_SW_WIRED) == 0 &&
5045 (origpte & ATTR_SW_WIRED) != 0)
5046 pmap->pm_stats.wired_count -= pagesizes[psind] / PAGE_SIZE;
5047
5048 return (KERN_SUCCESS);
5049 }
5050
5051 /*
5052 * Insert the given physical page (p) at
5053 * the specified virtual address (v) in the
5054 * target physical map with the protection requested.
5055 *
5056 * If specified, the page will be wired down, meaning
5057 * that the related pte can not be reclaimed.
5058 *
5059 * NB: This is the only routine which MAY NOT lazy-evaluate
5060 * or lose information. That is, this routine must actually
5061 * insert this page into the given map NOW.
5062 */
5063 int
pmap_enter(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,u_int flags,int8_t psind)5064 pmap_enter(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5065 u_int flags, int8_t psind)
5066 {
5067 struct rwlock *lock;
5068 pd_entry_t *pde;
5069 pt_entry_t new_l3, orig_l3;
5070 pt_entry_t *l2, *l3;
5071 pv_entry_t pv;
5072 vm_paddr_t opa, pa;
5073 vm_page_t mpte, om;
5074 bool nosleep;
5075 int lvl, rv;
5076
5077 KASSERT(ADDR_IS_CANONICAL(va),
5078 ("%s: Address not in canonical form: %lx", __func__, va));
5079
5080 va = trunc_page(va);
5081 if ((m->oflags & VPO_UNMANAGED) == 0)
5082 VM_PAGE_OBJECT_BUSY_ASSERT(m);
5083 pa = VM_PAGE_TO_PHYS(m);
5084 new_l3 = (pt_entry_t)(PHYS_TO_PTE(pa) | ATTR_DEFAULT | L3_PAGE);
5085 new_l3 |= pmap_pte_memattr(pmap, m->md.pv_memattr);
5086 new_l3 |= pmap_pte_prot(pmap, prot);
5087 if ((flags & PMAP_ENTER_WIRED) != 0)
5088 new_l3 |= ATTR_SW_WIRED;
5089 if (pmap->pm_stage == PM_STAGE1) {
5090 if (!ADDR_IS_KERNEL(va))
5091 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5092 else
5093 new_l3 |= ATTR_S1_UXN;
5094 if (pmap != kernel_pmap)
5095 new_l3 |= ATTR_S1_nG;
5096 } else {
5097 /*
5098 * Clear the access flag on executable mappings, this will be
5099 * set later when the page is accessed. The fault handler is
5100 * required to invalidate the I-cache.
5101 *
5102 * TODO: Switch to the valid flag to allow hardware management
5103 * of the access flag. Much of the pmap code assumes the
5104 * valid flag is set and fails to destroy the old page tables
5105 * correctly if it is clear.
5106 */
5107 if (prot & VM_PROT_EXECUTE)
5108 new_l3 &= ~ATTR_AF;
5109 }
5110 if ((m->oflags & VPO_UNMANAGED) == 0) {
5111 new_l3 |= ATTR_SW_MANAGED;
5112 if ((prot & VM_PROT_WRITE) != 0) {
5113 new_l3 |= ATTR_SW_DBM;
5114 if ((flags & VM_PROT_WRITE) == 0) {
5115 if (pmap->pm_stage == PM_STAGE1)
5116 new_l3 |= ATTR_S1_AP(ATTR_S1_AP_RO);
5117 else
5118 new_l3 &=
5119 ~ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
5120 }
5121 }
5122 }
5123
5124 CTR2(KTR_PMAP, "pmap_enter: %.16lx -> %.16lx", va, pa);
5125
5126 lock = NULL;
5127 PMAP_LOCK(pmap);
5128 if ((flags & PMAP_ENTER_LARGEPAGE) != 0) {
5129 KASSERT((m->oflags & VPO_UNMANAGED) != 0,
5130 ("managed largepage va %#lx flags %#x", va, flags));
5131 new_l3 &= ~L3_PAGE;
5132 if (psind == 2) {
5133 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
5134 new_l3 |= L1_BLOCK;
5135 } else /* (psind == 1) */
5136 new_l3 |= L2_BLOCK;
5137 rv = pmap_enter_largepage(pmap, va, new_l3, flags, psind);
5138 goto out;
5139 }
5140 if (psind == 1) {
5141 /* Assert the required virtual and physical alignment. */
5142 KASSERT((va & L2_OFFSET) == 0, ("pmap_enter: va unaligned"));
5143 KASSERT(m->psind > 0, ("pmap_enter: m->psind < psind"));
5144 rv = pmap_enter_l2(pmap, va, (new_l3 & ~L3_PAGE) | L2_BLOCK,
5145 flags, m, &lock);
5146 goto out;
5147 }
5148 mpte = NULL;
5149
5150 /*
5151 * In the case that a page table page is not
5152 * resident, we are creating it here.
5153 */
5154 retry:
5155 pde = pmap_pde(pmap, va, &lvl);
5156 if (pde != NULL && lvl == 2) {
5157 l3 = pmap_l2_to_l3(pde, va);
5158 if (!ADDR_IS_KERNEL(va) && mpte == NULL) {
5159 mpte = PTE_TO_VM_PAGE(pmap_load(pde));
5160 mpte->ref_count++;
5161 }
5162 goto havel3;
5163 } else if (pde != NULL && lvl == 1) {
5164 l2 = pmap_l1_to_l2(pde, va);
5165 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK &&
5166 (l3 = pmap_demote_l2_locked(pmap, l2, va, &lock)) != NULL) {
5167 l3 = &l3[pmap_l3_index(va)];
5168 if (!ADDR_IS_KERNEL(va)) {
5169 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5170 mpte->ref_count++;
5171 }
5172 goto havel3;
5173 }
5174 /* We need to allocate an L3 table. */
5175 }
5176 if (!ADDR_IS_KERNEL(va)) {
5177 nosleep = (flags & PMAP_ENTER_NOSLEEP) != 0;
5178
5179 /*
5180 * We use _pmap_alloc_l3() instead of pmap_alloc_l3() in order
5181 * to handle the possibility that a superpage mapping for "va"
5182 * was created while we slept.
5183 */
5184 mpte = _pmap_alloc_l3(pmap, pmap_l2_pindex(va),
5185 nosleep ? NULL : &lock);
5186 if (mpte == NULL && nosleep) {
5187 CTR0(KTR_PMAP, "pmap_enter: mpte == NULL");
5188 rv = KERN_RESOURCE_SHORTAGE;
5189 goto out;
5190 }
5191 goto retry;
5192 } else
5193 panic("pmap_enter: missing L3 table for kernel va %#lx", va);
5194
5195 havel3:
5196 orig_l3 = pmap_load(l3);
5197 opa = PTE_TO_PHYS(orig_l3);
5198 pv = NULL;
5199 new_l3 |= pmap_pte_bti(pmap, va);
5200
5201 /*
5202 * Is the specified virtual address already mapped?
5203 */
5204 if (pmap_l3_valid(orig_l3)) {
5205 /*
5206 * Wiring change, just update stats. We don't worry about
5207 * wiring PT pages as they remain resident as long as there
5208 * are valid mappings in them. Hence, if a user page is wired,
5209 * the PT page will be also.
5210 */
5211 if ((flags & PMAP_ENTER_WIRED) != 0 &&
5212 (orig_l3 & ATTR_SW_WIRED) == 0)
5213 pmap->pm_stats.wired_count++;
5214 else if ((flags & PMAP_ENTER_WIRED) == 0 &&
5215 (orig_l3 & ATTR_SW_WIRED) != 0)
5216 pmap->pm_stats.wired_count--;
5217
5218 /*
5219 * Remove the extra PT page reference.
5220 */
5221 if (mpte != NULL) {
5222 mpte->ref_count--;
5223 KASSERT(mpte->ref_count > 0,
5224 ("pmap_enter: missing reference to page table page,"
5225 " va: 0x%lx", va));
5226 }
5227
5228 /*
5229 * Has the physical page changed?
5230 */
5231 if (opa == pa) {
5232 /*
5233 * No, might be a protection or wiring change.
5234 */
5235 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5236 (new_l3 & ATTR_SW_DBM) != 0)
5237 vm_page_aflag_set(m, PGA_WRITEABLE);
5238 goto validate;
5239 }
5240
5241 /*
5242 * The physical page has changed. Temporarily invalidate
5243 * the mapping.
5244 */
5245 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5246 (void)pmap_demote_l3c(pmap, l3, va);
5247 orig_l3 = pmap_load_clear(l3);
5248 KASSERT(PTE_TO_PHYS(orig_l3) == opa,
5249 ("pmap_enter: unexpected pa update for %#lx", va));
5250 if ((orig_l3 & ATTR_SW_MANAGED) != 0) {
5251 om = PHYS_TO_VM_PAGE(opa);
5252
5253 /*
5254 * The pmap lock is sufficient to synchronize with
5255 * concurrent calls to pmap_page_test_mappings() and
5256 * pmap_ts_referenced().
5257 */
5258 if (pmap_pte_dirty(pmap, orig_l3))
5259 vm_page_dirty(om);
5260 if ((orig_l3 & ATTR_AF) != 0) {
5261 pmap_invalidate_page(pmap, va, true);
5262 vm_page_aflag_set(om, PGA_REFERENCED);
5263 }
5264 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, om);
5265 pv = pmap_pvh_remove(&om->md, pmap, va);
5266 if ((m->oflags & VPO_UNMANAGED) != 0)
5267 free_pv_entry(pmap, pv);
5268 if ((om->a.flags & PGA_WRITEABLE) != 0 &&
5269 TAILQ_EMPTY(&om->md.pv_list) &&
5270 ((om->flags & PG_FICTITIOUS) != 0 ||
5271 TAILQ_EMPTY(&page_to_pvh(om)->pv_list)))
5272 vm_page_aflag_clear(om, PGA_WRITEABLE);
5273 } else {
5274 KASSERT((orig_l3 & ATTR_AF) != 0,
5275 ("pmap_enter: unmanaged mapping lacks ATTR_AF"));
5276 pmap_invalidate_page(pmap, va, true);
5277 }
5278 orig_l3 = 0;
5279 } else {
5280 /*
5281 * Increment the counters.
5282 */
5283 if ((new_l3 & ATTR_SW_WIRED) != 0)
5284 pmap->pm_stats.wired_count++;
5285 pmap_resident_count_inc(pmap, 1);
5286 }
5287 /*
5288 * Enter on the PV list if part of our managed memory.
5289 */
5290 if ((m->oflags & VPO_UNMANAGED) == 0) {
5291 if (pv == NULL) {
5292 pv = get_pv_entry(pmap, &lock);
5293 pv->pv_va = va;
5294 }
5295 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
5296 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
5297 m->md.pv_gen++;
5298 if ((new_l3 & ATTR_SW_DBM) != 0)
5299 vm_page_aflag_set(m, PGA_WRITEABLE);
5300 }
5301
5302 validate:
5303 if (pmap->pm_stage == PM_STAGE1) {
5304 /*
5305 * Sync icache if exec permission and attribute
5306 * VM_MEMATTR_WRITE_BACK is set. Do it now, before the mapping
5307 * is stored and made valid for hardware table walk. If done
5308 * later, then other can access this page before caches are
5309 * properly synced. Don't do it for kernel memory which is
5310 * mapped with exec permission even if the memory isn't going
5311 * to hold executable code. The only time when icache sync is
5312 * needed is after kernel module is loaded and the relocation
5313 * info is processed. And it's done in elf_cpu_load_file().
5314 */
5315 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
5316 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK &&
5317 (opa != pa || (orig_l3 & ATTR_S1_XN))) {
5318 PMAP_ASSERT_STAGE1(pmap);
5319 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
5320 PAGE_SIZE);
5321 }
5322 } else {
5323 cpu_dcache_wb_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
5324 }
5325
5326 /*
5327 * Update the L3 entry
5328 */
5329 if (pmap_l3_valid(orig_l3)) {
5330 KASSERT(opa == pa, ("pmap_enter: invalid update"));
5331 if ((orig_l3 & ~ATTR_AF) != (new_l3 & ~ATTR_AF)) {
5332 /* same PA, different attributes */
5333 if ((orig_l3 & ATTR_CONTIGUOUS) != 0)
5334 (void)pmap_demote_l3c(pmap, l3, va);
5335 orig_l3 = pmap_load_store(l3, new_l3);
5336 pmap_invalidate_page(pmap, va, true);
5337 if ((orig_l3 & ATTR_SW_MANAGED) != 0 &&
5338 pmap_pte_dirty(pmap, orig_l3))
5339 vm_page_dirty(m);
5340 } else {
5341 /*
5342 * orig_l3 == new_l3
5343 * This can happens if multiple threads simultaneously
5344 * access not yet mapped page. This bad for performance
5345 * since this can cause full demotion-NOP-promotion
5346 * cycle.
5347 * Another possible reasons are:
5348 * - VM and pmap memory layout are diverged
5349 * - tlb flush is missing somewhere and CPU doesn't see
5350 * actual mapping.
5351 */
5352 CTR4(KTR_PMAP, "%s: already mapped page - "
5353 "pmap %p va 0x%#lx pte 0x%lx",
5354 __func__, pmap, va, new_l3);
5355 }
5356 } else {
5357 /* New mapping */
5358 pmap_store(l3, new_l3);
5359 dsb(ishst);
5360 }
5361
5362 #if VM_NRESERVLEVEL > 0
5363 /*
5364 * First, attempt L3C promotion, if the virtual and physical addresses
5365 * are aligned with each other and an underlying reservation has the
5366 * neighboring L3 pages allocated. The first condition is simply an
5367 * optimization that recognizes some eventual promotion failures early
5368 * at a lower run-time cost. Then, if both the page table page and
5369 * the reservation are fully populated, attempt L2 promotion.
5370 */
5371 if ((va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
5372 (m->flags & PG_FICTITIOUS) == 0 &&
5373 vm_reserv_is_populated(m, L3C_ENTRIES) &&
5374 pmap_promote_l3c(pmap, l3, va) &&
5375 (mpte == NULL || mpte->ref_count == NL3PG) &&
5376 vm_reserv_level_iffullpop(m) == 0)
5377 (void)pmap_promote_l2(pmap, pde, va, mpte, &lock);
5378 #endif
5379
5380 rv = KERN_SUCCESS;
5381 out:
5382 if (lock != NULL)
5383 rw_wunlock(lock);
5384 PMAP_UNLOCK(pmap);
5385 return (rv);
5386 }
5387
5388 /*
5389 * Tries to create a read- and/or execute-only L2 page mapping. Returns
5390 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5391 * value. See pmap_enter_l2() for the possible error values when "no sleep",
5392 * "no replace", and "no reclaim" are specified.
5393 */
5394 static int
pmap_enter_l2_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,struct rwlock ** lockp)5395 pmap_enter_l2_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot,
5396 struct rwlock **lockp)
5397 {
5398 pd_entry_t new_l2;
5399
5400 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5401 PMAP_ASSERT_STAGE1(pmap);
5402 KASSERT(ADDR_IS_CANONICAL(va),
5403 ("%s: Address not in canonical form: %lx", __func__, va));
5404
5405 new_l2 = (pd_entry_t)(VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5406 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5407 L2_BLOCK);
5408 if ((m->oflags & VPO_UNMANAGED) == 0) {
5409 new_l2 |= ATTR_SW_MANAGED;
5410 new_l2 &= ~ATTR_AF;
5411 }
5412 if ((prot & VM_PROT_EXECUTE) == 0 ||
5413 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5414 new_l2 |= ATTR_S1_XN;
5415 if (!ADDR_IS_KERNEL(va))
5416 new_l2 |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5417 else
5418 new_l2 |= ATTR_S1_UXN;
5419 if (pmap != kernel_pmap)
5420 new_l2 |= ATTR_S1_nG;
5421 return (pmap_enter_l2(pmap, va, new_l2, PMAP_ENTER_NOSLEEP |
5422 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, lockp));
5423 }
5424
5425 /*
5426 * Returns true if every page table entry in the specified page table is
5427 * zero.
5428 */
5429 static bool
pmap_every_pte_zero(vm_paddr_t pa)5430 pmap_every_pte_zero(vm_paddr_t pa)
5431 {
5432 pt_entry_t *pt_end, *pte;
5433
5434 KASSERT((pa & PAGE_MASK) == 0, ("pa is misaligned"));
5435 pte = (pt_entry_t *)PHYS_TO_DMAP(pa);
5436 for (pt_end = pte + Ln_ENTRIES; pte < pt_end; pte++) {
5437 if (*pte != 0)
5438 return (false);
5439 }
5440 return (true);
5441 }
5442
5443 /*
5444 * Tries to create the specified L2 page mapping. Returns KERN_SUCCESS if
5445 * the mapping was created, and one of KERN_FAILURE, KERN_NO_SPACE, or
5446 * KERN_RESOURCE_SHORTAGE otherwise. Returns KERN_FAILURE if
5447 * PMAP_ENTER_NOREPLACE was specified and a base page mapping already exists
5448 * within the L2 virtual address range starting at the specified virtual
5449 * address. Returns KERN_NO_SPACE if PMAP_ENTER_NOREPLACE was specified and a
5450 * L2 page mapping already exists at the specified virtual address. Returns
5451 * KERN_RESOURCE_SHORTAGE if either (1) PMAP_ENTER_NOSLEEP was specified and a
5452 * page table page allocation failed or (2) PMAP_ENTER_NORECLAIM was specified
5453 * and a PV entry allocation failed.
5454 */
5455 static int
pmap_enter_l2(pmap_t pmap,vm_offset_t va,pd_entry_t new_l2,u_int flags,vm_page_t m,struct rwlock ** lockp)5456 pmap_enter_l2(pmap_t pmap, vm_offset_t va, pd_entry_t new_l2, u_int flags,
5457 vm_page_t m, struct rwlock **lockp)
5458 {
5459 struct spglist free;
5460 pd_entry_t *l2, old_l2;
5461 vm_page_t l2pg, mt;
5462 vm_page_t uwptpg;
5463
5464 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5465 KASSERT(ADDR_IS_CANONICAL(va),
5466 ("%s: Address not in canonical form: %lx", __func__, va));
5467
5468 if ((l2 = pmap_alloc_l2(pmap, va, &l2pg, (flags &
5469 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp)) == NULL) {
5470 CTR2(KTR_PMAP, "pmap_enter_l2: failure for va %#lx in pmap %p",
5471 va, pmap);
5472 return (KERN_RESOURCE_SHORTAGE);
5473 }
5474
5475 /*
5476 * If bti is not the same for the whole l2 range, return failure
5477 * and let vm_fault() cope. Check after l2 allocation, since
5478 * it could sleep.
5479 */
5480 if (!pmap_bti_same(pmap, va, va + L2_SIZE, &new_l2)) {
5481 KASSERT(l2pg != NULL, ("pmap_enter_l2: missing L2 PTP"));
5482 pmap_abort_ptp(pmap, va, l2pg);
5483 return (KERN_PROTECTION_FAILURE);
5484 }
5485
5486 /*
5487 * If there are existing mappings, either abort or remove them.
5488 */
5489 if ((old_l2 = pmap_load(l2)) != 0) {
5490 KASSERT(l2pg == NULL || l2pg->ref_count > 1,
5491 ("pmap_enter_l2: l2pg's ref count is too low"));
5492 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5493 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK) {
5494 if (l2pg != NULL)
5495 l2pg->ref_count--;
5496 CTR2(KTR_PMAP,
5497 "pmap_enter_l2: no space for va %#lx"
5498 " in pmap %p", va, pmap);
5499 return (KERN_NO_SPACE);
5500 } else if (!ADDR_IS_KERNEL(va) ||
5501 !pmap_every_pte_zero(PTE_TO_PHYS(old_l2))) {
5502 if (l2pg != NULL)
5503 l2pg->ref_count--;
5504 CTR2(KTR_PMAP,
5505 "pmap_enter_l2: failure for va %#lx"
5506 " in pmap %p", va, pmap);
5507 return (KERN_FAILURE);
5508 }
5509 }
5510 SLIST_INIT(&free);
5511 if ((old_l2 & ATTR_DESCR_MASK) == L2_BLOCK)
5512 (void)pmap_remove_l2(pmap, l2, va,
5513 pmap_load(pmap_l1(pmap, va)), &free, lockp);
5514 else
5515 pmap_remove_l3_range(pmap, old_l2, va, va + L2_SIZE,
5516 &free, lockp);
5517 if (!ADDR_IS_KERNEL(va)) {
5518 vm_page_free_pages_toq(&free, true);
5519 KASSERT(pmap_load(l2) == 0,
5520 ("pmap_enter_l2: non-zero L2 entry %p", l2));
5521 } else {
5522 KASSERT(SLIST_EMPTY(&free),
5523 ("pmap_enter_l2: freed kernel page table page"));
5524
5525 /*
5526 * Both pmap_remove_l2() and pmap_remove_l3_range()
5527 * will leave the kernel page table page zero filled.
5528 * Nonetheless, the TLB could have an intermediate
5529 * entry for the kernel page table page, so request
5530 * an invalidation at all levels after clearing
5531 * the L2_TABLE entry.
5532 */
5533 mt = PTE_TO_VM_PAGE(pmap_load(l2));
5534 if (pmap_insert_pt_page(pmap, mt, false, false))
5535 panic("pmap_enter_l2: trie insert failed");
5536 pmap_clear(l2);
5537 pmap_s1_invalidate_page(pmap, va, false);
5538 }
5539 }
5540
5541 /*
5542 * Allocate leaf ptpage for wired userspace pages.
5543 */
5544 uwptpg = NULL;
5545 if ((new_l2 & ATTR_SW_WIRED) != 0 && pmap != kernel_pmap) {
5546 uwptpg = vm_page_alloc_noobj(VM_ALLOC_WIRED);
5547 if (uwptpg == NULL) {
5548 pmap_abort_ptp(pmap, va, l2pg);
5549 return (KERN_RESOURCE_SHORTAGE);
5550 }
5551 uwptpg->pindex = pmap_l2_pindex(va);
5552 if (pmap_insert_pt_page(pmap, uwptpg, true, false)) {
5553 vm_page_unwire_noq(uwptpg);
5554 vm_page_free(uwptpg);
5555 pmap_abort_ptp(pmap, va, l2pg);
5556 return (KERN_RESOURCE_SHORTAGE);
5557 }
5558 pmap_resident_count_inc(pmap, 1);
5559 uwptpg->ref_count = NL3PG;
5560 }
5561 if ((new_l2 & ATTR_SW_MANAGED) != 0) {
5562 /*
5563 * Abort this mapping if its PV entry could not be created.
5564 */
5565 if (!pmap_pv_insert_l2(pmap, va, new_l2, flags, lockp)) {
5566 if (l2pg != NULL)
5567 pmap_abort_ptp(pmap, va, l2pg);
5568 if (uwptpg != NULL) {
5569 mt = pmap_remove_pt_page(pmap, va);
5570 KASSERT(mt == uwptpg,
5571 ("removed pt page %p, expected %p", mt,
5572 uwptpg));
5573 pmap_resident_count_dec(pmap, 1);
5574 uwptpg->ref_count = 1;
5575 vm_page_unwire_noq(uwptpg);
5576 vm_page_free(uwptpg);
5577 }
5578 CTR2(KTR_PMAP,
5579 "pmap_enter_l2: failure for va %#lx in pmap %p",
5580 va, pmap);
5581 return (KERN_RESOURCE_SHORTAGE);
5582 }
5583 if ((new_l2 & ATTR_SW_DBM) != 0)
5584 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
5585 vm_page_aflag_set(mt, PGA_WRITEABLE);
5586 }
5587
5588 /*
5589 * Increment counters.
5590 */
5591 if ((new_l2 & ATTR_SW_WIRED) != 0)
5592 pmap->pm_stats.wired_count += L2_SIZE / PAGE_SIZE;
5593 pmap->pm_stats.resident_count += L2_SIZE / PAGE_SIZE;
5594
5595 /*
5596 * Conditionally sync the icache. See pmap_enter() for details.
5597 */
5598 if ((new_l2 & ATTR_S1_XN) == 0 && (PTE_TO_PHYS(new_l2) !=
5599 PTE_TO_PHYS(old_l2) || (old_l2 & ATTR_S1_XN) != 0) &&
5600 pmap != kernel_pmap && m->md.pv_memattr == VM_MEMATTR_WRITE_BACK) {
5601 cpu_icache_sync_range((void *)PHYS_TO_DMAP(PTE_TO_PHYS(new_l2)),
5602 L2_SIZE);
5603 }
5604
5605 /*
5606 * Map the superpage.
5607 */
5608 pmap_store(l2, new_l2);
5609 dsb(ishst);
5610
5611 atomic_add_long(&pmap_l2_mappings, 1);
5612 CTR2(KTR_PMAP, "pmap_enter_l2: success for va %#lx in pmap %p",
5613 va, pmap);
5614
5615 return (KERN_SUCCESS);
5616 }
5617
5618 /*
5619 * Tries to create a read- and/or execute-only L3C page mapping. Returns
5620 * KERN_SUCCESS if the mapping was created. Otherwise, returns an error
5621 * value.
5622 */
5623 static int
pmap_enter_l3c_rx(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_page_t * ml3p,vm_prot_t prot,struct rwlock ** lockp)5624 pmap_enter_l3c_rx(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_page_t *ml3p,
5625 vm_prot_t prot, struct rwlock **lockp)
5626 {
5627 pt_entry_t l3e;
5628
5629 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5630 PMAP_ASSERT_STAGE1(pmap);
5631 KASSERT(ADDR_IS_CANONICAL(va),
5632 ("%s: Address not in canonical form: %lx", __func__, va));
5633
5634 l3e = VM_PAGE_TO_PTE(m) | ATTR_DEFAULT |
5635 ATTR_S1_IDX(m->md.pv_memattr) | ATTR_S1_AP(ATTR_S1_AP_RO) |
5636 ATTR_CONTIGUOUS | L3_PAGE;
5637 if ((m->oflags & VPO_UNMANAGED) == 0) {
5638 l3e |= ATTR_SW_MANAGED;
5639 l3e &= ~ATTR_AF;
5640 }
5641 if ((prot & VM_PROT_EXECUTE) == 0 ||
5642 m->md.pv_memattr == VM_MEMATTR_DEVICE)
5643 l3e |= ATTR_S1_XN;
5644 if (!ADDR_IS_KERNEL(va))
5645 l3e |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
5646 else
5647 l3e |= ATTR_S1_UXN;
5648 if (pmap != kernel_pmap)
5649 l3e |= ATTR_S1_nG;
5650 return (pmap_enter_l3c(pmap, va, l3e, PMAP_ENTER_NOSLEEP |
5651 PMAP_ENTER_NOREPLACE | PMAP_ENTER_NORECLAIM, m, ml3p, lockp));
5652 }
5653
5654 static int
pmap_enter_l3c(pmap_t pmap,vm_offset_t va,pt_entry_t l3e,u_int flags,vm_page_t m,vm_page_t * ml3p,struct rwlock ** lockp)5655 pmap_enter_l3c(pmap_t pmap, vm_offset_t va, pt_entry_t l3e, u_int flags,
5656 vm_page_t m, vm_page_t *ml3p, struct rwlock **lockp)
5657 {
5658 pd_entry_t *l2p, *pde;
5659 pt_entry_t *l3p, *tl3p;
5660 vm_page_t mt;
5661 vm_paddr_t pa;
5662 vm_pindex_t l2pindex;
5663 int lvl;
5664
5665 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5666 KASSERT((va & L3C_OFFSET) == 0,
5667 ("pmap_enter_l3c: va is not aligned"));
5668 KASSERT(!VA_IS_CLEANMAP(va) || (l3e & ATTR_SW_MANAGED) == 0,
5669 ("pmap_enter_l3c: managed mapping within the clean submap"));
5670
5671 /*
5672 * If the L3 PTP is not resident, we attempt to create it here.
5673 */
5674 if (!ADDR_IS_KERNEL(va)) {
5675 /*
5676 * Were we given the correct L3 PTP? If so, we can simply
5677 * increment its ref count.
5678 */
5679 l2pindex = pmap_l2_pindex(va);
5680 if (*ml3p != NULL && (*ml3p)->pindex == l2pindex) {
5681 (*ml3p)->ref_count += L3C_ENTRIES;
5682 } else {
5683 retry:
5684 /*
5685 * Get the L2 entry.
5686 */
5687 pde = pmap_pde(pmap, va, &lvl);
5688
5689 /*
5690 * If the L2 entry is a superpage, we either abort or
5691 * demote depending on the given flags.
5692 */
5693 if (lvl == 1) {
5694 l2p = pmap_l1_to_l2(pde, va);
5695 if ((pmap_load(l2p) & ATTR_DESCR_MASK) ==
5696 L2_BLOCK) {
5697 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5698 return (KERN_FAILURE);
5699 l3p = pmap_demote_l2_locked(pmap, l2p,
5700 va, lockp);
5701 if (l3p != NULL) {
5702 *ml3p = PTE_TO_VM_PAGE(
5703 pmap_load(l2p));
5704 (*ml3p)->ref_count +=
5705 L3C_ENTRIES;
5706 goto have_l3p;
5707 }
5708 }
5709 /* We need to allocate an L3 PTP. */
5710 }
5711
5712 /*
5713 * If the L3 PTP is mapped, we just increment its ref
5714 * count. Otherwise, we attempt to allocate it.
5715 */
5716 if (lvl == 2 && pmap_load(pde) != 0) {
5717 *ml3p = PTE_TO_VM_PAGE(pmap_load(pde));
5718 (*ml3p)->ref_count += L3C_ENTRIES;
5719 } else {
5720 *ml3p = _pmap_alloc_l3(pmap, l2pindex, (flags &
5721 PMAP_ENTER_NOSLEEP) != 0 ? NULL : lockp);
5722 if (*ml3p == NULL) {
5723 if ((flags & PMAP_ENTER_NOSLEEP) != 0)
5724 return (KERN_FAILURE);
5725
5726 /*
5727 * The page table may have changed
5728 * while we slept.
5729 */
5730 goto retry;
5731 }
5732 (*ml3p)->ref_count += L3C_ENTRIES - 1;
5733 }
5734 }
5735 l3p = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(*ml3p));
5736 } else {
5737 *ml3p = NULL;
5738
5739 /*
5740 * If the L2 entry is a superpage, we either abort or demote
5741 * depending on the given flags.
5742 */
5743 pde = pmap_pde(kernel_pmap, va, &lvl);
5744 if (lvl == 1) {
5745 l2p = pmap_l1_to_l2(pde, va);
5746 KASSERT((pmap_load(l2p) & ATTR_DESCR_MASK) == L2_BLOCK,
5747 ("pmap_enter_l3c: missing L2 block"));
5748 if ((flags & PMAP_ENTER_NOREPLACE) != 0)
5749 return (KERN_FAILURE);
5750 l3p = pmap_demote_l2_locked(pmap, l2p, va, lockp);
5751 } else {
5752 KASSERT(lvl == 2,
5753 ("pmap_enter_l3c: Invalid level %d", lvl));
5754 l3p = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(
5755 pmap_load(pde)));
5756 }
5757 }
5758 have_l3p:
5759 l3p = &l3p[pmap_l3_index(va)];
5760
5761 /*
5762 * If bti is not the same for the whole L3C range, return failure
5763 * and let vm_fault() cope. Check after L3 allocation, since
5764 * it could sleep.
5765 */
5766 if (!pmap_bti_same(pmap, va, va + L3C_SIZE, &l3e)) {
5767 KASSERT(*ml3p != NULL, ("pmap_enter_l3c: missing L3 PTP"));
5768 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
5769 pmap_abort_ptp(pmap, va, *ml3p);
5770 *ml3p = NULL;
5771 return (KERN_PROTECTION_FAILURE);
5772 }
5773
5774 /*
5775 * If there are existing mappings, either abort or remove them.
5776 */
5777 if ((flags & PMAP_ENTER_NOREPLACE) != 0) {
5778 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5779 if (pmap_load(tl3p) != 0) {
5780 if (*ml3p != NULL)
5781 (*ml3p)->ref_count -= L3C_ENTRIES;
5782 return (KERN_FAILURE);
5783 }
5784 }
5785 } else {
5786 /*
5787 * Because we increment the L3 page's reference count above,
5788 * it is guaranteed not to be freed here and we can pass NULL
5789 * instead of a valid free list.
5790 */
5791 pmap_remove_l3_range(pmap, pmap_load(pmap_l2(pmap, va)), va,
5792 va + L3C_SIZE, NULL, lockp);
5793 }
5794
5795 /*
5796 * Enter on the PV list if part of our managed memory.
5797 */
5798 if ((l3e & ATTR_SW_MANAGED) != 0) {
5799 if (!pmap_pv_insert_l3c(pmap, va, m, lockp)) {
5800 if (*ml3p != NULL) {
5801 (*ml3p)->ref_count -= L3C_ENTRIES - 1;
5802 pmap_abort_ptp(pmap, va, *ml3p);
5803 *ml3p = NULL;
5804 }
5805 return (KERN_RESOURCE_SHORTAGE);
5806 }
5807 if ((l3e & ATTR_SW_DBM) != 0)
5808 for (mt = m; mt < &m[L3C_ENTRIES]; mt++)
5809 vm_page_aflag_set(mt, PGA_WRITEABLE);
5810 }
5811
5812 /*
5813 * Increment counters.
5814 */
5815 if ((l3e & ATTR_SW_WIRED) != 0)
5816 pmap->pm_stats.wired_count += L3C_ENTRIES;
5817 pmap_resident_count_inc(pmap, L3C_ENTRIES);
5818
5819 pa = VM_PAGE_TO_PHYS(m);
5820 KASSERT((pa & L3C_OFFSET) == 0, ("pmap_enter_l3c: pa is not aligned"));
5821
5822 /*
5823 * Sync the icache before the mapping is stored.
5824 */
5825 if ((l3e & ATTR_S1_XN) == 0 && pmap != kernel_pmap &&
5826 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
5827 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), L3C_SIZE);
5828
5829 /*
5830 * Map the superpage.
5831 */
5832 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
5833 pmap_store(tl3p, l3e);
5834 l3e += L3_SIZE;
5835 }
5836 dsb(ishst);
5837
5838 counter_u64_add(pmap_l3c_mappings, 1);
5839 CTR2(KTR_PMAP, "pmap_enter_l3c: success for va %#lx in pmap %p",
5840 va, pmap);
5841 return (KERN_SUCCESS);
5842 }
5843
5844 /*
5845 * Maps a sequence of resident pages belonging to the same object.
5846 * The sequence begins with the given page m_start. This page is
5847 * mapped at the given virtual address start. Each subsequent page is
5848 * mapped at a virtual address that is offset from start by the same
5849 * amount as the page is offset from m_start within the object. The
5850 * last page in the sequence is the page with the largest offset from
5851 * m_start that can be mapped at a virtual address less than the given
5852 * virtual address end. Not every virtual page between start and end
5853 * is mapped; only those for which a resident page exists with the
5854 * corresponding offset from m_start are mapped.
5855 */
5856 void
pmap_enter_object(pmap_t pmap,vm_offset_t start,vm_offset_t end,vm_page_t m_start,vm_prot_t prot)5857 pmap_enter_object(pmap_t pmap, vm_offset_t start, vm_offset_t end,
5858 vm_page_t m_start, vm_prot_t prot)
5859 {
5860 struct rwlock *lock;
5861 vm_offset_t va;
5862 vm_page_t m, mpte;
5863 vm_pindex_t diff, psize;
5864 int rv;
5865
5866 VM_OBJECT_ASSERT_LOCKED(m_start->object);
5867
5868 psize = atop(end - start);
5869 mpte = NULL;
5870 m = m_start;
5871 lock = NULL;
5872 PMAP_LOCK(pmap);
5873 while (m != NULL && (diff = m->pindex - m_start->pindex) < psize) {
5874 va = start + ptoa(diff);
5875 if ((va & L2_OFFSET) == 0 && va + L2_SIZE <= end &&
5876 m->psind == 1 && pmap_ps_enabled(pmap) &&
5877 ((rv = pmap_enter_l2_rx(pmap, va, m, prot, &lock)) ==
5878 KERN_SUCCESS || rv == KERN_NO_SPACE))
5879 m = &m[L2_SIZE / PAGE_SIZE - 1];
5880 else if ((va & L3C_OFFSET) == 0 && va + L3C_SIZE <= end &&
5881 (VM_PAGE_TO_PHYS(m) & L3C_OFFSET) == 0 &&
5882 vm_reserv_is_populated(m, L3C_ENTRIES) &&
5883 pmap_ps_enabled(pmap) &&
5884 ((rv = pmap_enter_l3c_rx(pmap, va, m, &mpte, prot,
5885 &lock)) == KERN_SUCCESS || rv == KERN_NO_SPACE))
5886 m = &m[L3C_ENTRIES - 1];
5887 else {
5888 /*
5889 * In general, if a superpage mapping were possible,
5890 * it would have been created above. That said, if
5891 * start and end are not superpage aligned, then
5892 * promotion might be possible at the ends of [start,
5893 * end). However, in practice, those promotion
5894 * attempts are so unlikely to succeed that they are
5895 * not worth trying.
5896 */
5897 mpte = pmap_enter_quick_locked(pmap, va, m, prot |
5898 VM_PROT_NO_PROMOTE, mpte, &lock);
5899 }
5900 m = TAILQ_NEXT(m, listq);
5901 }
5902 if (lock != NULL)
5903 rw_wunlock(lock);
5904 PMAP_UNLOCK(pmap);
5905 }
5906
5907 /*
5908 * this code makes some *MAJOR* assumptions:
5909 * 1. Current pmap & pmap exists.
5910 * 2. Not wired.
5911 * 3. Read access.
5912 * 4. No page table pages.
5913 * but is *MUCH* faster than pmap_enter...
5914 */
5915
5916 void
pmap_enter_quick(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot)5917 pmap_enter_quick(pmap_t pmap, vm_offset_t va, vm_page_t m, vm_prot_t prot)
5918 {
5919 struct rwlock *lock;
5920
5921 lock = NULL;
5922 PMAP_LOCK(pmap);
5923 (void)pmap_enter_quick_locked(pmap, va, m, prot, NULL, &lock);
5924 if (lock != NULL)
5925 rw_wunlock(lock);
5926 PMAP_UNLOCK(pmap);
5927 }
5928
5929 static vm_page_t
pmap_enter_quick_locked(pmap_t pmap,vm_offset_t va,vm_page_t m,vm_prot_t prot,vm_page_t mpte,struct rwlock ** lockp)5930 pmap_enter_quick_locked(pmap_t pmap, vm_offset_t va, vm_page_t m,
5931 vm_prot_t prot, vm_page_t mpte, struct rwlock **lockp)
5932 {
5933 pt_entry_t *l1, *l2, *l3, l3_val;
5934 vm_paddr_t pa;
5935 int lvl;
5936
5937 KASSERT(!VA_IS_CLEANMAP(va) ||
5938 (m->oflags & VPO_UNMANAGED) != 0,
5939 ("pmap_enter_quick_locked: managed mapping within the clean submap"));
5940 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
5941 PMAP_ASSERT_STAGE1(pmap);
5942 KASSERT(ADDR_IS_CANONICAL(va),
5943 ("%s: Address not in canonical form: %lx", __func__, va));
5944 l2 = NULL;
5945
5946 CTR2(KTR_PMAP, "pmap_enter_quick_locked: %p %lx", pmap, va);
5947 /*
5948 * In the case that a page table page is not
5949 * resident, we are creating it here.
5950 */
5951 if (!ADDR_IS_KERNEL(va)) {
5952 vm_pindex_t l2pindex;
5953
5954 /*
5955 * Calculate pagetable page index
5956 */
5957 l2pindex = pmap_l2_pindex(va);
5958 if (mpte && (mpte->pindex == l2pindex)) {
5959 mpte->ref_count++;
5960 } else {
5961 /*
5962 * If the page table page is mapped, we just increment
5963 * the hold count, and activate it. Otherwise, we
5964 * attempt to allocate a page table page, passing NULL
5965 * instead of the PV list lock pointer because we don't
5966 * intend to sleep. If this attempt fails, we don't
5967 * retry. Instead, we give up.
5968 */
5969 l1 = pmap_l1(pmap, va);
5970 if (l1 != NULL && pmap_load(l1) != 0) {
5971 if ((pmap_load(l1) & ATTR_DESCR_MASK) ==
5972 L1_BLOCK)
5973 return (NULL);
5974 l2 = pmap_l1_to_l2(l1, va);
5975 if (pmap_load(l2) != 0) {
5976 if ((pmap_load(l2) & ATTR_DESCR_MASK) ==
5977 L2_BLOCK)
5978 return (NULL);
5979 mpte = PTE_TO_VM_PAGE(pmap_load(l2));
5980 mpte->ref_count++;
5981 } else {
5982 mpte = _pmap_alloc_l3(pmap, l2pindex,
5983 NULL);
5984 if (mpte == NULL)
5985 return (mpte);
5986 }
5987 } else {
5988 mpte = _pmap_alloc_l3(pmap, l2pindex, NULL);
5989 if (mpte == NULL)
5990 return (mpte);
5991 }
5992 }
5993 l3 = (pt_entry_t *)PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mpte));
5994 l3 = &l3[pmap_l3_index(va)];
5995 } else {
5996 mpte = NULL;
5997 l2 = pmap_pde(kernel_pmap, va, &lvl);
5998 KASSERT(l2 != NULL,
5999 ("pmap_enter_quick_locked: Invalid page entry, va: 0x%lx",
6000 va));
6001 KASSERT(lvl == 2,
6002 ("pmap_enter_quick_locked: Invalid level %d", lvl));
6003 l3 = pmap_l2_to_l3(l2, va);
6004 }
6005
6006 /*
6007 * Abort if a mapping already exists.
6008 */
6009 if (pmap_load(l3) != 0) {
6010 if (mpte != NULL)
6011 mpte->ref_count--;
6012 return (NULL);
6013 }
6014
6015 /*
6016 * Enter on the PV list if part of our managed memory.
6017 */
6018 if ((m->oflags & VPO_UNMANAGED) == 0 &&
6019 !pmap_try_insert_pv_entry(pmap, va, m, lockp)) {
6020 if (mpte != NULL)
6021 pmap_abort_ptp(pmap, va, mpte);
6022 return (NULL);
6023 }
6024
6025 /*
6026 * Increment counters
6027 */
6028 pmap_resident_count_inc(pmap, 1);
6029
6030 pa = VM_PAGE_TO_PHYS(m);
6031 l3_val = PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_IDX(m->md.pv_memattr) |
6032 ATTR_S1_AP(ATTR_S1_AP_RO) | L3_PAGE;
6033 l3_val |= pmap_pte_bti(pmap, va);
6034 if ((prot & VM_PROT_EXECUTE) == 0 ||
6035 m->md.pv_memattr == VM_MEMATTR_DEVICE)
6036 l3_val |= ATTR_S1_XN;
6037 if (!ADDR_IS_KERNEL(va))
6038 l3_val |= ATTR_S1_AP(ATTR_S1_AP_USER) | ATTR_S1_PXN;
6039 else
6040 l3_val |= ATTR_S1_UXN;
6041 if (pmap != kernel_pmap)
6042 l3_val |= ATTR_S1_nG;
6043
6044 /*
6045 * Now validate mapping with RO protection
6046 */
6047 if ((m->oflags & VPO_UNMANAGED) == 0) {
6048 l3_val |= ATTR_SW_MANAGED;
6049 l3_val &= ~ATTR_AF;
6050 }
6051
6052 /* Sync icache before the mapping is stored to PTE */
6053 if ((prot & VM_PROT_EXECUTE) && pmap != kernel_pmap &&
6054 m->md.pv_memattr == VM_MEMATTR_WRITE_BACK)
6055 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa), PAGE_SIZE);
6056
6057 pmap_store(l3, l3_val);
6058 dsb(ishst);
6059
6060 #if VM_NRESERVLEVEL > 0
6061 /*
6062 * First, attempt L3C promotion, if the virtual and physical addresses
6063 * are aligned with each other and an underlying reservation has the
6064 * neighboring L3 pages allocated. The first condition is simply an
6065 * optimization that recognizes some eventual promotion failures early
6066 * at a lower run-time cost. Then, attempt L2 promotion, if both the
6067 * PTP and the reservation are fully populated.
6068 */
6069 if ((prot & VM_PROT_NO_PROMOTE) == 0 &&
6070 (va & L3C_OFFSET) == (pa & L3C_OFFSET) &&
6071 (m->flags & PG_FICTITIOUS) == 0 &&
6072 vm_reserv_is_populated(m, L3C_ENTRIES) &&
6073 pmap_promote_l3c(pmap, l3, va) &&
6074 (mpte == NULL || mpte->ref_count == NL3PG) &&
6075 vm_reserv_level_iffullpop(m) == 0) {
6076 if (l2 == NULL)
6077 l2 = pmap_pde(pmap, va, &lvl);
6078
6079 /*
6080 * If promotion succeeds, then the next call to this function
6081 * should not be given the unmapped PTP as a hint.
6082 */
6083 if (pmap_promote_l2(pmap, l2, va, mpte, lockp))
6084 mpte = NULL;
6085 }
6086 #endif
6087
6088 return (mpte);
6089 }
6090
6091 /*
6092 * This code maps large physical mmap regions into the
6093 * processor address space. Note that some shortcuts
6094 * are taken, but the code works.
6095 */
6096 void
pmap_object_init_pt(pmap_t pmap,vm_offset_t addr,vm_object_t object,vm_pindex_t pindex,vm_size_t size)6097 pmap_object_init_pt(pmap_t pmap, vm_offset_t addr, vm_object_t object,
6098 vm_pindex_t pindex, vm_size_t size)
6099 {
6100
6101 VM_OBJECT_ASSERT_WLOCKED(object);
6102 KASSERT(object->type == OBJT_DEVICE || object->type == OBJT_SG,
6103 ("pmap_object_init_pt: non-device object"));
6104 }
6105
6106 /*
6107 * Clear the wired attribute from the mappings for the specified range of
6108 * addresses in the given pmap. Every valid mapping within that range
6109 * must have the wired attribute set. In contrast, invalid mappings
6110 * cannot have the wired attribute set, so they are ignored.
6111 *
6112 * The wired attribute of the page table entry is not a hardware feature,
6113 * so there is no need to invalidate any TLB entries.
6114 */
6115 void
pmap_unwire(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)6116 pmap_unwire(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
6117 {
6118 vm_offset_t va_next;
6119 pd_entry_t *l0, *l1, *l2;
6120 pt_entry_t *l3;
6121 bool partial_l3c;
6122
6123 PMAP_LOCK(pmap);
6124 for (; sva < eva; sva = va_next) {
6125 l0 = pmap_l0(pmap, sva);
6126 if (pmap_load(l0) == 0) {
6127 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
6128 if (va_next < sva)
6129 va_next = eva;
6130 continue;
6131 }
6132
6133 l1 = pmap_l0_to_l1(l0, sva);
6134 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
6135 if (va_next < sva)
6136 va_next = eva;
6137 if (pmap_load(l1) == 0)
6138 continue;
6139
6140 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6141 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6142 KASSERT(va_next <= eva,
6143 ("partial update of non-transparent 1G page "
6144 "l1 %#lx sva %#lx eva %#lx va_next %#lx",
6145 pmap_load(l1), sva, eva, va_next));
6146 MPASS(pmap != kernel_pmap);
6147 MPASS((pmap_load(l1) & (ATTR_SW_MANAGED |
6148 ATTR_SW_WIRED)) == ATTR_SW_WIRED);
6149 pmap_clear_bits(l1, ATTR_SW_WIRED);
6150 pmap->pm_stats.wired_count -= L1_SIZE / PAGE_SIZE;
6151 continue;
6152 }
6153
6154 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
6155 if (va_next < sva)
6156 va_next = eva;
6157
6158 l2 = pmap_l1_to_l2(l1, sva);
6159 if (pmap_load(l2) == 0)
6160 continue;
6161
6162 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK) {
6163 if ((pmap_load(l2) & ATTR_SW_WIRED) == 0)
6164 panic("pmap_unwire: l2 %#jx is missing "
6165 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l2));
6166
6167 /*
6168 * Are we unwiring the entire large page? If not,
6169 * demote the mapping and fall through.
6170 */
6171 if (sva + L2_SIZE == va_next && eva >= va_next) {
6172 pmap_clear_bits(l2, ATTR_SW_WIRED);
6173 pmap->pm_stats.wired_count -= L2_SIZE /
6174 PAGE_SIZE;
6175 continue;
6176 } else if (pmap_demote_l2(pmap, l2, sva) == NULL)
6177 panic("pmap_unwire: demotion failed");
6178 }
6179 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
6180 ("pmap_unwire: Invalid l2 entry after demotion"));
6181
6182 if (va_next > eva)
6183 va_next = eva;
6184 for (partial_l3c = true, l3 = pmap_l2_to_l3(l2, sva);
6185 sva != va_next; l3++, sva += L3_SIZE) {
6186 if (pmap_load(l3) == 0)
6187 continue;
6188 if ((pmap_load(l3) & ATTR_CONTIGUOUS) != 0) {
6189 /*
6190 * Avoid demotion for whole-page unwiring.
6191 */
6192 if ((sva & L3C_OFFSET) == 0) {
6193 /*
6194 * Handle the possibility that
6195 * "va_next" is zero because of
6196 * address wraparound.
6197 */
6198 partial_l3c = sva + L3C_OFFSET >
6199 va_next - 1;
6200 }
6201 if (partial_l3c)
6202 (void)pmap_demote_l3c(pmap, l3, sva);
6203 }
6204 if ((pmap_load(l3) & ATTR_SW_WIRED) == 0)
6205 panic("pmap_unwire: l3 %#jx is missing "
6206 "ATTR_SW_WIRED", (uintmax_t)pmap_load(l3));
6207
6208 /*
6209 * ATTR_SW_WIRED must be cleared atomically. Although
6210 * the pmap lock synchronizes access to ATTR_SW_WIRED,
6211 * the System MMU may write to the entry concurrently.
6212 */
6213 pmap_clear_bits(l3, ATTR_SW_WIRED);
6214 pmap->pm_stats.wired_count--;
6215 }
6216 }
6217 PMAP_UNLOCK(pmap);
6218 }
6219
6220 /*
6221 * This function requires that the caller has already added one to ml3's
6222 * ref_count in anticipation of creating a 4KB page mapping.
6223 */
6224 static bool
pmap_copy_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va,pt_entry_t l3e,vm_page_t ml3,struct rwlock ** lockp)6225 pmap_copy_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va, pt_entry_t l3e,
6226 vm_page_t ml3, struct rwlock **lockp)
6227 {
6228 pt_entry_t *tl3p;
6229
6230 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
6231 KASSERT((va & L3C_OFFSET) == 0,
6232 ("pmap_copy_l3c: va is not aligned"));
6233 KASSERT((l3e & ATTR_SW_MANAGED) != 0,
6234 ("pmap_copy_l3c: l3e is not managed"));
6235
6236 /*
6237 * Abort if a mapping already exists.
6238 */
6239 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++)
6240 if (pmap_load(tl3p) != 0) {
6241 if (ml3 != NULL)
6242 ml3->ref_count--;
6243 return (false);
6244 }
6245
6246 if (!pmap_pv_insert_l3c(pmap, va, PTE_TO_VM_PAGE(l3e), lockp)) {
6247 if (ml3 != NULL)
6248 pmap_abort_ptp(pmap, va, ml3);
6249 return (false);
6250 }
6251 ml3->ref_count += L3C_ENTRIES - 1;
6252
6253 /*
6254 * Clear the wired and accessed bits. However, leave the dirty bit
6255 * unchanged because read/write superpage mappings are required to be
6256 * dirty.
6257 */
6258 l3e &= ~(ATTR_SW_WIRED | ATTR_AF);
6259
6260 for (tl3p = l3p; tl3p < &l3p[L3C_ENTRIES]; tl3p++) {
6261 pmap_store(tl3p, l3e);
6262 l3e += L3_SIZE;
6263 }
6264 pmap_resident_count_inc(pmap, L3C_ENTRIES);
6265 counter_u64_add(pmap_l3c_mappings, 1);
6266 CTR2(KTR_PMAP, "pmap_copy_l3c: success for va %#lx in pmap %p",
6267 va, pmap);
6268 return (true);
6269 }
6270
6271 /*
6272 * Copy the range specified by src_addr/len
6273 * from the source map to the range dst_addr/len
6274 * in the destination map.
6275 *
6276 * This routine is only advisory and need not do anything.
6277 *
6278 * Because the executable mappings created by this routine are copied,
6279 * it should not have to flush the instruction cache.
6280 */
6281 void
pmap_copy(pmap_t dst_pmap,pmap_t src_pmap,vm_offset_t dst_addr,vm_size_t len,vm_offset_t src_addr)6282 pmap_copy(pmap_t dst_pmap, pmap_t src_pmap, vm_offset_t dst_addr, vm_size_t len,
6283 vm_offset_t src_addr)
6284 {
6285 struct rwlock *lock;
6286 pd_entry_t *l0, *l1, *l2, srcptepaddr;
6287 pt_entry_t *dst_pte, mask, nbits, ptetemp, *src_pte;
6288 vm_offset_t addr, end_addr, va_next;
6289 vm_page_t dst_m, dstmpte, srcmpte;
6290
6291 PMAP_ASSERT_STAGE1(dst_pmap);
6292 PMAP_ASSERT_STAGE1(src_pmap);
6293
6294 if (dst_addr != src_addr)
6295 return;
6296 end_addr = src_addr + len;
6297 lock = NULL;
6298 if (dst_pmap < src_pmap) {
6299 PMAP_LOCK(dst_pmap);
6300 PMAP_LOCK(src_pmap);
6301 } else {
6302 PMAP_LOCK(src_pmap);
6303 PMAP_LOCK(dst_pmap);
6304 }
6305 for (addr = src_addr; addr < end_addr; addr = va_next) {
6306 l0 = pmap_l0(src_pmap, addr);
6307 if (pmap_load(l0) == 0) {
6308 va_next = (addr + L0_SIZE) & ~L0_OFFSET;
6309 if (va_next < addr)
6310 va_next = end_addr;
6311 continue;
6312 }
6313
6314 va_next = (addr + L1_SIZE) & ~L1_OFFSET;
6315 if (va_next < addr)
6316 va_next = end_addr;
6317 l1 = pmap_l0_to_l1(l0, addr);
6318 if (pmap_load(l1) == 0)
6319 continue;
6320 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
6321 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
6322 KASSERT(va_next <= end_addr,
6323 ("partial update of non-transparent 1G page "
6324 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6325 pmap_load(l1), addr, end_addr, va_next));
6326 srcptepaddr = pmap_load(l1);
6327 l1 = pmap_l1(dst_pmap, addr);
6328 if (l1 == NULL) {
6329 if (_pmap_alloc_l3(dst_pmap,
6330 pmap_l0_pindex(addr), NULL) == NULL)
6331 break;
6332 l1 = pmap_l1(dst_pmap, addr);
6333 } else {
6334 l0 = pmap_l0(dst_pmap, addr);
6335 dst_m = PTE_TO_VM_PAGE(pmap_load(l0));
6336 dst_m->ref_count++;
6337 }
6338 KASSERT(pmap_load(l1) == 0,
6339 ("1G mapping present in dst pmap "
6340 "l1 %#lx addr %#lx end_addr %#lx va_next %#lx",
6341 pmap_load(l1), addr, end_addr, va_next));
6342 pmap_store(l1, srcptepaddr & ~ATTR_SW_WIRED);
6343 pmap_resident_count_inc(dst_pmap, L1_SIZE / PAGE_SIZE);
6344 continue;
6345 }
6346
6347 va_next = (addr + L2_SIZE) & ~L2_OFFSET;
6348 if (va_next < addr)
6349 va_next = end_addr;
6350 l2 = pmap_l1_to_l2(l1, addr);
6351 srcptepaddr = pmap_load(l2);
6352 if (srcptepaddr == 0)
6353 continue;
6354 if ((srcptepaddr & ATTR_DESCR_MASK) == L2_BLOCK) {
6355 /*
6356 * We can only virtual copy whole superpages.
6357 */
6358 if ((addr & L2_OFFSET) != 0 ||
6359 addr + L2_SIZE > end_addr)
6360 continue;
6361 l2 = pmap_alloc_l2(dst_pmap, addr, &dst_m, NULL);
6362 if (l2 == NULL)
6363 break;
6364 if (pmap_load(l2) == 0 &&
6365 ((srcptepaddr & ATTR_SW_MANAGED) == 0 ||
6366 pmap_pv_insert_l2(dst_pmap, addr, srcptepaddr,
6367 PMAP_ENTER_NORECLAIM, &lock))) {
6368 /*
6369 * We leave the dirty bit unchanged because
6370 * managed read/write superpage mappings are
6371 * required to be dirty. However, managed
6372 * superpage mappings are not required to
6373 * have their accessed bit set, so we clear
6374 * it because we don't know if this mapping
6375 * will be used.
6376 */
6377 srcptepaddr &= ~ATTR_SW_WIRED;
6378 if ((srcptepaddr & ATTR_SW_MANAGED) != 0)
6379 srcptepaddr &= ~ATTR_AF;
6380 pmap_store(l2, srcptepaddr);
6381 pmap_resident_count_inc(dst_pmap, L2_SIZE /
6382 PAGE_SIZE);
6383 atomic_add_long(&pmap_l2_mappings, 1);
6384 } else
6385 pmap_abort_ptp(dst_pmap, addr, dst_m);
6386 continue;
6387 }
6388 KASSERT((srcptepaddr & ATTR_DESCR_MASK) == L2_TABLE,
6389 ("pmap_copy: invalid L2 entry"));
6390 srcmpte = PTE_TO_VM_PAGE(srcptepaddr);
6391 KASSERT(srcmpte->ref_count > 0,
6392 ("pmap_copy: source page table page is unused"));
6393 if (va_next > end_addr)
6394 va_next = end_addr;
6395 src_pte = (pt_entry_t *)PHYS_TO_DMAP(PTE_TO_PHYS(srcptepaddr));
6396 src_pte = &src_pte[pmap_l3_index(addr)];
6397 dstmpte = NULL;
6398 for (; addr < va_next; addr += PAGE_SIZE, src_pte++) {
6399 ptetemp = pmap_load(src_pte);
6400
6401 /*
6402 * We only virtual copy managed pages.
6403 */
6404 if ((ptetemp & ATTR_SW_MANAGED) == 0)
6405 continue;
6406
6407 if (dstmpte != NULL) {
6408 KASSERT(dstmpte->pindex == pmap_l2_pindex(addr),
6409 ("dstmpte pindex/addr mismatch"));
6410 dstmpte->ref_count++;
6411 } else if ((dstmpte = pmap_alloc_l3(dst_pmap, addr,
6412 NULL)) == NULL)
6413 goto out;
6414 dst_pte = (pt_entry_t *)
6415 PHYS_TO_DMAP(VM_PAGE_TO_PHYS(dstmpte));
6416 dst_pte = &dst_pte[pmap_l3_index(addr)];
6417 if ((ptetemp & ATTR_CONTIGUOUS) != 0 && (addr &
6418 L3C_OFFSET) == 0 && addr + L3C_OFFSET <=
6419 va_next - 1) {
6420 if (!pmap_copy_l3c(dst_pmap, dst_pte, addr,
6421 ptetemp, dstmpte, &lock))
6422 goto out;
6423 addr += L3C_SIZE - PAGE_SIZE;
6424 src_pte += L3C_ENTRIES - 1;
6425 } else if (pmap_load(dst_pte) == 0 &&
6426 pmap_try_insert_pv_entry(dst_pmap, addr,
6427 PTE_TO_VM_PAGE(ptetemp), &lock)) {
6428 /*
6429 * Clear the wired, contiguous, modified, and
6430 * accessed bits from the destination PTE.
6431 * The contiguous bit is cleared because we
6432 * are not copying the entire L3C superpage.
6433 */
6434 mask = ATTR_SW_WIRED | ATTR_CONTIGUOUS |
6435 ATTR_AF;
6436 nbits = 0;
6437 if ((ptetemp & ATTR_SW_DBM) != 0)
6438 nbits |= ATTR_S1_AP_RW_BIT;
6439 pmap_store(dst_pte, (ptetemp & ~mask) | nbits);
6440 pmap_resident_count_inc(dst_pmap, 1);
6441 } else {
6442 pmap_abort_ptp(dst_pmap, addr, dstmpte);
6443 goto out;
6444 }
6445 /* Have we copied all of the valid mappings? */
6446 if (dstmpte->ref_count >= srcmpte->ref_count)
6447 break;
6448 }
6449 }
6450 out:
6451 /*
6452 * XXX This barrier may not be needed because the destination pmap is
6453 * not active.
6454 */
6455 dsb(ishst);
6456
6457 if (lock != NULL)
6458 rw_wunlock(lock);
6459 PMAP_UNLOCK(src_pmap);
6460 PMAP_UNLOCK(dst_pmap);
6461 }
6462
6463 int
pmap_vmspace_copy(pmap_t dst_pmap,pmap_t src_pmap)6464 pmap_vmspace_copy(pmap_t dst_pmap, pmap_t src_pmap)
6465 {
6466 int error;
6467
6468 if (dst_pmap->pm_stage != src_pmap->pm_stage)
6469 return (EINVAL);
6470
6471 if (dst_pmap->pm_stage != PM_STAGE1 || src_pmap->pm_bti == NULL)
6472 return (0);
6473
6474 for (;;) {
6475 if (dst_pmap < src_pmap) {
6476 PMAP_LOCK(dst_pmap);
6477 PMAP_LOCK(src_pmap);
6478 } else {
6479 PMAP_LOCK(src_pmap);
6480 PMAP_LOCK(dst_pmap);
6481 }
6482 error = pmap_bti_copy(dst_pmap, src_pmap);
6483 /* Clean up partial copy on failure due to no memory. */
6484 if (error == ENOMEM)
6485 pmap_bti_deassign_all(dst_pmap);
6486 PMAP_UNLOCK(src_pmap);
6487 PMAP_UNLOCK(dst_pmap);
6488 if (error != ENOMEM)
6489 break;
6490 vm_wait(NULL);
6491 }
6492 return (error);
6493 }
6494
6495 /*
6496 * pmap_zero_page zeros the specified hardware page by mapping
6497 * the page into KVM and using bzero to clear its contents.
6498 */
6499 void
pmap_zero_page(vm_page_t m)6500 pmap_zero_page(vm_page_t m)
6501 {
6502 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6503
6504 pagezero((void *)va);
6505 }
6506
6507 /*
6508 * pmap_zero_page_area zeros the specified hardware page by mapping
6509 * the page into KVM and using bzero to clear its contents.
6510 *
6511 * off and size may not cover an area beyond a single hardware page.
6512 */
6513 void
pmap_zero_page_area(vm_page_t m,int off,int size)6514 pmap_zero_page_area(vm_page_t m, int off, int size)
6515 {
6516 vm_offset_t va = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m));
6517
6518 if (off == 0 && size == PAGE_SIZE)
6519 pagezero((void *)va);
6520 else
6521 bzero((char *)va + off, size);
6522 }
6523
6524 /*
6525 * pmap_copy_page copies the specified (machine independent)
6526 * page by mapping the page into virtual memory and using
6527 * bcopy to copy the page, one machine dependent page at a
6528 * time.
6529 */
6530 void
pmap_copy_page(vm_page_t msrc,vm_page_t mdst)6531 pmap_copy_page(vm_page_t msrc, vm_page_t mdst)
6532 {
6533 vm_offset_t src = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(msrc));
6534 vm_offset_t dst = PHYS_TO_DMAP(VM_PAGE_TO_PHYS(mdst));
6535
6536 pagecopy((void *)src, (void *)dst);
6537 }
6538
6539 int unmapped_buf_allowed = 1;
6540
6541 void
pmap_copy_pages(vm_page_t ma[],vm_offset_t a_offset,vm_page_t mb[],vm_offset_t b_offset,int xfersize)6542 pmap_copy_pages(vm_page_t ma[], vm_offset_t a_offset, vm_page_t mb[],
6543 vm_offset_t b_offset, int xfersize)
6544 {
6545 void *a_cp, *b_cp;
6546 vm_page_t m_a, m_b;
6547 vm_paddr_t p_a, p_b;
6548 vm_offset_t a_pg_offset, b_pg_offset;
6549 int cnt;
6550
6551 while (xfersize > 0) {
6552 a_pg_offset = a_offset & PAGE_MASK;
6553 m_a = ma[a_offset >> PAGE_SHIFT];
6554 p_a = m_a->phys_addr;
6555 b_pg_offset = b_offset & PAGE_MASK;
6556 m_b = mb[b_offset >> PAGE_SHIFT];
6557 p_b = m_b->phys_addr;
6558 cnt = min(xfersize, PAGE_SIZE - a_pg_offset);
6559 cnt = min(cnt, PAGE_SIZE - b_pg_offset);
6560 if (__predict_false(!PHYS_IN_DMAP(p_a))) {
6561 panic("!DMAP a %lx", p_a);
6562 } else {
6563 a_cp = (char *)PHYS_TO_DMAP(p_a) + a_pg_offset;
6564 }
6565 if (__predict_false(!PHYS_IN_DMAP(p_b))) {
6566 panic("!DMAP b %lx", p_b);
6567 } else {
6568 b_cp = (char *)PHYS_TO_DMAP(p_b) + b_pg_offset;
6569 }
6570 bcopy(a_cp, b_cp, cnt);
6571 a_offset += cnt;
6572 b_offset += cnt;
6573 xfersize -= cnt;
6574 }
6575 }
6576
6577 vm_offset_t
pmap_quick_enter_page(vm_page_t m)6578 pmap_quick_enter_page(vm_page_t m)
6579 {
6580
6581 return (PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)));
6582 }
6583
6584 void
pmap_quick_remove_page(vm_offset_t addr)6585 pmap_quick_remove_page(vm_offset_t addr)
6586 {
6587 }
6588
6589 /*
6590 * Returns true if the pmap's pv is one of the first
6591 * 16 pvs linked to from this page. This count may
6592 * be changed upwards or downwards in the future; it
6593 * is only necessary that true be returned for a small
6594 * subset of pmaps for proper page aging.
6595 */
6596 bool
pmap_page_exists_quick(pmap_t pmap,vm_page_t m)6597 pmap_page_exists_quick(pmap_t pmap, vm_page_t m)
6598 {
6599 struct md_page *pvh;
6600 struct rwlock *lock;
6601 pv_entry_t pv;
6602 int loops = 0;
6603 bool rv;
6604
6605 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
6606 ("pmap_page_exists_quick: page %p is not managed", m));
6607 rv = false;
6608 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6609 rw_rlock(lock);
6610 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6611 if (PV_PMAP(pv) == pmap) {
6612 rv = true;
6613 break;
6614 }
6615 loops++;
6616 if (loops >= 16)
6617 break;
6618 }
6619 if (!rv && loops < 16 && (m->flags & PG_FICTITIOUS) == 0) {
6620 pvh = page_to_pvh(m);
6621 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6622 if (PV_PMAP(pv) == pmap) {
6623 rv = true;
6624 break;
6625 }
6626 loops++;
6627 if (loops >= 16)
6628 break;
6629 }
6630 }
6631 rw_runlock(lock);
6632 return (rv);
6633 }
6634
6635 /*
6636 * pmap_page_wired_mappings:
6637 *
6638 * Return the number of managed mappings to the given physical page
6639 * that are wired.
6640 */
6641 int
pmap_page_wired_mappings(vm_page_t m)6642 pmap_page_wired_mappings(vm_page_t m)
6643 {
6644 struct rwlock *lock;
6645 struct md_page *pvh;
6646 pmap_t pmap;
6647 pt_entry_t *pte;
6648 pv_entry_t pv;
6649 int count, md_gen, pvh_gen;
6650
6651 if ((m->oflags & VPO_UNMANAGED) != 0)
6652 return (0);
6653 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6654 rw_rlock(lock);
6655 restart:
6656 count = 0;
6657 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6658 pmap = PV_PMAP(pv);
6659 if (!PMAP_TRYLOCK(pmap)) {
6660 md_gen = m->md.pv_gen;
6661 rw_runlock(lock);
6662 PMAP_LOCK(pmap);
6663 rw_rlock(lock);
6664 if (md_gen != m->md.pv_gen) {
6665 PMAP_UNLOCK(pmap);
6666 goto restart;
6667 }
6668 }
6669 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6670 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6671 count++;
6672 PMAP_UNLOCK(pmap);
6673 }
6674 if ((m->flags & PG_FICTITIOUS) == 0) {
6675 pvh = page_to_pvh(m);
6676 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6677 pmap = PV_PMAP(pv);
6678 if (!PMAP_TRYLOCK(pmap)) {
6679 md_gen = m->md.pv_gen;
6680 pvh_gen = pvh->pv_gen;
6681 rw_runlock(lock);
6682 PMAP_LOCK(pmap);
6683 rw_rlock(lock);
6684 if (md_gen != m->md.pv_gen ||
6685 pvh_gen != pvh->pv_gen) {
6686 PMAP_UNLOCK(pmap);
6687 goto restart;
6688 }
6689 }
6690 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6691 if ((pmap_load(pte) & ATTR_SW_WIRED) != 0)
6692 count++;
6693 PMAP_UNLOCK(pmap);
6694 }
6695 }
6696 rw_runlock(lock);
6697 return (count);
6698 }
6699
6700 /*
6701 * Returns true if the given page is mapped individually or as part of
6702 * a 2mpage. Otherwise, returns false.
6703 */
6704 bool
pmap_page_is_mapped(vm_page_t m)6705 pmap_page_is_mapped(vm_page_t m)
6706 {
6707 struct rwlock *lock;
6708 bool rv;
6709
6710 if ((m->oflags & VPO_UNMANAGED) != 0)
6711 return (false);
6712 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6713 rw_rlock(lock);
6714 rv = !TAILQ_EMPTY(&m->md.pv_list) ||
6715 ((m->flags & PG_FICTITIOUS) == 0 &&
6716 !TAILQ_EMPTY(&page_to_pvh(m)->pv_list));
6717 rw_runlock(lock);
6718 return (rv);
6719 }
6720
6721 /*
6722 * Destroy all managed, non-wired mappings in the given user-space
6723 * pmap. This pmap cannot be active on any processor besides the
6724 * caller.
6725 *
6726 * This function cannot be applied to the kernel pmap. Moreover, it
6727 * is not intended for general use. It is only to be used during
6728 * process termination. Consequently, it can be implemented in ways
6729 * that make it faster than pmap_remove(). First, it can more quickly
6730 * destroy mappings by iterating over the pmap's collection of PV
6731 * entries, rather than searching the page table. Second, it doesn't
6732 * have to test and clear the page table entries atomically, because
6733 * no processor is currently accessing the user address space. In
6734 * particular, a page table entry's dirty bit won't change state once
6735 * this function starts.
6736 */
6737 void
pmap_remove_pages(pmap_t pmap)6738 pmap_remove_pages(pmap_t pmap)
6739 {
6740 pd_entry_t *pde;
6741 pt_entry_t *pte, tpte;
6742 struct spglist free;
6743 struct pv_chunklist free_chunks[PMAP_MEMDOM];
6744 vm_page_t m, ml3, mt;
6745 pv_entry_t pv;
6746 struct md_page *pvh;
6747 struct pv_chunk *pc, *npc;
6748 struct rwlock *lock;
6749 int64_t bit;
6750 uint64_t inuse, bitmask;
6751 int allfree, field, i, idx, lvl;
6752 int freed __pvused;
6753 vm_paddr_t pa;
6754
6755 lock = NULL;
6756
6757 for (i = 0; i < PMAP_MEMDOM; i++)
6758 TAILQ_INIT(&free_chunks[i]);
6759 SLIST_INIT(&free);
6760 PMAP_LOCK(pmap);
6761 TAILQ_FOREACH_SAFE(pc, &pmap->pm_pvchunk, pc_list, npc) {
6762 allfree = 1;
6763 freed = 0;
6764 for (field = 0; field < _NPCM; field++) {
6765 inuse = ~pc->pc_map[field] & pc_freemask[field];
6766 while (inuse != 0) {
6767 bit = ffsl(inuse) - 1;
6768 bitmask = 1UL << bit;
6769 idx = field * 64 + bit;
6770 pv = &pc->pc_pventry[idx];
6771 inuse &= ~bitmask;
6772
6773 pde = pmap_pde(pmap, pv->pv_va, &lvl);
6774 KASSERT(pde != NULL,
6775 ("Attempting to remove an unmapped page"));
6776
6777 switch(lvl) {
6778 case 1:
6779 pte = pmap_l1_to_l2(pde, pv->pv_va);
6780 tpte = pmap_load(pte);
6781 KASSERT((tpte & ATTR_DESCR_MASK) ==
6782 L2_BLOCK,
6783 ("Attempting to remove an invalid "
6784 "block: %lx", tpte));
6785 break;
6786 case 2:
6787 pte = pmap_l2_to_l3(pde, pv->pv_va);
6788 tpte = pmap_load(pte);
6789 KASSERT((tpte & ATTR_DESCR_MASK) ==
6790 L3_PAGE,
6791 ("Attempting to remove an invalid "
6792 "page: %lx", tpte));
6793 break;
6794 default:
6795 panic(
6796 "Invalid page directory level: %d",
6797 lvl);
6798 }
6799
6800 /*
6801 * We cannot remove wired mappings at this time.
6802 *
6803 * For L3C superpages, all of the constituent PTEs
6804 * should have the wired bit set, so we don't
6805 * check for ATTR_CONTIGUOUS here.
6806 */
6807 if (tpte & ATTR_SW_WIRED) {
6808 allfree = 0;
6809 continue;
6810 }
6811
6812 /* Mark free */
6813 pc->pc_map[field] |= bitmask;
6814
6815 /*
6816 * Because this pmap is not active on other
6817 * processors, the dirty bit cannot have
6818 * changed state since we last loaded pte.
6819 */
6820 pmap_clear(pte);
6821
6822 pa = PTE_TO_PHYS(tpte);
6823
6824 m = PHYS_TO_VM_PAGE(pa);
6825 KASSERT(m->phys_addr == pa,
6826 ("vm_page_t %p phys_addr mismatch %016jx %016jx",
6827 m, (uintmax_t)m->phys_addr,
6828 (uintmax_t)tpte));
6829
6830 KASSERT((m->flags & PG_FICTITIOUS) != 0 ||
6831 m < &vm_page_array[vm_page_array_size],
6832 ("pmap_remove_pages: bad pte %#jx",
6833 (uintmax_t)tpte));
6834
6835 /*
6836 * Update the vm_page_t clean/reference bits.
6837 *
6838 * We don't check for ATTR_CONTIGUOUS here
6839 * because writeable L3C superpages are expected
6840 * to be dirty, i.e., every constituent PTE
6841 * should be dirty.
6842 */
6843 if (pmap_pte_dirty(pmap, tpte)) {
6844 switch (lvl) {
6845 case 1:
6846 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6847 vm_page_dirty(mt);
6848 break;
6849 case 2:
6850 vm_page_dirty(m);
6851 break;
6852 }
6853 }
6854
6855 CHANGE_PV_LIST_LOCK_TO_VM_PAGE(&lock, m);
6856
6857 switch (lvl) {
6858 case 1:
6859 pmap_resident_count_dec(pmap,
6860 L2_SIZE / PAGE_SIZE);
6861 pvh = page_to_pvh(m);
6862 TAILQ_REMOVE(&pvh->pv_list, pv,pv_next);
6863 pvh->pv_gen++;
6864 if (TAILQ_EMPTY(&pvh->pv_list)) {
6865 for (mt = m; mt < &m[L2_SIZE / PAGE_SIZE]; mt++)
6866 if ((mt->a.flags & PGA_WRITEABLE) != 0 &&
6867 TAILQ_EMPTY(&mt->md.pv_list))
6868 vm_page_aflag_clear(mt, PGA_WRITEABLE);
6869 }
6870 ml3 = pmap_remove_pt_page(pmap,
6871 pv->pv_va);
6872 if (ml3 != NULL) {
6873 KASSERT(vm_page_any_valid(ml3),
6874 ("pmap_remove_pages: l3 page not promoted"));
6875 pmap_resident_count_dec(pmap,1);
6876 KASSERT(ml3->ref_count == NL3PG,
6877 ("pmap_remove_pages: l3 page ref count error"));
6878 ml3->ref_count = 0;
6879 pmap_add_delayed_free_list(ml3,
6880 &free, false);
6881 }
6882 break;
6883 case 2:
6884 pmap_resident_count_dec(pmap, 1);
6885 TAILQ_REMOVE(&m->md.pv_list, pv,
6886 pv_next);
6887 m->md.pv_gen++;
6888 if ((m->a.flags & PGA_WRITEABLE) != 0 &&
6889 TAILQ_EMPTY(&m->md.pv_list) &&
6890 (m->flags & PG_FICTITIOUS) == 0) {
6891 pvh = page_to_pvh(m);
6892 if (TAILQ_EMPTY(&pvh->pv_list))
6893 vm_page_aflag_clear(m,
6894 PGA_WRITEABLE);
6895 }
6896 break;
6897 }
6898 pmap_unuse_pt(pmap, pv->pv_va, pmap_load(pde),
6899 &free);
6900 freed++;
6901 }
6902 }
6903 PV_STAT(atomic_add_long(&pv_entry_frees, freed));
6904 PV_STAT(atomic_add_int(&pv_entry_spare, freed));
6905 PV_STAT(atomic_subtract_long(&pv_entry_count, freed));
6906 if (allfree) {
6907 TAILQ_REMOVE(&pmap->pm_pvchunk, pc, pc_list);
6908 TAILQ_INSERT_TAIL(&free_chunks[pc_to_domain(pc)], pc,
6909 pc_list);
6910 }
6911 }
6912 if (lock != NULL)
6913 rw_wunlock(lock);
6914 pmap_invalidate_all(pmap);
6915 pmap_bti_deassign_all(pmap);
6916 free_pv_chunk_batch(free_chunks);
6917 PMAP_UNLOCK(pmap);
6918 vm_page_free_pages_toq(&free, true);
6919 }
6920
6921 /*
6922 * This is used to check if a page has been accessed or modified.
6923 */
6924 static bool
pmap_page_test_mappings(vm_page_t m,bool accessed,bool modified)6925 pmap_page_test_mappings(vm_page_t m, bool accessed, bool modified)
6926 {
6927 struct rwlock *lock;
6928 pv_entry_t pv;
6929 struct md_page *pvh;
6930 pt_entry_t l3e, mask, *pte, value;
6931 pmap_t pmap;
6932 int md_gen, pvh_gen;
6933 bool rv;
6934
6935 rv = false;
6936 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
6937 rw_rlock(lock);
6938 restart:
6939 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
6940 pmap = PV_PMAP(pv);
6941 PMAP_ASSERT_STAGE1(pmap);
6942 if (!PMAP_TRYLOCK(pmap)) {
6943 md_gen = m->md.pv_gen;
6944 rw_runlock(lock);
6945 PMAP_LOCK(pmap);
6946 rw_rlock(lock);
6947 if (md_gen != m->md.pv_gen) {
6948 PMAP_UNLOCK(pmap);
6949 goto restart;
6950 }
6951 }
6952 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
6953 mask = 0;
6954 value = 0;
6955 if (modified) {
6956 mask |= ATTR_S1_AP_RW_BIT;
6957 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6958 }
6959 if (accessed) {
6960 mask |= ATTR_AF | ATTR_DESCR_MASK;
6961 value |= ATTR_AF | L3_PAGE;
6962 }
6963 l3e = pmap_load(pte);
6964 if ((l3e & ATTR_CONTIGUOUS) != 0)
6965 l3e = pmap_load_l3c(pte);
6966 PMAP_UNLOCK(pmap);
6967 rv = (l3e & mask) == value;
6968 if (rv)
6969 goto out;
6970 }
6971 if ((m->flags & PG_FICTITIOUS) == 0) {
6972 pvh = page_to_pvh(m);
6973 TAILQ_FOREACH(pv, &pvh->pv_list, pv_next) {
6974 pmap = PV_PMAP(pv);
6975 PMAP_ASSERT_STAGE1(pmap);
6976 if (!PMAP_TRYLOCK(pmap)) {
6977 md_gen = m->md.pv_gen;
6978 pvh_gen = pvh->pv_gen;
6979 rw_runlock(lock);
6980 PMAP_LOCK(pmap);
6981 rw_rlock(lock);
6982 if (md_gen != m->md.pv_gen ||
6983 pvh_gen != pvh->pv_gen) {
6984 PMAP_UNLOCK(pmap);
6985 goto restart;
6986 }
6987 }
6988 pte = pmap_pte_exists(pmap, pv->pv_va, 2, __func__);
6989 mask = 0;
6990 value = 0;
6991 if (modified) {
6992 mask |= ATTR_S1_AP_RW_BIT;
6993 value |= ATTR_S1_AP(ATTR_S1_AP_RW);
6994 }
6995 if (accessed) {
6996 mask |= ATTR_AF | ATTR_DESCR_MASK;
6997 value |= ATTR_AF | L2_BLOCK;
6998 }
6999 rv = (pmap_load(pte) & mask) == value;
7000 PMAP_UNLOCK(pmap);
7001 if (rv)
7002 goto out;
7003 }
7004 }
7005 out:
7006 rw_runlock(lock);
7007 return (rv);
7008 }
7009
7010 /*
7011 * pmap_is_modified:
7012 *
7013 * Return whether or not the specified physical page was modified
7014 * in any physical maps.
7015 */
7016 bool
pmap_is_modified(vm_page_t m)7017 pmap_is_modified(vm_page_t m)
7018 {
7019
7020 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7021 ("pmap_is_modified: page %p is not managed", m));
7022
7023 /*
7024 * If the page is not busied then this check is racy.
7025 */
7026 if (!pmap_page_is_write_mapped(m))
7027 return (false);
7028 return (pmap_page_test_mappings(m, false, true));
7029 }
7030
7031 /*
7032 * pmap_is_prefaultable:
7033 *
7034 * Return whether or not the specified virtual address is eligible
7035 * for prefault.
7036 */
7037 bool
pmap_is_prefaultable(pmap_t pmap,vm_offset_t addr)7038 pmap_is_prefaultable(pmap_t pmap, vm_offset_t addr)
7039 {
7040 pd_entry_t *pde;
7041 pt_entry_t *pte;
7042 bool rv;
7043 int lvl;
7044
7045 /*
7046 * Return true if and only if the L3 entry for the specified virtual
7047 * address is allocated but invalid.
7048 */
7049 rv = false;
7050 PMAP_LOCK(pmap);
7051 pde = pmap_pde(pmap, addr, &lvl);
7052 if (pde != NULL && lvl == 2) {
7053 pte = pmap_l2_to_l3(pde, addr);
7054 rv = pmap_load(pte) == 0;
7055 }
7056 PMAP_UNLOCK(pmap);
7057 return (rv);
7058 }
7059
7060 /*
7061 * pmap_is_referenced:
7062 *
7063 * Return whether or not the specified physical page was referenced
7064 * in any physical maps.
7065 */
7066 bool
pmap_is_referenced(vm_page_t m)7067 pmap_is_referenced(vm_page_t m)
7068 {
7069
7070 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7071 ("pmap_is_referenced: page %p is not managed", m));
7072 return (pmap_page_test_mappings(m, true, false));
7073 }
7074
7075 /*
7076 * Clear the write and modified bits in each of the given page's mappings.
7077 */
7078 void
pmap_remove_write(vm_page_t m)7079 pmap_remove_write(vm_page_t m)
7080 {
7081 struct md_page *pvh;
7082 pmap_t pmap;
7083 struct rwlock *lock;
7084 pv_entry_t next_pv, pv;
7085 pt_entry_t oldpte, *pte, set, clear, mask, val;
7086 vm_offset_t va;
7087 int md_gen, pvh_gen;
7088
7089 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7090 ("pmap_remove_write: page %p is not managed", m));
7091 vm_page_assert_busied(m);
7092
7093 if (!pmap_page_is_write_mapped(m))
7094 return;
7095 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7096 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7097 rw_wlock(lock);
7098 retry:
7099 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7100 pmap = PV_PMAP(pv);
7101 PMAP_ASSERT_STAGE1(pmap);
7102 if (!PMAP_TRYLOCK(pmap)) {
7103 pvh_gen = pvh->pv_gen;
7104 rw_wunlock(lock);
7105 PMAP_LOCK(pmap);
7106 rw_wlock(lock);
7107 if (pvh_gen != pvh->pv_gen) {
7108 PMAP_UNLOCK(pmap);
7109 goto retry;
7110 }
7111 }
7112 va = pv->pv_va;
7113 pte = pmap_pte_exists(pmap, va, 2, __func__);
7114 if ((pmap_load(pte) & ATTR_SW_DBM) != 0)
7115 (void)pmap_demote_l2_locked(pmap, pte, va, &lock);
7116 KASSERT(lock == VM_PAGE_TO_PV_LIST_LOCK(m),
7117 ("inconsistent pv lock %p %p for page %p",
7118 lock, VM_PAGE_TO_PV_LIST_LOCK(m), m));
7119 PMAP_UNLOCK(pmap);
7120 }
7121 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7122 pmap = PV_PMAP(pv);
7123 if (!PMAP_TRYLOCK(pmap)) {
7124 pvh_gen = pvh->pv_gen;
7125 md_gen = m->md.pv_gen;
7126 rw_wunlock(lock);
7127 PMAP_LOCK(pmap);
7128 rw_wlock(lock);
7129 if (pvh_gen != pvh->pv_gen ||
7130 md_gen != m->md.pv_gen) {
7131 PMAP_UNLOCK(pmap);
7132 goto retry;
7133 }
7134 }
7135 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7136 oldpte = pmap_load(pte);
7137 if ((oldpte & ATTR_SW_DBM) != 0) {
7138 if ((oldpte & ATTR_CONTIGUOUS) != 0) {
7139 (void)pmap_demote_l3c(pmap, pte, pv->pv_va);
7140
7141 /*
7142 * The L3 entry's accessed bit may have
7143 * changed.
7144 */
7145 oldpte = pmap_load(pte);
7146 }
7147 if (pmap->pm_stage == PM_STAGE1) {
7148 set = ATTR_S1_AP_RW_BIT;
7149 clear = 0;
7150 mask = ATTR_S1_AP_RW_BIT;
7151 val = ATTR_S1_AP(ATTR_S1_AP_RW);
7152 } else {
7153 set = 0;
7154 clear = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7155 mask = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7156 val = ATTR_S2_S2AP(ATTR_S2_S2AP_WRITE);
7157 }
7158 clear |= ATTR_SW_DBM;
7159 while (!atomic_fcmpset_64(pte, &oldpte,
7160 (oldpte | set) & ~clear))
7161 cpu_spinwait();
7162
7163 if ((oldpte & mask) == val)
7164 vm_page_dirty(m);
7165 pmap_invalidate_page(pmap, pv->pv_va, true);
7166 }
7167 PMAP_UNLOCK(pmap);
7168 }
7169 rw_wunlock(lock);
7170 vm_page_aflag_clear(m, PGA_WRITEABLE);
7171 }
7172
7173 /*
7174 * pmap_ts_referenced:
7175 *
7176 * Return a count of reference bits for a page, clearing those bits.
7177 * It is not necessary for every reference bit to be cleared, but it
7178 * is necessary that 0 only be returned when there are truly no
7179 * reference bits set.
7180 *
7181 * As an optimization, update the page's dirty field if a modified bit is
7182 * found while counting reference bits. This opportunistic update can be
7183 * performed at low cost and can eliminate the need for some future calls
7184 * to pmap_is_modified(). However, since this function stops after
7185 * finding PMAP_TS_REFERENCED_MAX reference bits, it may not detect some
7186 * dirty pages. Those dirty pages will only be detected by a future call
7187 * to pmap_is_modified().
7188 */
7189 int
pmap_ts_referenced(vm_page_t m)7190 pmap_ts_referenced(vm_page_t m)
7191 {
7192 struct md_page *pvh;
7193 pv_entry_t pv, pvf;
7194 pmap_t pmap;
7195 struct rwlock *lock;
7196 pt_entry_t *pte, tpte;
7197 vm_offset_t va;
7198 vm_paddr_t pa;
7199 int cleared, md_gen, not_cleared, pvh_gen;
7200 struct spglist free;
7201
7202 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7203 ("pmap_ts_referenced: page %p is not managed", m));
7204 SLIST_INIT(&free);
7205 cleared = 0;
7206 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7207 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7208 rw_wlock(lock);
7209 retry:
7210 not_cleared = 0;
7211 if ((pvf = TAILQ_FIRST(&pvh->pv_list)) == NULL)
7212 goto small_mappings;
7213 pv = pvf;
7214 do {
7215 if (pvf == NULL)
7216 pvf = pv;
7217 pmap = PV_PMAP(pv);
7218 if (!PMAP_TRYLOCK(pmap)) {
7219 pvh_gen = pvh->pv_gen;
7220 rw_wunlock(lock);
7221 PMAP_LOCK(pmap);
7222 rw_wlock(lock);
7223 if (pvh_gen != pvh->pv_gen) {
7224 PMAP_UNLOCK(pmap);
7225 goto retry;
7226 }
7227 }
7228 va = pv->pv_va;
7229 pte = pmap_pte_exists(pmap, va, 2, __func__);
7230 tpte = pmap_load(pte);
7231 if (pmap_pte_dirty(pmap, tpte)) {
7232 /*
7233 * Although "tpte" is mapping a 2MB page, because
7234 * this function is called at a 4KB page granularity,
7235 * we only update the 4KB page under test.
7236 */
7237 vm_page_dirty(m);
7238 }
7239 if ((tpte & ATTR_AF) != 0) {
7240 pa = VM_PAGE_TO_PHYS(m);
7241
7242 /*
7243 * Since this reference bit is shared by 512 4KB pages,
7244 * it should not be cleared every time it is tested.
7245 * Apply a simple "hash" function on the physical page
7246 * number, the virtual superpage number, and the pmap
7247 * address to select one 4KB page out of the 512 on
7248 * which testing the reference bit will result in
7249 * clearing that reference bit. This function is
7250 * designed to avoid the selection of the same 4KB page
7251 * for every 2MB page mapping.
7252 *
7253 * On demotion, a mapping that hasn't been referenced
7254 * is simply destroyed. To avoid the possibility of a
7255 * subsequent page fault on a demoted wired mapping,
7256 * always leave its reference bit set. Moreover,
7257 * since the superpage is wired, the current state of
7258 * its reference bit won't affect page replacement.
7259 */
7260 if ((((pa >> PAGE_SHIFT) ^ (va >> L2_SHIFT) ^
7261 (uintptr_t)pmap) & (Ln_ENTRIES - 1)) == 0 &&
7262 (tpte & ATTR_SW_WIRED) == 0) {
7263 pmap_clear_bits(pte, ATTR_AF);
7264 pmap_invalidate_page(pmap, va, true);
7265 cleared++;
7266 } else
7267 not_cleared++;
7268 }
7269 PMAP_UNLOCK(pmap);
7270 /* Rotate the PV list if it has more than one entry. */
7271 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7272 TAILQ_REMOVE(&pvh->pv_list, pv, pv_next);
7273 TAILQ_INSERT_TAIL(&pvh->pv_list, pv, pv_next);
7274 pvh->pv_gen++;
7275 }
7276 if (cleared + not_cleared >= PMAP_TS_REFERENCED_MAX)
7277 goto out;
7278 } while ((pv = TAILQ_FIRST(&pvh->pv_list)) != pvf);
7279 small_mappings:
7280 if ((pvf = TAILQ_FIRST(&m->md.pv_list)) == NULL)
7281 goto out;
7282 pv = pvf;
7283 do {
7284 if (pvf == NULL)
7285 pvf = pv;
7286 pmap = PV_PMAP(pv);
7287 if (!PMAP_TRYLOCK(pmap)) {
7288 pvh_gen = pvh->pv_gen;
7289 md_gen = m->md.pv_gen;
7290 rw_wunlock(lock);
7291 PMAP_LOCK(pmap);
7292 rw_wlock(lock);
7293 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7294 PMAP_UNLOCK(pmap);
7295 goto retry;
7296 }
7297 }
7298 pte = pmap_pte_exists(pmap, pv->pv_va, 3, __func__);
7299 tpte = pmap_load(pte);
7300 if (pmap_pte_dirty(pmap, tpte))
7301 vm_page_dirty(m);
7302 if ((tpte & ATTR_AF) != 0) {
7303 if ((tpte & ATTR_SW_WIRED) == 0) {
7304 /*
7305 * Clear the accessed bit in this L3 entry
7306 * regardless of the contiguous bit.
7307 */
7308 pmap_clear_bits(pte, ATTR_AF);
7309 pmap_invalidate_page(pmap, pv->pv_va, true);
7310 cleared++;
7311 } else
7312 not_cleared++;
7313 } else if ((tpte & ATTR_CONTIGUOUS) != 0 &&
7314 (pmap_load_l3c(pte) & ATTR_AF) != 0) {
7315 /*
7316 * An L3C superpage mapping is regarded as accessed
7317 * until the accessed bit has been cleared in all
7318 * of its constituent entries.
7319 */
7320 not_cleared++;
7321 }
7322 PMAP_UNLOCK(pmap);
7323 /* Rotate the PV list if it has more than one entry. */
7324 if (TAILQ_NEXT(pv, pv_next) != NULL) {
7325 TAILQ_REMOVE(&m->md.pv_list, pv, pv_next);
7326 TAILQ_INSERT_TAIL(&m->md.pv_list, pv, pv_next);
7327 m->md.pv_gen++;
7328 }
7329 } while ((pv = TAILQ_FIRST(&m->md.pv_list)) != pvf && cleared +
7330 not_cleared < PMAP_TS_REFERENCED_MAX);
7331 out:
7332 rw_wunlock(lock);
7333 vm_page_free_pages_toq(&free, true);
7334 return (cleared + not_cleared);
7335 }
7336
7337 /*
7338 * Apply the given advice to the specified range of addresses within the
7339 * given pmap. Depending on the advice, clear the referenced and/or
7340 * modified flags in each mapping and set the mapped page's dirty field.
7341 */
7342 void
pmap_advise(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,int advice)7343 pmap_advise(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, int advice)
7344 {
7345 struct rwlock *lock;
7346 vm_offset_t va, va_next, dva;
7347 vm_page_t m;
7348 pd_entry_t *l0, *l1, *l2, oldl2;
7349 pt_entry_t *l3, *dl3, oldl3;
7350
7351 PMAP_ASSERT_STAGE1(pmap);
7352
7353 if (advice != MADV_DONTNEED && advice != MADV_FREE)
7354 return;
7355
7356 PMAP_LOCK(pmap);
7357 for (; sva < eva; sva = va_next) {
7358 l0 = pmap_l0(pmap, sva);
7359 if (pmap_load(l0) == 0) {
7360 va_next = (sva + L0_SIZE) & ~L0_OFFSET;
7361 if (va_next < sva)
7362 va_next = eva;
7363 continue;
7364 }
7365
7366 va_next = (sva + L1_SIZE) & ~L1_OFFSET;
7367 if (va_next < sva)
7368 va_next = eva;
7369 l1 = pmap_l0_to_l1(l0, sva);
7370 if (pmap_load(l1) == 0)
7371 continue;
7372 if ((pmap_load(l1) & ATTR_DESCR_MASK) == L1_BLOCK) {
7373 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7374 continue;
7375 }
7376
7377 va_next = (sva + L2_SIZE) & ~L2_OFFSET;
7378 if (va_next < sva)
7379 va_next = eva;
7380 l2 = pmap_l1_to_l2(l1, sva);
7381 oldl2 = pmap_load(l2);
7382 if (oldl2 == 0)
7383 continue;
7384 if ((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK) {
7385 if ((oldl2 & ATTR_SW_MANAGED) == 0)
7386 continue;
7387 lock = NULL;
7388 if (!pmap_demote_l2_locked(pmap, l2, sva, &lock)) {
7389 if (lock != NULL)
7390 rw_wunlock(lock);
7391
7392 /*
7393 * The 2MB page mapping was destroyed.
7394 */
7395 continue;
7396 }
7397
7398 /*
7399 * Unless the page mappings are wired, remove the
7400 * mapping to a single page so that a subsequent
7401 * access may repromote. Choosing the last page
7402 * within the address range [sva, min(va_next, eva))
7403 * generally results in more repromotions. Since the
7404 * underlying page table page is fully populated, this
7405 * removal never frees a page table page.
7406 */
7407 if ((oldl2 & ATTR_SW_WIRED) == 0) {
7408 va = eva;
7409 if (va > va_next)
7410 va = va_next;
7411 va -= PAGE_SIZE;
7412 KASSERT(va >= sva,
7413 ("pmap_advise: no address gap"));
7414 l3 = pmap_l2_to_l3(l2, va);
7415 KASSERT(pmap_load(l3) != 0,
7416 ("pmap_advise: invalid PTE"));
7417 pmap_remove_l3(pmap, l3, va, pmap_load(l2),
7418 NULL, &lock);
7419 }
7420 if (lock != NULL)
7421 rw_wunlock(lock);
7422 }
7423 KASSERT((pmap_load(l2) & ATTR_DESCR_MASK) == L2_TABLE,
7424 ("pmap_advise: invalid L2 entry after demotion"));
7425 if (va_next > eva)
7426 va_next = eva;
7427 va = va_next;
7428 for (l3 = pmap_l2_to_l3(l2, sva); sva != va_next; l3++,
7429 sva += L3_SIZE) {
7430 oldl3 = pmap_load(l3);
7431 if ((oldl3 & (ATTR_SW_MANAGED | ATTR_DESCR_MASK)) !=
7432 (ATTR_SW_MANAGED | L3_PAGE))
7433 goto maybe_invlrng;
7434 else if (pmap_pte_dirty(pmap, oldl3)) {
7435 if (advice == MADV_DONTNEED) {
7436 /*
7437 * Future calls to pmap_is_modified()
7438 * can be avoided by making the page
7439 * dirty now.
7440 */
7441 m = PTE_TO_VM_PAGE(oldl3);
7442 vm_page_dirty(m);
7443 }
7444 if ((oldl3 & ATTR_CONTIGUOUS) != 0) {
7445 /*
7446 * Unconditionally demote the L3C
7447 * superpage because we do not allow
7448 * writeable, clean superpages.
7449 */
7450 (void)pmap_demote_l3c(pmap, l3, sva);
7451
7452 /*
7453 * Destroy the final mapping before the
7454 * next L3C boundary or va_next,
7455 * whichever comes first, so that a
7456 * subsequent access may act as a
7457 * repromotion trigger.
7458 */
7459 if ((oldl3 & ATTR_SW_WIRED) == 0) {
7460 dva = MIN((sva & ~L3C_OFFSET) +
7461 L3C_SIZE - PAGE_SIZE,
7462 va_next - PAGE_SIZE);
7463 dl3 = pmap_l2_to_l3(l2, dva);
7464 KASSERT(pmap_load(dl3) != 0,
7465 ("pmap_advise: invalid PTE"));
7466 lock = NULL;
7467 pmap_remove_l3(pmap, dl3, dva,
7468 pmap_load(l2), NULL, &lock);
7469 if (lock != NULL)
7470 rw_wunlock(lock);
7471 }
7472
7473 /*
7474 * The L3 entry's accessed bit may have
7475 * changed.
7476 */
7477 oldl3 = pmap_load(l3);
7478 }
7479
7480 /*
7481 * Check that we did not just destroy this entry so
7482 * we avoid corrupting the page able.
7483 */
7484 if (oldl3 != 0) {
7485 while (!atomic_fcmpset_long(l3, &oldl3,
7486 (oldl3 & ~ATTR_AF) |
7487 ATTR_S1_AP(ATTR_S1_AP_RO)))
7488 cpu_spinwait();
7489 }
7490 } else if ((oldl3 & ATTR_AF) != 0) {
7491 /*
7492 * Clear the accessed bit in this L3 entry
7493 * regardless of the contiguous bit.
7494 */
7495 pmap_clear_bits(l3, ATTR_AF);
7496 } else
7497 goto maybe_invlrng;
7498 if (va == va_next)
7499 va = sva;
7500 continue;
7501 maybe_invlrng:
7502 if (va != va_next) {
7503 pmap_s1_invalidate_range(pmap, va, sva, true);
7504 va = va_next;
7505 }
7506 }
7507 if (va != va_next)
7508 pmap_s1_invalidate_range(pmap, va, sva, true);
7509 }
7510 PMAP_UNLOCK(pmap);
7511 }
7512
7513 /*
7514 * Clear the modify bits on the specified physical page.
7515 */
7516 void
pmap_clear_modify(vm_page_t m)7517 pmap_clear_modify(vm_page_t m)
7518 {
7519 struct md_page *pvh;
7520 struct rwlock *lock;
7521 pmap_t pmap;
7522 pv_entry_t next_pv, pv;
7523 pd_entry_t *l2, oldl2;
7524 pt_entry_t *l3, oldl3;
7525 vm_offset_t va;
7526 int md_gen, pvh_gen;
7527
7528 KASSERT((m->oflags & VPO_UNMANAGED) == 0,
7529 ("pmap_clear_modify: page %p is not managed", m));
7530 vm_page_assert_busied(m);
7531
7532 if (!pmap_page_is_write_mapped(m))
7533 return;
7534 pvh = (m->flags & PG_FICTITIOUS) != 0 ? &pv_dummy : page_to_pvh(m);
7535 lock = VM_PAGE_TO_PV_LIST_LOCK(m);
7536 rw_wlock(lock);
7537 restart:
7538 TAILQ_FOREACH_SAFE(pv, &pvh->pv_list, pv_next, next_pv) {
7539 pmap = PV_PMAP(pv);
7540 PMAP_ASSERT_STAGE1(pmap);
7541 if (!PMAP_TRYLOCK(pmap)) {
7542 pvh_gen = pvh->pv_gen;
7543 rw_wunlock(lock);
7544 PMAP_LOCK(pmap);
7545 rw_wlock(lock);
7546 if (pvh_gen != pvh->pv_gen) {
7547 PMAP_UNLOCK(pmap);
7548 goto restart;
7549 }
7550 }
7551 va = pv->pv_va;
7552 l2 = pmap_l2(pmap, va);
7553 oldl2 = pmap_load(l2);
7554 /* If oldl2 has ATTR_SW_DBM set, then it is also dirty. */
7555 if ((oldl2 & ATTR_SW_DBM) != 0 &&
7556 pmap_demote_l2_locked(pmap, l2, va, &lock) &&
7557 (oldl2 & ATTR_SW_WIRED) == 0) {
7558 /*
7559 * Write protect the mapping to a single page so that
7560 * a subsequent write access may repromote.
7561 */
7562 va += VM_PAGE_TO_PHYS(m) - PTE_TO_PHYS(oldl2);
7563 l3 = pmap_l2_to_l3(l2, va);
7564 oldl3 = pmap_load(l3);
7565 while (!atomic_fcmpset_long(l3, &oldl3,
7566 (oldl3 & ~ATTR_SW_DBM) | ATTR_S1_AP(ATTR_S1_AP_RO)))
7567 cpu_spinwait();
7568 vm_page_dirty(m);
7569 pmap_s1_invalidate_page(pmap, va, true);
7570 }
7571 PMAP_UNLOCK(pmap);
7572 }
7573 TAILQ_FOREACH(pv, &m->md.pv_list, pv_next) {
7574 pmap = PV_PMAP(pv);
7575 PMAP_ASSERT_STAGE1(pmap);
7576 if (!PMAP_TRYLOCK(pmap)) {
7577 md_gen = m->md.pv_gen;
7578 pvh_gen = pvh->pv_gen;
7579 rw_wunlock(lock);
7580 PMAP_LOCK(pmap);
7581 rw_wlock(lock);
7582 if (pvh_gen != pvh->pv_gen || md_gen != m->md.pv_gen) {
7583 PMAP_UNLOCK(pmap);
7584 goto restart;
7585 }
7586 }
7587 l2 = pmap_l2(pmap, pv->pv_va);
7588 l3 = pmap_l2_to_l3(l2, pv->pv_va);
7589 oldl3 = pmap_load(l3);
7590 KASSERT((oldl3 & ATTR_CONTIGUOUS) == 0 ||
7591 (oldl3 & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
7592 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
7593 ("writeable L3C superpage not dirty"));
7594 if ((oldl3 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) == ATTR_SW_DBM) {
7595 if ((oldl3 & ATTR_CONTIGUOUS) != 0)
7596 (void)pmap_demote_l3c(pmap, l3, pv->pv_va);
7597 pmap_set_bits(l3, ATTR_S1_AP(ATTR_S1_AP_RO));
7598 pmap_s1_invalidate_page(pmap, pv->pv_va, true);
7599 }
7600 PMAP_UNLOCK(pmap);
7601 }
7602 rw_wunlock(lock);
7603 }
7604
7605 void *
pmap_mapbios(vm_paddr_t pa,vm_size_t size)7606 pmap_mapbios(vm_paddr_t pa, vm_size_t size)
7607 {
7608 struct pmap_preinit_mapping *ppim;
7609 vm_offset_t va, offset;
7610 pd_entry_t old_l2e, *pde;
7611 pt_entry_t *l2;
7612 int i, lvl, l2_blocks, free_l2_count, start_idx;
7613
7614 if (!vm_initialized) {
7615 /*
7616 * No L3 ptables so map entire L2 blocks where start VA is:
7617 * preinit_map_va + start_idx * L2_SIZE
7618 * There may be duplicate mappings (multiple VA -> same PA) but
7619 * ARM64 dcache is always PIPT so that's acceptable.
7620 */
7621 if (size == 0)
7622 return (NULL);
7623
7624 /* Calculate how many L2 blocks are needed for the mapping */
7625 l2_blocks = (roundup2(pa + size, L2_SIZE) -
7626 rounddown2(pa, L2_SIZE)) >> L2_SHIFT;
7627
7628 offset = pa & L2_OFFSET;
7629
7630 if (preinit_map_va == 0)
7631 return (NULL);
7632
7633 /* Map 2MiB L2 blocks from reserved VA space */
7634
7635 free_l2_count = 0;
7636 start_idx = -1;
7637 /* Find enough free contiguous VA space */
7638 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7639 ppim = pmap_preinit_mapping + i;
7640 if (free_l2_count > 0 && ppim->pa != 0) {
7641 /* Not enough space here */
7642 free_l2_count = 0;
7643 start_idx = -1;
7644 continue;
7645 }
7646
7647 if (ppim->pa == 0) {
7648 /* Free L2 block */
7649 if (start_idx == -1)
7650 start_idx = i;
7651 free_l2_count++;
7652 if (free_l2_count == l2_blocks)
7653 break;
7654 }
7655 }
7656 if (free_l2_count != l2_blocks)
7657 panic("%s: too many preinit mappings", __func__);
7658
7659 va = preinit_map_va + (start_idx * L2_SIZE);
7660 for (i = start_idx; i < start_idx + l2_blocks; i++) {
7661 /* Mark entries as allocated */
7662 ppim = pmap_preinit_mapping + i;
7663 ppim->pa = pa;
7664 ppim->va = va + offset;
7665 ppim->size = size;
7666 }
7667
7668 /* Map L2 blocks */
7669 pa = rounddown2(pa, L2_SIZE);
7670 old_l2e = 0;
7671 for (i = 0; i < l2_blocks; i++) {
7672 pde = pmap_pde(kernel_pmap, va, &lvl);
7673 KASSERT(pde != NULL,
7674 ("pmap_mapbios: Invalid page entry, va: 0x%lx",
7675 va));
7676 KASSERT(lvl == 1,
7677 ("pmap_mapbios: Invalid level %d", lvl));
7678
7679 /* Insert L2_BLOCK */
7680 l2 = pmap_l1_to_l2(pde, va);
7681 old_l2e |= pmap_load_store(l2,
7682 PHYS_TO_PTE(pa) | ATTR_DEFAULT | ATTR_S1_XN |
7683 ATTR_KERN_GP | ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK) |
7684 L2_BLOCK);
7685
7686 va += L2_SIZE;
7687 pa += L2_SIZE;
7688 }
7689 if ((old_l2e & ATTR_DESCR_VALID) != 0)
7690 pmap_s1_invalidate_all(kernel_pmap);
7691 else {
7692 /*
7693 * Because the old entries were invalid and the new
7694 * mappings are not executable, an isb is not required.
7695 */
7696 dsb(ishst);
7697 }
7698
7699 va = preinit_map_va + (start_idx * L2_SIZE);
7700
7701 } else {
7702 /* kva_alloc may be used to map the pages */
7703 offset = pa & PAGE_MASK;
7704 size = round_page(offset + size);
7705
7706 va = kva_alloc(size);
7707 if (va == 0)
7708 panic("%s: Couldn't allocate KVA", __func__);
7709
7710 pde = pmap_pde(kernel_pmap, va, &lvl);
7711 KASSERT(lvl == 2, ("pmap_mapbios: Invalid level %d", lvl));
7712
7713 /* L3 table is linked */
7714 va = trunc_page(va);
7715 pa = trunc_page(pa);
7716 pmap_kenter(va, size, pa, memory_mapping_mode(pa));
7717 }
7718
7719 return ((void *)(va + offset));
7720 }
7721
7722 void
pmap_unmapbios(void * p,vm_size_t size)7723 pmap_unmapbios(void *p, vm_size_t size)
7724 {
7725 struct pmap_preinit_mapping *ppim;
7726 vm_offset_t offset, va, va_trunc;
7727 pd_entry_t *pde;
7728 pt_entry_t *l2;
7729 int i, lvl, l2_blocks, block;
7730 bool preinit_map;
7731
7732 va = (vm_offset_t)p;
7733 l2_blocks =
7734 (roundup2(va + size, L2_SIZE) - rounddown2(va, L2_SIZE)) >> L2_SHIFT;
7735 KASSERT(l2_blocks > 0, ("pmap_unmapbios: invalid size %lx", size));
7736
7737 /* Remove preinit mapping */
7738 preinit_map = false;
7739 block = 0;
7740 for (i = 0; i < PMAP_PREINIT_MAPPING_COUNT; i++) {
7741 ppim = pmap_preinit_mapping + i;
7742 if (ppim->va == va) {
7743 KASSERT(ppim->size == size,
7744 ("pmap_unmapbios: size mismatch"));
7745 ppim->va = 0;
7746 ppim->pa = 0;
7747 ppim->size = 0;
7748 preinit_map = true;
7749 offset = block * L2_SIZE;
7750 va_trunc = rounddown2(va, L2_SIZE) + offset;
7751
7752 /* Remove L2_BLOCK */
7753 pde = pmap_pde(kernel_pmap, va_trunc, &lvl);
7754 KASSERT(pde != NULL,
7755 ("pmap_unmapbios: Invalid page entry, va: 0x%lx",
7756 va_trunc));
7757 l2 = pmap_l1_to_l2(pde, va_trunc);
7758 pmap_clear(l2);
7759
7760 if (block == (l2_blocks - 1))
7761 break;
7762 block++;
7763 }
7764 }
7765 if (preinit_map) {
7766 pmap_s1_invalidate_all(kernel_pmap);
7767 return;
7768 }
7769
7770 /* Unmap the pages reserved with kva_alloc. */
7771 if (vm_initialized) {
7772 offset = va & PAGE_MASK;
7773 size = round_page(offset + size);
7774 va = trunc_page(va);
7775
7776 /* Unmap and invalidate the pages */
7777 pmap_kremove_device(va, size);
7778
7779 kva_free(va, size);
7780 }
7781 }
7782
7783 /*
7784 * Sets the memory attribute for the specified page.
7785 */
7786 void
pmap_page_set_memattr(vm_page_t m,vm_memattr_t ma)7787 pmap_page_set_memattr(vm_page_t m, vm_memattr_t ma)
7788 {
7789
7790 m->md.pv_memattr = ma;
7791
7792 /*
7793 * If "m" is a normal page, update its direct mapping. This update
7794 * can be relied upon to perform any cache operations that are
7795 * required for data coherence.
7796 */
7797 if ((m->flags & PG_FICTITIOUS) == 0 &&
7798 pmap_change_attr(PHYS_TO_DMAP(VM_PAGE_TO_PHYS(m)), PAGE_SIZE,
7799 m->md.pv_memattr) != 0)
7800 panic("memory attribute change on the direct map failed");
7801 }
7802
7803 /*
7804 * Changes the specified virtual address range's memory type to that given by
7805 * the parameter "mode". The specified virtual address range must be
7806 * completely contained within either the direct map or the kernel map. If
7807 * the virtual address range is contained within the kernel map, then the
7808 * memory type for each of the corresponding ranges of the direct map is also
7809 * changed. (The corresponding ranges of the direct map are those ranges that
7810 * map the same physical pages as the specified virtual address range.) These
7811 * changes to the direct map are necessary because Intel describes the
7812 * behavior of their processors as "undefined" if two or more mappings to the
7813 * same physical page have different memory types.
7814 *
7815 * Returns zero if the change completed successfully, and either EINVAL or
7816 * ENOMEM if the change failed. Specifically, EINVAL is returned if some part
7817 * of the virtual address range was not mapped, and ENOMEM is returned if
7818 * there was insufficient memory available to complete the change. In the
7819 * latter case, the memory type may have been changed on some part of the
7820 * virtual address range or the direct map.
7821 */
7822 int
pmap_change_attr(vm_offset_t va,vm_size_t size,int mode)7823 pmap_change_attr(vm_offset_t va, vm_size_t size, int mode)
7824 {
7825 int error;
7826
7827 PMAP_LOCK(kernel_pmap);
7828 error = pmap_change_props_locked(va, size, PROT_NONE, mode, false);
7829 PMAP_UNLOCK(kernel_pmap);
7830 return (error);
7831 }
7832
7833 /*
7834 * Changes the specified virtual address range's protections to those
7835 * specified by "prot". Like pmap_change_attr(), protections for aliases
7836 * in the direct map are updated as well. Protections on aliasing mappings may
7837 * be a subset of the requested protections; for example, mappings in the direct
7838 * map are never executable.
7839 */
7840 int
pmap_change_prot(vm_offset_t va,vm_size_t size,vm_prot_t prot)7841 pmap_change_prot(vm_offset_t va, vm_size_t size, vm_prot_t prot)
7842 {
7843 int error;
7844
7845 /* Only supported within the kernel map. */
7846 if (va < VM_MIN_KERNEL_ADDRESS)
7847 return (EINVAL);
7848
7849 PMAP_LOCK(kernel_pmap);
7850 error = pmap_change_props_locked(va, size, prot, -1, false);
7851 PMAP_UNLOCK(kernel_pmap);
7852 return (error);
7853 }
7854
7855 static int
pmap_change_props_locked(vm_offset_t va,vm_size_t size,vm_prot_t prot,int mode,bool skip_unmapped)7856 pmap_change_props_locked(vm_offset_t va, vm_size_t size, vm_prot_t prot,
7857 int mode, bool skip_unmapped)
7858 {
7859 vm_offset_t base, offset, tmpva;
7860 vm_size_t pte_size;
7861 vm_paddr_t pa;
7862 pt_entry_t pte, *ptep, *newpte;
7863 pt_entry_t bits, mask;
7864 int lvl, rv;
7865
7866 PMAP_LOCK_ASSERT(kernel_pmap, MA_OWNED);
7867 base = trunc_page(va);
7868 offset = va & PAGE_MASK;
7869 size = round_page(offset + size);
7870
7871 if (!VIRT_IN_DMAP(base) &&
7872 !(base >= VM_MIN_KERNEL_ADDRESS && base < VM_MAX_KERNEL_ADDRESS))
7873 return (EINVAL);
7874
7875 bits = 0;
7876 mask = 0;
7877 if (mode != -1) {
7878 bits = ATTR_S1_IDX(mode);
7879 mask = ATTR_S1_IDX_MASK;
7880 if (mode == VM_MEMATTR_DEVICE) {
7881 mask |= ATTR_S1_XN;
7882 bits |= ATTR_S1_XN;
7883 }
7884 }
7885 if (prot != VM_PROT_NONE) {
7886 /* Don't mark the DMAP as executable. It never is on arm64. */
7887 if (VIRT_IN_DMAP(base)) {
7888 prot &= ~VM_PROT_EXECUTE;
7889 /*
7890 * XXX Mark the DMAP as writable for now. We rely
7891 * on this in ddb & dtrace to insert breakpoint
7892 * instructions.
7893 */
7894 prot |= VM_PROT_WRITE;
7895 }
7896
7897 if ((prot & VM_PROT_WRITE) == 0) {
7898 bits |= ATTR_S1_AP(ATTR_S1_AP_RO);
7899 }
7900 if ((prot & VM_PROT_EXECUTE) == 0) {
7901 bits |= ATTR_S1_PXN;
7902 }
7903 bits |= ATTR_S1_UXN;
7904 mask |= ATTR_S1_AP_MASK | ATTR_S1_XN;
7905 }
7906
7907 for (tmpva = base; tmpva < base + size; ) {
7908 ptep = pmap_pte(kernel_pmap, tmpva, &lvl);
7909 if (ptep == NULL && !skip_unmapped) {
7910 return (EINVAL);
7911 } else if ((ptep == NULL && skip_unmapped) ||
7912 (pmap_load(ptep) & mask) == bits) {
7913 /*
7914 * We already have the correct attribute or there
7915 * is no memory mapped at this address and we are
7916 * skipping unmapped memory.
7917 */
7918 switch (lvl) {
7919 default:
7920 panic("Invalid DMAP table level: %d\n", lvl);
7921 case 1:
7922 tmpva = (tmpva & ~L1_OFFSET) + L1_SIZE;
7923 break;
7924 case 2:
7925 tmpva = (tmpva & ~L2_OFFSET) + L2_SIZE;
7926 break;
7927 case 3:
7928 tmpva += PAGE_SIZE;
7929 break;
7930 }
7931 } else {
7932 /* We can't demote/promote this entry */
7933 MPASS((pmap_load(ptep) & ATTR_SW_NO_PROMOTE) == 0);
7934
7935 /*
7936 * Find the entry and demote it if the requested change
7937 * only applies to part of the address range mapped by
7938 * the entry.
7939 */
7940 switch (lvl) {
7941 default:
7942 panic("Invalid DMAP table level: %d\n", lvl);
7943 case 1:
7944 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
7945 if ((tmpva & L1_OFFSET) == 0 &&
7946 (base + size - tmpva) >= L1_SIZE) {
7947 pte_size = L1_SIZE;
7948 break;
7949 }
7950 newpte = pmap_demote_l1(kernel_pmap, ptep,
7951 tmpva & ~L1_OFFSET);
7952 if (newpte == NULL)
7953 return (EINVAL);
7954 ptep = pmap_l1_to_l2(ptep, tmpva);
7955 /* FALLTHROUGH */
7956 case 2:
7957 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
7958 if ((tmpva & L2C_OFFSET) == 0 &&
7959 (base + size - tmpva) >= L2C_SIZE) {
7960 pte_size = L2C_SIZE;
7961 break;
7962 }
7963 if (!pmap_demote_l2c(kernel_pmap, ptep,
7964 tmpva))
7965 return (EINVAL);
7966 }
7967 if ((tmpva & L2_OFFSET) == 0 &&
7968 (base + size - tmpva) >= L2_SIZE) {
7969 pte_size = L2_SIZE;
7970 break;
7971 }
7972 newpte = pmap_demote_l2(kernel_pmap, ptep,
7973 tmpva);
7974 if (newpte == NULL)
7975 return (EINVAL);
7976 ptep = pmap_l2_to_l3(ptep, tmpva);
7977 /* FALLTHROUGH */
7978 case 3:
7979 if ((pmap_load(ptep) & ATTR_CONTIGUOUS) != 0) {
7980 if ((tmpva & L3C_OFFSET) == 0 &&
7981 (base + size - tmpva) >= L3C_SIZE) {
7982 pte_size = L3C_SIZE;
7983 break;
7984 }
7985 if (!pmap_demote_l3c(kernel_pmap, ptep,
7986 tmpva))
7987 return (EINVAL);
7988 }
7989 pte_size = PAGE_SIZE;
7990 break;
7991 }
7992
7993 /* Update the entry */
7994 pte = pmap_load(ptep);
7995 pte &= ~mask;
7996 pte |= bits;
7997
7998 switch (pte_size) {
7999 case L2C_SIZE:
8000 pmap_update_strided(kernel_pmap, ptep, ptep +
8001 L2C_ENTRIES, pte, tmpva, L2_SIZE, L2C_SIZE);
8002 break;
8003 case L3C_SIZE:
8004 pmap_update_strided(kernel_pmap, ptep, ptep +
8005 L3C_ENTRIES, pte, tmpva, L3_SIZE, L3C_SIZE);
8006 break;
8007 default:
8008 /*
8009 * We are updating a single block or page entry,
8010 * so regardless of pte_size pass PAGE_SIZE in
8011 * order that a single TLB invalidation is
8012 * performed.
8013 */
8014 pmap_update_entry(kernel_pmap, ptep, pte, tmpva,
8015 PAGE_SIZE);
8016 break;
8017 }
8018
8019 pa = PTE_TO_PHYS(pte);
8020 if (!VIRT_IN_DMAP(tmpva) && PHYS_IN_DMAP(pa)) {
8021 /*
8022 * Keep the DMAP memory in sync.
8023 */
8024 rv = pmap_change_props_locked(
8025 PHYS_TO_DMAP(pa), pte_size,
8026 prot, mode, true);
8027 if (rv != 0)
8028 return (rv);
8029 }
8030
8031 /*
8032 * If moving to a non-cacheable entry flush
8033 * the cache.
8034 */
8035 if (mode == VM_MEMATTR_UNCACHEABLE)
8036 cpu_dcache_wbinv_range((void *)tmpva, pte_size);
8037 tmpva += pte_size;
8038 }
8039 }
8040
8041 return (0);
8042 }
8043
8044 /*
8045 * Create an L2 table to map all addresses within an L1 mapping.
8046 */
8047 static pt_entry_t *
pmap_demote_l1(pmap_t pmap,pt_entry_t * l1,vm_offset_t va)8048 pmap_demote_l1(pmap_t pmap, pt_entry_t *l1, vm_offset_t va)
8049 {
8050 pt_entry_t *l2, newl2, oldl1;
8051 vm_offset_t tmpl1;
8052 vm_paddr_t l2phys, phys;
8053 vm_page_t ml2;
8054 int i;
8055
8056 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8057 oldl1 = pmap_load(l1);
8058 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
8059 KASSERT((oldl1 & ATTR_DESCR_MASK) == L1_BLOCK,
8060 ("pmap_demote_l1: Demoting a non-block entry"));
8061 KASSERT((va & L1_OFFSET) == 0,
8062 ("pmap_demote_l1: Invalid virtual address %#lx", va));
8063 KASSERT((oldl1 & ATTR_SW_MANAGED) == 0,
8064 ("pmap_demote_l1: Level 1 table shouldn't be managed"));
8065 KASSERT((oldl1 & ATTR_SW_NO_PROMOTE) == 0,
8066 ("pmap_demote_l1: Demoting entry with no-demote flag set"));
8067
8068 tmpl1 = 0;
8069 if (va <= (vm_offset_t)l1 && va + L1_SIZE > (vm_offset_t)l1) {
8070 tmpl1 = kva_alloc(PAGE_SIZE);
8071 if (tmpl1 == 0)
8072 return (NULL);
8073 }
8074
8075 if ((ml2 = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED)) ==
8076 NULL) {
8077 CTR2(KTR_PMAP, "pmap_demote_l1: failure for va %#lx"
8078 " in pmap %p", va, pmap);
8079 l2 = NULL;
8080 goto fail;
8081 }
8082
8083 l2phys = VM_PAGE_TO_PHYS(ml2);
8084 l2 = (pt_entry_t *)PHYS_TO_DMAP(l2phys);
8085
8086 /* Address the range points at */
8087 phys = PTE_TO_PHYS(oldl1);
8088 /* The attributed from the old l1 table to be copied */
8089 newl2 = oldl1 & ATTR_MASK;
8090
8091 /* Create the new entries */
8092 newl2 |= ATTR_CONTIGUOUS;
8093 for (i = 0; i < Ln_ENTRIES; i++) {
8094 l2[i] = newl2 | phys;
8095 phys += L2_SIZE;
8096 }
8097 KASSERT(l2[0] == (ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) |
8098 L2_BLOCK), ("Invalid l2 page (%lx != %lx)", l2[0],
8099 ATTR_CONTIGUOUS | (oldl1 & ~ATTR_DESCR_MASK) | L2_BLOCK));
8100
8101 if (tmpl1 != 0) {
8102 pmap_kenter(tmpl1, PAGE_SIZE,
8103 DMAP_TO_PHYS((vm_offset_t)l1) & ~L3_OFFSET,
8104 VM_MEMATTR_WRITE_BACK);
8105 l1 = (pt_entry_t *)(tmpl1 + ((vm_offset_t)l1 & PAGE_MASK));
8106 }
8107
8108 pmap_update_entry(pmap, l1, l2phys | L1_TABLE, va, PAGE_SIZE);
8109
8110 counter_u64_add(pmap_l1_demotions, 1);
8111 fail:
8112 if (tmpl1 != 0) {
8113 pmap_kremove(tmpl1);
8114 kva_free(tmpl1, PAGE_SIZE);
8115 }
8116
8117 return (l2);
8118 }
8119
8120 static void
pmap_fill_l3(pt_entry_t * firstl3,pt_entry_t newl3)8121 pmap_fill_l3(pt_entry_t *firstl3, pt_entry_t newl3)
8122 {
8123 pt_entry_t *l3;
8124
8125 for (l3 = firstl3; l3 - firstl3 < Ln_ENTRIES; l3++) {
8126 *l3 = newl3;
8127 newl3 += L3_SIZE;
8128 }
8129 }
8130
8131 static void
pmap_demote_l2_check(pt_entry_t * firstl3p __unused,pt_entry_t newl3e __unused)8132 pmap_demote_l2_check(pt_entry_t *firstl3p __unused, pt_entry_t newl3e __unused)
8133 {
8134 #ifdef INVARIANTS
8135 #ifdef DIAGNOSTIC
8136 pt_entry_t *xl3p, *yl3p;
8137
8138 for (xl3p = firstl3p; xl3p < firstl3p + Ln_ENTRIES;
8139 xl3p++, newl3e += PAGE_SIZE) {
8140 if (PTE_TO_PHYS(pmap_load(xl3p)) != PTE_TO_PHYS(newl3e)) {
8141 printf("pmap_demote_l2: xl3e %zd and newl3e map "
8142 "different pages: found %#lx, expected %#lx\n",
8143 xl3p - firstl3p, pmap_load(xl3p), newl3e);
8144 printf("page table dump\n");
8145 for (yl3p = firstl3p; yl3p < firstl3p + Ln_ENTRIES;
8146 yl3p++) {
8147 printf("%zd %#lx\n", yl3p - firstl3p,
8148 pmap_load(yl3p));
8149 }
8150 panic("firstpte");
8151 }
8152 }
8153 #else
8154 KASSERT(PTE_TO_PHYS(pmap_load(firstl3p)) == PTE_TO_PHYS(newl3e),
8155 ("pmap_demote_l2: firstl3 and newl3e map different physical"
8156 " addresses"));
8157 #endif
8158 #endif
8159 }
8160
8161 static void
pmap_demote_l2_abort(pmap_t pmap,vm_offset_t va,pt_entry_t * l2,struct rwlock ** lockp)8162 pmap_demote_l2_abort(pmap_t pmap, vm_offset_t va, pt_entry_t *l2,
8163 struct rwlock **lockp)
8164 {
8165 struct spglist free;
8166
8167 SLIST_INIT(&free);
8168 (void)pmap_remove_l2(pmap, l2, va, pmap_load(pmap_l1(pmap, va)), &free,
8169 lockp);
8170 vm_page_free_pages_toq(&free, true);
8171 }
8172
8173 /*
8174 * Create an L3 table to map all addresses within an L2 mapping.
8175 */
8176 static pt_entry_t *
pmap_demote_l2_locked(pmap_t pmap,pt_entry_t * l2,vm_offset_t va,struct rwlock ** lockp)8177 pmap_demote_l2_locked(pmap_t pmap, pt_entry_t *l2, vm_offset_t va,
8178 struct rwlock **lockp)
8179 {
8180 pt_entry_t *l3, newl3, oldl2;
8181 vm_offset_t tmpl2;
8182 vm_paddr_t l3phys;
8183 vm_page_t ml3;
8184
8185 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8186 PMAP_ASSERT_STAGE1(pmap);
8187 KASSERT(ADDR_IS_CANONICAL(va),
8188 ("%s: Address not in canonical form: %lx", __func__, va));
8189
8190 l3 = NULL;
8191 oldl2 = pmap_load(l2);
8192 KASSERT((oldl2 & ATTR_DESCR_MASK) == L2_BLOCK,
8193 ("pmap_demote_l2: Demoting a non-block entry"));
8194 KASSERT((oldl2 & ATTR_SW_NO_PROMOTE) == 0,
8195 ("pmap_demote_l2: Demoting entry with no-demote flag set"));
8196 va &= ~L2_OFFSET;
8197
8198 tmpl2 = 0;
8199 if (va <= (vm_offset_t)l2 && va + L2_SIZE > (vm_offset_t)l2) {
8200 tmpl2 = kva_alloc(PAGE_SIZE);
8201 if (tmpl2 == 0)
8202 return (NULL);
8203 }
8204
8205 /*
8206 * Invalidate the 2MB page mapping and return "failure" if the
8207 * mapping was never accessed.
8208 */
8209 if ((oldl2 & ATTR_AF) == 0) {
8210 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8211 ("pmap_demote_l2: a wired mapping is missing ATTR_AF"));
8212 pmap_demote_l2_abort(pmap, va, l2, lockp);
8213 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx in pmap %p",
8214 va, pmap);
8215 goto fail;
8216 }
8217
8218 if ((ml3 = pmap_remove_pt_page(pmap, va)) == NULL) {
8219 KASSERT((oldl2 & ATTR_SW_WIRED) == 0,
8220 ("pmap_demote_l2: page table page for a wired mapping"
8221 " is missing"));
8222
8223 /*
8224 * If the page table page is missing and the mapping
8225 * is for a kernel address, the mapping must belong to
8226 * either the direct map or the early kernel memory.
8227 * Page table pages are preallocated for every other
8228 * part of the kernel address space, so the direct map
8229 * region and early kernel memory are the only parts of the
8230 * kernel address space that must be handled here.
8231 */
8232 KASSERT(!ADDR_IS_KERNEL(va) || VIRT_IN_DMAP(va) ||
8233 (va >= VM_MIN_KERNEL_ADDRESS && va < kernel_vm_end),
8234 ("pmap_demote_l2: No saved mpte for va %#lx", va));
8235
8236 /*
8237 * If the 2MB page mapping belongs to the direct map
8238 * region of the kernel's address space, then the page
8239 * allocation request specifies the highest possible
8240 * priority (VM_ALLOC_INTERRUPT). Otherwise, the
8241 * priority is normal.
8242 */
8243 ml3 = vm_page_alloc_noobj(
8244 (VIRT_IN_DMAP(va) ? VM_ALLOC_INTERRUPT : 0) |
8245 VM_ALLOC_WIRED);
8246
8247 /*
8248 * If the allocation of the new page table page fails,
8249 * invalidate the 2MB page mapping and return "failure".
8250 */
8251 if (ml3 == NULL) {
8252 pmap_demote_l2_abort(pmap, va, l2, lockp);
8253 CTR2(KTR_PMAP, "pmap_demote_l2: failure for va %#lx"
8254 " in pmap %p", va, pmap);
8255 goto fail;
8256 }
8257 ml3->pindex = pmap_l2_pindex(va);
8258
8259 if (!ADDR_IS_KERNEL(va)) {
8260 ml3->ref_count = NL3PG;
8261 pmap_resident_count_inc(pmap, 1);
8262 }
8263 }
8264 l3phys = VM_PAGE_TO_PHYS(ml3);
8265 l3 = (pt_entry_t *)PHYS_TO_DMAP(l3phys);
8266 newl3 = ATTR_CONTIGUOUS | (oldl2 & ~ATTR_DESCR_MASK) | L3_PAGE;
8267 KASSERT((oldl2 & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) !=
8268 (ATTR_S1_AP(ATTR_S1_AP_RO) | ATTR_SW_DBM),
8269 ("pmap_demote_l2: L2 entry is writeable but not dirty"));
8270
8271 /*
8272 * If the PTP is not leftover from an earlier promotion or it does not
8273 * have ATTR_AF set in every L3E, then fill it. The new L3Es will all
8274 * have ATTR_AF set.
8275 *
8276 * When pmap_update_entry() clears the old L2 mapping, it (indirectly)
8277 * performs a dsb(). That dsb() ensures that the stores for filling
8278 * "l3" are visible before "l3" is added to the page table.
8279 */
8280 if (!vm_page_all_valid(ml3))
8281 pmap_fill_l3(l3, newl3);
8282
8283 pmap_demote_l2_check(l3, newl3);
8284
8285 /*
8286 * If the mapping has changed attributes, update the L3Es.
8287 */
8288 if ((pmap_load(l3) & ATTR_PROMOTE) != (newl3 & ATTR_PROMOTE))
8289 pmap_fill_l3(l3, newl3);
8290
8291 /*
8292 * Map the temporary page so we don't lose access to the l2 table.
8293 */
8294 if (tmpl2 != 0) {
8295 pmap_kenter(tmpl2, PAGE_SIZE,
8296 DMAP_TO_PHYS((vm_offset_t)l2) & ~L3_OFFSET,
8297 VM_MEMATTR_WRITE_BACK);
8298 l2 = (pt_entry_t *)(tmpl2 + ((vm_offset_t)l2 & PAGE_MASK));
8299 }
8300
8301 /*
8302 * The spare PV entries must be reserved prior to demoting the
8303 * mapping, that is, prior to changing the PDE. Otherwise, the state
8304 * of the L2 and the PV lists will be inconsistent, which can result
8305 * in reclaim_pv_chunk() attempting to remove a PV entry from the
8306 * wrong PV list and pmap_pv_demote_l2() failing to find the expected
8307 * PV entry for the 2MB page mapping that is being demoted.
8308 */
8309 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8310 reserve_pv_entries(pmap, Ln_ENTRIES - 1, lockp);
8311
8312 /*
8313 * Pass PAGE_SIZE so that a single TLB invalidation is performed on
8314 * the 2MB page mapping.
8315 */
8316 pmap_update_entry(pmap, l2, l3phys | L2_TABLE, va, PAGE_SIZE);
8317
8318 /*
8319 * Demote the PV entry.
8320 */
8321 if ((oldl2 & ATTR_SW_MANAGED) != 0)
8322 pmap_pv_demote_l2(pmap, va, PTE_TO_PHYS(oldl2), lockp);
8323
8324 atomic_add_long(&pmap_l2_demotions, 1);
8325 CTR3(KTR_PMAP, "pmap_demote_l2: success for va %#lx"
8326 " in pmap %p %lx", va, pmap, l3[0]);
8327
8328 fail:
8329 if (tmpl2 != 0) {
8330 pmap_kremove(tmpl2);
8331 kva_free(tmpl2, PAGE_SIZE);
8332 }
8333
8334 return (l3);
8335
8336 }
8337
8338 static pt_entry_t *
pmap_demote_l2(pmap_t pmap,pt_entry_t * l2,vm_offset_t va)8339 pmap_demote_l2(pmap_t pmap, pt_entry_t *l2, vm_offset_t va)
8340 {
8341 struct rwlock *lock;
8342 pt_entry_t *l3;
8343
8344 lock = NULL;
8345 l3 = pmap_demote_l2_locked(pmap, l2, va, &lock);
8346 if (lock != NULL)
8347 rw_wunlock(lock);
8348 return (l3);
8349 }
8350
8351 /*
8352 * Demote an L2C superpage mapping to L2C_ENTRIES L2 block mappings.
8353 */
8354 static bool
pmap_demote_l2c(pmap_t pmap,pt_entry_t * l2p,vm_offset_t va)8355 pmap_demote_l2c(pmap_t pmap, pt_entry_t *l2p, vm_offset_t va)
8356 {
8357 pd_entry_t *l2c_end, *l2c_start, l2e, mask, nbits, *tl2p;
8358 vm_offset_t tmpl3;
8359 register_t intr;
8360
8361 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8362 PMAP_ASSERT_STAGE1(pmap);
8363 l2c_start = (pd_entry_t *)((uintptr_t)l2p & ~((L2C_ENTRIES *
8364 sizeof(pd_entry_t)) - 1));
8365 l2c_end = l2c_start + L2C_ENTRIES;
8366 tmpl3 = 0;
8367 if ((va & ~L2C_OFFSET) < (vm_offset_t)l2c_end &&
8368 (vm_offset_t)l2c_start < (va & ~L2C_OFFSET) + L2C_SIZE) {
8369 tmpl3 = kva_alloc(PAGE_SIZE);
8370 if (tmpl3 == 0)
8371 return (false);
8372 pmap_kenter(tmpl3, PAGE_SIZE,
8373 DMAP_TO_PHYS((vm_offset_t)l2c_start) & ~L3_OFFSET,
8374 VM_MEMATTR_WRITE_BACK);
8375 l2c_start = (pd_entry_t *)(tmpl3 +
8376 ((vm_offset_t)l2c_start & PAGE_MASK));
8377 l2c_end = (pd_entry_t *)(tmpl3 +
8378 ((vm_offset_t)l2c_end & PAGE_MASK));
8379 }
8380 mask = 0;
8381 nbits = ATTR_DESCR_VALID;
8382 intr = intr_disable();
8383
8384 /*
8385 * Break the mappings.
8386 */
8387 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8388 /*
8389 * Clear the mapping's contiguous and valid bits, but leave
8390 * the rest of the entry unchanged, so that a lockless,
8391 * concurrent pmap_kextract() can still lookup the physical
8392 * address.
8393 */
8394 l2e = pmap_load(tl2p);
8395 KASSERT((l2e & ATTR_CONTIGUOUS) != 0,
8396 ("pmap_demote_l2c: missing ATTR_CONTIGUOUS"));
8397 KASSERT((l2e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8398 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8399 ("pmap_demote_l2c: missing ATTR_S1_AP_RW"));
8400 while (!atomic_fcmpset_64(tl2p, &l2e, l2e & ~(ATTR_CONTIGUOUS |
8401 ATTR_DESCR_VALID)))
8402 cpu_spinwait();
8403
8404 /*
8405 * Hardware accessed and dirty bit maintenance might only
8406 * update a single L2 entry, so we must combine the accessed
8407 * and dirty bits from this entire set of contiguous L2
8408 * entries.
8409 */
8410 if ((l2e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8411 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8412 mask = ATTR_S1_AP_RW_BIT;
8413 nbits |= l2e & ATTR_AF;
8414 }
8415 if ((nbits & ATTR_AF) != 0) {
8416 pmap_s1_invalidate_strided(pmap, va & ~L2C_OFFSET, (va +
8417 L2C_SIZE) & ~L2C_OFFSET, L2_SIZE, true);
8418 }
8419
8420 /*
8421 * Remake the mappings, updating the accessed and dirty bits.
8422 */
8423 for (tl2p = l2c_start; tl2p < l2c_end; tl2p++) {
8424 l2e = pmap_load(tl2p);
8425 while (!atomic_fcmpset_64(tl2p, &l2e, (l2e & ~mask) | nbits))
8426 cpu_spinwait();
8427 }
8428 dsb(ishst);
8429
8430 intr_restore(intr);
8431 if (tmpl3 != 0) {
8432 pmap_kremove(tmpl3);
8433 kva_free(tmpl3, PAGE_SIZE);
8434 }
8435 counter_u64_add(pmap_l2c_demotions, 1);
8436 CTR2(KTR_PMAP, "pmap_demote_l2c: success for va %#lx in pmap %p",
8437 va, pmap);
8438 return (true);
8439 }
8440
8441 /*
8442 * Demote a L3C superpage mapping to L3C_ENTRIES 4KB page mappings.
8443 */
8444 static bool
pmap_demote_l3c(pmap_t pmap,pt_entry_t * l3p,vm_offset_t va)8445 pmap_demote_l3c(pmap_t pmap, pt_entry_t *l3p, vm_offset_t va)
8446 {
8447 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8448 vm_offset_t tmpl3;
8449 register_t intr;
8450
8451 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
8452 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8453 sizeof(pt_entry_t)) - 1));
8454 l3c_end = l3c_start + L3C_ENTRIES;
8455 tmpl3 = 0;
8456 if ((va & ~L3C_OFFSET) < (vm_offset_t)l3c_end &&
8457 (vm_offset_t)l3c_start < (va & ~L3C_OFFSET) + L3C_SIZE) {
8458 tmpl3 = kva_alloc(PAGE_SIZE);
8459 if (tmpl3 == 0)
8460 return (false);
8461 pmap_kenter(tmpl3, PAGE_SIZE,
8462 DMAP_TO_PHYS((vm_offset_t)l3c_start) & ~L3_OFFSET,
8463 VM_MEMATTR_WRITE_BACK);
8464 l3c_start = (pt_entry_t *)(tmpl3 +
8465 ((vm_offset_t)l3c_start & PAGE_MASK));
8466 l3c_end = (pt_entry_t *)(tmpl3 +
8467 ((vm_offset_t)l3c_end & PAGE_MASK));
8468 }
8469 mask = 0;
8470 nbits = ATTR_DESCR_VALID;
8471 intr = intr_disable();
8472
8473 /*
8474 * Break the mappings.
8475 */
8476 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8477 /*
8478 * Clear the mapping's contiguous and valid bits, but leave
8479 * the rest of the entry unchanged, so that a lockless,
8480 * concurrent pmap_kextract() can still lookup the physical
8481 * address.
8482 */
8483 l3e = pmap_load(tl3p);
8484 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8485 ("pmap_demote_l3c: missing ATTR_CONTIGUOUS"));
8486 KASSERT((l3e & (ATTR_SW_DBM | ATTR_S1_AP_RW_BIT)) !=
8487 (ATTR_SW_DBM | ATTR_S1_AP(ATTR_S1_AP_RO)),
8488 ("pmap_demote_l3c: missing ATTR_S1_AP_RW"));
8489 while (!atomic_fcmpset_64(tl3p, &l3e, l3e & ~(ATTR_CONTIGUOUS |
8490 ATTR_DESCR_VALID)))
8491 cpu_spinwait();
8492
8493 /*
8494 * Hardware accessed and dirty bit maintenance might only
8495 * update a single L3 entry, so we must combine the accessed
8496 * and dirty bits from this entire set of contiguous L3
8497 * entries.
8498 */
8499 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8500 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8501 mask = ATTR_S1_AP_RW_BIT;
8502 nbits |= l3e & ATTR_AF;
8503 }
8504 if ((nbits & ATTR_AF) != 0) {
8505 pmap_invalidate_range(pmap, va & ~L3C_OFFSET, (va + L3C_SIZE) &
8506 ~L3C_OFFSET, true);
8507 }
8508
8509 /*
8510 * Remake the mappings, updating the accessed and dirty bits.
8511 */
8512 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8513 l3e = pmap_load(tl3p);
8514 while (!atomic_fcmpset_64(tl3p, &l3e, (l3e & ~mask) | nbits))
8515 cpu_spinwait();
8516 }
8517 dsb(ishst);
8518
8519 intr_restore(intr);
8520 if (tmpl3 != 0) {
8521 pmap_kremove(tmpl3);
8522 kva_free(tmpl3, PAGE_SIZE);
8523 }
8524 counter_u64_add(pmap_l3c_demotions, 1);
8525 CTR2(KTR_PMAP, "pmap_demote_l3c: success for va %#lx in pmap %p",
8526 va, pmap);
8527 return (true);
8528 }
8529
8530 /*
8531 * Accumulate the accessed and dirty bits within a L3C superpage and
8532 * return the specified PTE with them applied correctly.
8533 */
8534 static pt_entry_t
pmap_load_l3c(pt_entry_t * l3p)8535 pmap_load_l3c(pt_entry_t *l3p)
8536 {
8537 pt_entry_t *l3c_end, *l3c_start, l3e, mask, nbits, *tl3p;
8538
8539 l3c_start = (pt_entry_t *)((uintptr_t)l3p & ~((L3C_ENTRIES *
8540 sizeof(pt_entry_t)) - 1));
8541 l3c_end = l3c_start + L3C_ENTRIES;
8542 mask = 0;
8543 nbits = 0;
8544 /* Iterate over each mapping in the superpage. */
8545 for (tl3p = l3c_start; tl3p < l3c_end; tl3p++) {
8546 l3e = pmap_load(tl3p);
8547 KASSERT((l3e & ATTR_CONTIGUOUS) != 0,
8548 ("pmap_load_l3c: missing ATTR_CONTIGUOUS"));
8549 /* Update mask if the current page has its dirty bit set. */
8550 if ((l3e & (ATTR_S1_AP_RW_BIT | ATTR_SW_DBM)) ==
8551 (ATTR_S1_AP(ATTR_S1_AP_RW) | ATTR_SW_DBM))
8552 mask = ATTR_S1_AP_RW_BIT;
8553 /* Update nbits if the accessed bit is set. */
8554 nbits |= l3e & ATTR_AF;
8555 }
8556 return ((pmap_load(l3p) & ~mask) | nbits);
8557 }
8558
8559 /*
8560 * Perform the pmap work for mincore(2). If the page is not both referenced and
8561 * modified by this pmap, returns its physical address so that the caller can
8562 * find other mappings.
8563 */
8564 int
pmap_mincore(pmap_t pmap,vm_offset_t addr,vm_paddr_t * pap)8565 pmap_mincore(pmap_t pmap, vm_offset_t addr, vm_paddr_t *pap)
8566 {
8567 pt_entry_t *pte, tpte;
8568 vm_paddr_t mask, pa;
8569 int lvl, val;
8570 bool managed;
8571
8572 PMAP_ASSERT_STAGE1(pmap);
8573 PMAP_LOCK(pmap);
8574 pte = pmap_pte(pmap, addr, &lvl);
8575 if (pte != NULL) {
8576 tpte = pmap_load(pte);
8577
8578 switch (lvl) {
8579 case 3:
8580 mask = L3_OFFSET;
8581 break;
8582 case 2:
8583 mask = L2_OFFSET;
8584 break;
8585 case 1:
8586 mask = L1_OFFSET;
8587 break;
8588 default:
8589 panic("pmap_mincore: invalid level %d", lvl);
8590 }
8591
8592 managed = (tpte & ATTR_SW_MANAGED) != 0;
8593 val = MINCORE_INCORE;
8594 if (lvl != 3)
8595 val |= MINCORE_PSIND(3 - lvl);
8596 if ((managed && pmap_pte_dirty(pmap, tpte)) || (!managed &&
8597 (tpte & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP(ATTR_S1_AP_RW)))
8598 val |= MINCORE_MODIFIED | MINCORE_MODIFIED_OTHER;
8599 if ((tpte & ATTR_AF) == ATTR_AF)
8600 val |= MINCORE_REFERENCED | MINCORE_REFERENCED_OTHER;
8601
8602 pa = PTE_TO_PHYS(tpte) | (addr & mask);
8603 } else {
8604 managed = false;
8605 val = 0;
8606 }
8607
8608 if ((val & (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER)) !=
8609 (MINCORE_MODIFIED_OTHER | MINCORE_REFERENCED_OTHER) && managed) {
8610 *pap = pa;
8611 }
8612 PMAP_UNLOCK(pmap);
8613 return (val);
8614 }
8615
8616 /*
8617 * Garbage collect every ASID that is neither active on a processor nor
8618 * reserved.
8619 */
8620 static void
pmap_reset_asid_set(pmap_t pmap)8621 pmap_reset_asid_set(pmap_t pmap)
8622 {
8623 pmap_t curpmap;
8624 int asid, cpuid, epoch;
8625 struct asid_set *set;
8626 enum pmap_stage stage;
8627
8628 set = pmap->pm_asid_set;
8629 stage = pmap->pm_stage;
8630
8631 set = pmap->pm_asid_set;
8632 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8633 mtx_assert(&set->asid_set_mutex, MA_OWNED);
8634
8635 /*
8636 * Ensure that the store to asid_epoch is globally visible before the
8637 * loads from pc_curpmap are performed.
8638 */
8639 epoch = set->asid_epoch + 1;
8640 if (epoch == INT_MAX)
8641 epoch = 0;
8642 set->asid_epoch = epoch;
8643 dsb(ishst);
8644 if (stage == PM_STAGE1) {
8645 __asm __volatile("tlbi vmalle1is");
8646 } else {
8647 KASSERT(pmap_clean_stage2_tlbi != NULL,
8648 ("%s: Unset stage 2 tlb invalidation callback\n",
8649 __func__));
8650 pmap_clean_stage2_tlbi();
8651 }
8652 dsb(ish);
8653 bit_nclear(set->asid_set, ASID_FIRST_AVAILABLE,
8654 set->asid_set_size - 1);
8655 CPU_FOREACH(cpuid) {
8656 if (cpuid == curcpu)
8657 continue;
8658 if (stage == PM_STAGE1) {
8659 curpmap = pcpu_find(cpuid)->pc_curpmap;
8660 PMAP_ASSERT_STAGE1(pmap);
8661 } else {
8662 curpmap = pcpu_find(cpuid)->pc_curvmpmap;
8663 if (curpmap == NULL)
8664 continue;
8665 PMAP_ASSERT_STAGE2(pmap);
8666 }
8667 KASSERT(curpmap->pm_asid_set == set, ("Incorrect set"));
8668 asid = COOKIE_TO_ASID(curpmap->pm_cookie);
8669 if (asid == -1)
8670 continue;
8671 bit_set(set->asid_set, asid);
8672 curpmap->pm_cookie = COOKIE_FROM(asid, epoch);
8673 }
8674 }
8675
8676 /*
8677 * Allocate a new ASID for the specified pmap.
8678 */
8679 static void
pmap_alloc_asid(pmap_t pmap)8680 pmap_alloc_asid(pmap_t pmap)
8681 {
8682 struct asid_set *set;
8683 int new_asid;
8684
8685 set = pmap->pm_asid_set;
8686 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8687
8688 mtx_lock_spin(&set->asid_set_mutex);
8689
8690 /*
8691 * While this processor was waiting to acquire the asid set mutex,
8692 * pmap_reset_asid_set() running on another processor might have
8693 * updated this pmap's cookie to the current epoch. In which case, we
8694 * don't need to allocate a new ASID.
8695 */
8696 if (COOKIE_TO_EPOCH(pmap->pm_cookie) == set->asid_epoch)
8697 goto out;
8698
8699 bit_ffc_at(set->asid_set, set->asid_next, set->asid_set_size,
8700 &new_asid);
8701 if (new_asid == -1) {
8702 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8703 set->asid_next, &new_asid);
8704 if (new_asid == -1) {
8705 pmap_reset_asid_set(pmap);
8706 bit_ffc_at(set->asid_set, ASID_FIRST_AVAILABLE,
8707 set->asid_set_size, &new_asid);
8708 KASSERT(new_asid != -1, ("ASID allocation failure"));
8709 }
8710 }
8711 bit_set(set->asid_set, new_asid);
8712 set->asid_next = new_asid + 1;
8713 pmap->pm_cookie = COOKIE_FROM(new_asid, set->asid_epoch);
8714 out:
8715 mtx_unlock_spin(&set->asid_set_mutex);
8716 }
8717
8718 static uint64_t __read_mostly ttbr_flags;
8719
8720 /*
8721 * Compute the value that should be stored in ttbr0 to activate the specified
8722 * pmap. This value may change from time to time.
8723 */
8724 uint64_t
pmap_to_ttbr0(pmap_t pmap)8725 pmap_to_ttbr0(pmap_t pmap)
8726 {
8727 uint64_t ttbr;
8728
8729 ttbr = pmap->pm_ttbr;
8730 ttbr |= ASID_TO_OPERAND(COOKIE_TO_ASID(pmap->pm_cookie));
8731 ttbr |= ttbr_flags;
8732
8733 return (ttbr);
8734 }
8735
8736 static void
pmap_set_cnp(void * arg)8737 pmap_set_cnp(void *arg)
8738 {
8739 uint64_t ttbr0, ttbr1;
8740 u_int cpuid;
8741
8742 cpuid = *(u_int *)arg;
8743 if (cpuid == curcpu) {
8744 /*
8745 * Set the flags while all CPUs are handling the
8746 * smp_rendezvous so will not call pmap_to_ttbr0. Any calls
8747 * to pmap_to_ttbr0 after this will have the CnP flag set.
8748 * The dsb after invalidating the TLB will act as a barrier
8749 * to ensure all CPUs can observe this change.
8750 */
8751 ttbr_flags |= TTBR_CnP;
8752 }
8753
8754 ttbr0 = READ_SPECIALREG(ttbr0_el1);
8755 ttbr0 |= TTBR_CnP;
8756
8757 ttbr1 = READ_SPECIALREG(ttbr1_el1);
8758 ttbr1 |= TTBR_CnP;
8759
8760 /* Update ttbr{0,1}_el1 with the CnP flag */
8761 WRITE_SPECIALREG(ttbr0_el1, ttbr0);
8762 WRITE_SPECIALREG(ttbr1_el1, ttbr1);
8763 isb();
8764 __asm __volatile("tlbi vmalle1is");
8765 dsb(ish);
8766 isb();
8767 }
8768
8769 /*
8770 * Defer enabling some features until we have read the ID registers to know
8771 * if they are supported on all CPUs.
8772 */
8773 static void
pmap_init_mp(void * dummy __unused)8774 pmap_init_mp(void *dummy __unused)
8775 {
8776 uint64_t reg;
8777
8778 if (get_kernel_reg(ID_AA64PFR1_EL1, ®)) {
8779 if (ID_AA64PFR1_BT_VAL(reg) != ID_AA64PFR1_BT_NONE) {
8780 if (bootverbose)
8781 printf("Enabling BTI\n");
8782 pmap_bti_support = true;
8783
8784 pmap_bti_ranges_zone = uma_zcreate("BTI ranges",
8785 sizeof(struct rs_el), NULL, NULL, NULL, NULL,
8786 UMA_ALIGN_PTR, 0);
8787 }
8788 }
8789 }
8790 SYSINIT(pmap_init_mp, SI_SUB_CPU, SI_ORDER_ANY, pmap_init_mp, NULL);
8791
8792 /*
8793 * Defer enabling CnP until we have read the ID registers to know if it's
8794 * supported on all CPUs.
8795 */
8796 static void
pmap_init_cnp(void * dummy __unused)8797 pmap_init_cnp(void *dummy __unused)
8798 {
8799 uint64_t reg;
8800 u_int cpuid;
8801
8802 if (!get_kernel_reg(ID_AA64MMFR2_EL1, ®))
8803 return;
8804
8805 if (ID_AA64MMFR2_CnP_VAL(reg) != ID_AA64MMFR2_CnP_NONE) {
8806 if (bootverbose)
8807 printf("Enabling CnP\n");
8808 cpuid = curcpu;
8809 smp_rendezvous(NULL, pmap_set_cnp, NULL, &cpuid);
8810 }
8811
8812 }
8813 SYSINIT(pmap_init_cnp, SI_SUB_SMP, SI_ORDER_ANY, pmap_init_cnp, NULL);
8814
8815 static bool
pmap_activate_int(pmap_t pmap)8816 pmap_activate_int(pmap_t pmap)
8817 {
8818 struct asid_set *set;
8819 int epoch;
8820
8821 KASSERT(PCPU_GET(curpmap) != NULL, ("no active pmap"));
8822 KASSERT(pmap != kernel_pmap, ("kernel pmap activation"));
8823
8824 if ((pmap->pm_stage == PM_STAGE1 && pmap == PCPU_GET(curpmap)) ||
8825 (pmap->pm_stage == PM_STAGE2 && pmap == PCPU_GET(curvmpmap))) {
8826 /*
8827 * Handle the possibility that the old thread was preempted
8828 * after an "ic" or "tlbi" instruction but before it performed
8829 * a "dsb" instruction. If the old thread migrates to a new
8830 * processor, its completion of a "dsb" instruction on that
8831 * new processor does not guarantee that the "ic" or "tlbi"
8832 * instructions performed on the old processor have completed.
8833 */
8834 dsb(ish);
8835 return (false);
8836 }
8837
8838 set = pmap->pm_asid_set;
8839 KASSERT(set != NULL, ("%s: NULL asid set", __func__));
8840
8841 /*
8842 * Ensure that the store to curpmap is globally visible before the
8843 * load from asid_epoch is performed.
8844 */
8845 if (pmap->pm_stage == PM_STAGE1)
8846 PCPU_SET(curpmap, pmap);
8847 else
8848 PCPU_SET(curvmpmap, pmap);
8849 dsb(ish);
8850 epoch = COOKIE_TO_EPOCH(pmap->pm_cookie);
8851 if (epoch >= 0 && epoch != set->asid_epoch)
8852 pmap_alloc_asid(pmap);
8853
8854 if (pmap->pm_stage == PM_STAGE1) {
8855 set_ttbr0(pmap_to_ttbr0(pmap));
8856 if (PCPU_GET(bcast_tlbi_workaround) != 0)
8857 invalidate_local_icache();
8858 }
8859 return (true);
8860 }
8861
8862 void
pmap_activate_vm(pmap_t pmap)8863 pmap_activate_vm(pmap_t pmap)
8864 {
8865
8866 PMAP_ASSERT_STAGE2(pmap);
8867
8868 (void)pmap_activate_int(pmap);
8869 }
8870
8871 void
pmap_activate(struct thread * td)8872 pmap_activate(struct thread *td)
8873 {
8874 pmap_t pmap;
8875
8876 pmap = vmspace_pmap(td->td_proc->p_vmspace);
8877 PMAP_ASSERT_STAGE1(pmap);
8878 critical_enter();
8879 (void)pmap_activate_int(pmap);
8880 critical_exit();
8881 }
8882
8883 /*
8884 * Activate the thread we are switching to.
8885 * To simplify the assembly in cpu_throw return the new threads pcb.
8886 */
8887 struct pcb *
pmap_switch(struct thread * new)8888 pmap_switch(struct thread *new)
8889 {
8890 pcpu_bp_harden bp_harden;
8891 struct pcb *pcb;
8892
8893 /* Store the new curthread */
8894 PCPU_SET(curthread, new);
8895
8896 /* And the new pcb */
8897 pcb = new->td_pcb;
8898 PCPU_SET(curpcb, pcb);
8899
8900 /*
8901 * TODO: We may need to flush the cache here if switching
8902 * to a user process.
8903 */
8904
8905 if (pmap_activate_int(vmspace_pmap(new->td_proc->p_vmspace))) {
8906 /*
8907 * Stop userspace from training the branch predictor against
8908 * other processes. This will call into a CPU specific
8909 * function that clears the branch predictor state.
8910 */
8911 bp_harden = PCPU_GET(bp_harden);
8912 if (bp_harden != NULL)
8913 bp_harden();
8914 }
8915
8916 return (pcb);
8917 }
8918
8919 void
pmap_sync_icache(pmap_t pmap,vm_offset_t va,vm_size_t sz)8920 pmap_sync_icache(pmap_t pmap, vm_offset_t va, vm_size_t sz)
8921 {
8922
8923 PMAP_ASSERT_STAGE1(pmap);
8924 KASSERT(ADDR_IS_CANONICAL(va),
8925 ("%s: Address not in canonical form: %lx", __func__, va));
8926
8927 if (ADDR_IS_KERNEL(va)) {
8928 cpu_icache_sync_range((void *)va, sz);
8929 } else {
8930 u_int len, offset;
8931 vm_paddr_t pa;
8932
8933 /* Find the length of data in this page to flush */
8934 offset = va & PAGE_MASK;
8935 len = imin(PAGE_SIZE - offset, sz);
8936
8937 while (sz != 0) {
8938 /* Extract the physical address & find it in the DMAP */
8939 pa = pmap_extract(pmap, va);
8940 if (pa != 0)
8941 cpu_icache_sync_range((void *)PHYS_TO_DMAP(pa),
8942 len);
8943
8944 /* Move to the next page */
8945 sz -= len;
8946 va += len;
8947 /* Set the length for the next iteration */
8948 len = imin(PAGE_SIZE, sz);
8949 }
8950 }
8951 }
8952
8953 static int
pmap_stage2_fault(pmap_t pmap,uint64_t esr,uint64_t far)8954 pmap_stage2_fault(pmap_t pmap, uint64_t esr, uint64_t far)
8955 {
8956 pd_entry_t *pdep;
8957 pt_entry_t *ptep, pte;
8958 int rv, lvl, dfsc;
8959
8960 PMAP_ASSERT_STAGE2(pmap);
8961 rv = KERN_FAILURE;
8962
8963 /* Data and insn aborts use same encoding for FSC field. */
8964 dfsc = esr & ISS_DATA_DFSC_MASK;
8965 switch (dfsc) {
8966 case ISS_DATA_DFSC_TF_L0:
8967 case ISS_DATA_DFSC_TF_L1:
8968 case ISS_DATA_DFSC_TF_L2:
8969 case ISS_DATA_DFSC_TF_L3:
8970 PMAP_LOCK(pmap);
8971 pdep = pmap_pde(pmap, far, &lvl);
8972 if (pdep == NULL || lvl != (dfsc - ISS_DATA_DFSC_TF_L1)) {
8973 PMAP_UNLOCK(pmap);
8974 break;
8975 }
8976
8977 switch (lvl) {
8978 case 0:
8979 ptep = pmap_l0_to_l1(pdep, far);
8980 break;
8981 case 1:
8982 ptep = pmap_l1_to_l2(pdep, far);
8983 break;
8984 case 2:
8985 ptep = pmap_l2_to_l3(pdep, far);
8986 break;
8987 default:
8988 panic("%s: Invalid pde level %d", __func__,lvl);
8989 }
8990 goto fault_exec;
8991
8992 case ISS_DATA_DFSC_AFF_L1:
8993 case ISS_DATA_DFSC_AFF_L2:
8994 case ISS_DATA_DFSC_AFF_L3:
8995 PMAP_LOCK(pmap);
8996 ptep = pmap_pte(pmap, far, &lvl);
8997 fault_exec:
8998 if (ptep != NULL && (pte = pmap_load(ptep)) != 0) {
8999 if (icache_vmid) {
9000 pmap_invalidate_vpipt_icache();
9001 } else {
9002 /*
9003 * If accessing an executable page invalidate
9004 * the I-cache so it will be valid when we
9005 * continue execution in the guest. The D-cache
9006 * is assumed to already be clean to the Point
9007 * of Coherency.
9008 */
9009 if ((pte & ATTR_S2_XN_MASK) !=
9010 ATTR_S2_XN(ATTR_S2_XN_NONE)) {
9011 invalidate_icache();
9012 }
9013 }
9014 pmap_set_bits(ptep, ATTR_AF | ATTR_DESCR_VALID);
9015 rv = KERN_SUCCESS;
9016 }
9017 PMAP_UNLOCK(pmap);
9018 break;
9019 }
9020
9021 return (rv);
9022 }
9023
9024 int
pmap_fault(pmap_t pmap,uint64_t esr,uint64_t far)9025 pmap_fault(pmap_t pmap, uint64_t esr, uint64_t far)
9026 {
9027 pt_entry_t pte, *ptep;
9028 register_t intr;
9029 uint64_t ec, par;
9030 int lvl, rv;
9031
9032 rv = KERN_FAILURE;
9033
9034 ec = ESR_ELx_EXCEPTION(esr);
9035 switch (ec) {
9036 case EXCP_INSN_ABORT_L:
9037 case EXCP_INSN_ABORT:
9038 case EXCP_DATA_ABORT_L:
9039 case EXCP_DATA_ABORT:
9040 break;
9041 default:
9042 return (rv);
9043 }
9044
9045 if (pmap->pm_stage == PM_STAGE2)
9046 return (pmap_stage2_fault(pmap, esr, far));
9047
9048 /* Data and insn aborts use same encoding for FSC field. */
9049 switch (esr & ISS_DATA_DFSC_MASK) {
9050 case ISS_DATA_DFSC_AFF_L1:
9051 case ISS_DATA_DFSC_AFF_L2:
9052 case ISS_DATA_DFSC_AFF_L3:
9053 PMAP_LOCK(pmap);
9054 ptep = pmap_pte(pmap, far, &lvl);
9055 if (ptep != NULL) {
9056 pmap_set_bits(ptep, ATTR_AF);
9057 rv = KERN_SUCCESS;
9058 /*
9059 * XXXMJ as an optimization we could mark the entry
9060 * dirty if this is a write fault.
9061 */
9062 }
9063 PMAP_UNLOCK(pmap);
9064 break;
9065 case ISS_DATA_DFSC_PF_L1:
9066 case ISS_DATA_DFSC_PF_L2:
9067 case ISS_DATA_DFSC_PF_L3:
9068 if ((ec != EXCP_DATA_ABORT_L && ec != EXCP_DATA_ABORT) ||
9069 (esr & ISS_DATA_WnR) == 0)
9070 return (rv);
9071 PMAP_LOCK(pmap);
9072 ptep = pmap_pte(pmap, far, &lvl);
9073 if (ptep != NULL &&
9074 ((pte = pmap_load(ptep)) & ATTR_SW_DBM) != 0) {
9075 if ((pte & ATTR_S1_AP_RW_BIT) ==
9076 ATTR_S1_AP(ATTR_S1_AP_RO)) {
9077 pmap_clear_bits(ptep, ATTR_S1_AP_RW_BIT);
9078 pmap_s1_invalidate_page(pmap, far, true);
9079 }
9080 rv = KERN_SUCCESS;
9081 }
9082 PMAP_UNLOCK(pmap);
9083 break;
9084 case ISS_DATA_DFSC_TF_L0:
9085 case ISS_DATA_DFSC_TF_L1:
9086 case ISS_DATA_DFSC_TF_L2:
9087 case ISS_DATA_DFSC_TF_L3:
9088 /*
9089 * Retry the translation. A break-before-make sequence can
9090 * produce a transient fault.
9091 */
9092 if (pmap == kernel_pmap) {
9093 /*
9094 * The translation fault may have occurred within a
9095 * critical section. Therefore, we must check the
9096 * address without acquiring the kernel pmap's lock.
9097 */
9098 if (pmap_klookup(far, NULL))
9099 rv = KERN_SUCCESS;
9100 } else {
9101 PMAP_LOCK(pmap);
9102 /* Ask the MMU to check the address. */
9103 intr = intr_disable();
9104 par = arm64_address_translate_s1e0r(far);
9105 intr_restore(intr);
9106 PMAP_UNLOCK(pmap);
9107
9108 /*
9109 * If the translation was successful, then we can
9110 * return success to the trap handler.
9111 */
9112 if (PAR_SUCCESS(par))
9113 rv = KERN_SUCCESS;
9114 }
9115 break;
9116 }
9117
9118 return (rv);
9119 }
9120
9121 /*
9122 * Increase the starting virtual address of the given mapping if a
9123 * different alignment might result in more superpage mappings.
9124 */
9125 void
pmap_align_superpage(vm_object_t object,vm_ooffset_t offset,vm_offset_t * addr,vm_size_t size)9126 pmap_align_superpage(vm_object_t object, vm_ooffset_t offset,
9127 vm_offset_t *addr, vm_size_t size)
9128 {
9129 vm_offset_t superpage_offset;
9130
9131 if (size < L2_SIZE)
9132 return;
9133 if (object != NULL && (object->flags & OBJ_COLORED) != 0)
9134 offset += ptoa(object->pg_color);
9135 superpage_offset = offset & L2_OFFSET;
9136 if (size - ((L2_SIZE - superpage_offset) & L2_OFFSET) < L2_SIZE ||
9137 (*addr & L2_OFFSET) == superpage_offset)
9138 return;
9139 if ((*addr & L2_OFFSET) < superpage_offset)
9140 *addr = (*addr & ~L2_OFFSET) + superpage_offset;
9141 else
9142 *addr = ((*addr + L2_OFFSET) & ~L2_OFFSET) + superpage_offset;
9143 }
9144
9145 /**
9146 * Get the kernel virtual address of a set of physical pages. If there are
9147 * physical addresses not covered by the DMAP perform a transient mapping
9148 * that will be removed when calling pmap_unmap_io_transient.
9149 *
9150 * \param page The pages the caller wishes to obtain the virtual
9151 * address on the kernel memory map.
9152 * \param vaddr On return contains the kernel virtual memory address
9153 * of the pages passed in the page parameter.
9154 * \param count Number of pages passed in.
9155 * \param can_fault true if the thread using the mapped pages can take
9156 * page faults, false otherwise.
9157 *
9158 * \returns true if the caller must call pmap_unmap_io_transient when
9159 * finished or false otherwise.
9160 *
9161 */
9162 bool
pmap_map_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9163 pmap_map_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9164 bool can_fault)
9165 {
9166 vm_paddr_t paddr;
9167 bool needs_mapping;
9168 int error __diagused, i;
9169
9170 /*
9171 * Allocate any KVA space that we need, this is done in a separate
9172 * loop to prevent calling vmem_alloc while pinned.
9173 */
9174 needs_mapping = false;
9175 for (i = 0; i < count; i++) {
9176 paddr = VM_PAGE_TO_PHYS(page[i]);
9177 if (__predict_false(!PHYS_IN_DMAP(paddr))) {
9178 error = vmem_alloc(kernel_arena, PAGE_SIZE,
9179 M_BESTFIT | M_WAITOK, &vaddr[i]);
9180 KASSERT(error == 0, ("vmem_alloc failed: %d", error));
9181 needs_mapping = true;
9182 } else {
9183 vaddr[i] = PHYS_TO_DMAP(paddr);
9184 }
9185 }
9186
9187 /* Exit early if everything is covered by the DMAP */
9188 if (!needs_mapping)
9189 return (false);
9190
9191 if (!can_fault)
9192 sched_pin();
9193 for (i = 0; i < count; i++) {
9194 paddr = VM_PAGE_TO_PHYS(page[i]);
9195 if (!PHYS_IN_DMAP(paddr)) {
9196 panic(
9197 "pmap_map_io_transient: TODO: Map out of DMAP data");
9198 }
9199 }
9200
9201 return (needs_mapping);
9202 }
9203
9204 void
pmap_unmap_io_transient(vm_page_t page[],vm_offset_t vaddr[],int count,bool can_fault)9205 pmap_unmap_io_transient(vm_page_t page[], vm_offset_t vaddr[], int count,
9206 bool can_fault)
9207 {
9208 vm_paddr_t paddr;
9209 int i;
9210
9211 if (!can_fault)
9212 sched_unpin();
9213 for (i = 0; i < count; i++) {
9214 paddr = VM_PAGE_TO_PHYS(page[i]);
9215 if (!PHYS_IN_DMAP(paddr)) {
9216 panic("ARM64TODO: pmap_unmap_io_transient: Unmap data");
9217 }
9218 }
9219 }
9220
9221 bool
pmap_is_valid_memattr(pmap_t pmap __unused,vm_memattr_t mode)9222 pmap_is_valid_memattr(pmap_t pmap __unused, vm_memattr_t mode)
9223 {
9224
9225 return (mode >= VM_MEMATTR_DEVICE && mode <= VM_MEMATTR_WRITE_THROUGH);
9226 }
9227
9228 static void *
bti_dup_range(void * ctx __unused,void * data)9229 bti_dup_range(void *ctx __unused, void *data)
9230 {
9231 struct rs_el *node, *new_node;
9232
9233 new_node = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9234 if (new_node == NULL)
9235 return (NULL);
9236 node = data;
9237 memcpy(new_node, node, sizeof(*node));
9238 return (new_node);
9239 }
9240
9241 static void
bti_free_range(void * ctx __unused,void * node)9242 bti_free_range(void *ctx __unused, void *node)
9243 {
9244
9245 uma_zfree(pmap_bti_ranges_zone, node);
9246 }
9247
9248 static int
pmap_bti_assign(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9249 pmap_bti_assign(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9250 {
9251 struct rs_el *rs;
9252 int error;
9253
9254 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9255 PMAP_ASSERT_STAGE1(pmap);
9256 MPASS(pmap->pm_bti != NULL);
9257 rs = uma_zalloc(pmap_bti_ranges_zone, M_NOWAIT);
9258 if (rs == NULL)
9259 return (ENOMEM);
9260 error = rangeset_insert(pmap->pm_bti, sva, eva, rs);
9261 if (error != 0)
9262 uma_zfree(pmap_bti_ranges_zone, rs);
9263 return (error);
9264 }
9265
9266 static void
pmap_bti_deassign_all(pmap_t pmap)9267 pmap_bti_deassign_all(pmap_t pmap)
9268 {
9269
9270 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9271 if (pmap->pm_bti != NULL)
9272 rangeset_remove_all(pmap->pm_bti);
9273 }
9274
9275 /*
9276 * Returns true if the BTI setting is the same across the specified address
9277 * range, and false otherwise. When returning true, updates the referenced PTE
9278 * to reflect the BTI setting.
9279 *
9280 * Only stage 1 pmaps support BTI. The kernel pmap is always a stage 1 pmap
9281 * that has the same BTI setting implicitly across its entire address range.
9282 */
9283 static bool
pmap_bti_same(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,pt_entry_t * pte)9284 pmap_bti_same(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, pt_entry_t *pte)
9285 {
9286 struct rs_el *next_rs, *rs;
9287 vm_offset_t va;
9288
9289 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9290 KASSERT(ADDR_IS_CANONICAL(sva),
9291 ("%s: Start address not in canonical form: %lx", __func__, sva));
9292 KASSERT(ADDR_IS_CANONICAL(eva),
9293 ("%s: End address not in canonical form: %lx", __func__, eva));
9294 KASSERT((*pte & ATTR_S1_GP) == 0,
9295 ("%s: pte %lx has ATTR_S1_GP preset", __func__, *pte));
9296
9297 if (pmap == kernel_pmap) {
9298 *pte |= ATTR_KERN_GP;
9299 return (true);
9300 }
9301 if (pmap->pm_bti == NULL)
9302 return (true);
9303 PMAP_ASSERT_STAGE1(pmap);
9304 rs = rangeset_lookup(pmap->pm_bti, sva);
9305 if (rs == NULL) {
9306 rs = rangeset_next(pmap->pm_bti, sva);
9307 return (rs == NULL ||
9308 rs->re_start >= eva);
9309 }
9310 while ((va = rs->re_end) < eva) {
9311 next_rs = rangeset_next(pmap->pm_bti, va);
9312 if (next_rs == NULL ||
9313 va != next_rs->re_start)
9314 return (false);
9315 rs = next_rs;
9316 }
9317 if (rs != NULL)
9318 *pte |= ATTR_S1_GP;
9319 return (true);
9320 }
9321
9322 static pt_entry_t
pmap_pte_bti(pmap_t pmap,vm_offset_t va)9323 pmap_pte_bti(pmap_t pmap, vm_offset_t va)
9324 {
9325 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9326 MPASS(ADDR_IS_CANONICAL(va));
9327
9328 if (pmap->pm_stage != PM_STAGE1)
9329 return (0);
9330 if (pmap == kernel_pmap)
9331 return (ATTR_KERN_GP);
9332 if (pmap->pm_bti != NULL && rangeset_lookup(pmap->pm_bti, va) != NULL)
9333 return (ATTR_S1_GP);
9334 return (0);
9335 }
9336
9337 static void
pmap_bti_on_remove(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9338 pmap_bti_on_remove(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9339 {
9340
9341 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9342 if (pmap->pm_bti != NULL)
9343 rangeset_remove(pmap->pm_bti, sva, eva);
9344 }
9345
9346 static int
pmap_bti_copy(pmap_t dst_pmap,pmap_t src_pmap)9347 pmap_bti_copy(pmap_t dst_pmap, pmap_t src_pmap)
9348 {
9349
9350 PMAP_LOCK_ASSERT(dst_pmap, MA_OWNED);
9351 PMAP_LOCK_ASSERT(src_pmap, MA_OWNED);
9352 MPASS(src_pmap->pm_stage == dst_pmap->pm_stage);
9353 MPASS(src_pmap->pm_bti != NULL);
9354 MPASS(dst_pmap->pm_bti != NULL);
9355 if (src_pmap->pm_bti->rs_data_ctx == NULL)
9356 return (0);
9357 return (rangeset_copy(dst_pmap->pm_bti, src_pmap->pm_bti));
9358 }
9359
9360 static void
pmap_bti_update_range(pmap_t pmap,vm_offset_t sva,vm_offset_t eva,bool set)9361 pmap_bti_update_range(pmap_t pmap, vm_offset_t sva, vm_offset_t eva, bool set)
9362 {
9363 PMAP_LOCK_ASSERT(pmap, MA_OWNED);
9364 PMAP_ASSERT_STAGE1(pmap);
9365
9366 pmap_mask_set_locked(pmap, sva, eva, ATTR_S1_GP, set ? ATTR_S1_GP : 0,
9367 true);
9368 }
9369
9370 int
pmap_bti_set(pmap_t pmap,vm_offset_t sva,vm_offset_t eva)9371 pmap_bti_set(pmap_t pmap, vm_offset_t sva, vm_offset_t eva)
9372 {
9373 int error;
9374
9375 if (pmap->pm_bti == NULL)
9376 return (0);
9377 if (!ADDR_IS_CANONICAL(sva) || !ADDR_IS_CANONICAL(eva))
9378 return (EINVAL);
9379 if (pmap->pm_stage != PM_STAGE1)
9380 return (EINVAL);
9381 if (eva <= sva || ADDR_IS_KERNEL(eva))
9382 return (EFAULT);
9383
9384 sva = trunc_page(sva);
9385 eva = round_page(eva);
9386 for (;;) {
9387 PMAP_LOCK(pmap);
9388 error = pmap_bti_assign(pmap, sva, eva);
9389 if (error == 0)
9390 pmap_bti_update_range(pmap, sva, eva, true);
9391 PMAP_UNLOCK(pmap);
9392 if (error != ENOMEM)
9393 break;
9394 vm_wait(NULL);
9395 }
9396 return (error);
9397 }
9398
9399 #if defined(KASAN) || defined(KMSAN)
9400 static pd_entry_t *pmap_san_early_l2;
9401
9402 #define SAN_BOOTSTRAP_L2_SIZE (1 * L2_SIZE)
9403 #define SAN_BOOTSTRAP_SIZE (2 * PAGE_SIZE)
9404 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_l2(void)9405 pmap_san_enter_bootstrap_alloc_l2(void)
9406 {
9407 static uint8_t bootstrap_data[SAN_BOOTSTRAP_L2_SIZE] __aligned(L2_SIZE);
9408 static size_t offset = 0;
9409 vm_offset_t addr;
9410
9411 if (offset + L2_SIZE > sizeof(bootstrap_data)) {
9412 panic("%s: out of memory for the bootstrap shadow map L2 entries",
9413 __func__);
9414 }
9415
9416 addr = (uintptr_t)&bootstrap_data[offset];
9417 offset += L2_SIZE;
9418 return (addr);
9419 }
9420
9421 /*
9422 * SAN L1 + L2 pages, maybe L3 entries later?
9423 */
9424 static vm_offset_t __nosanitizeaddress
pmap_san_enter_bootstrap_alloc_pages(int npages)9425 pmap_san_enter_bootstrap_alloc_pages(int npages)
9426 {
9427 static uint8_t bootstrap_data[SAN_BOOTSTRAP_SIZE] __aligned(PAGE_SIZE);
9428 static size_t offset = 0;
9429 vm_offset_t addr;
9430
9431 if (offset + (npages * PAGE_SIZE) > sizeof(bootstrap_data)) {
9432 panic("%s: out of memory for the bootstrap shadow map",
9433 __func__);
9434 }
9435
9436 addr = (uintptr_t)&bootstrap_data[offset];
9437 offset += (npages * PAGE_SIZE);
9438 return (addr);
9439 }
9440
9441 static void __nosanitizeaddress
pmap_san_enter_bootstrap(void)9442 pmap_san_enter_bootstrap(void)
9443 {
9444 vm_offset_t freemempos;
9445
9446 /* L1, L2 */
9447 freemempos = pmap_san_enter_bootstrap_alloc_pages(2);
9448 bs_state.freemempos = freemempos;
9449 bs_state.va = KASAN_MIN_ADDRESS;
9450 pmap_bootstrap_l1_table(&bs_state);
9451 pmap_san_early_l2 = bs_state.l2;
9452 }
9453
9454 static vm_page_t
pmap_san_enter_alloc_l3(void)9455 pmap_san_enter_alloc_l3(void)
9456 {
9457 vm_page_t m;
9458
9459 m = vm_page_alloc_noobj(VM_ALLOC_INTERRUPT | VM_ALLOC_WIRED |
9460 VM_ALLOC_ZERO);
9461 if (m == NULL)
9462 panic("%s: no memory to grow shadow map", __func__);
9463 return (m);
9464 }
9465
9466 static vm_page_t
pmap_san_enter_alloc_l2(void)9467 pmap_san_enter_alloc_l2(void)
9468 {
9469 return (vm_page_alloc_noobj_contig(VM_ALLOC_WIRED | VM_ALLOC_ZERO,
9470 Ln_ENTRIES, 0, ~0ul, L2_SIZE, 0, VM_MEMATTR_DEFAULT));
9471 }
9472
9473 void __nosanitizeaddress __nosanitizememory
pmap_san_enter(vm_offset_t va)9474 pmap_san_enter(vm_offset_t va)
9475 {
9476 pd_entry_t *l1, *l2;
9477 pt_entry_t *l3;
9478 vm_page_t m;
9479
9480 if (virtual_avail == 0) {
9481 vm_offset_t block;
9482 int slot;
9483 bool first;
9484
9485 /* Temporary shadow map prior to pmap_bootstrap(). */
9486 first = pmap_san_early_l2 == NULL;
9487 if (first)
9488 pmap_san_enter_bootstrap();
9489
9490 l2 = pmap_san_early_l2;
9491 slot = pmap_l2_index(va);
9492
9493 if ((pmap_load(&l2[slot]) & ATTR_DESCR_VALID) == 0) {
9494 MPASS(first);
9495 block = pmap_san_enter_bootstrap_alloc_l2();
9496 pmap_store(&l2[slot],
9497 PHYS_TO_PTE(pmap_early_vtophys(block)) |
9498 PMAP_SAN_PTE_BITS | L2_BLOCK);
9499 dmb(ishst);
9500 }
9501
9502 return;
9503 }
9504
9505 mtx_assert(&kernel_map->system_mtx, MA_OWNED);
9506 l1 = pmap_l1(kernel_pmap, va);
9507 MPASS(l1 != NULL);
9508 if ((pmap_load(l1) & ATTR_DESCR_VALID) == 0) {
9509 m = pmap_san_enter_alloc_l3();
9510 pmap_store(l1, VM_PAGE_TO_PTE(m) | L1_TABLE);
9511 }
9512 l2 = pmap_l1_to_l2(l1, va);
9513 if ((pmap_load(l2) & ATTR_DESCR_VALID) == 0) {
9514 m = pmap_san_enter_alloc_l2();
9515 if (m != NULL) {
9516 pmap_store(l2, VM_PAGE_TO_PTE(m) |
9517 PMAP_SAN_PTE_BITS | L2_BLOCK);
9518 } else {
9519 m = pmap_san_enter_alloc_l3();
9520 pmap_store(l2, VM_PAGE_TO_PTE(m) | L2_TABLE);
9521 }
9522 dmb(ishst);
9523 }
9524 if ((pmap_load(l2) & ATTR_DESCR_MASK) == L2_BLOCK)
9525 return;
9526 l3 = pmap_l2_to_l3(l2, va);
9527 if ((pmap_load(l3) & ATTR_DESCR_VALID) != 0)
9528 return;
9529 m = pmap_san_enter_alloc_l3();
9530 pmap_store(l3, VM_PAGE_TO_PTE(m) | PMAP_SAN_PTE_BITS | L3_PAGE);
9531 dmb(ishst);
9532 }
9533 #endif /* KASAN || KMSAN */
9534
9535 /*
9536 * Track a range of the kernel's virtual address space that is contiguous
9537 * in various mapping attributes.
9538 */
9539 struct pmap_kernel_map_range {
9540 vm_offset_t sva;
9541 pt_entry_t attrs;
9542 int l3pages;
9543 int l3contig;
9544 int l2blocks;
9545 int l2contig;
9546 int l1blocks;
9547 };
9548
9549 static void
sysctl_kmaps_dump(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t eva)9550 sysctl_kmaps_dump(struct sbuf *sb, struct pmap_kernel_map_range *range,
9551 vm_offset_t eva)
9552 {
9553 const char *mode;
9554 int index;
9555
9556 if (eva <= range->sva)
9557 return;
9558
9559 index = range->attrs & ATTR_S1_IDX_MASK;
9560 switch (index) {
9561 case ATTR_S1_IDX(VM_MEMATTR_DEVICE_NP):
9562 mode = "DEV-NP";
9563 break;
9564 case ATTR_S1_IDX(VM_MEMATTR_DEVICE):
9565 mode = "DEV";
9566 break;
9567 case ATTR_S1_IDX(VM_MEMATTR_UNCACHEABLE):
9568 mode = "UC";
9569 break;
9570 case ATTR_S1_IDX(VM_MEMATTR_WRITE_BACK):
9571 mode = "WB";
9572 break;
9573 case ATTR_S1_IDX(VM_MEMATTR_WRITE_THROUGH):
9574 mode = "WT";
9575 break;
9576 default:
9577 printf(
9578 "%s: unknown memory type %x for range 0x%016lx-0x%016lx\n",
9579 __func__, index, range->sva, eva);
9580 mode = "??";
9581 break;
9582 }
9583
9584 sbuf_printf(sb, "0x%016lx-0x%016lx r%c%c%c%c%c %6s %d %d %d %d %d\n",
9585 range->sva, eva,
9586 (range->attrs & ATTR_S1_AP_RW_BIT) == ATTR_S1_AP_RW ? 'w' : '-',
9587 (range->attrs & ATTR_S1_PXN) != 0 ? '-' : 'x',
9588 (range->attrs & ATTR_S1_UXN) != 0 ? '-' : 'X',
9589 (range->attrs & ATTR_S1_AP(ATTR_S1_AP_USER)) != 0 ? 'u' : 's',
9590 (range->attrs & ATTR_S1_GP) != 0 ? 'g' : '-',
9591 mode, range->l1blocks, range->l2contig, range->l2blocks,
9592 range->l3contig, range->l3pages);
9593
9594 /* Reset to sentinel value. */
9595 range->sva = 0xfffffffffffffffful;
9596 }
9597
9598 /*
9599 * Determine whether the attributes specified by a page table entry match those
9600 * being tracked by the current range.
9601 */
9602 static bool
sysctl_kmaps_match(struct pmap_kernel_map_range * range,pt_entry_t attrs)9603 sysctl_kmaps_match(struct pmap_kernel_map_range *range, pt_entry_t attrs)
9604 {
9605
9606 return (range->attrs == attrs);
9607 }
9608
9609 static void
sysctl_kmaps_reinit(struct pmap_kernel_map_range * range,vm_offset_t va,pt_entry_t attrs)9610 sysctl_kmaps_reinit(struct pmap_kernel_map_range *range, vm_offset_t va,
9611 pt_entry_t attrs)
9612 {
9613
9614 memset(range, 0, sizeof(*range));
9615 range->sva = va;
9616 range->attrs = attrs;
9617 }
9618
9619 /* Get the block/page attributes that correspond to the table attributes */
9620 static pt_entry_t
sysctl_kmaps_table_attrs(pd_entry_t table)9621 sysctl_kmaps_table_attrs(pd_entry_t table)
9622 {
9623 pt_entry_t attrs;
9624
9625 attrs = 0;
9626 if ((table & TATTR_UXN_TABLE) != 0)
9627 attrs |= ATTR_S1_UXN;
9628 if ((table & TATTR_PXN_TABLE) != 0)
9629 attrs |= ATTR_S1_PXN;
9630 if ((table & TATTR_AP_TABLE_RO) != 0)
9631 attrs |= ATTR_S1_AP(ATTR_S1_AP_RO);
9632
9633 return (attrs);
9634 }
9635
9636 /* Read the block/page attributes we care about */
9637 static pt_entry_t
sysctl_kmaps_block_attrs(pt_entry_t block)9638 sysctl_kmaps_block_attrs(pt_entry_t block)
9639 {
9640 return (block & (ATTR_S1_AP_MASK | ATTR_S1_XN | ATTR_S1_IDX_MASK |
9641 ATTR_S1_GP));
9642 }
9643
9644 /*
9645 * Given a leaf PTE, derive the mapping's attributes. If they do not match
9646 * those of the current run, dump the address range and its attributes, and
9647 * begin a new run.
9648 */
9649 static void
sysctl_kmaps_check(struct sbuf * sb,struct pmap_kernel_map_range * range,vm_offset_t va,pd_entry_t l0e,pd_entry_t l1e,pd_entry_t l2e,pt_entry_t l3e)9650 sysctl_kmaps_check(struct sbuf *sb, struct pmap_kernel_map_range *range,
9651 vm_offset_t va, pd_entry_t l0e, pd_entry_t l1e, pd_entry_t l2e,
9652 pt_entry_t l3e)
9653 {
9654 pt_entry_t attrs;
9655
9656 attrs = sysctl_kmaps_table_attrs(l0e);
9657
9658 if ((l1e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9659 attrs |= sysctl_kmaps_block_attrs(l1e);
9660 goto done;
9661 }
9662 attrs |= sysctl_kmaps_table_attrs(l1e);
9663
9664 if ((l2e & ATTR_DESCR_TYPE_MASK) == ATTR_DESCR_TYPE_BLOCK) {
9665 attrs |= sysctl_kmaps_block_attrs(l2e);
9666 goto done;
9667 }
9668 attrs |= sysctl_kmaps_table_attrs(l2e);
9669 attrs |= sysctl_kmaps_block_attrs(l3e);
9670
9671 done:
9672 if (range->sva > va || !sysctl_kmaps_match(range, attrs)) {
9673 sysctl_kmaps_dump(sb, range, va);
9674 sysctl_kmaps_reinit(range, va, attrs);
9675 }
9676 }
9677
9678 static int
sysctl_kmaps(SYSCTL_HANDLER_ARGS)9679 sysctl_kmaps(SYSCTL_HANDLER_ARGS)
9680 {
9681 struct pmap_kernel_map_range range;
9682 struct sbuf sbuf, *sb;
9683 pd_entry_t l0e, *l1, l1e, *l2, l2e;
9684 pt_entry_t *l3, l3e;
9685 vm_offset_t sva;
9686 vm_paddr_t pa;
9687 int error, i, j, k, l;
9688
9689 error = sysctl_wire_old_buffer(req, 0);
9690 if (error != 0)
9691 return (error);
9692 sb = &sbuf;
9693 sbuf_new_for_sysctl(sb, NULL, PAGE_SIZE, req);
9694
9695 /* Sentinel value. */
9696 range.sva = 0xfffffffffffffffful;
9697
9698 /*
9699 * Iterate over the kernel page tables without holding the kernel pmap
9700 * lock. Kernel page table pages are never freed, so at worst we will
9701 * observe inconsistencies in the output.
9702 */
9703 for (sva = 0xffff000000000000ul, i = pmap_l0_index(sva); i < Ln_ENTRIES;
9704 i++) {
9705 if (i == pmap_l0_index(DMAP_MIN_ADDRESS))
9706 sbuf_printf(sb, "\nDirect map:\n");
9707 else if (i == pmap_l0_index(VM_MIN_KERNEL_ADDRESS))
9708 sbuf_printf(sb, "\nKernel map:\n");
9709 #ifdef KASAN
9710 else if (i == pmap_l0_index(KASAN_MIN_ADDRESS))
9711 sbuf_printf(sb, "\nKASAN shadow map:\n");
9712 #endif
9713 #ifdef KMSAN
9714 else if (i == pmap_l0_index(KMSAN_SHAD_MIN_ADDRESS))
9715 sbuf_printf(sb, "\nKMSAN shadow map:\n");
9716 else if (i == pmap_l0_index(KMSAN_ORIG_MIN_ADDRESS))
9717 sbuf_printf(sb, "\nKMSAN origin map:\n");
9718 #endif
9719
9720 l0e = kernel_pmap->pm_l0[i];
9721 if ((l0e & ATTR_DESCR_VALID) == 0) {
9722 sysctl_kmaps_dump(sb, &range, sva);
9723 sva += L0_SIZE;
9724 continue;
9725 }
9726 pa = PTE_TO_PHYS(l0e);
9727 l1 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9728
9729 for (j = pmap_l1_index(sva); j < Ln_ENTRIES; j++) {
9730 l1e = l1[j];
9731 if ((l1e & ATTR_DESCR_VALID) == 0) {
9732 sysctl_kmaps_dump(sb, &range, sva);
9733 sva += L1_SIZE;
9734 continue;
9735 }
9736 if ((l1e & ATTR_DESCR_MASK) == L1_BLOCK) {
9737 PMAP_ASSERT_L1_BLOCKS_SUPPORTED;
9738 sysctl_kmaps_check(sb, &range, sva, l0e, l1e,
9739 0, 0);
9740 range.l1blocks++;
9741 sva += L1_SIZE;
9742 continue;
9743 }
9744 pa = PTE_TO_PHYS(l1e);
9745 l2 = (pd_entry_t *)PHYS_TO_DMAP(pa);
9746
9747 for (k = pmap_l2_index(sva); k < Ln_ENTRIES; k++) {
9748 l2e = l2[k];
9749 if ((l2e & ATTR_DESCR_VALID) == 0) {
9750 sysctl_kmaps_dump(sb, &range, sva);
9751 sva += L2_SIZE;
9752 continue;
9753 }
9754 if ((l2e & ATTR_DESCR_MASK) == L2_BLOCK) {
9755 sysctl_kmaps_check(sb, &range, sva,
9756 l0e, l1e, l2e, 0);
9757 if ((l2e & ATTR_CONTIGUOUS) != 0)
9758 range.l2contig +=
9759 k % L2C_ENTRIES == 0 ?
9760 1 : 0;
9761 else
9762 range.l2blocks++;
9763 sva += L2_SIZE;
9764 continue;
9765 }
9766 pa = PTE_TO_PHYS(l2e);
9767 l3 = (pt_entry_t *)PHYS_TO_DMAP(pa);
9768
9769 for (l = pmap_l3_index(sva); l < Ln_ENTRIES;
9770 l++, sva += L3_SIZE) {
9771 l3e = l3[l];
9772 if ((l3e & ATTR_DESCR_VALID) == 0) {
9773 sysctl_kmaps_dump(sb, &range,
9774 sva);
9775 continue;
9776 }
9777 sysctl_kmaps_check(sb, &range, sva,
9778 l0e, l1e, l2e, l3e);
9779 if ((l3e & ATTR_CONTIGUOUS) != 0)
9780 range.l3contig +=
9781 l % L3C_ENTRIES == 0 ?
9782 1 : 0;
9783 else
9784 range.l3pages++;
9785 }
9786 }
9787 }
9788 }
9789
9790 error = sbuf_finish(sb);
9791 sbuf_delete(sb);
9792 return (error);
9793 }
9794 SYSCTL_OID(_vm_pmap, OID_AUTO, kernel_maps,
9795 CTLTYPE_STRING | CTLFLAG_RD | CTLFLAG_MPSAFE | CTLFLAG_SKIP,
9796 NULL, 0, sysctl_kmaps, "A",
9797 "Dump kernel address layout");
9798